diff mbox series

[v1,3/6] intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

Message ID 20240228094432.1092748-4-zhenzhong.duan@intel.com (mailing list archive)
State New, archived
Headers show
Series Check and sync host IOMMU cap/ecap with vIOMMU | expand

Commit Message

Duan, Zhenzhong Feb. 28, 2024, 9:44 a.m. UTC
From: Yi Liu <yi.l.liu@intel.com>

Add a framework to check and synchronize host IOMMU cap/ecap with
vIOMMU cap/ecap.

The sequence will be:

vtd_cap_init() initializes iommu->cap/ecap.
vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
iommu->cap_frozen set when machine create done, iommu->cap/ecap become readonly.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c         | 50 ++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin March 12, 2024, 5:03 p.m. UTC | #1
On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
> From: Yi Liu <yi.l.liu@intel.com>
> 
> Add a framework to check and synchronize host IOMMU cap/ecap with
> vIOMMU cap/ecap.
> 
> The sequence will be:
> 
> vtd_cap_init() initializes iommu->cap/ecap.
> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
> iommu->cap_frozen set when machine create done, iommu->cap/ecap become readonly.
> 
> Implementation details for different backends will be in following patches.
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  include/hw/i386/intel_iommu.h |  1 +
>  hw/i386/intel_iommu.c         | 50 ++++++++++++++++++++++++++++++++++-
>  2 files changed, 50 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index bbc7b96add..c71a133820 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>  
>      uint64_t cap;                   /* The value of capability reg */
>      uint64_t ecap;                  /* The value of extended capability reg */
> +    bool cap_frozen;                /* cap/ecap become read-only after frozen */
>  
>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>      GHashTable *iotlb;              /* IOTLB */
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index ffa1ad6429..a9f9dfd6a7 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -35,6 +35,8 @@
>  #include "sysemu/kvm.h"
>  #include "sysemu/dma.h"
>  #include "sysemu/sysemu.h"
> +#include "hw/vfio/vfio-common.h"
> +#include "sysemu/iommufd.h"
>  #include "hw/i386/apic_internal.h"
>  #include "kvm/kvm_i386.h"
>  #include "migration/vmstate.h"
> @@ -3819,6 +3821,38 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>      return vtd_dev_as;
>  }
>  
> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
> +                                 IOMMULegacyDevice *ldev,
> +                                 Error **errp)
> +{
> +    return 0;
> +}
> +
> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
> +                                  IOMMUFDDevice *idev,
> +                                  Error **errp)
> +{
> +    return 0;
> +}
> +
> +static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
> +                          Error **errp)
> +{
> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
> +    IOMMUFDDevice *idev;
> +
> +    if (base_dev->type == HID_LEGACY) {
> +        IOMMULegacyDevice *ldev = container_of(base_dev,
> +                                               IOMMULegacyDevice, base);
> +
> +        return vtd_check_legacy_hdev(s, ldev, errp);
> +    }
> +
> +    idev = container_of(base_dev, IOMMUFDDevice, base);
> +
> +    return vtd_check_iommufd_hdev(s, idev, errp);
> +}
> +
>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
>                                      HostIOMMUDevice *base_dev, Error **errp)
>  {
> @@ -3829,6 +3863,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
>          .devfn = devfn,
>      };
>      struct vtd_as_key *new_key;
> +    int ret;
>  
>      assert(base_dev);
>  
> @@ -3848,6 +3883,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
>      vtd_hdev->iommu_state = s;
>      vtd_hdev->dev = base_dev;
>  
> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
> +    if (ret) {
> +        g_free(vtd_hdev);
> +        vtd_iommu_unlock(s);
> +        return ret;
> +    }
> +
>      new_key = g_malloc(sizeof(*new_key));
>      new_key->bus = bus;
>      new_key->devfn = devfn;


Okay. So when VFIO device is created, it will call vtd_dev_set_iommu_device
and that in turn will update caps.




> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
>      s->iq_dw = false;
>      s->next_frcd_reg = 0;
>  
> -    vtd_cap_init(s);
> +    if (!s->cap_frozen) {
> +        vtd_cap_init(s);
> +    }
>  

If it's fronzen it's because VFIO was added after machine done.
And then what? I think caps are just wrong?


I think the way to approach this is just by specifying this
as an option on command line.

So if one wants VFIO one has to sync caps with host.
No?



>      /*
>       * Rsvd field masks for spte
> @@ -4254,6 +4298,10 @@ static int vtd_machine_done_notify_one(Object *child, void *unused)
>  
>  static void vtd_machine_done_hook(Notifier *notifier, void *unused)
>  {
> +    IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
> +
> +    iommu->cap_frozen = true;
> +
>      object_child_foreach_recursive(object_get_root(),
>                                     vtd_machine_done_notify_one, NULL);
>  }
> -- 
> 2.34.1
Duan, Zhenzhong March 13, 2024, 2:52 a.m. UTC | #2
Hi Michael,

>-----Original Message-----
>From: Michael S. Tsirkin <mst@redhat.com>
>Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>sync host IOMMU cap/ecap
>
>On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
>> From: Yi Liu <yi.l.liu@intel.com>
>>
>> Add a framework to check and synchronize host IOMMU cap/ecap with
>> vIOMMU cap/ecap.
>>
>> The sequence will be:
>>
>> vtd_cap_init() initializes iommu->cap/ecap.
>> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
>> iommu->cap_frozen set when machine create done, iommu->cap/ecap
>become readonly.
>>
>> Implementation details for different backends will be in following patches.
>>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  include/hw/i386/intel_iommu.h |  1 +
>>  hw/i386/intel_iommu.c         | 50
>++++++++++++++++++++++++++++++++++-
>>  2 files changed, 50 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/hw/i386/intel_iommu.h
>b/include/hw/i386/intel_iommu.h
>> index bbc7b96add..c71a133820 100644
>> --- a/include/hw/i386/intel_iommu.h
>> +++ b/include/hw/i386/intel_iommu.h
>> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>>
>>      uint64_t cap;                   /* The value of capability reg */
>>      uint64_t ecap;                  /* The value of extended capability reg */
>> +    bool cap_frozen;                /* cap/ecap become read-only after frozen */
>>
>>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>>      GHashTable *iotlb;              /* IOTLB */
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index ffa1ad6429..a9f9dfd6a7 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -35,6 +35,8 @@
>>  #include "sysemu/kvm.h"
>>  #include "sysemu/dma.h"
>>  #include "sysemu/sysemu.h"
>> +#include "hw/vfio/vfio-common.h"
>> +#include "sysemu/iommufd.h"
>>  #include "hw/i386/apic_internal.h"
>>  #include "kvm/kvm_i386.h"
>>  #include "migration/vmstate.h"
>> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>>      return vtd_dev_as;
>>  }
>>
>> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>> +                                 IOMMULegacyDevice *ldev,
>> +                                 Error **errp)
>> +{
>> +    return 0;
>> +}
>> +
>> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>> +                                  IOMMUFDDevice *idev,
>> +                                  Error **errp)
>> +{
>> +    return 0;
>> +}
>> +
>> +static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice
>*vtd_hdev,
>> +                          Error **errp)
>> +{
>> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
>> +    IOMMUFDDevice *idev;
>> +
>> +    if (base_dev->type == HID_LEGACY) {
>> +        IOMMULegacyDevice *ldev = container_of(base_dev,
>> +                                               IOMMULegacyDevice, base);
>> +
>> +        return vtd_check_legacy_hdev(s, ldev, errp);
>> +    }
>> +
>> +    idev = container_of(base_dev, IOMMUFDDevice, base);
>> +
>> +    return vtd_check_iommufd_hdev(s, idev, errp);
>> +}
>> +
>>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
>devfn,
>>                                      HostIOMMUDevice *base_dev, Error **errp)
>>  {
>> @@ -3829,6 +3863,7 @@ static int vtd_dev_set_iommu_device(PCIBus
>*bus, void *opaque, int devfn,
>>          .devfn = devfn,
>>      };
>>      struct vtd_as_key *new_key;
>> +    int ret;
>>
>>      assert(base_dev);
>>
>> @@ -3848,6 +3883,13 @@ static int vtd_dev_set_iommu_device(PCIBus
>*bus, void *opaque, int devfn,
>>      vtd_hdev->iommu_state = s;
>>      vtd_hdev->dev = base_dev;
>>
>> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
>> +    if (ret) {
>> +        g_free(vtd_hdev);
>> +        vtd_iommu_unlock(s);
>> +        return ret;
>> +    }
>> +
>>      new_key = g_malloc(sizeof(*new_key));
>>      new_key->bus = bus;
>>      new_key->devfn = devfn;
>
>
>Okay. So when VFIO device is created, it will call vtd_dev_set_iommu_device
>and that in turn will update caps.
>
>
>
>
>> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
>>      s->iq_dw = false;
>>      s->next_frcd_reg = 0;
>>
>> -    vtd_cap_init(s);
>> +    if (!s->cap_frozen) {
>> +        vtd_cap_init(s);
>> +    }
>>
>
>If it's fronzen it's because VFIO was added after machine done.
>And then what? I think caps are just wrong?

Not quite get your question on caps being wrong. But try to explains:

When a hot plugged vfio device's host iommu cap isn't compatible with
vIOMMU's, hotplug should fail. Currently there is no check for this and
allow hotplug to succeed, but then some issue will reveal later,
e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
mapping beyond host supported iova range, then DMA will fail.

In fact, before this series, cap is not impacted by VFIO, so it's same effect of
frozen after machine done.

>
>
>I think the way to approach this is just by specifying this
>as an option on command line.

Do you mean add a cap_frozen property to intel_iommu?
Vtd_init() is called in realize() and system reset(), so I utilize realize() to init cap
and froze cap before system reset(). If cap_frozen is an option, when it's set to
false, cap could be updated every system reset and it's not a fix value any more.
This may break migration.

>
>So if one wants VFIO one has to sync caps with host.
>No?

Yes, check for compatibility. But it's not preventing the usage of VFIO
with vIOMMU, it finds the incompatible issue earlier and fail hotplug instead of
surprising guest driver failure.

Thanks
Zhenzhong

>
>
>
>>      /*
>>       * Rsvd field masks for spte
>> @@ -4254,6 +4298,10 @@ static int
>vtd_machine_done_notify_one(Object *child, void *unused)
>>
>>  static void vtd_machine_done_hook(Notifier *notifier, void *unused)
>>  {
>> +    IntelIOMMUState *iommu =
>INTEL_IOMMU_DEVICE(x86_iommu_get_default());
>> +
>> +    iommu->cap_frozen = true;
>> +
>>      object_child_foreach_recursive(object_get_root(),
>>                                     vtd_machine_done_notify_one, NULL);
>>  }
>> --
>> 2.34.1
Michael S. Tsirkin March 13, 2024, 7:07 a.m. UTC | #3
On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
> Hi Michael,
> 
> >-----Original Message-----
> >From: Michael S. Tsirkin <mst@redhat.com>
> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
> >sync host IOMMU cap/ecap
> >
> >On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
> >> From: Yi Liu <yi.l.liu@intel.com>
> >>
> >> Add a framework to check and synchronize host IOMMU cap/ecap with
> >> vIOMMU cap/ecap.
> >>
> >> The sequence will be:
> >>
> >> vtd_cap_init() initializes iommu->cap/ecap.
> >> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
> >> iommu->cap_frozen set when machine create done, iommu->cap/ecap
> >become readonly.
> >>
> >> Implementation details for different backends will be in following patches.
> >>
> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> >> ---
> >>  include/hw/i386/intel_iommu.h |  1 +
> >>  hw/i386/intel_iommu.c         | 50
> >++++++++++++++++++++++++++++++++++-
> >>  2 files changed, 50 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/hw/i386/intel_iommu.h
> >b/include/hw/i386/intel_iommu.h
> >> index bbc7b96add..c71a133820 100644
> >> --- a/include/hw/i386/intel_iommu.h
> >> +++ b/include/hw/i386/intel_iommu.h
> >> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
> >>
> >>      uint64_t cap;                   /* The value of capability reg */
> >>      uint64_t ecap;                  /* The value of extended capability reg */
> >> +    bool cap_frozen;                /* cap/ecap become read-only after frozen */
> >>
> >>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
> >>      GHashTable *iotlb;              /* IOTLB */
> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >> index ffa1ad6429..a9f9dfd6a7 100644
> >> --- a/hw/i386/intel_iommu.c
> >> +++ b/hw/i386/intel_iommu.c
> >> @@ -35,6 +35,8 @@
> >>  #include "sysemu/kvm.h"
> >>  #include "sysemu/dma.h"
> >>  #include "sysemu/sysemu.h"
> >> +#include "hw/vfio/vfio-common.h"
> >> +#include "sysemu/iommufd.h"
> >>  #include "hw/i386/apic_internal.h"
> >>  #include "kvm/kvm_i386.h"
> >>  #include "migration/vmstate.h"
> >> @@ -3819,6 +3821,38 @@ VTDAddressSpace
> >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
> >>      return vtd_dev_as;
> >>  }
> >>
> >> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
> >> +                                 IOMMULegacyDevice *ldev,
> >> +                                 Error **errp)
> >> +{
> >> +    return 0;
> >> +}
> >> +
> >> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
> >> +                                  IOMMUFDDevice *idev,
> >> +                                  Error **errp)
> >> +{
> >> +    return 0;
> >> +}
> >> +
> >> +static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice
> >*vtd_hdev,
> >> +                          Error **errp)
> >> +{
> >> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
> >> +    IOMMUFDDevice *idev;
> >> +
> >> +    if (base_dev->type == HID_LEGACY) {
> >> +        IOMMULegacyDevice *ldev = container_of(base_dev,
> >> +                                               IOMMULegacyDevice, base);
> >> +
> >> +        return vtd_check_legacy_hdev(s, ldev, errp);
> >> +    }
> >> +
> >> +    idev = container_of(base_dev, IOMMUFDDevice, base);
> >> +
> >> +    return vtd_check_iommufd_hdev(s, idev, errp);
> >> +}
> >> +
> >>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
> >devfn,
> >>                                      HostIOMMUDevice *base_dev, Error **errp)
> >>  {
> >> @@ -3829,6 +3863,7 @@ static int vtd_dev_set_iommu_device(PCIBus
> >*bus, void *opaque, int devfn,
> >>          .devfn = devfn,
> >>      };
> >>      struct vtd_as_key *new_key;
> >> +    int ret;
> >>
> >>      assert(base_dev);
> >>
> >> @@ -3848,6 +3883,13 @@ static int vtd_dev_set_iommu_device(PCIBus
> >*bus, void *opaque, int devfn,
> >>      vtd_hdev->iommu_state = s;
> >>      vtd_hdev->dev = base_dev;
> >>
> >> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
> >> +    if (ret) {
> >> +        g_free(vtd_hdev);
> >> +        vtd_iommu_unlock(s);
> >> +        return ret;
> >> +    }
> >> +
> >>      new_key = g_malloc(sizeof(*new_key));
> >>      new_key->bus = bus;
> >>      new_key->devfn = devfn;
> >
> >
> >Okay. So when VFIO device is created, it will call vtd_dev_set_iommu_device
> >and that in turn will update caps.
> >
> >
> >
> >
> >> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
> >>      s->iq_dw = false;
> >>      s->next_frcd_reg = 0;
> >>
> >> -    vtd_cap_init(s);
> >> +    if (!s->cap_frozen) {
> >> +        vtd_cap_init(s);
> >> +    }
> >>
> >
> >If it's fronzen it's because VFIO was added after machine done.
> >And then what? I think caps are just wrong?
> 
> Not quite get your question on caps being wrong. But try to explains:
> 
> When a hot plugged vfio device's host iommu cap isn't compatible with
> vIOMMU's, hotplug should fail. Currently there is no check for this and
> allow hotplug to succeed, but then some issue will reveal later,
> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
> mapping beyond host supported iova range, then DMA will fail.
> 
> In fact, before this series, cap is not impacted by VFIO, so it's same effect of
> frozen after machine done.
> 
> >
> >
> >I think the way to approach this is just by specifying this
> >as an option on command line.
> 
> Do you mean add a cap_frozen property to intel_iommu?
> Vtd_init() is called in realize() and system reset(), so I utilize realize() to init cap
> and froze cap before system reset(). If cap_frozen is an option, when it's set to
> false, cap could be updated every system reset and it's not a fix value any more.
> This may break migration.

No, I mean either
1. add some kind of vfio-iommu device that is not exposed to guest
   but is not hot pluggable

or

2. add a property to specify ecap, rely on management to set it correctly


> >
> >So if one wants VFIO one has to sync caps with host.
> >No?
> 
> Yes, check for compatibility. But it's not preventing the usage of VFIO
> with vIOMMU, it finds the incompatible issue earlier and fail hotplug instead of
> surprising guest driver failure.
> 
> Thanks
> Zhenzhong


I don't see where the check for compatibility and hotplug failure are.
Did I miss it?

> >
> >
> >
> >>      /*
> >>       * Rsvd field masks for spte
> >> @@ -4254,6 +4298,10 @@ static int
> >vtd_machine_done_notify_one(Object *child, void *unused)
> >>
> >>  static void vtd_machine_done_hook(Notifier *notifier, void *unused)
> >>  {
> >> +    IntelIOMMUState *iommu =
> >INTEL_IOMMU_DEVICE(x86_iommu_get_default());
> >> +
> >> +    iommu->cap_frozen = true;
> >> +
> >>      object_child_foreach_recursive(object_get_root(),
> >>                                     vtd_machine_done_notify_one, NULL);
> >>  }
> >> --
> >> 2.34.1
Duan, Zhenzhong March 13, 2024, 7:54 a.m. UTC | #4
>-----Original Message-----
>From: Michael S. Tsirkin <mst@redhat.com>
>Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>sync host IOMMU cap/ecap
>
>On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
>> Hi Michael,
>>
>> >-----Original Message-----
>> >From: Michael S. Tsirkin <mst@redhat.com>
>> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>> >sync host IOMMU cap/ecap
>> >
>> >On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
>> >> From: Yi Liu <yi.l.liu@intel.com>
>> >>
>> >> Add a framework to check and synchronize host IOMMU cap/ecap with
>> >> vIOMMU cap/ecap.
>> >>
>> >> The sequence will be:
>> >>
>> >> vtd_cap_init() initializes iommu->cap/ecap.
>> >> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
>> >> iommu->cap_frozen set when machine create done, iommu->cap/ecap
>> >become readonly.
>> >>
>> >> Implementation details for different backends will be in following
>patches.
>> >>
>> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> >> ---
>> >>  include/hw/i386/intel_iommu.h |  1 +
>> >>  hw/i386/intel_iommu.c         | 50
>> >++++++++++++++++++++++++++++++++++-
>> >>  2 files changed, 50 insertions(+), 1 deletion(-)
>> >>
>> >> diff --git a/include/hw/i386/intel_iommu.h
>> >b/include/hw/i386/intel_iommu.h
>> >> index bbc7b96add..c71a133820 100644
>> >> --- a/include/hw/i386/intel_iommu.h
>> >> +++ b/include/hw/i386/intel_iommu.h
>> >> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>> >>
>> >>      uint64_t cap;                   /* The value of capability reg */
>> >>      uint64_t ecap;                  /* The value of extended capability reg */
>> >> +    bool cap_frozen;                /* cap/ecap become read-only after
>frozen */
>> >>
>> >>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>> >>      GHashTable *iotlb;              /* IOTLB */
>> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> >> index ffa1ad6429..a9f9dfd6a7 100644
>> >> --- a/hw/i386/intel_iommu.c
>> >> +++ b/hw/i386/intel_iommu.c
>> >> @@ -35,6 +35,8 @@
>> >>  #include "sysemu/kvm.h"
>> >>  #include "sysemu/dma.h"
>> >>  #include "sysemu/sysemu.h"
>> >> +#include "hw/vfio/vfio-common.h"
>> >> +#include "sysemu/iommufd.h"
>> >>  #include "hw/i386/apic_internal.h"
>> >>  #include "kvm/kvm_i386.h"
>> >>  #include "migration/vmstate.h"
>> >> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>> >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>> >>      return vtd_dev_as;
>> >>  }
>> >>
>> >> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>> >> +                                 IOMMULegacyDevice *ldev,
>> >> +                                 Error **errp)
>> >> +{
>> >> +    return 0;
>> >> +}
>> >> +
>> >> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>> >> +                                  IOMMUFDDevice *idev,
>> >> +                                  Error **errp)
>> >> +{
>> >> +    return 0;
>> >> +}
>> >> +
>> >> +static int vtd_check_hdev(IntelIOMMUState *s,
>VTDHostIOMMUDevice
>> >*vtd_hdev,
>> >> +                          Error **errp)
>> >> +{
>> >> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
>> >> +    IOMMUFDDevice *idev;
>> >> +
>> >> +    if (base_dev->type == HID_LEGACY) {
>> >> +        IOMMULegacyDevice *ldev = container_of(base_dev,
>> >> +                                               IOMMULegacyDevice, base);
>> >> +
>> >> +        return vtd_check_legacy_hdev(s, ldev, errp);
>> >> +    }
>> >> +
>> >> +    idev = container_of(base_dev, IOMMUFDDevice, base);
>> >> +
>> >> +    return vtd_check_iommufd_hdev(s, idev, errp);
>> >> +}
>> >> +
>> >>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
>> >devfn,
>> >>                                      HostIOMMUDevice *base_dev, Error **errp)
>> >>  {
>> >> @@ -3829,6 +3863,7 @@ static int
>vtd_dev_set_iommu_device(PCIBus
>> >*bus, void *opaque, int devfn,
>> >>          .devfn = devfn,
>> >>      };
>> >>      struct vtd_as_key *new_key;
>> >> +    int ret;
>> >>
>> >>      assert(base_dev);
>> >>
>> >> @@ -3848,6 +3883,13 @@ static int
>vtd_dev_set_iommu_device(PCIBus
>> >*bus, void *opaque, int devfn,
>> >>      vtd_hdev->iommu_state = s;
>> >>      vtd_hdev->dev = base_dev;
>> >>
>> >> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
>> >> +    if (ret) {
>> >> +        g_free(vtd_hdev);
>> >> +        vtd_iommu_unlock(s);
>> >> +        return ret;
>> >> +    }
>> >> +
>> >>      new_key = g_malloc(sizeof(*new_key));
>> >>      new_key->bus = bus;
>> >>      new_key->devfn = devfn;
>> >
>> >
>> >Okay. So when VFIO device is created, it will call
>vtd_dev_set_iommu_device
>> >and that in turn will update caps.
>> >
>> >
>> >
>> >
>> >> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
>> >>      s->iq_dw = false;
>> >>      s->next_frcd_reg = 0;
>> >>
>> >> -    vtd_cap_init(s);
>> >> +    if (!s->cap_frozen) {
>> >> +        vtd_cap_init(s);
>> >> +    }
>> >>
>> >
>> >If it's fronzen it's because VFIO was added after machine done.
>> >And then what? I think caps are just wrong?
>>
>> Not quite get your question on caps being wrong. But try to explains:
>>
>> When a hot plugged vfio device's host iommu cap isn't compatible with
>> vIOMMU's, hotplug should fail. Currently there is no check for this and
>> allow hotplug to succeed, but then some issue will reveal later,
>> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
>> mapping beyond host supported iova range, then DMA will fail.
>>
>> In fact, before this series, cap is not impacted by VFIO, so it's same effect of
>> frozen after machine done.
>>
>> >
>> >
>> >I think the way to approach this is just by specifying this
>> >as an option on command line.
>>
>> Do you mean add a cap_frozen property to intel_iommu?
>> Vtd_init() is called in realize() and system reset(), so I utilize realize() to init
>cap
>> and froze cap before system reset(). If cap_frozen is an option, when it's
>set to
>> false, cap could be updated every system reset and it's not a fix value any
>more.
>> This may break migration.
>
>No, I mean either
>1. add some kind of vfio-iommu device that is not exposed to guest
>   but is not hot pluggable

Not quite get, what will such vfio-iommu device be used for if not exposed to guest.
If we just want to avoid vfio device hotplug, we can use
TYPE_VFIO_PCI_NOHOTPLUG device. But even if we use coldplugged
vfio device, we will face same compatibility issue of vIOMMU caps vs. host caps.

>
>or
>
>2. add a property to specify ecap, rely on management to set it correctly

Yes. This deep customization could works, but I think it's difficult to teach user
about how to set each bit in cap/ecaps.

>
>
>> >
>> >So if one wants VFIO one has to sync caps with host.
>> >No?
>>
>> Yes, check for compatibility. But it's not preventing the usage of VFIO
>> with vIOMMU, it finds the incompatible issue earlier and fail hotplug
>instead of
>> surprising guest driver failure.
>>
>> Thanks
>> Zhenzhong
>
>
>I don't see where the check for compatibility and hotplug failure are.
>Did I miss it?

There is some check code in patch4, see https://lists.gnu.org/archive/html/qemu-devel/2024-02/msg06419.html

And more widely used in nesting series's patch6, see https://lists.gnu.org/archive/html/qemu-devel/2024-01/msg02746.html
I have not sent a new version of nesting series yet, but the code related to caps check/sync will be
similar.

Thanks
Zhenzhong
Michael S. Tsirkin March 13, 2024, 11:17 a.m. UTC | #5
On Wed, Mar 13, 2024 at 07:54:11AM +0000, Duan, Zhenzhong wrote:
> 
> 
> >-----Original Message-----
> >From: Michael S. Tsirkin <mst@redhat.com>
> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
> >sync host IOMMU cap/ecap
> >
> >On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
> >> Hi Michael,
> >>
> >> >-----Original Message-----
> >> >From: Michael S. Tsirkin <mst@redhat.com>
> >> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
> >> >sync host IOMMU cap/ecap
> >> >
> >> >On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
> >> >> From: Yi Liu <yi.l.liu@intel.com>
> >> >>
> >> >> Add a framework to check and synchronize host IOMMU cap/ecap with
> >> >> vIOMMU cap/ecap.
> >> >>
> >> >> The sequence will be:
> >> >>
> >> >> vtd_cap_init() initializes iommu->cap/ecap.
> >> >> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
> >> >> iommu->cap_frozen set when machine create done, iommu->cap/ecap
> >> >become readonly.
> >> >>
> >> >> Implementation details for different backends will be in following
> >patches.
> >> >>
> >> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> >> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> >> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> >> >> ---
> >> >>  include/hw/i386/intel_iommu.h |  1 +
> >> >>  hw/i386/intel_iommu.c         | 50
> >> >++++++++++++++++++++++++++++++++++-
> >> >>  2 files changed, 50 insertions(+), 1 deletion(-)
> >> >>
> >> >> diff --git a/include/hw/i386/intel_iommu.h
> >> >b/include/hw/i386/intel_iommu.h
> >> >> index bbc7b96add..c71a133820 100644
> >> >> --- a/include/hw/i386/intel_iommu.h
> >> >> +++ b/include/hw/i386/intel_iommu.h
> >> >> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
> >> >>
> >> >>      uint64_t cap;                   /* The value of capability reg */
> >> >>      uint64_t ecap;                  /* The value of extended capability reg */
> >> >> +    bool cap_frozen;                /* cap/ecap become read-only after
> >frozen */
> >> >>
> >> >>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
> >> >>      GHashTable *iotlb;              /* IOTLB */
> >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >> >> index ffa1ad6429..a9f9dfd6a7 100644
> >> >> --- a/hw/i386/intel_iommu.c
> >> >> +++ b/hw/i386/intel_iommu.c
> >> >> @@ -35,6 +35,8 @@
> >> >>  #include "sysemu/kvm.h"
> >> >>  #include "sysemu/dma.h"
> >> >>  #include "sysemu/sysemu.h"
> >> >> +#include "hw/vfio/vfio-common.h"
> >> >> +#include "sysemu/iommufd.h"
> >> >>  #include "hw/i386/apic_internal.h"
> >> >>  #include "kvm/kvm_i386.h"
> >> >>  #include "migration/vmstate.h"
> >> >> @@ -3819,6 +3821,38 @@ VTDAddressSpace
> >> >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
> >> >>      return vtd_dev_as;
> >> >>  }
> >> >>
> >> >> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
> >> >> +                                 IOMMULegacyDevice *ldev,
> >> >> +                                 Error **errp)
> >> >> +{
> >> >> +    return 0;
> >> >> +}
> >> >> +
> >> >> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
> >> >> +                                  IOMMUFDDevice *idev,
> >> >> +                                  Error **errp)
> >> >> +{
> >> >> +    return 0;
> >> >> +}
> >> >> +
> >> >> +static int vtd_check_hdev(IntelIOMMUState *s,
> >VTDHostIOMMUDevice
> >> >*vtd_hdev,
> >> >> +                          Error **errp)
> >> >> +{
> >> >> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
> >> >> +    IOMMUFDDevice *idev;
> >> >> +
> >> >> +    if (base_dev->type == HID_LEGACY) {
> >> >> +        IOMMULegacyDevice *ldev = container_of(base_dev,
> >> >> +                                               IOMMULegacyDevice, base);
> >> >> +
> >> >> +        return vtd_check_legacy_hdev(s, ldev, errp);
> >> >> +    }
> >> >> +
> >> >> +    idev = container_of(base_dev, IOMMUFDDevice, base);
> >> >> +
> >> >> +    return vtd_check_iommufd_hdev(s, idev, errp);
> >> >> +}
> >> >> +
> >> >>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
> >> >devfn,
> >> >>                                      HostIOMMUDevice *base_dev, Error **errp)
> >> >>  {
> >> >> @@ -3829,6 +3863,7 @@ static int
> >vtd_dev_set_iommu_device(PCIBus
> >> >*bus, void *opaque, int devfn,
> >> >>          .devfn = devfn,
> >> >>      };
> >> >>      struct vtd_as_key *new_key;
> >> >> +    int ret;
> >> >>
> >> >>      assert(base_dev);
> >> >>
> >> >> @@ -3848,6 +3883,13 @@ static int
> >vtd_dev_set_iommu_device(PCIBus
> >> >*bus, void *opaque, int devfn,
> >> >>      vtd_hdev->iommu_state = s;
> >> >>      vtd_hdev->dev = base_dev;
> >> >>
> >> >> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
> >> >> +    if (ret) {
> >> >> +        g_free(vtd_hdev);
> >> >> +        vtd_iommu_unlock(s);
> >> >> +        return ret;
> >> >> +    }
> >> >> +
> >> >>      new_key = g_malloc(sizeof(*new_key));
> >> >>      new_key->bus = bus;
> >> >>      new_key->devfn = devfn;
> >> >
> >> >
> >> >Okay. So when VFIO device is created, it will call
> >vtd_dev_set_iommu_device
> >> >and that in turn will update caps.
> >> >
> >> >
> >> >
> >> >
> >> >> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
> >> >>      s->iq_dw = false;
> >> >>      s->next_frcd_reg = 0;
> >> >>
> >> >> -    vtd_cap_init(s);
> >> >> +    if (!s->cap_frozen) {
> >> >> +        vtd_cap_init(s);
> >> >> +    }
> >> >>
> >> >
> >> >If it's fronzen it's because VFIO was added after machine done.
> >> >And then what? I think caps are just wrong?
> >>
> >> Not quite get your question on caps being wrong. But try to explains:
> >>
> >> When a hot plugged vfio device's host iommu cap isn't compatible with
> >> vIOMMU's, hotplug should fail. Currently there is no check for this and
> >> allow hotplug to succeed, but then some issue will reveal later,
> >> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
> >> mapping beyond host supported iova range, then DMA will fail.
> >>
> >> In fact, before this series, cap is not impacted by VFIO, so it's same effect of
> >> frozen after machine done.
> >>
> >> >
> >> >
> >> >I think the way to approach this is just by specifying this
> >> >as an option on command line.
> >>
> >> Do you mean add a cap_frozen property to intel_iommu?
> >> Vtd_init() is called in realize() and system reset(), so I utilize realize() to init
> >cap
> >> and froze cap before system reset(). If cap_frozen is an option, when it's
> >set to
> >> false, cap could be updated every system reset and it's not a fix value any
> >more.
> >> This may break migration.
> >
> >No, I mean either
> >1. add some kind of vfio-iommu device that is not exposed to guest
> >   but is not hot pluggable
> 
> Not quite get, what will such vfio-iommu device be used for if not exposed to guest.

It will update the IOMMU.
And do so without need for tricky callbacks.
Duan, Zhenzhong March 14, 2024, 4:05 a.m. UTC | #6
>-----Original Message-----
>From: Michael S. Tsirkin <mst@redhat.com>
>Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>sync host IOMMU cap/ecap
>
>On Wed, Mar 13, 2024 at 07:54:11AM +0000, Duan, Zhenzhong wrote:
>>
>>
>> >-----Original Message-----
>> >From: Michael S. Tsirkin <mst@redhat.com>
>> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>> >sync host IOMMU cap/ecap
>> >
>> >On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
>> >> Hi Michael,
>> >>
>> >> >-----Original Message-----
>> >> >From: Michael S. Tsirkin <mst@redhat.com>
>> >> >Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check
>and
>> >> >sync host IOMMU cap/ecap
>> >> >
>> >> >On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
>> >> >> From: Yi Liu <yi.l.liu@intel.com>
>> >> >>
>> >> >> Add a framework to check and synchronize host IOMMU cap/ecap
>with
>> >> >> vIOMMU cap/ecap.
>> >> >>
>> >> >> The sequence will be:
>> >> >>
>> >> >> vtd_cap_init() initializes iommu->cap/ecap.
>> >> >> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
>> >> >> iommu->cap_frozen set when machine create done, iommu-
>>cap/ecap
>> >> >become readonly.
>> >> >>
>> >> >> Implementation details for different backends will be in following
>> >patches.
>> >> >>
>> >> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> >> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> >> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> >> >> ---
>> >> >>  include/hw/i386/intel_iommu.h |  1 +
>> >> >>  hw/i386/intel_iommu.c         | 50
>> >> >++++++++++++++++++++++++++++++++++-
>> >> >>  2 files changed, 50 insertions(+), 1 deletion(-)
>> >> >>
>> >> >> diff --git a/include/hw/i386/intel_iommu.h
>> >> >b/include/hw/i386/intel_iommu.h
>> >> >> index bbc7b96add..c71a133820 100644
>> >> >> --- a/include/hw/i386/intel_iommu.h
>> >> >> +++ b/include/hw/i386/intel_iommu.h
>> >> >> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>> >> >>
>> >> >>      uint64_t cap;                   /* The value of capability reg */
>> >> >>      uint64_t ecap;                  /* The value of extended capability reg
>*/
>> >> >> +    bool cap_frozen;                /* cap/ecap become read-only after
>> >frozen */
>> >> >>
>> >> >>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>> >> >>      GHashTable *iotlb;              /* IOTLB */
>> >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> >> >> index ffa1ad6429..a9f9dfd6a7 100644
>> >> >> --- a/hw/i386/intel_iommu.c
>> >> >> +++ b/hw/i386/intel_iommu.c
>> >> >> @@ -35,6 +35,8 @@
>> >> >>  #include "sysemu/kvm.h"
>> >> >>  #include "sysemu/dma.h"
>> >> >>  #include "sysemu/sysemu.h"
>> >> >> +#include "hw/vfio/vfio-common.h"
>> >> >> +#include "sysemu/iommufd.h"
>> >> >>  #include "hw/i386/apic_internal.h"
>> >> >>  #include "kvm/kvm_i386.h"
>> >> >>  #include "migration/vmstate.h"
>> >> >> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>> >> >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>> >> >>      return vtd_dev_as;
>> >> >>  }
>> >> >>
>> >> >> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>> >> >> +                                 IOMMULegacyDevice *ldev,
>> >> >> +                                 Error **errp)
>> >> >> +{
>> >> >> +    return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>> >> >> +                                  IOMMUFDDevice *idev,
>> >> >> +                                  Error **errp)
>> >> >> +{
>> >> >> +    return 0;
>> >> >> +}
>> >> >> +
>> >> >> +static int vtd_check_hdev(IntelIOMMUState *s,
>> >VTDHostIOMMUDevice
>> >> >*vtd_hdev,
>> >> >> +                          Error **errp)
>> >> >> +{
>> >> >> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
>> >> >> +    IOMMUFDDevice *idev;
>> >> >> +
>> >> >> +    if (base_dev->type == HID_LEGACY) {
>> >> >> +        IOMMULegacyDevice *ldev = container_of(base_dev,
>> >> >> +                                               IOMMULegacyDevice, base);
>> >> >> +
>> >> >> +        return vtd_check_legacy_hdev(s, ldev, errp);
>> >> >> +    }
>> >> >> +
>> >> >> +    idev = container_of(base_dev, IOMMUFDDevice, base);
>> >> >> +
>> >> >> +    return vtd_check_iommufd_hdev(s, idev, errp);
>> >> >> +}
>> >> >> +
>> >> >>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque,
>int
>> >> >devfn,
>> >> >>                                      HostIOMMUDevice *base_dev, Error **errp)
>> >> >>  {
>> >> >> @@ -3829,6 +3863,7 @@ static int
>> >vtd_dev_set_iommu_device(PCIBus
>> >> >*bus, void *opaque, int devfn,
>> >> >>          .devfn = devfn,
>> >> >>      };
>> >> >>      struct vtd_as_key *new_key;
>> >> >> +    int ret;
>> >> >>
>> >> >>      assert(base_dev);
>> >> >>
>> >> >> @@ -3848,6 +3883,13 @@ static int
>> >vtd_dev_set_iommu_device(PCIBus
>> >> >*bus, void *opaque, int devfn,
>> >> >>      vtd_hdev->iommu_state = s;
>> >> >>      vtd_hdev->dev = base_dev;
>> >> >>
>> >> >> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
>> >> >> +    if (ret) {
>> >> >> +        g_free(vtd_hdev);
>> >> >> +        vtd_iommu_unlock(s);
>> >> >> +        return ret;
>> >> >> +    }
>> >> >> +
>> >> >>      new_key = g_malloc(sizeof(*new_key));
>> >> >>      new_key->bus = bus;
>> >> >>      new_key->devfn = devfn;
>> >> >
>> >> >
>> >> >Okay. So when VFIO device is created, it will call
>> >vtd_dev_set_iommu_device
>> >> >and that in turn will update caps.
>> >> >
>> >> >
>> >> >
>> >> >
>> >> >> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
>> >> >>      s->iq_dw = false;
>> >> >>      s->next_frcd_reg = 0;
>> >> >>
>> >> >> -    vtd_cap_init(s);
>> >> >> +    if (!s->cap_frozen) {
>> >> >> +        vtd_cap_init(s);
>> >> >> +    }
>> >> >>
>> >> >
>> >> >If it's fronzen it's because VFIO was added after machine done.
>> >> >And then what? I think caps are just wrong?
>> >>
>> >> Not quite get your question on caps being wrong. But try to explains:
>> >>
>> >> When a hot plugged vfio device's host iommu cap isn't compatible with
>> >> vIOMMU's, hotplug should fail. Currently there is no check for this and
>> >> allow hotplug to succeed, but then some issue will reveal later,
>> >> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
>> >> mapping beyond host supported iova range, then DMA will fail.
>> >>
>> >> In fact, before this series, cap is not impacted by VFIO, so it's same
>effect of
>> >> frozen after machine done.
>> >>
>> >> >
>> >> >
>> >> >I think the way to approach this is just by specifying this
>> >> >as an option on command line.
>> >>
>> >> Do you mean add a cap_frozen property to intel_iommu?
>> >> Vtd_init() is called in realize() and system reset(), so I utilize realize() to
>init
>> >cap
>> >> and froze cap before system reset(). If cap_frozen is an option, when it's
>> >set to
>> >> false, cap could be updated every system reset and it's not a fix value
>any
>> >more.
>> >> This may break migration.
>> >
>> >No, I mean either
>> >1. add some kind of vfio-iommu device that is not exposed to guest
>> >   but is not hot pluggable
>>
>> Not quite get, what will such vfio-iommu device be used for if not exposed
>to guest.
>
>It will update the IOMMU.
>And do so without need for tricky callbacks.

Sorry to bother you again, just want to get clear understanding to your suggestions.

1. Is vfio-iommu device type inherited from TYPE_VFIO_PCI_NOHOTPLUG?

2. How to avoid tricky set/unset_iommu_device callbacks with vfio-iommu device,
    if we want to pass vfio-iommu device to vIOMMU to update caps?

3. Do you mean we can loop on vfio_device_list to get vfio-iommu device?
    We want to support vdpa device in future, also need to loop vdpa device list;
    also need to bypass vfio device whose host bridge configured IOMMU bypassed.

4. It looks, with vfio-iommu device hotplug is not supported, or I misunderstand?

Thanks
Zhenzhong
Eric Auger March 18, 2024, 1:20 p.m. UTC | #7
Hi Michael,

On 3/13/24 12:17, Michael S. Tsirkin wrote:
> On Wed, Mar 13, 2024 at 07:54:11AM +0000, Duan, Zhenzhong wrote:
>>
>>> -----Original Message-----
>>> From: Michael S. Tsirkin <mst@redhat.com>
>>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>>> sync host IOMMU cap/ecap
>>>
>>> On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
>>>> Hi Michael,
>>>>
>>>>> -----Original Message-----
>>>>> From: Michael S. Tsirkin <mst@redhat.com>
>>>>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>>>>> sync host IOMMU cap/ecap
>>>>>
>>>>> On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
>>>>>> From: Yi Liu <yi.l.liu@intel.com>
>>>>>>
>>>>>> Add a framework to check and synchronize host IOMMU cap/ecap with
>>>>>> vIOMMU cap/ecap.
>>>>>>
>>>>>> The sequence will be:
>>>>>>
>>>>>> vtd_cap_init() initializes iommu->cap/ecap.
>>>>>> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
>>>>>> iommu->cap_frozen set when machine create done, iommu->cap/ecap
>>>>> become readonly.
>>>>>> Implementation details for different backends will be in following
>>> patches.
>>>>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>>>>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>>>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>>>>> ---
>>>>>>  include/hw/i386/intel_iommu.h |  1 +
>>>>>>  hw/i386/intel_iommu.c         | 50
>>>>> ++++++++++++++++++++++++++++++++++-
>>>>>>  2 files changed, 50 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/include/hw/i386/intel_iommu.h
>>>>> b/include/hw/i386/intel_iommu.h
>>>>>> index bbc7b96add..c71a133820 100644
>>>>>> --- a/include/hw/i386/intel_iommu.h
>>>>>> +++ b/include/hw/i386/intel_iommu.h
>>>>>> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>>>>>>
>>>>>>      uint64_t cap;                   /* The value of capability reg */
>>>>>>      uint64_t ecap;                  /* The value of extended capability reg */
>>>>>> +    bool cap_frozen;                /* cap/ecap become read-only after
>>> frozen */
>>>>>>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>>>>>>      GHashTable *iotlb;              /* IOTLB */
>>>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>>>>>> index ffa1ad6429..a9f9dfd6a7 100644
>>>>>> --- a/hw/i386/intel_iommu.c
>>>>>> +++ b/hw/i386/intel_iommu.c
>>>>>> @@ -35,6 +35,8 @@
>>>>>>  #include "sysemu/kvm.h"
>>>>>>  #include "sysemu/dma.h"
>>>>>>  #include "sysemu/sysemu.h"
>>>>>> +#include "hw/vfio/vfio-common.h"
>>>>>> +#include "sysemu/iommufd.h"
>>>>>>  #include "hw/i386/apic_internal.h"
>>>>>>  #include "kvm/kvm_i386.h"
>>>>>>  #include "migration/vmstate.h"
>>>>>> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>>>>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>>>>>>      return vtd_dev_as;
>>>>>>  }
>>>>>>
>>>>>> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>>>>>> +                                 IOMMULegacyDevice *ldev,
>>>>>> +                                 Error **errp)
>>>>>> +{
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>>>>>> +                                  IOMMUFDDevice *idev,
>>>>>> +                                  Error **errp)
>>>>>> +{
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int vtd_check_hdev(IntelIOMMUState *s,
>>> VTDHostIOMMUDevice
>>>>> *vtd_hdev,
>>>>>> +                          Error **errp)
>>>>>> +{
>>>>>> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
>>>>>> +    IOMMUFDDevice *idev;
>>>>>> +
>>>>>> +    if (base_dev->type == HID_LEGACY) {
>>>>>> +        IOMMULegacyDevice *ldev = container_of(base_dev,
>>>>>> +                                               IOMMULegacyDevice, base);
>>>>>> +
>>>>>> +        return vtd_check_legacy_hdev(s, ldev, errp);
>>>>>> +    }
>>>>>> +
>>>>>> +    idev = container_of(base_dev, IOMMUFDDevice, base);
>>>>>> +
>>>>>> +    return vtd_check_iommufd_hdev(s, idev, errp);
>>>>>> +}
>>>>>> +
>>>>>>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
>>>>> devfn,
>>>>>>                                      HostIOMMUDevice *base_dev, Error **errp)
>>>>>>  {
>>>>>> @@ -3829,6 +3863,7 @@ static int
>>> vtd_dev_set_iommu_device(PCIBus
>>>>> *bus, void *opaque, int devfn,
>>>>>>          .devfn = devfn,
>>>>>>      };
>>>>>>      struct vtd_as_key *new_key;
>>>>>> +    int ret;
>>>>>>
>>>>>>      assert(base_dev);
>>>>>>
>>>>>> @@ -3848,6 +3883,13 @@ static int
>>> vtd_dev_set_iommu_device(PCIBus
>>>>> *bus, void *opaque, int devfn,
>>>>>>      vtd_hdev->iommu_state = s;
>>>>>>      vtd_hdev->dev = base_dev;
>>>>>>
>>>>>> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
>>>>>> +    if (ret) {
>>>>>> +        g_free(vtd_hdev);
>>>>>> +        vtd_iommu_unlock(s);
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +
>>>>>>      new_key = g_malloc(sizeof(*new_key));
>>>>>>      new_key->bus = bus;
>>>>>>      new_key->devfn = devfn;
>>>>>
>>>>> Okay. So when VFIO device is created, it will call
>>> vtd_dev_set_iommu_device
>>>>> and that in turn will update caps.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
>>>>>>      s->iq_dw = false;
>>>>>>      s->next_frcd_reg = 0;
>>>>>>
>>>>>> -    vtd_cap_init(s);
>>>>>> +    if (!s->cap_frozen) {
>>>>>> +        vtd_cap_init(s);
>>>>>> +    }
>>>>>>
>>>>> If it's fronzen it's because VFIO was added after machine done.
>>>>> And then what? I think caps are just wrong?
>>>> Not quite get your question on caps being wrong. But try to explains:
>>>>
>>>> When a hot plugged vfio device's host iommu cap isn't compatible with
>>>> vIOMMU's, hotplug should fail. Currently there is no check for this and
>>>> allow hotplug to succeed, but then some issue will reveal later,
>>>> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
>>>> mapping beyond host supported iova range, then DMA will fail.
>>>>
>>>> In fact, before this series, cap is not impacted by VFIO, so it's same effect of
>>>> frozen after machine done.
>>>>
>>>>>
>>>>> I think the way to approach this is just by specifying this
>>>>> as an option on command line.
>>>> Do you mean add a cap_frozen property to intel_iommu?
>>>> Vtd_init() is called in realize() and system reset(), so I utilize realize() to init
>>> cap
>>>> and froze cap before system reset(). If cap_frozen is an option, when it's
>>> set to
>>>> false, cap could be updated every system reset and it's not a fix value any
>>> more.
>>>> This may break migration.
>>> No, I mean either
>>> 1. add some kind of vfio-iommu device that is not exposed to guest
>>>   but is not hot pluggable
>> Not quite get, what will such vfio-iommu device be used for if not exposed to guest.
> It will update the IOMMU.
> And do so without need for tricky callbacks.
>

At the moment the only way VFIO can pass info to vIOMMU is through the
IOMMU MR API (IOMMUMemoryRegionClass).
Unfortunately this API is not fully adapted to new use cases because it
relies on the IOMMU MR to be enabled which causes the VFIO

vfio_listener_region_add() to be called and call the relevant IOMMU MR callback. Before the IOMMU MR enablement there is no way to convey info from VFIO to the vIOMMU. That why, for several years and since the beginning of discussions related to nested IOMMU, Peter Xu, Yi Liu, myself headed towards the usage of PCIIOMMUOps instead.


You will find in 
[RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for hotplugged devices
https://lore.kernel.org/all/20240117080414.316890-1-eric.auger@redhat.com/
yet another example of such kind of PCIIOMMUOps. In that series instead of relying on a HostIOMMUDevice object as proposed by Zhenzhong it uses a direct callback but it is still the same principle and the HostIOMMUDevice looks better to accomodate both iommufd and VFIO backend.


host reserved memory regions is not something we can easiliy describe at vIOMMU level. The info are fetched from host and propagated to the vIOMMU

I think nested stage setup would also need this PCIIOMMUOps trick to setup stage 1 info. So the introduction of the so called HostIOMMUDevice object has a broader scope than a gaw option or ecap option set I think.

If you don't like this approach, please can you elaborate on the "vfio-iommu device" proposal above so that we can explore it.

Thank you in advance

Eric
Michael S. Tsirkin March 28, 2024, 7:20 a.m. UTC | #8
On Mon, Mar 18, 2024 at 02:20:50PM +0100, Eric Auger wrote:
> Hi Michael,
> 
> On 3/13/24 12:17, Michael S. Tsirkin wrote:
> > On Wed, Mar 13, 2024 at 07:54:11AM +0000, Duan, Zhenzhong wrote:
> >>
> >>> -----Original Message-----
> >>> From: Michael S. Tsirkin <mst@redhat.com>
> >>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
> >>> sync host IOMMU cap/ecap
> >>>
> >>> On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
> >>>> Hi Michael,
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Michael S. Tsirkin <mst@redhat.com>
> >>>>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
> >>>>> sync host IOMMU cap/ecap
> >>>>>
> >>>>> On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan wrote:
> >>>>>> From: Yi Liu <yi.l.liu@intel.com>
> >>>>>>
> >>>>>> Add a framework to check and synchronize host IOMMU cap/ecap with
> >>>>>> vIOMMU cap/ecap.
> >>>>>>
> >>>>>> The sequence will be:
> >>>>>>
> >>>>>> vtd_cap_init() initializes iommu->cap/ecap.
> >>>>>> vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
> >>>>>> iommu->cap_frozen set when machine create done, iommu->cap/ecap
> >>>>> become readonly.
> >>>>>> Implementation details for different backends will be in following
> >>> patches.
> >>>>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> >>>>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> >>>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> >>>>>> ---
> >>>>>>  include/hw/i386/intel_iommu.h |  1 +
> >>>>>>  hw/i386/intel_iommu.c         | 50
> >>>>> ++++++++++++++++++++++++++++++++++-
> >>>>>>  2 files changed, 50 insertions(+), 1 deletion(-)
> >>>>>>
> >>>>>> diff --git a/include/hw/i386/intel_iommu.h
> >>>>> b/include/hw/i386/intel_iommu.h
> >>>>>> index bbc7b96add..c71a133820 100644
> >>>>>> --- a/include/hw/i386/intel_iommu.h
> >>>>>> +++ b/include/hw/i386/intel_iommu.h
> >>>>>> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
> >>>>>>
> >>>>>>      uint64_t cap;                   /* The value of capability reg */
> >>>>>>      uint64_t ecap;                  /* The value of extended capability reg */
> >>>>>> +    bool cap_frozen;                /* cap/ecap become read-only after
> >>> frozen */
> >>>>>>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
> >>>>>>      GHashTable *iotlb;              /* IOTLB */
> >>>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >>>>>> index ffa1ad6429..a9f9dfd6a7 100644
> >>>>>> --- a/hw/i386/intel_iommu.c
> >>>>>> +++ b/hw/i386/intel_iommu.c
> >>>>>> @@ -35,6 +35,8 @@
> >>>>>>  #include "sysemu/kvm.h"
> >>>>>>  #include "sysemu/dma.h"
> >>>>>>  #include "sysemu/sysemu.h"
> >>>>>> +#include "hw/vfio/vfio-common.h"
> >>>>>> +#include "sysemu/iommufd.h"
> >>>>>>  #include "hw/i386/apic_internal.h"
> >>>>>>  #include "kvm/kvm_i386.h"
> >>>>>>  #include "migration/vmstate.h"
> >>>>>> @@ -3819,6 +3821,38 @@ VTDAddressSpace
> >>>>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
> >>>>>>      return vtd_dev_as;
> >>>>>>  }
> >>>>>>
> >>>>>> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
> >>>>>> +                                 IOMMULegacyDevice *ldev,
> >>>>>> +                                 Error **errp)
> >>>>>> +{
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
> >>>>>> +                                  IOMMUFDDevice *idev,
> >>>>>> +                                  Error **errp)
> >>>>>> +{
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int vtd_check_hdev(IntelIOMMUState *s,
> >>> VTDHostIOMMUDevice
> >>>>> *vtd_hdev,
> >>>>>> +                          Error **errp)
> >>>>>> +{
> >>>>>> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
> >>>>>> +    IOMMUFDDevice *idev;
> >>>>>> +
> >>>>>> +    if (base_dev->type == HID_LEGACY) {
> >>>>>> +        IOMMULegacyDevice *ldev = container_of(base_dev,
> >>>>>> +                                               IOMMULegacyDevice, base);
> >>>>>> +
> >>>>>> +        return vtd_check_legacy_hdev(s, ldev, errp);
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    idev = container_of(base_dev, IOMMUFDDevice, base);
> >>>>>> +
> >>>>>> +    return vtd_check_iommufd_hdev(s, idev, errp);
> >>>>>> +}
> >>>>>> +
> >>>>>>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int
> >>>>> devfn,
> >>>>>>                                      HostIOMMUDevice *base_dev, Error **errp)
> >>>>>>  {
> >>>>>> @@ -3829,6 +3863,7 @@ static int
> >>> vtd_dev_set_iommu_device(PCIBus
> >>>>> *bus, void *opaque, int devfn,
> >>>>>>          .devfn = devfn,
> >>>>>>      };
> >>>>>>      struct vtd_as_key *new_key;
> >>>>>> +    int ret;
> >>>>>>
> >>>>>>      assert(base_dev);
> >>>>>>
> >>>>>> @@ -3848,6 +3883,13 @@ static int
> >>> vtd_dev_set_iommu_device(PCIBus
> >>>>> *bus, void *opaque, int devfn,
> >>>>>>      vtd_hdev->iommu_state = s;
> >>>>>>      vtd_hdev->dev = base_dev;
> >>>>>>
> >>>>>> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
> >>>>>> +    if (ret) {
> >>>>>> +        g_free(vtd_hdev);
> >>>>>> +        vtd_iommu_unlock(s);
> >>>>>> +        return ret;
> >>>>>> +    }
> >>>>>> +
> >>>>>>      new_key = g_malloc(sizeof(*new_key));
> >>>>>>      new_key->bus = bus;
> >>>>>>      new_key->devfn = devfn;
> >>>>>
> >>>>> Okay. So when VFIO device is created, it will call
> >>> vtd_dev_set_iommu_device
> >>>>> and that in turn will update caps.
> >>>>>
> >>>>>
> >>>>>
> >>>>>
> >>>>>> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
> >>>>>>      s->iq_dw = false;
> >>>>>>      s->next_frcd_reg = 0;
> >>>>>>
> >>>>>> -    vtd_cap_init(s);
> >>>>>> +    if (!s->cap_frozen) {
> >>>>>> +        vtd_cap_init(s);
> >>>>>> +    }
> >>>>>>
> >>>>> If it's fronzen it's because VFIO was added after machine done.
> >>>>> And then what? I think caps are just wrong?
> >>>> Not quite get your question on caps being wrong. But try to explains:
> >>>>
> >>>> When a hot plugged vfio device's host iommu cap isn't compatible with
> >>>> vIOMMU's, hotplug should fail. Currently there is no check for this and
> >>>> allow hotplug to succeed, but then some issue will reveal later,
> >>>> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup iova
> >>>> mapping beyond host supported iova range, then DMA will fail.
> >>>>
> >>>> In fact, before this series, cap is not impacted by VFIO, so it's same effect of
> >>>> frozen after machine done.
> >>>>
> >>>>>
> >>>>> I think the way to approach this is just by specifying this
> >>>>> as an option on command line.
> >>>> Do you mean add a cap_frozen property to intel_iommu?
> >>>> Vtd_init() is called in realize() and system reset(), so I utilize realize() to init
> >>> cap
> >>>> and froze cap before system reset(). If cap_frozen is an option, when it's
> >>> set to
> >>>> false, cap could be updated every system reset and it's not a fix value any
> >>> more.
> >>>> This may break migration.
> >>> No, I mean either
> >>> 1. add some kind of vfio-iommu device that is not exposed to guest
> >>>   but is not hot pluggable
> >> Not quite get, what will such vfio-iommu device be used for if not exposed to guest.
> > It will update the IOMMU.
> > And do so without need for tricky callbacks.
> >
> 
> At the moment the only way VFIO can pass info to vIOMMU is through the
> IOMMU MR API (IOMMUMemoryRegionClass).
> Unfortunately this API is not fully adapted to new use cases because it
> relies on the IOMMU MR to be enabled which causes the VFIO
> 
> vfio_listener_region_add() to be called and call the relevant IOMMU MR callback. Before the IOMMU MR enablement there is no way to convey info from VFIO to the vIOMMU. That why, for several years and since the beginning of discussions related to nested IOMMU, Peter Xu, Yi Liu, myself headed towards the usage of PCIIOMMUOps instead.
> 
> 
> You will find in 
> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for hotplugged devices
> https://lore.kernel.org/all/20240117080414.316890-1-eric.auger@redhat.com/
> yet another example of such kind of PCIIOMMUOps. In that series instead of relying on a HostIOMMUDevice object as proposed by Zhenzhong it uses a direct callback but it is still the same principle and the HostIOMMUDevice looks better to accomodate both iommufd and VFIO backend.
> 
> 
> host reserved memory regions is not something we can easiliy describe at vIOMMU level. The info are fetched from host and propagated to the vIOMMU
> 
> I think nested stage setup would also need this PCIIOMMUOps trick to setup stage 1 info. So the introduction of the so called HostIOMMUDevice object has a broader scope than a gaw option or ecap option set I think.
> 
> If you don't like this approach, please can you elaborate on the "vfio-iommu device" proposal above so that we can explore it.
> 
> Thank you in advance
> 
> Eric


Hi Eric,
Sorry about the delay in answering - was on vacation.

My concern is with user interface not with the internal API used.
The concern is simple - assymetry and complex rules in handling devices.
E.g. a non hotplugged vfio device adjusts the vIOMMU, then you can
remove it and hotplug another one and it works, but if you start
without vfio then hotplug then it might fail because it's too late
to adjust the vIOMMU.

And what I am saying, is that we either want something on the qemu command line
that will tell the vIOMMU "please use info from the host" or
have management take info from the host and supply that to the vIOMMU.
The former seems more user friendly, for sure. The disadvantage of it
is that one can't e.g. take the least common denominator between two
hosts and make things migrateable in this way.
If we limit our ambition to VFIO that might be acceptable, after all
VFIO already assumes very similar hardware on src and dst of migration.
The later is what we did for aw-bits.

I see lots of acceptable options, but yes it seems nice to have
something single that will work for all IOMMUs and also maybe live under
VFIO? Thus I came up with the idea of a stub device living under vfio
who's job is just to be present on command line and adjust the guest
machine (ATM the viommu but we might see other uses too) to match host.
Duan, Zhenzhong March 29, 2024, 3:22 a.m. UTC | #9
Hi Michael,

>-----Original Message-----
>From: Michael S. Tsirkin <mst@redhat.com>
>Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>sync host IOMMU cap/ecap
>
>On Mon, Mar 18, 2024 at 02:20:50PM +0100, Eric Auger wrote:
>> Hi Michael,
>>
>> On 3/13/24 12:17, Michael S. Tsirkin wrote:
>> > On Wed, Mar 13, 2024 at 07:54:11AM +0000, Duan, Zhenzhong wrote:
>> >>
>> >>> -----Original Message-----
>> >>> From: Michael S. Tsirkin <mst@redhat.com>
>> >>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check
>and
>> >>> sync host IOMMU cap/ecap
>> >>>
>> >>> On Wed, Mar 13, 2024 at 02:52:39AM +0000, Duan, Zhenzhong wrote:
>> >>>> Hi Michael,
>> >>>>
>> >>>>> -----Original Message-----
>> >>>>> From: Michael S. Tsirkin <mst@redhat.com>
>> >>>>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to
>check and
>> >>>>> sync host IOMMU cap/ecap
>> >>>>>
>> >>>>> On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan
>wrote:
>> >>>>>> From: Yi Liu <yi.l.liu@intel.com>
>> >>>>>>
>> >>>>>> Add a framework to check and synchronize host IOMMU cap/ecap
>with
>> >>>>>> vIOMMU cap/ecap.
>> >>>>>>
>> >>>>>> The sequence will be:
>> >>>>>>
>> >>>>>> vtd_cap_init() initializes iommu->cap/ecap.
>> >>>>>> vtd_check_hdev() update iommu->cap/ecap based on host
>cap/ecap.
>> >>>>>> iommu->cap_frozen set when machine create done, iommu-
>>cap/ecap
>> >>>>> become readonly.
>> >>>>>> Implementation details for different backends will be in following
>> >>> patches.
>> >>>>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> >>>>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
>> >>>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> >>>>>> ---
>> >>>>>>  include/hw/i386/intel_iommu.h |  1 +
>> >>>>>>  hw/i386/intel_iommu.c         | 50
>> >>>>> ++++++++++++++++++++++++++++++++++-
>> >>>>>>  2 files changed, 50 insertions(+), 1 deletion(-)
>> >>>>>>
>> >>>>>> diff --git a/include/hw/i386/intel_iommu.h
>> >>>>> b/include/hw/i386/intel_iommu.h
>> >>>>>> index bbc7b96add..c71a133820 100644
>> >>>>>> --- a/include/hw/i386/intel_iommu.h
>> >>>>>> +++ b/include/hw/i386/intel_iommu.h
>> >>>>>> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>> >>>>>>
>> >>>>>>      uint64_t cap;                   /* The value of capability reg */
>> >>>>>>      uint64_t ecap;                  /* The value of extended capability reg
>*/
>> >>>>>> +    bool cap_frozen;                /* cap/ecap become read-only after
>> >>> frozen */
>> >>>>>>      uint32_t context_cache_gen;     /* Should be in [1,MAX] */
>> >>>>>>      GHashTable *iotlb;              /* IOTLB */
>> >>>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> >>>>>> index ffa1ad6429..a9f9dfd6a7 100644
>> >>>>>> --- a/hw/i386/intel_iommu.c
>> >>>>>> +++ b/hw/i386/intel_iommu.c
>> >>>>>> @@ -35,6 +35,8 @@
>> >>>>>>  #include "sysemu/kvm.h"
>> >>>>>>  #include "sysemu/dma.h"
>> >>>>>>  #include "sysemu/sysemu.h"
>> >>>>>> +#include "hw/vfio/vfio-common.h"
>> >>>>>> +#include "sysemu/iommufd.h"
>> >>>>>>  #include "hw/i386/apic_internal.h"
>> >>>>>>  #include "kvm/kvm_i386.h"
>> >>>>>>  #include "migration/vmstate.h"
>> >>>>>> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>> >>>>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>> >>>>>>      return vtd_dev_as;
>> >>>>>>  }
>> >>>>>>
>> >>>>>> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>> >>>>>> +                                 IOMMULegacyDevice *ldev,
>> >>>>>> +                                 Error **errp)
>> >>>>>> +{
>> >>>>>> +    return 0;
>> >>>>>> +}
>> >>>>>> +
>> >>>>>> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>> >>>>>> +                                  IOMMUFDDevice *idev,
>> >>>>>> +                                  Error **errp)
>> >>>>>> +{
>> >>>>>> +    return 0;
>> >>>>>> +}
>> >>>>>> +
>> >>>>>> +static int vtd_check_hdev(IntelIOMMUState *s,
>> >>> VTDHostIOMMUDevice
>> >>>>> *vtd_hdev,
>> >>>>>> +                          Error **errp)
>> >>>>>> +{
>> >>>>>> +    HostIOMMUDevice *base_dev = vtd_hdev->dev;
>> >>>>>> +    IOMMUFDDevice *idev;
>> >>>>>> +
>> >>>>>> +    if (base_dev->type == HID_LEGACY) {
>> >>>>>> +        IOMMULegacyDevice *ldev = container_of(base_dev,
>> >>>>>> +                                               IOMMULegacyDevice, base);
>> >>>>>> +
>> >>>>>> +        return vtd_check_legacy_hdev(s, ldev, errp);
>> >>>>>> +    }
>> >>>>>> +
>> >>>>>> +    idev = container_of(base_dev, IOMMUFDDevice, base);
>> >>>>>> +
>> >>>>>> +    return vtd_check_iommufd_hdev(s, idev, errp);
>> >>>>>> +}
>> >>>>>> +
>> >>>>>>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque,
>int
>> >>>>> devfn,
>> >>>>>>                                      HostIOMMUDevice *base_dev, Error **errp)
>> >>>>>>  {
>> >>>>>> @@ -3829,6 +3863,7 @@ static int
>> >>> vtd_dev_set_iommu_device(PCIBus
>> >>>>> *bus, void *opaque, int devfn,
>> >>>>>>          .devfn = devfn,
>> >>>>>>      };
>> >>>>>>      struct vtd_as_key *new_key;
>> >>>>>> +    int ret;
>> >>>>>>
>> >>>>>>      assert(base_dev);
>> >>>>>>
>> >>>>>> @@ -3848,6 +3883,13 @@ static int
>> >>> vtd_dev_set_iommu_device(PCIBus
>> >>>>> *bus, void *opaque, int devfn,
>> >>>>>>      vtd_hdev->iommu_state = s;
>> >>>>>>      vtd_hdev->dev = base_dev;
>> >>>>>>
>> >>>>>> +    ret = vtd_check_hdev(s, vtd_hdev, errp);
>> >>>>>> +    if (ret) {
>> >>>>>> +        g_free(vtd_hdev);
>> >>>>>> +        vtd_iommu_unlock(s);
>> >>>>>> +        return ret;
>> >>>>>> +    }
>> >>>>>> +
>> >>>>>>      new_key = g_malloc(sizeof(*new_key));
>> >>>>>>      new_key->bus = bus;
>> >>>>>>      new_key->devfn = devfn;
>> >>>>>
>> >>>>> Okay. So when VFIO device is created, it will call
>> >>> vtd_dev_set_iommu_device
>> >>>>> and that in turn will update caps.
>> >>>>>
>> >>>>>
>> >>>>>
>> >>>>>
>> >>>>>> @@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState
>*s)
>> >>>>>>      s->iq_dw = false;
>> >>>>>>      s->next_frcd_reg = 0;
>> >>>>>>
>> >>>>>> -    vtd_cap_init(s);
>> >>>>>> +    if (!s->cap_frozen) {
>> >>>>>> +        vtd_cap_init(s);
>> >>>>>> +    }
>> >>>>>>
>> >>>>> If it's fronzen it's because VFIO was added after machine done.
>> >>>>> And then what? I think caps are just wrong?
>> >>>> Not quite get your question on caps being wrong. But try to explains:
>> >>>>
>> >>>> When a hot plugged vfio device's host iommu cap isn't compatible
>with
>> >>>> vIOMMU's, hotplug should fail. Currently there is no check for this
>and
>> >>>> allow hotplug to succeed, but then some issue will reveal later,
>> >>>> e.g., vIOMMU's MGAW > host IOMMU's MGAW, guest can setup
>iova
>> >>>> mapping beyond host supported iova range, then DMA will fail.
>> >>>>
>> >>>> In fact, before this series, cap is not impacted by VFIO, so it's same
>effect of
>> >>>> frozen after machine done.
>> >>>>
>> >>>>>
>> >>>>> I think the way to approach this is just by specifying this
>> >>>>> as an option on command line.
>> >>>> Do you mean add a cap_frozen property to intel_iommu?
>> >>>> Vtd_init() is called in realize() and system reset(), so I utilize realize()
>to init
>> >>> cap
>> >>>> and froze cap before system reset(). If cap_frozen is an option, when
>it's
>> >>> set to
>> >>>> false, cap could be updated every system reset and it's not a fix value
>any
>> >>> more.
>> >>>> This may break migration.
>> >>> No, I mean either
>> >>> 1. add some kind of vfio-iommu device that is not exposed to guest
>> >>>   but is not hot pluggable
>> >> Not quite get, what will such vfio-iommu device be used for if not
>exposed to guest.
>> > It will update the IOMMU.
>> > And do so without need for tricky callbacks.
>> >
>>
>> At the moment the only way VFIO can pass info to vIOMMU is through the
>> IOMMU MR API (IOMMUMemoryRegionClass).
>> Unfortunately this API is not fully adapted to new use cases because it
>> relies on the IOMMU MR to be enabled which causes the VFIO
>>
>> vfio_listener_region_add() to be called and call the relevant IOMMU MR
>callback. Before the IOMMU MR enablement there is no way to convey info
>from VFIO to the vIOMMU. That why, for several years and since the
>beginning of discussions related to nested IOMMU, Peter Xu, Yi Liu, myself
>headed towards the usage of PCIIOMMUOps instead.
>>
>>
>> You will find in
>> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for
>hotplugged devices
>> https://lore.kernel.org/all/20240117080414.316890-1-
>eric.auger@redhat.com/
>> yet another example of such kind of PCIIOMMUOps. In that series instead
>of relying on a HostIOMMUDevice object as proposed by Zhenzhong it uses a
>direct callback but it is still the same principle and the HostIOMMUDevice
>looks better to accomodate both iommufd and VFIO backend.
>>
>>
>> host reserved memory regions is not something we can easiliy describe at
>vIOMMU level. The info are fetched from host and propagated to the
>vIOMMU
>>
>> I think nested stage setup would also need this PCIIOMMUOps trick to
>setup stage 1 info. So the introduction of the so called HostIOMMUDevice
>object has a broader scope than a gaw option or ecap option set I think.
>>
>> If you don't like this approach, please can you elaborate on the "vfio-
>iommu device" proposal above so that we can explore it.
>>
>> Thank you in advance
>>
>> Eric
>
>
>Hi Eric,
>Sorry about the delay in answering - was on vacation.
>
>My concern is with user interface not with the internal API used.
>The concern is simple - assymetry and complex rules in handling devices.
>E.g. a non hotplugged vfio device adjusts the vIOMMU, then you can
>remove it and hotplug another one and it works, but if you start
>without vfio then hotplug then it might fail because it's too late
>to adjust the vIOMMU.

Just clarify the adjustment from vfio is only zeroing the cap/ecap's 1 bits,
never set an original 0 bits. So if hotplug fails, then coldplug should
also fail for the same device.

>
>And what I am saying, is that we either want something on the qemu
>command line
>that will tell the vIOMMU "please use info from the host" or
>have management take info from the host and supply that to the vIOMMU.
>The former seems more user friendly, for sure. The disadvantage of it
>is that one can't e.g. take the least common denominator between two
>hosts and make things migrateable in this way.
>If we limit our ambition to VFIO that might be acceptable, after all
>VFIO already assumes very similar hardware on src and dst of migration.
>The later is what we did for aw-bits.

We can get host IOMMU's hw cap/ecap by reading below file:

/sys/devices/virtual/iommu/dmar[*]/intel-iommu/[cap|ecap]

But it's not accurate as kernel command line can limit the support from
software aspect, e.g., intel_iommu=off/sm_off.

So I'd like to follow up the 2nd way, add two options which defaults 0,

+    DEFINE_PROP_UINT64("host_cap", IntelIOMMUState, host_cap, 0),
+    DEFINE_PROP_UINT64("host_ecap", IntelIOMMUState, host_ecap, 0),

When any is set by management, it overrides vIOMMU default value,
Or else use default.

With cap/ecap set by management, we still need to have the cap/ecap
check logic between host and vIOMMU, but no update, no frozen.

The suggestion for management is to set a value based on
/sys/devices/virtual/iommu/dmar[*]/intel-iommu/[cap|ecap]
But do not assign a random value.

Let me know if I misunderstand anything
diff mbox series

Patch

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index bbc7b96add..c71a133820 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -283,6 +283,7 @@  struct IntelIOMMUState {
 
     uint64_t cap;                   /* The value of capability reg */
     uint64_t ecap;                  /* The value of extended capability reg */
+    bool cap_frozen;                /* cap/ecap become read-only after frozen */
 
     uint32_t context_cache_gen;     /* Should be in [1,MAX] */
     GHashTable *iotlb;              /* IOTLB */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ffa1ad6429..a9f9dfd6a7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,8 @@ 
 #include "sysemu/kvm.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
@@ -3819,6 +3821,38 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
     return vtd_dev_as;
 }
 
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,
+                                 IOMMULegacyDevice *ldev,
+                                 Error **errp)
+{
+    return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+                                  IOMMUFDDevice *idev,
+                                  Error **errp)
+{
+    return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+                          Error **errp)
+{
+    HostIOMMUDevice *base_dev = vtd_hdev->dev;
+    IOMMUFDDevice *idev;
+
+    if (base_dev->type == HID_LEGACY) {
+        IOMMULegacyDevice *ldev = container_of(base_dev,
+                                               IOMMULegacyDevice, base);
+
+        return vtd_check_legacy_hdev(s, ldev, errp);
+    }
+
+    idev = container_of(base_dev, IOMMUFDDevice, base);
+
+    return vtd_check_iommufd_hdev(s, idev, errp);
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
                                     HostIOMMUDevice *base_dev, Error **errp)
 {
@@ -3829,6 +3863,7 @@  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
         .devfn = devfn,
     };
     struct vtd_as_key *new_key;
+    int ret;
 
     assert(base_dev);
 
@@ -3848,6 +3883,13 @@  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
     vtd_hdev->iommu_state = s;
     vtd_hdev->dev = base_dev;
 
+    ret = vtd_check_hdev(s, vtd_hdev, errp);
+    if (ret) {
+        g_free(vtd_hdev);
+        vtd_iommu_unlock(s);
+        return ret;
+    }
+
     new_key = g_malloc(sizeof(*new_key));
     new_key->bus = bus;
     new_key->devfn = devfn;
@@ -4083,7 +4125,9 @@  static void vtd_init(IntelIOMMUState *s)
     s->iq_dw = false;
     s->next_frcd_reg = 0;
 
-    vtd_cap_init(s);
+    if (!s->cap_frozen) {
+        vtd_cap_init(s);
+    }
 
     /*
      * Rsvd field masks for spte
@@ -4254,6 +4298,10 @@  static int vtd_machine_done_notify_one(Object *child, void *unused)
 
 static void vtd_machine_done_hook(Notifier *notifier, void *unused)
 {
+    IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+    iommu->cap_frozen = true;
+
     object_child_foreach_recursive(object_get_root(),
                                    vtd_machine_done_notify_one, NULL);
 }