diff mbox

[RFC,v3,13/14] intel_iommu: allow dynamic switch of IOMMU region

Message ID 1484276800-26814-14-git-send-email-peterx@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Xu Jan. 13, 2017, 3:06 a.m. UTC
This is preparation work to finally enabled dynamic switching ON/OFF for
VT-d protection. The old VT-d codes is using static IOMMU address space,
and that won't satisfy vfio-pci device listeners.

Let me explain.

vfio-pci devices depend on the memory region listener and IOMMU replay
mechanism to make sure the device mapping is coherent with the guest
even if there are domain switches. And there are two kinds of domain
switches:

  (1) switch from domain A -> B
  (2) switch from domain A -> no domain (e.g., turn DMAR off)

Case (1) is handled by the context entry invalidation handling by the
VT-d replay logic. What the replay function should do here is to replay
the existing page mappings in domain B.

However for case (2), we don't want to replay any domain mappings - we
just need the default GPA->HPA mappings (the address_space_memory
mapping). And this patch helps on case (2) to build up the mapping
automatically by leveraging the vfio-pci memory listeners.

Another important thing that this patch does is to seperate
IR (Interrupt Remapping) from DMAR (DMA Remapping). IR region should not
depend on the DMAR region (like before this patch). It should be a
standalone region, and it should be able to be activated without
DMAR (which is a common behavior of Linux kernel - by default it enables
IR while disabled DMAR).

Signed-off-by: Peter Xu <peterx@redhat.com>
---
v3:
- fix another trivial style issue patchew reported but I missed in v2

v2:
- fix issues reported by patchew
- switch domain by enable/disable memory regions [David]
- provide vtd_switch_address_space{_all}()
- provide a better comment on the memory regions

test done: with intel_iommu device, boot vm with/without
"intel_iommu=on" parameter.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c         | 78 ++++++++++++++++++++++++++++++++++++++++---
 hw/i386/trace-events          |  2 +-
 include/hw/i386/intel_iommu.h |  2 ++
 3 files changed, 77 insertions(+), 5 deletions(-)

Comments

Jason Wang Jan. 16, 2017, 6:20 a.m. UTC | #1
On 2017年01月13日 11:06, Peter Xu wrote:
> This is preparation work to finally enabled dynamic switching ON/OFF for
> VT-d protection. The old VT-d codes is using static IOMMU address space,
> and that won't satisfy vfio-pci device listeners.
>
> Let me explain.
>
> vfio-pci devices depend on the memory region listener and IOMMU replay
> mechanism to make sure the device mapping is coherent with the guest
> even if there are domain switches. And there are two kinds of domain
> switches:
>
>    (1) switch from domain A -> B
>    (2) switch from domain A -> no domain (e.g., turn DMAR off)
>
> Case (1) is handled by the context entry invalidation handling by the
> VT-d replay logic. What the replay function should do here is to replay
> the existing page mappings in domain B.
>
> However for case (2), we don't want to replay any domain mappings - we
> just need the default GPA->HPA mappings (the address_space_memory
> mapping). And this patch helps on case (2) to build up the mapping
> automatically by leveraging the vfio-pci memory listeners.
>
> Another important thing that this patch does is to seperate
> IR (Interrupt Remapping) from DMAR (DMA Remapping). IR region should not
> depend on the DMAR region (like before this patch). It should be a
> standalone region, and it should be able to be activated without
> DMAR (which is a common behavior of Linux kernel - by default it enables
> IR while disabled DMAR).
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> v3:
> - fix another trivial style issue patchew reported but I missed in v2
>
> v2:
> - fix issues reported by patchew
> - switch domain by enable/disable memory regions [David]
> - provide vtd_switch_address_space{_all}()
> - provide a better comment on the memory regions
>
> test done: with intel_iommu device, boot vm with/without
> "intel_iommu=on" parameter.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   hw/i386/intel_iommu.c         | 78 ++++++++++++++++++++++++++++++++++++++++---
>   hw/i386/trace-events          |  2 +-
>   include/hw/i386/intel_iommu.h |  2 ++
>   3 files changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index fd75112..2596f11 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>       vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
>   }
>   
> +static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)

Looks like you can check s->dmar_enabled here?

> +{
> +    assert(as);
> +
> +    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> +                                   VTD_PCI_SLOT(as->devfn),
> +                                   VTD_PCI_FUNC(as->devfn),
> +                                   iommu_enabled);
> +
> +    /* Turn off first then on the other */
> +    if (iommu_enabled) {
> +        memory_region_set_enabled(&as->sys_alias, false);
> +        memory_region_set_enabled(&as->iommu, true);
> +    } else {
> +        memory_region_set_enabled(&as->iommu, false);
> +        memory_region_set_enabled(&as->sys_alias, true);
> +    }
> +}
> +
> +static void vtd_switch_address_space_all(IntelIOMMUState *s, bool enabled)
> +{
> +    GHashTableIter iter;
> +    VTDBus *vtd_bus;
> +    int i;
> +
> +    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> +    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
> +        for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) {
> +            if (!vtd_bus->dev_as[i]) {
> +                continue;
> +            }
> +            vtd_switch_address_space(vtd_bus->dev_as[i], enabled);
> +        }
> +    }
> +}
> +
>   /* Handle Translation Enable/Disable */
>   static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
>   {
> +    if (s->dmar_enabled == en) {
> +        return;
> +    }
> +
>       VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
>   
>       if (en) {
> @@ -1360,6 +1400,8 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
>           /* Ok - report back to driver */
>           vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
>       }
> +
> +    vtd_switch_address_space_all(s, en);
>   }
>   
>   /* Handle Interrupt Remap Enable/Disable */
> @@ -2586,15 +2628,43 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>           vtd_dev_as->devfn = (uint8_t)devfn;
>           vtd_dev_as->iommu_state = s;
>           vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> +
> +        /*
> +         * Memory region relationships looks like (Address range shows
> +         * only lower 32 bits to make it short in length...):
> +         *
> +         * |-----------------+-------------------+----------|
> +         * | Name            | Address range     | Priority |
> +         * |-----------------+-------------------+----------+
> +         * | vtd_root        | 00000000-ffffffff |        0 |
> +         * |  intel_iommu    | 00000000-ffffffff |        1 |
> +         * |  vtd_sys_alias  | 00000000-ffffffff |        1 |
> +         * |  intel_iommu_ir | fee00000-feefffff |       64 |
> +         * |-----------------+-------------------+----------|
> +         *
> +         * We enable/disable DMAR by switching enablement for
> +         * vtd_sys_alias and intel_iommu regions. IR region is always
> +         * enabled.
> +         */
>           memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
>                                    &s->iommu_ops, "intel_iommu", UINT64_MAX);

Then it's better to name this as "intel_iommu_dmar"?

> +        memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
> +                                 "vtd_sys_alias", get_system_memory(),
> +                                 0, memory_region_size(get_system_memory()));
>           memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
>                                 &vtd_mem_ir_ops, s, "intel_iommu_ir",
>                                 VTD_INTERRUPT_ADDR_SIZE);
> -        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
> -                                    &vtd_dev_as->iommu_ir);
> -        address_space_init(&vtd_dev_as->as,
> -                           &vtd_dev_as->iommu, name);
> +        memory_region_init(&vtd_dev_as->root, OBJECT(s),
> +                           "vtd_root", UINT64_MAX);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root,
> +                                            VTD_INTERRUPT_ADDR_FIRST,
> +                                            &vtd_dev_as->iommu_ir, 64);
> +        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> +                                            &vtd_dev_as->sys_alias, 1);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> +                                            &vtd_dev_as->iommu, 1);
> +        vtd_switch_address_space(vtd_dev_as, s->dmar_enabled);
>       }
>       return vtd_dev_as;
>   }
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 92d210d..beaef61 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -11,7 +11,6 @@ xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (ad
>   x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32
>   
>   # hw/i386/intel_iommu.c
> -vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>   vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
>   vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
>   vtd_inv_desc_cc_global(void) "context invalidate globally"
> @@ -37,6 +36,7 @@ vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, in
>   vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
>   vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
>   vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
> +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>   
>   # hw/i386/amd_iommu.c
>   amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 749eef9..9c3f6c0 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -83,6 +83,8 @@ struct VTDAddressSpace {
>       uint8_t devfn;
>       AddressSpace as;
>       MemoryRegion iommu;
> +    MemoryRegion root;
> +    MemoryRegion sys_alias;
>       MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
>       IntelIOMMUState *iommu_state;
>       VTDContextCacheEntry context_cache_entry;
Peter Xu Jan. 16, 2017, 7:50 a.m. UTC | #2
On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:

[...]

> >diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >index fd75112..2596f11 100644
> >--- a/hw/i386/intel_iommu.c
> >+++ b/hw/i386/intel_iommu.c
> >@@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
> >      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
> >  }
> >+static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
> 
> Looks like you can check s->dmar_enabled here?

Yes, we need to check old state in case we don't need a switch at all.
Actually I checked it...

> 
> >+{
> >+    assert(as);
> >+
> >+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> >+                                   VTD_PCI_SLOT(as->devfn),
> >+                                   VTD_PCI_FUNC(as->devfn),
> >+                                   iommu_enabled);
> >+
> >+    /* Turn off first then on the other */
> >+    if (iommu_enabled) {
> >+        memory_region_set_enabled(&as->sys_alias, false);
> >+        memory_region_set_enabled(&as->iommu, true);
> >+    } else {
> >+        memory_region_set_enabled(&as->iommu, false);
> >+        memory_region_set_enabled(&as->sys_alias, true);
> >+    }
> >+}

[...]

> >  /* Handle Translation Enable/Disable */
> >  static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
> >  {
> >+    if (s->dmar_enabled == en) {
> >+        return;
> >+    }
> >+

... here :) ... and ...

[...]

> >+        memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
> >+                                 "vtd_sys_alias", get_system_memory(),
> >+                                 0, memory_region_size(get_system_memory()));
> >          memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
> >                                &vtd_mem_ir_ops, s, "intel_iommu_ir",
> >                                VTD_INTERRUPT_ADDR_SIZE);
> >-        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
> >-                                    &vtd_dev_as->iommu_ir);
> >-        address_space_init(&vtd_dev_as->as,
> >-                           &vtd_dev_as->iommu, name);
> >+        memory_region_init(&vtd_dev_as->root, OBJECT(s),
> >+                           "vtd_root", UINT64_MAX);
> >+        memory_region_add_subregion_overlap(&vtd_dev_as->root,
> >+                                            VTD_INTERRUPT_ADDR_FIRST,
> >+                                            &vtd_dev_as->iommu_ir, 64);
> >+        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
> >+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> >+                                            &vtd_dev_as->sys_alias, 1);
> >+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> >+                                            &vtd_dev_as->iommu, 1);
> >+        vtd_switch_address_space(vtd_dev_as, s->dmar_enabled);

... here I also used vtd_switch_address_space() to setup the init
state of the regions (in order to share the codes). So how about I
rename vtd_switch_address_space() into something like
vtd_setup_address_space(), to avoid misunderstanding?

Thanks,

-- peterx
Jason Wang Jan. 16, 2017, 8:01 a.m. UTC | #3
On 2017年01月16日 15:50, Peter Xu wrote:
> On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:
>
> [...]
>
>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>>> index fd75112..2596f11 100644
>>> --- a/hw/i386/intel_iommu.c
>>> +++ b/hw/i386/intel_iommu.c
>>> @@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>>>       vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
>>>   }
>>> +static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
>> Looks like you can check s->dmar_enabled here?
> Yes, we need to check old state in case we don't need a switch at all.
> Actually I checked it...
>

I mean is there a chance that iommu_enabled( better name should be 
dmar_enabled) is not equal to s->dmar_enabled? Looks not.

vtd_handle_gcmd_te() did:

     ...
     if (en) {
         s->dmar_enabled = true;
         /* Ok - report back to driver */
         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
     } else {
         s->dmar_enabled = false;
     ...

You can vtd_switch_address_space_all(s, en) after this which will call 
this function. And another caller like you've pointed out has already 
call this through s->dmar_enabled. So en here is always s->dmar_enalbed?

Thanks
Peter Xu Jan. 16, 2017, 8:12 a.m. UTC | #4
On Mon, Jan 16, 2017 at 04:01:00PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月16日 15:50, Peter Xu wrote:
> >On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:
> >
> >[...]
> >
> >>>diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >>>index fd75112..2596f11 100644
> >>>--- a/hw/i386/intel_iommu.c
> >>>+++ b/hw/i386/intel_iommu.c
> >>>@@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
> >>>      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
> >>>  }
> >>>+static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
> >>Looks like you can check s->dmar_enabled here?
> >Yes, we need to check old state in case we don't need a switch at all.
> >Actually I checked it...
> >
> 
> I mean is there a chance that iommu_enabled( better name should be
> dmar_enabled) is not equal to s->dmar_enabled? Looks not.
> 
> vtd_handle_gcmd_te() did:
> 
>     ...
>     if (en) {
>         s->dmar_enabled = true;
>         /* Ok - report back to driver */
>         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
>     } else {
>         s->dmar_enabled = false;
>     ...
> 
> You can vtd_switch_address_space_all(s, en) after this which will call this
> function. And another caller like you've pointed out has already call this
> through s->dmar_enabled. So en here is always s->dmar_enalbed?

Hmm, yes...

(I would still prefer keeping this parameter for readablility.
 Though, I prefer your suggestion to rename it to dmar_enabled)

-- peterx
Jason Wang Jan. 16, 2017, 8:25 a.m. UTC | #5
On 2017年01月16日 16:12, Peter Xu wrote:
> On Mon, Jan 16, 2017 at 04:01:00PM +0800, Jason Wang wrote:
>>
>> On 2017年01月16日 15:50, Peter Xu wrote:
>>> On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:
>>>
>>> [...]
>>>
>>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>>>>> index fd75112..2596f11 100644
>>>>> --- a/hw/i386/intel_iommu.c
>>>>> +++ b/hw/i386/intel_iommu.c
>>>>> @@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>>>>>       vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
>>>>>   }
>>>>> +static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
>>>> Looks like you can check s->dmar_enabled here?
>>> Yes, we need to check old state in case we don't need a switch at all.
>>> Actually I checked it...
>>>
>> I mean is there a chance that iommu_enabled( better name should be
>> dmar_enabled) is not equal to s->dmar_enabled? Looks not.
>>
>> vtd_handle_gcmd_te() did:
>>
>>      ...
>>      if (en) {
>>          s->dmar_enabled = true;
>>          /* Ok - report back to driver */
>>          vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
>>      } else {
>>          s->dmar_enabled = false;
>>      ...
>>
>> You can vtd_switch_address_space_all(s, en) after this which will call this
>> function. And another caller like you've pointed out has already call this
>> through s->dmar_enabled. So en here is always s->dmar_enalbed?
> Hmm, yes...
>
> (I would still prefer keeping this parameter for readablility.
>   Though, I prefer your suggestion to rename it to dmar_enabled)
>
> -- peterx

I think this does not give more readability :) May I was wrong, let 
leave this for maintainer.

Thanks :)
Peter Xu Jan. 16, 2017, 8:32 a.m. UTC | #6
On Mon, Jan 16, 2017 at 04:25:35PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月16日 16:12, Peter Xu wrote:
> >On Mon, Jan 16, 2017 at 04:01:00PM +0800, Jason Wang wrote:
> >>
> >>On 2017年01月16日 15:50, Peter Xu wrote:
> >>>On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:
> >>>
> >>>[...]
> >>>
> >>>>>diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >>>>>index fd75112..2596f11 100644
> >>>>>--- a/hw/i386/intel_iommu.c
> >>>>>+++ b/hw/i386/intel_iommu.c
> >>>>>@@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
> >>>>>      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
> >>>>>  }
> >>>>>+static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
> >>>>Looks like you can check s->dmar_enabled here?
> >>>Yes, we need to check old state in case we don't need a switch at all.
> >>>Actually I checked it...
> >>>
> >>I mean is there a chance that iommu_enabled( better name should be
> >>dmar_enabled) is not equal to s->dmar_enabled? Looks not.
> >>
> >>vtd_handle_gcmd_te() did:
> >>
> >>     ...
> >>     if (en) {
> >>         s->dmar_enabled = true;
> >>         /* Ok - report back to driver */
> >>         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
> >>     } else {
> >>         s->dmar_enabled = false;
> >>     ...
> >>
> >>You can vtd_switch_address_space_all(s, en) after this which will call this
> >>function. And another caller like you've pointed out has already call this
> >>through s->dmar_enabled. So en here is always s->dmar_enalbed?
> >Hmm, yes...
> >
> >(I would still prefer keeping this parameter for readablility.
> >  Though, I prefer your suggestion to rename it to dmar_enabled)
> >
> >-- peterx
> 
> I think this does not give more readability :) May I was wrong, let leave
> this for maintainer.
> 
> Thanks :)

Thanks for reviewing this series so fast!

I have no strong opinion as well. Maybe you are right. :-)

Michael, please let me know if you dislike this, so I can remove this
parameter (it equals to as->iommu_state->dmar_enabled).

Thanks,

-- peterx
Michael S. Tsirkin Jan. 16, 2017, 4:25 p.m. UTC | #7
On Mon, Jan 16, 2017 at 04:32:24PM +0800, Peter Xu wrote:
> On Mon, Jan 16, 2017 at 04:25:35PM +0800, Jason Wang wrote:
> > 
> > 
> > On 2017年01月16日 16:12, Peter Xu wrote:
> > >On Mon, Jan 16, 2017 at 04:01:00PM +0800, Jason Wang wrote:
> > >>
> > >>On 2017年01月16日 15:50, Peter Xu wrote:
> > >>>On Mon, Jan 16, 2017 at 02:20:31PM +0800, Jason Wang wrote:
> > >>>
> > >>>[...]
> > >>>
> > >>>>>diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > >>>>>index fd75112..2596f11 100644
> > >>>>>--- a/hw/i386/intel_iommu.c
> > >>>>>+++ b/hw/i386/intel_iommu.c
> > >>>>>@@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
> > >>>>>      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
> > >>>>>  }
> > >>>>>+static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
> > >>>>Looks like you can check s->dmar_enabled here?
> > >>>Yes, we need to check old state in case we don't need a switch at all.
> > >>>Actually I checked it...
> > >>>
> > >>I mean is there a chance that iommu_enabled( better name should be
> > >>dmar_enabled) is not equal to s->dmar_enabled? Looks not.
> > >>
> > >>vtd_handle_gcmd_te() did:
> > >>
> > >>     ...
> > >>     if (en) {
> > >>         s->dmar_enabled = true;
> > >>         /* Ok - report back to driver */
> > >>         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
> > >>     } else {
> > >>         s->dmar_enabled = false;
> > >>     ...
> > >>
> > >>You can vtd_switch_address_space_all(s, en) after this which will call this
> > >>function. And another caller like you've pointed out has already call this
> > >>through s->dmar_enabled. So en here is always s->dmar_enalbed?
> > >Hmm, yes...
> > >
> > >(I would still prefer keeping this parameter for readablility.
> > >  Though, I prefer your suggestion to rename it to dmar_enabled)
> > >
> > >-- peterx
> > 
> > I think this does not give more readability :) May I was wrong, let leave
> > this for maintainer.
> > 
> > Thanks :)
> 
> Thanks for reviewing this series so fast!
> 
> I have no strong opinion as well. Maybe you are right. :-)
> 
> Michael, please let me know if you dislike this, so I can remove this
> parameter (it equals to as->iommu_state->dmar_enabled).
> 
> Thanks,
> 
> -- peterx

I prefer not to duplicate data, yes.
Alex Williamson Jan. 16, 2017, 7:53 p.m. UTC | #8
On Fri, 13 Jan 2017 11:06:39 +0800
Peter Xu <peterx@redhat.com> wrote:

> This is preparation work to finally enabled dynamic switching ON/OFF for
> VT-d protection. The old VT-d codes is using static IOMMU address space,
> and that won't satisfy vfio-pci device listeners.
> 
> Let me explain.
> 
> vfio-pci devices depend on the memory region listener and IOMMU replay
> mechanism to make sure the device mapping is coherent with the guest
> even if there are domain switches. And there are two kinds of domain
> switches:
> 
>   (1) switch from domain A -> B
>   (2) switch from domain A -> no domain (e.g., turn DMAR off)
> 
> Case (1) is handled by the context entry invalidation handling by the
> VT-d replay logic. What the replay function should do here is to replay
> the existing page mappings in domain B.

There's really 2 steps here, right?  Invalidate A, replay B.  I think
the code handles this, but I want to make sure.  We don't want to end
up with a superset of both A & B.

On the invalidation, a future optimization when disabling an entire
memory region might also be to invalidate the entire range at once
rather than each individual mapping within the range, which I think is
what happens now, right?

> However for case (2), we don't want to replay any domain mappings - we
> just need the default GPA->HPA mappings (the address_space_memory
> mapping). And this patch helps on case (2) to build up the mapping
> automatically by leveraging the vfio-pci memory listeners.

Have you thought about using this address space switching to emulate
ecap.PT?  ie. advertise hardware based passthrough so that the guest
doesn't need to waste pagetable entries for a direct mapped, static
identity domain.

Otherwise the series looks pretty good to me.  Thanks,

Alex

> Another important thing that this patch does is to seperate
> IR (Interrupt Remapping) from DMAR (DMA Remapping). IR region should not
> depend on the DMAR region (like before this patch). It should be a
> standalone region, and it should be able to be activated without
> DMAR (which is a common behavior of Linux kernel - by default it enables
> IR while disabled DMAR).
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> v3:
> - fix another trivial style issue patchew reported but I missed in v2
> 
> v2:
> - fix issues reported by patchew
> - switch domain by enable/disable memory regions [David]
> - provide vtd_switch_address_space{_all}()
> - provide a better comment on the memory regions
> 
> test done: with intel_iommu device, boot vm with/without
> "intel_iommu=on" parameter.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  hw/i386/intel_iommu.c         | 78 ++++++++++++++++++++++++++++++++++++++++---
>  hw/i386/trace-events          |  2 +-
>  include/hw/i386/intel_iommu.h |  2 ++
>  3 files changed, 77 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index fd75112..2596f11 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -1343,9 +1343,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
>  }
>  
> +static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
> +{
> +    assert(as);
> +
> +    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> +                                   VTD_PCI_SLOT(as->devfn),
> +                                   VTD_PCI_FUNC(as->devfn),
> +                                   iommu_enabled);
> +
> +    /* Turn off first then on the other */
> +    if (iommu_enabled) {
> +        memory_region_set_enabled(&as->sys_alias, false);
> +        memory_region_set_enabled(&as->iommu, true);
> +    } else {
> +        memory_region_set_enabled(&as->iommu, false);
> +        memory_region_set_enabled(&as->sys_alias, true);
> +    }
> +}
> +
> +static void vtd_switch_address_space_all(IntelIOMMUState *s, bool enabled)
> +{
> +    GHashTableIter iter;
> +    VTDBus *vtd_bus;
> +    int i;
> +
> +    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> +    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
> +        for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) {
> +            if (!vtd_bus->dev_as[i]) {
> +                continue;
> +            }
> +            vtd_switch_address_space(vtd_bus->dev_as[i], enabled);
> +        }
> +    }
> +}
> +
>  /* Handle Translation Enable/Disable */
>  static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
>  {
> +    if (s->dmar_enabled == en) {
> +        return;
> +    }
> +
>      VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
>  
>      if (en) {
> @@ -1360,6 +1400,8 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
>          /* Ok - report back to driver */
>          vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
>      }
> +
> +    vtd_switch_address_space_all(s, en);
>  }
>  
>  /* Handle Interrupt Remap Enable/Disable */
> @@ -2586,15 +2628,43 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>          vtd_dev_as->devfn = (uint8_t)devfn;
>          vtd_dev_as->iommu_state = s;
>          vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> +
> +        /*
> +         * Memory region relationships looks like (Address range shows
> +         * only lower 32 bits to make it short in length...):
> +         *
> +         * |-----------------+-------------------+----------|
> +         * | Name            | Address range     | Priority |
> +         * |-----------------+-------------------+----------+
> +         * | vtd_root        | 00000000-ffffffff |        0 |
> +         * |  intel_iommu    | 00000000-ffffffff |        1 |
> +         * |  vtd_sys_alias  | 00000000-ffffffff |        1 |
> +         * |  intel_iommu_ir | fee00000-feefffff |       64 |
> +         * |-----------------+-------------------+----------|
> +         *
> +         * We enable/disable DMAR by switching enablement for
> +         * vtd_sys_alias and intel_iommu regions. IR region is always
> +         * enabled.
> +         */
>          memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
>                                   &s->iommu_ops, "intel_iommu", UINT64_MAX);
> +        memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
> +                                 "vtd_sys_alias", get_system_memory(),
> +                                 0, memory_region_size(get_system_memory()));
>          memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
>                                &vtd_mem_ir_ops, s, "intel_iommu_ir",
>                                VTD_INTERRUPT_ADDR_SIZE);
> -        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
> -                                    &vtd_dev_as->iommu_ir);
> -        address_space_init(&vtd_dev_as->as,
> -                           &vtd_dev_as->iommu, name);
> +        memory_region_init(&vtd_dev_as->root, OBJECT(s),
> +                           "vtd_root", UINT64_MAX);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root,
> +                                            VTD_INTERRUPT_ADDR_FIRST,
> +                                            &vtd_dev_as->iommu_ir, 64);
> +        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> +                                            &vtd_dev_as->sys_alias, 1);
> +        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
> +                                            &vtd_dev_as->iommu, 1);
> +        vtd_switch_address_space(vtd_dev_as, s->dmar_enabled);
>      }
>      return vtd_dev_as;
>  }
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 92d210d..beaef61 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -11,7 +11,6 @@ xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (ad
>  x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32
>  
>  # hw/i386/intel_iommu.c
> -vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>  vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
>  vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
>  vtd_inv_desc_cc_global(void) "context invalidate globally"
> @@ -37,6 +36,7 @@ vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, in
>  vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
>  vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
>  vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
> +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>  
>  # hw/i386/amd_iommu.c
>  amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 749eef9..9c3f6c0 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -83,6 +83,8 @@ struct VTDAddressSpace {
>      uint8_t devfn;
>      AddressSpace as;
>      MemoryRegion iommu;
> +    MemoryRegion root;
> +    MemoryRegion sys_alias;
>      MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
>      IntelIOMMUState *iommu_state;
>      VTDContextCacheEntry context_cache_entry;
Peter Xu Jan. 17, 2017, 2 p.m. UTC | #9
On Mon, Jan 16, 2017 at 12:53:57PM -0700, Alex Williamson wrote:
> On Fri, 13 Jan 2017 11:06:39 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
> > This is preparation work to finally enabled dynamic switching ON/OFF for
> > VT-d protection. The old VT-d codes is using static IOMMU address space,
> > and that won't satisfy vfio-pci device listeners.
> > 
> > Let me explain.
> > 
> > vfio-pci devices depend on the memory region listener and IOMMU replay
> > mechanism to make sure the device mapping is coherent with the guest
> > even if there are domain switches. And there are two kinds of domain
> > switches:
> > 
> >   (1) switch from domain A -> B
> >   (2) switch from domain A -> no domain (e.g., turn DMAR off)
> > 
> > Case (1) is handled by the context entry invalidation handling by the
> > VT-d replay logic. What the replay function should do here is to replay
> > the existing page mappings in domain B.
> 
> There's really 2 steps here, right?  Invalidate A, replay B.  I think
> the code handles this, but I want to make sure.  We don't want to end
> up with a superset of both A & B.

First of all, this discussion should be beyond this patch's scope,
since this patch is currently only handling the case when guest
disables DMAR in general.

Then, my understanding for above question: when we do A -> B domain
switch, guest will not send specific context entry invalidations for
A, but will for sure send one when context entry is ready for B. In
that sense, IMO we don't have a clear "two steps", only one, which is
the latter "replay B". We do correct unmap based on the PSIs
(page-selective invalidations) of A when guest unmaps the pages in A.

So, for the use case of nested device assignment (which is the goal of
this series for now):

- L1 guest put device D1,D2,... of L2 guest into domain A
- L1 guest map the L2 memory into L1 address space (L2GPA -> L1GPA)
- ... (L2 guest runs, until it stops running)
- L1 guest unmap all the pages in domain A
- L1 guest move device D1,D2,... of L2 guest outside domain A

This series should work for above, since before any device leaves its
domain, the domain will be clean and without unmapped pages.

However, if we have the following scenario (which I don't know whether
this's achievable):

- guest iommu domain A has device D1, D2
- guest iommu domain B has device D3
- move device D2 from domain A into B

Here when D2 move from A to B, IIUC our current Linux IOMMU driver
code will not send any PSI (page-selected invalidations) for D2 or
domain A because domain A still has device in it, guest should only
send a context entry invalidation for device D2, telling that D2 has
switched to domain B. In that case, I am not sure whether current
series can work properly, and IMHO we may need to have the domain
knowledge in VT-d emulation code (while we don't have it yet) in the
future to further support this kind of domain switches.

> 
> On the invalidation, a future optimization when disabling an entire
> memory region might also be to invalidate the entire range at once
> rather than each individual mapping within the range, which I think is
> what happens now, right?

Right. IIUC this can be an enhancement to current page walk logic - we
can coalesce continuous IOTLB with same property and notify only once
for these coalesced entries.

Noted in my todo list.

> 
> > However for case (2), we don't want to replay any domain mappings - we
> > just need the default GPA->HPA mappings (the address_space_memory
> > mapping). And this patch helps on case (2) to build up the mapping
> > automatically by leveraging the vfio-pci memory listeners.
> 
> Have you thought about using this address space switching to emulate
> ecap.PT?  ie. advertise hardware based passthrough so that the guest
> doesn't need to waste pagetable entries for a direct mapped, static
> identity domain.

Kind of. Currently we still don't have iommu=pt for the emulated code.
We can achieve that by leveraging this patch.

> 
> Otherwise the series looks pretty good to me.  Thanks,

Your review comment is really important to me. Thanks!

I'll see whether we can get to a consensus on above issue, then repost
with existing fixes.

Thanks,

-- peterx
Peter Xu Jan. 17, 2017, 2:53 p.m. UTC | #10
On Mon, Jan 16, 2017 at 06:25:32PM +0200, Michael S. Tsirkin wrote:

[...]

> > > I think this does not give more readability :) May I was wrong, let leave
> > > this for maintainer.
> > > 
> > > Thanks :)
> > 
> > Thanks for reviewing this series so fast!
> > 
> > I have no strong opinion as well. Maybe you are right. :-)
> > 
> > Michael, please let me know if you dislike this, so I can remove this
> > parameter (it equals to as->iommu_state->dmar_enabled).
> > 
> > Thanks,
> > 
> > -- peterx
> 
> I prefer not to duplicate data, yes.

Let me remove it then. Thanks,

-- peterx
Alex Williamson Jan. 17, 2017, 3:46 p.m. UTC | #11
On Tue, 17 Jan 2017 22:00:00 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Mon, Jan 16, 2017 at 12:53:57PM -0700, Alex Williamson wrote:
> > On Fri, 13 Jan 2017 11:06:39 +0800
> > Peter Xu <peterx@redhat.com> wrote:
> >   
> > > This is preparation work to finally enabled dynamic switching ON/OFF for
> > > VT-d protection. The old VT-d codes is using static IOMMU address space,
> > > and that won't satisfy vfio-pci device listeners.
> > > 
> > > Let me explain.
> > > 
> > > vfio-pci devices depend on the memory region listener and IOMMU replay
> > > mechanism to make sure the device mapping is coherent with the guest
> > > even if there are domain switches. And there are two kinds of domain
> > > switches:
> > > 
> > >   (1) switch from domain A -> B
> > >   (2) switch from domain A -> no domain (e.g., turn DMAR off)
> > > 
> > > Case (1) is handled by the context entry invalidation handling by the
> > > VT-d replay logic. What the replay function should do here is to replay
> > > the existing page mappings in domain B.  
> > 
> > There's really 2 steps here, right?  Invalidate A, replay B.  I think
> > the code handles this, but I want to make sure.  We don't want to end
> > up with a superset of both A & B.  
> 
> First of all, this discussion should be beyond this patch's scope,
> since this patch is currently only handling the case when guest
> disables DMAR in general.
> 
> Then, my understanding for above question: when we do A -> B domain
> switch, guest will not send specific context entry invalidations for
> A, but will for sure send one when context entry is ready for B. In
> that sense, IMO we don't have a clear "two steps", only one, which is
> the latter "replay B". We do correct unmap based on the PSIs
> (page-selective invalidations) of A when guest unmaps the pages in A.
> 
> So, for the use case of nested device assignment (which is the goal of
> this series for now):
> 
> - L1 guest put device D1,D2,... of L2 guest into domain A
> - L1 guest map the L2 memory into L1 address space (L2GPA -> L1GPA)
> - ... (L2 guest runs, until it stops running)
> - L1 guest unmap all the pages in domain A
> - L1 guest move device D1,D2,... of L2 guest outside domain A
> 
> This series should work for above, since before any device leaves its
> domain, the domain will be clean and without unmapped pages.
> 
> However, if we have the following scenario (which I don't know whether
> this's achievable):
> 
> - guest iommu domain A has device D1, D2
> - guest iommu domain B has device D3
> - move device D2 from domain A into B
> 
> Here when D2 move from A to B, IIUC our current Linux IOMMU driver
> code will not send any PSI (page-selected invalidations) for D2 or
> domain A because domain A still has device in it, guest should only
> send a context entry invalidation for device D2, telling that D2 has
> switched to domain B. In that case, I am not sure whether current
> series can work properly, and IMHO we may need to have the domain
> knowledge in VT-d emulation code (while we don't have it yet) in the
> future to further support this kind of domain switches.

This is a serious issue that needs to be resolved.  The context entry
invalidation when D2 is switched from A->B must unmap anything from
domain A before the replay of domain B.  Your example is easily
achieved, for instance what if domain A is the SI (static identity)
domain for the L1 guest, domain B is the device assignment domain for
the L2 guest with current device D3.  The user hot adds device D2 into
the L2 guest moving it from the L1 SI domain to the device assignment
domain.  vfio will not override existing mappings on replay, it will
error, giving the L2 guest a device with access to the static identity
mappings of the L1 host.  This isn't acceptable.
 
> > On the invalidation, a future optimization when disabling an entire
> > memory region might also be to invalidate the entire range at once
> > rather than each individual mapping within the range, which I think is
> > what happens now, right?  
> 
> Right. IIUC this can be an enhancement to current page walk logic - we
> can coalesce continuous IOTLB with same property and notify only once
> for these coalesced entries.
> 
> Noted in my todo list.

A context entry invalidation as in the example above might make use of
this to skip any sort of page walk logic, simply invalidate the entire
address space.

> >   
> > > However for case (2), we don't want to replay any domain mappings - we
> > > just need the default GPA->HPA mappings (the address_space_memory
> > > mapping). And this patch helps on case (2) to build up the mapping
> > > automatically by leveraging the vfio-pci memory listeners.  
> > 
> > Have you thought about using this address space switching to emulate
> > ecap.PT?  ie. advertise hardware based passthrough so that the guest
> > doesn't need to waste pagetable entries for a direct mapped, static
> > identity domain.  
> 
> Kind of. Currently we still don't have iommu=pt for the emulated code.
> We can achieve that by leveraging this patch.

Well, we have iommu=pt, but the L1 guest will implement this as a fully
populated SI domain rather than as a bit in the context entry to do
hardware direct translation.  Given the mapping overhead through vfio,
the L1 guest will always want to use iommu=pt as dynamic mapping
performance is going to be horrid.  Thanks,

Alex
Peter Xu Jan. 18, 2017, 7:49 a.m. UTC | #12
On Tue, Jan 17, 2017 at 08:46:04AM -0700, Alex Williamson wrote:
> On Tue, 17 Jan 2017 22:00:00 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Mon, Jan 16, 2017 at 12:53:57PM -0700, Alex Williamson wrote:
> > > On Fri, 13 Jan 2017 11:06:39 +0800
> > > Peter Xu <peterx@redhat.com> wrote:
> > >   
> > > > This is preparation work to finally enabled dynamic switching ON/OFF for
> > > > VT-d protection. The old VT-d codes is using static IOMMU address space,
> > > > and that won't satisfy vfio-pci device listeners.
> > > > 
> > > > Let me explain.
> > > > 
> > > > vfio-pci devices depend on the memory region listener and IOMMU replay
> > > > mechanism to make sure the device mapping is coherent with the guest
> > > > even if there are domain switches. And there are two kinds of domain
> > > > switches:
> > > > 
> > > >   (1) switch from domain A -> B
> > > >   (2) switch from domain A -> no domain (e.g., turn DMAR off)
> > > > 
> > > > Case (1) is handled by the context entry invalidation handling by the
> > > > VT-d replay logic. What the replay function should do here is to replay
> > > > the existing page mappings in domain B.  
> > > 
> > > There's really 2 steps here, right?  Invalidate A, replay B.  I think
> > > the code handles this, but I want to make sure.  We don't want to end
> > > up with a superset of both A & B.  
> > 
> > First of all, this discussion should be beyond this patch's scope,
> > since this patch is currently only handling the case when guest
> > disables DMAR in general.
> > 
> > Then, my understanding for above question: when we do A -> B domain
> > switch, guest will not send specific context entry invalidations for
> > A, but will for sure send one when context entry is ready for B. In
> > that sense, IMO we don't have a clear "two steps", only one, which is
> > the latter "replay B". We do correct unmap based on the PSIs
> > (page-selective invalidations) of A when guest unmaps the pages in A.
> > 
> > So, for the use case of nested device assignment (which is the goal of
> > this series for now):
> > 
> > - L1 guest put device D1,D2,... of L2 guest into domain A
> > - L1 guest map the L2 memory into L1 address space (L2GPA -> L1GPA)
> > - ... (L2 guest runs, until it stops running)
> > - L1 guest unmap all the pages in domain A
> > - L1 guest move device D1,D2,... of L2 guest outside domain A
> > 
> > This series should work for above, since before any device leaves its
> > domain, the domain will be clean and without unmapped pages.
> > 
> > However, if we have the following scenario (which I don't know whether
> > this's achievable):
> > 
> > - guest iommu domain A has device D1, D2
> > - guest iommu domain B has device D3
> > - move device D2 from domain A into B
> > 
> > Here when D2 move from A to B, IIUC our current Linux IOMMU driver
> > code will not send any PSI (page-selected invalidations) for D2 or
> > domain A because domain A still has device in it, guest should only
> > send a context entry invalidation for device D2, telling that D2 has
> > switched to domain B. In that case, I am not sure whether current
> > series can work properly, and IMHO we may need to have the domain
> > knowledge in VT-d emulation code (while we don't have it yet) in the
> > future to further support this kind of domain switches.
> 
> This is a serious issue that needs to be resolved.  The context entry
> invalidation when D2 is switched from A->B must unmap anything from
> domain A before the replay of domain B.  Your example is easily
> achieved, for instance what if domain A is the SI (static identity)
> domain for the L1 guest, domain B is the device assignment domain for
> the L2 guest with current device D3.  The user hot adds device D2 into
> the L2 guest moving it from the L1 SI domain to the device assignment
> domain.  vfio will not override existing mappings on replay, it will
> error, giving the L2 guest a device with access to the static identity
> mappings of the L1 host.  This isn't acceptable.
>  
> > > On the invalidation, a future optimization when disabling an entire
> > > memory region might also be to invalidate the entire range at once
> > > rather than each individual mapping within the range, which I think is
> > > what happens now, right?  
> > 
> > Right. IIUC this can be an enhancement to current page walk logic - we
> > can coalesce continuous IOTLB with same property and notify only once
> > for these coalesced entries.
> > 
> > Noted in my todo list.
> 
> A context entry invalidation as in the example above might make use of
> this to skip any sort of page walk logic, simply invalidate the entire
> address space.

Alex, I got one more thing to ask:

I was trying to invalidate the entire address space by sending a big
IOTLB notification to vfio-pci, which looks like:

  IOMMUTLBEntry entry = {
      .target_as = &address_space_memory,
      .iova = 0,
      .translated_addr = 0,
      .addr_mask = (1 << 63) - 1,
      .perm = IOMMU_NONE,     /* UNMAP */
  };

Then I feed this entry to vfio-pci IOMMU notifier.

However, this was blocked in vfio_iommu_map_notify(), with error:

  qemu-system-x86_64: iommu has granularity incompatible with target AS

Since we have:

  /*
   * The IOMMU TLB entry we have just covers translation through
   * this IOMMU to its immediate target.  We need to translate
   * it the rest of the way through to memory.
   */
  rcu_read_lock();
  mr = address_space_translate(&address_space_memory,
                               iotlb->translated_addr,
                               &xlat, &len, iotlb->perm & IOMMU_WO);
  if (!memory_region_is_ram(mr)) {
      error_report("iommu map to non memory area %"HWADDR_PRIx"",
                   xlat);
      goto out;
  }
  /*
   * Translation truncates length to the IOMMU page size,
   * check that it did not truncate too much.
   */
  if (len & iotlb->addr_mask) {
      error_report("iommu has granularity incompatible with target AS");
      goto out;
  }

In my case len == 0xa0000 (that's the translation result), and
iotlb->addr_mask == (1<<63)-1. So looks like the translation above
splitted the big region and a simple big UNMAP doesn't work for me.

Do you have any suggestion on how I can solve this? In what case will
we need the above address_space_translate()?

> 
> > >   
> > > > However for case (2), we don't want to replay any domain mappings - we
> > > > just need the default GPA->HPA mappings (the address_space_memory
> > > > mapping). And this patch helps on case (2) to build up the mapping
> > > > automatically by leveraging the vfio-pci memory listeners.  
> > > 
> > > Have you thought about using this address space switching to emulate
> > > ecap.PT?  ie. advertise hardware based passthrough so that the guest
> > > doesn't need to waste pagetable entries for a direct mapped, static
> > > identity domain.  
> > 
> > Kind of. Currently we still don't have iommu=pt for the emulated code.
> > We can achieve that by leveraging this patch.
> 
> Well, we have iommu=pt, but the L1 guest will implement this as a fully
> populated SI domain rather than as a bit in the context entry to do
> hardware direct translation.  Given the mapping overhead through vfio,
> the L1 guest will always want to use iommu=pt as dynamic mapping
> performance is going to be horrid.  Thanks,

I see, so we have iommu=pt in guest even VT-d emulation does not
provide that bit. Anyway, supporting ecap.pt is in my todo list.

Thanks,

-- peterx
Peter Xu Jan. 19, 2017, 8:20 a.m. UTC | #13
On Wed, Jan 18, 2017 at 03:49:44PM +0800, Peter Xu wrote:

[...]

> I was trying to invalidate the entire address space by sending a big
> IOTLB notification to vfio-pci, which looks like:
> 
>   IOMMUTLBEntry entry = {
>       .target_as = &address_space_memory,
>       .iova = 0,
>       .translated_addr = 0,
>       .addr_mask = (1 << 63) - 1,
>       .perm = IOMMU_NONE,     /* UNMAP */
>   };
> 
> Then I feed this entry to vfio-pci IOMMU notifier.
> 
> However, this was blocked in vfio_iommu_map_notify(), with error:
> 
>   qemu-system-x86_64: iommu has granularity incompatible with target AS
> 
> Since we have:
> 
>   /*
>    * The IOMMU TLB entry we have just covers translation through
>    * this IOMMU to its immediate target.  We need to translate
>    * it the rest of the way through to memory.
>    */
>   rcu_read_lock();
>   mr = address_space_translate(&address_space_memory,
>                                iotlb->translated_addr,
>                                &xlat, &len, iotlb->perm & IOMMU_WO);
>   if (!memory_region_is_ram(mr)) {
>       error_report("iommu map to non memory area %"HWADDR_PRIx"",
>                    xlat);
>       goto out;
>   }
>   /*
>    * Translation truncates length to the IOMMU page size,
>    * check that it did not truncate too much.
>    */
>   if (len & iotlb->addr_mask) {
>       error_report("iommu has granularity incompatible with target AS");
>       goto out;
>   }
> 
> In my case len == 0xa0000 (that's the translation result), and
> iotlb->addr_mask == (1<<63)-1. So looks like the translation above
> splitted the big region and a simple big UNMAP doesn't work for me.
> 
> Do you have any suggestion on how I can solve this? In what case will
> we need the above address_space_translate()?

Hmm... it should be checking that the translated address range is RAM.

However if with this, IOMMU notifiers won't be able to leverage the
vfio driver feature to unmap a very big region.

IMHO the check should only be meaningful for map operations. I'll try
to post a RFC patch for vfio-pci to allow unmap of very big regions,
to see whether that'll be a workable approach.

Thanks,

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fd75112..2596f11 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1343,9 +1343,49 @@  static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
 }
 
+static void vtd_switch_address_space(VTDAddressSpace *as, bool iommu_enabled)
+{
+    assert(as);
+
+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
+                                   VTD_PCI_SLOT(as->devfn),
+                                   VTD_PCI_FUNC(as->devfn),
+                                   iommu_enabled);
+
+    /* Turn off first then on the other */
+    if (iommu_enabled) {
+        memory_region_set_enabled(&as->sys_alias, false);
+        memory_region_set_enabled(&as->iommu, true);
+    } else {
+        memory_region_set_enabled(&as->iommu, false);
+        memory_region_set_enabled(&as->sys_alias, true);
+    }
+}
+
+static void vtd_switch_address_space_all(IntelIOMMUState *s, bool enabled)
+{
+    GHashTableIter iter;
+    VTDBus *vtd_bus;
+    int i;
+
+    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+        for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) {
+            if (!vtd_bus->dev_as[i]) {
+                continue;
+            }
+            vtd_switch_address_space(vtd_bus->dev_as[i], enabled);
+        }
+    }
+}
+
 /* Handle Translation Enable/Disable */
 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
 {
+    if (s->dmar_enabled == en) {
+        return;
+    }
+
     VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
 
     if (en) {
@@ -1360,6 +1400,8 @@  static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
         /* Ok - report back to driver */
         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
     }
+
+    vtd_switch_address_space_all(s, en);
 }
 
 /* Handle Interrupt Remap Enable/Disable */
@@ -2586,15 +2628,43 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->devfn = (uint8_t)devfn;
         vtd_dev_as->iommu_state = s;
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+
+        /*
+         * Memory region relationships looks like (Address range shows
+         * only lower 32 bits to make it short in length...):
+         *
+         * |-----------------+-------------------+----------|
+         * | Name            | Address range     | Priority |
+         * |-----------------+-------------------+----------+
+         * | vtd_root        | 00000000-ffffffff |        0 |
+         * |  intel_iommu    | 00000000-ffffffff |        1 |
+         * |  vtd_sys_alias  | 00000000-ffffffff |        1 |
+         * |  intel_iommu_ir | fee00000-feefffff |       64 |
+         * |-----------------+-------------------+----------|
+         *
+         * We enable/disable DMAR by switching enablement for
+         * vtd_sys_alias and intel_iommu regions. IR region is always
+         * enabled.
+         */
         memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
                                  &s->iommu_ops, "intel_iommu", UINT64_MAX);
+        memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
+                                 "vtd_sys_alias", get_system_memory(),
+                                 0, memory_region_size(get_system_memory()));
         memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
                               &vtd_mem_ir_ops, s, "intel_iommu_ir",
                               VTD_INTERRUPT_ADDR_SIZE);
-        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
-                                    &vtd_dev_as->iommu_ir);
-        address_space_init(&vtd_dev_as->as,
-                           &vtd_dev_as->iommu, name);
+        memory_region_init(&vtd_dev_as->root, OBJECT(s),
+                           "vtd_root", UINT64_MAX);
+        memory_region_add_subregion_overlap(&vtd_dev_as->root,
+                                            VTD_INTERRUPT_ADDR_FIRST,
+                                            &vtd_dev_as->iommu_ir, 64);
+        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+                                            &vtd_dev_as->sys_alias, 1);
+        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
+                                            &vtd_dev_as->iommu, 1);
+        vtd_switch_address_space(vtd_dev_as, s->dmar_enabled);
     }
     return vtd_dev_as;
 }
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 92d210d..beaef61 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -11,7 +11,6 @@  xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (ad
 x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32
 
 # hw/i386/intel_iommu.c
-vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64
 vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16
 vtd_inv_desc_cc_global(void) "context invalidate globally"
@@ -37,6 +36,7 @@  vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, in
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
 vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
+vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 749eef9..9c3f6c0 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -83,6 +83,8 @@  struct VTDAddressSpace {
     uint8_t devfn;
     AddressSpace as;
     MemoryRegion iommu;
+    MemoryRegion root;
+    MemoryRegion sys_alias;
     MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
     IntelIOMMUState *iommu_state;
     VTDContextCacheEntry context_cache_entry;