diff mbox series

[1/3] virtio-iommu: Add bypass mode support to assigned device

Message ID 20220613061010.2674054-2-zhenzhong.duan@intel.com (mailing list archive)
State New, archived
Headers show
Series Add bypass mode support to assigned device | expand

Commit Message

Duan, Zhenzhong June 13, 2022, 6:10 a.m. UTC
Currently assigned devices can not work in virtio-iommu bypass mode.
Guest driver fails to probe the device due to DMA failure. And the
reason is because of lacking GPA -> HPA mappings when VM is created.

Add a root container memory region to hold both bypass memory region
and iommu memory region, so the switch between them is supported
just like the implementation in virtual VT-d.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/virtio/trace-events           |   1 +
 hw/virtio/virtio-iommu.c         | 115 ++++++++++++++++++++++++++++++-
 include/hw/virtio/virtio-iommu.h |   2 +
 3 files changed, 116 insertions(+), 2 deletions(-)

Comments

Eric Auger June 23, 2022, 4:52 p.m. UTC | #1
Hi,
On 6/13/22 08:10, Zhenzhong Duan wrote:
> Currently assigned devices can not work in virtio-iommu bypass mode.
> Guest driver fails to probe the device due to DMA failure. And the
> reason is because of lacking GPA -> HPA mappings when VM is created.
>
> Add a root container memory region to hold both bypass memory region
> and iommu memory region, so the switch between them is supported
> just like the implementation in virtual VT-d.
>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  hw/virtio/trace-events           |   1 +
>  hw/virtio/virtio-iommu.c         | 115 ++++++++++++++++++++++++++++++-
>  include/hw/virtio/virtio-iommu.h |   2 +
>  3 files changed, 116 insertions(+), 2 deletions(-)
>
> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> index ab8e095b73fa..20af2e7ebd78 100644
> --- a/hw/virtio/trace-events
> +++ b/hw/virtio/trace-events
> @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uin
>  virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64
>  virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
>  virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
> +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>  
>  # virtio-mem.c
>  virtio_mem_send_response(uint16_t type) "type=%" PRIu16
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
> index 2597e166f939..ff718107ee02 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -69,6 +69,77 @@ static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
>      return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
>  }
>  
> +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
> +{
> +    uint32_t sid;
> +    bool bypassed;
> +    VirtIOIOMMU *s = sdev->viommu;
> +    VirtIOIOMMUEndpoint *ep;
> +
> +    sid = virtio_iommu_get_bdf(sdev);
> +
> +    qemu_mutex_lock(&s->mutex);
> +    /* need to check bypass before system reset */
> +    if (!s->endpoints) {
> +        bypassed = s->config.bypass;
> +        goto unlock;
> +    }
> +
> +    ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
> +    if (!ep || !ep->domain) {
> +        bypassed = s->config.bypass;
> +    } else {
> +        bypassed = ep->domain->bypass;
> +    }
> +
> +unlock:
> +    qemu_mutex_unlock(&s->mutex);
> +    return bypassed;
> +}
> +
> +/* Return whether the device is using IOMMU translation. */
> +static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
> +{
> +    bool use_remapping;
> +
> +    assert(sdev);
> +
> +    use_remapping = !virtio_iommu_device_bypassed(sdev);
> +
> +    trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
> +                                            PCI_SLOT(sdev->devfn),
> +                                            PCI_FUNC(sdev->devfn),
> +                                            use_remapping);
> +
> +    /* Turn off first then on the other */
> +    if (use_remapping) {
> +        memory_region_set_enabled(&sdev->bypass_mr, false);
> +        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
> +    } else {
> +        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
> +        memory_region_set_enabled(&sdev->bypass_mr, true);
> +    }
> +
> +    return use_remapping;
> +}
> +
> +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
> +{
> +    GHashTableIter iter;
> +    IOMMUPciBus *iommu_pci_bus;
> +    int i;
> +
> +    g_hash_table_iter_init(&iter, s->as_by_busptr);
> +    while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
> +        for (i = 0; i < PCI_DEVFN_MAX; i++) {
> +            if (!iommu_pci_bus->pbdev[i]) {
> +                continue;
> +            }
> +            virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
> +        }
> +    }
> +}
> +
>  /**
>   * The bus number is used for lookup when SID based operations occur.
>   * In that case we lazily populate the IOMMUPciBus array from the bus hash
> @@ -213,6 +284,7 @@ static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
>  static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
>  {
>      VirtIOIOMMUDomain *domain = ep->domain;
> +    IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
>  
>      if (!ep->domain) {
>          return;
> @@ -221,6 +293,7 @@ static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
>                     ep->iommu_mr);
>      QLIST_REMOVE(ep, next);
>      ep->domain = NULL;
> +    virtio_iommu_switch_address_space(sdev);
>  }
>  
>  static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
> @@ -323,12 +396,39 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
>  
>          trace_virtio_iommu_init_iommu_mr(name);
>  
> +        memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
> +        address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
> +
> +        /*
> +         * Build the IOMMU disabled container with aliases to the
> +         * shared MRs.  Note that aliasing to a shared memory region
What do you call the 'disabled container' and the shared MRs?

> +         * could help the memory API to detect same FlatViews so we
> +         * can have devices to share the same FlatView when in bypass
> +         * mode. (either by not configuring virtio-iommu driver or with
> +         * "iommu=pt").  It will greatly reduce the total number of
> +         * FlatViews of the system hence VM runs faster.

Do you mean that we could have used a shared bypass MR instead of
creatingone per device?
> +         */
> +        memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
> +                                 "system", get_system_memory(), 0,
> +                                 memory_region_size(get_system_memory()));
> +
>          memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
>                                   TYPE_VIRTIO_IOMMU_MEMORY_REGION,
>                                   OBJECT(s), name,
>                                   UINT64_MAX);
> -        address_space_init(&sdev->as,
> -                           MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU);
> +
> +        /*
> +         * Hook both the containers under the root container, we
did you mean "hook both sub-regions to the root container"?
> +         * switch between iommu & bypass MRs by enable/disable
> +         * corresponding sub-containers
> +         */
> +        memory_region_add_subregion_overlap(&sdev->root, 0,
> +                                            MEMORY_REGION(&sdev->iommu_mr),
> +                                            0);
> +        memory_region_add_subregion_overlap(&sdev->root, 0,
> +                                            &sdev->bypass_mr, 0);
> +
> +        virtio_iommu_switch_address_space(sdev);
>          g_free(name);
>      }
>      return &sdev->as;
> @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
>      uint32_t flags = le32_to_cpu(req->flags);
>      VirtIOIOMMUDomain *domain;
>      VirtIOIOMMUEndpoint *ep;
> +    IOMMUDevice *sdev;
>  
>      trace_virtio_iommu_attach(domain_id, ep_id);
>  
> @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
>      QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
>  
>      ep->domain = domain;
> +    sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
> +    virtio_iommu_switch_address_space(sdev);
>  
>      /* Replay domain mappings on the associated memory region */
>      g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
> @@ -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev,
>              return;
>          }
>          dev_config->bypass = in_config->bypass;
> +        virtio_iommu_switch_address_space_all(dev);
>      }
>  
>      trace_virtio_iommu_set_config(in_config->bypass);
> @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void *opaque)
>       * system reset
>       */
>      s->config.bypass = s->boot_bypass;
> +    virtio_iommu_switch_address_space_all(s);
> +
>  }
>  
>  static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
> @@ -1041,6 +1147,11 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
>                               virtio_iommu_handle_command);
>      s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
>  
> +    /*
> +     * config.bypass is needed to get initial address space early, such as
> +     * in vfio realize
> +     */
I don't understand this comment, please can you elaborate?
> +    s->config.bypass = s->boot_bypass;
>      s->config.page_size_mask = TARGET_PAGE_MASK;
>      s->config.input_range.end = UINT64_MAX;
>      s->config.domain_range.end = UINT32_MAX;
> diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
> index 84391f844826..102eeefa731d 100644
> --- a/include/hw/virtio/virtio-iommu.h
> +++ b/include/hw/virtio/virtio-iommu.h
> @@ -37,6 +37,8 @@ typedef struct IOMMUDevice {
>      int           devfn;
>      IOMMUMemoryRegion  iommu_mr;
>      AddressSpace  as;
> +    MemoryRegion root;          /* The root container of the device */
> +    MemoryRegion bypass_mr;     /* The alias of shared memory MR */
>  } IOMMUDevice;
>  
>  typedef struct IOMMUPciBus {
Did you test migration? I wonder if we shouldn't call

virtio_iommu_switch_address_space_all(s)
after restoring the endpoints in iommu_post_load() as it is done in intel-iommu

Thanks

Eric
Duan, Zhenzhong June 24, 2022, 8:28 a.m. UTC | #2
>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Sent: Friday, June 24, 2022 12:52 AM
>To: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>Cc: qemu-devel@nongnu.org; mst@redhat.com; jean-philippe@linaro.org;
>pbonzini@redhat.com; Zhang, Yu C <yu.c.zhang@intel.com>; Dong,
>Chuanxiao <chuanxiao.dong@intel.com>; Zhang, Tina
><tina.zhang@intel.com>
>Subject: Re: [PATCH 1/3] virtio-iommu: Add bypass mode support to
>assigned device
>
>Hi,
>On 6/13/22 08:10, Zhenzhong Duan wrote:
>> Currently assigned devices can not work in virtio-iommu bypass mode.
>> Guest driver fails to probe the device due to DMA failure. And the
>> reason is because of lacking GPA -> HPA mappings when VM is created.
>>
>> Add a root container memory region to hold both bypass memory region
>> and iommu memory region, so the switch between them is supported just
>> like the implementation in virtual VT-d.
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  hw/virtio/trace-events           |   1 +
>>  hw/virtio/virtio-iommu.c         | 115 ++++++++++++++++++++++++++++++-
>>  include/hw/virtio/virtio-iommu.h |   2 +
>>  3 files changed, 116 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index
>> ab8e095b73fa..20af2e7ebd78 100644
>> --- a/hw/virtio/trace-events
>> +++ b/hw/virtio/trace-events
>> @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t
>> virt_start, uint64_t virt_end, uin
>> virtio_iommu_set_page_size_mask(const char *name, uint64_t old,
>uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64
>virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
>>  virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
>> +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn,
>bool on) "Device %02x:%02x.%x switching address space (iommu
>enabled=%d)"
>>
>>  # virtio-mem.c
>>  virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git
>> a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index
>> 2597e166f939..ff718107ee02 100644
>> --- a/hw/virtio/virtio-iommu.c
>> +++ b/hw/virtio/virtio-iommu.c
>> @@ -69,6 +69,77 @@ static inline uint16_t
>virtio_iommu_get_bdf(IOMMUDevice *dev)
>>      return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);  }
>>
>> +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) {
>> +    uint32_t sid;
>> +    bool bypassed;
>> +    VirtIOIOMMU *s = sdev->viommu;
>> +    VirtIOIOMMUEndpoint *ep;
>> +
>> +    sid = virtio_iommu_get_bdf(sdev);
>> +
>> +    qemu_mutex_lock(&s->mutex);
>> +    /* need to check bypass before system reset */
>> +    if (!s->endpoints) {
>> +        bypassed = s->config.bypass;
>> +        goto unlock;
>> +    }
>> +
>> +    ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
>> +    if (!ep || !ep->domain) {
>> +        bypassed = s->config.bypass;
>> +    } else {
>> +        bypassed = ep->domain->bypass;
>> +    }
>> +
>> +unlock:
>> +    qemu_mutex_unlock(&s->mutex);
>> +    return bypassed;
>> +}
>> +
>> +/* Return whether the device is using IOMMU translation. */ static
>> +bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) {
>> +    bool use_remapping;
>> +
>> +    assert(sdev);
>> +
>> +    use_remapping = !virtio_iommu_device_bypassed(sdev);
>> +
>> +    trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
>> +                                            PCI_SLOT(sdev->devfn),
>> +                                            PCI_FUNC(sdev->devfn),
>> +                                            use_remapping);
>> +
>> +    /* Turn off first then on the other */
>> +    if (use_remapping) {
>> +        memory_region_set_enabled(&sdev->bypass_mr, false);
>> +        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr),
>true);
>> +    } else {
>> +        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr),
>false);
>> +        memory_region_set_enabled(&sdev->bypass_mr, true);
>> +    }
>> +
>> +    return use_remapping;
>> +}
>> +
>> +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) {
>> +    GHashTableIter iter;
>> +    IOMMUPciBus *iommu_pci_bus;
>> +    int i;
>> +
>> +    g_hash_table_iter_init(&iter, s->as_by_busptr);
>> +    while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus))
>{
>> +        for (i = 0; i < PCI_DEVFN_MAX; i++) {
>> +            if (!iommu_pci_bus->pbdev[i]) {
>> +                continue;
>> +            }
>> +            virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
>> +        }
>> +    }
>> +}
>> +
>>  /**
>>   * The bus number is used for lookup when SID based operations occur.
>>   * In that case we lazily populate the IOMMUPciBus array from the bus
>> hash @@ -213,6 +284,7 @@ static gboolean
>> virtio_iommu_notify_map_cb(gpointer key, gpointer value,  static void
>> virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
>{
>>      VirtIOIOMMUDomain *domain = ep->domain;
>> +    IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice,
>> + iommu_mr);
>>
>>      if (!ep->domain) {
>>          return;
>> @@ -221,6 +293,7 @@ static void
>virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
>>                     ep->iommu_mr);
>>      QLIST_REMOVE(ep, next);
>>      ep->domain = NULL;
>> +    virtio_iommu_switch_address_space(sdev);
>>  }
>>
>>  static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU
>*s,
>> @@ -323,12 +396,39 @@ static AddressSpace
>> *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
>>
>>          trace_virtio_iommu_init_iommu_mr(name);
>>
>> +        memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
>> +        address_space_init(&sdev->as, &sdev->root,
>> + TYPE_VIRTIO_IOMMU);
>> +
>> +        /*
>> +         * Build the IOMMU disabled container with aliases to the
>> +         * shared MRs.  Note that aliasing to a shared memory region
>What do you call the 'disabled container' and the shared MRs?

' IOMMU disabled container' means sdev->bypass_mr.
Shared MRs means get_system_memory(), aka system_memory. 
All endpoint's sdev->bypass_mr alias to system_memory, so saying
sdev->bypass_mr 'IOMMU disabled' and system_memory shared MRs.
>
>> +         * could help the memory API to detect same FlatViews so we
>> +         * can have devices to share the same FlatView when in bypass
>> +         * mode. (either by not configuring virtio-iommu driver or with
>> +         * "iommu=pt").  It will greatly reduce the total number of
>> +         * FlatViews of the system hence VM runs faster.
>
>Do you mean that we could have used a shared bypass MR instead of
>creatingone per device?

I think we still need to create iommu MR per device which is enabled when
endpoint attached to dma domain.
Shared bypass MR is enabled when:
1. endpoint not attached to any domain yet and virtio-iommu.boot_bypass=true
2. endpoint is attached to bypass domain

>> +         */
>> +        memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
>> +                                 "system", get_system_memory(), 0,
>> +
>> + memory_region_size(get_system_memory()));
>> +
>>          memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev-
>>iommu_mr),
>>                                   TYPE_VIRTIO_IOMMU_MEMORY_REGION,
>>                                   OBJECT(s), name,
>>                                   UINT64_MAX);
>> -        address_space_init(&sdev->as,
>> -                           MEMORY_REGION(&sdev->iommu_mr),
>TYPE_VIRTIO_IOMMU);
>> +
>> +        /*
>> +         * Hook both the containers under the root container, we
>did you mean "hook both sub-regions to the root container"?

Yes

>> +         * switch between iommu & bypass MRs by enable/disable
>> +         * corresponding sub-containers
>> +         */
>> +        memory_region_add_subregion_overlap(&sdev->root, 0,
>> +                                            MEMORY_REGION(&sdev->iommu_mr),
>> +                                            0);
>> +        memory_region_add_subregion_overlap(&sdev->root, 0,
>> +                                            &sdev->bypass_mr, 0);
>> +
>> +        virtio_iommu_switch_address_space(sdev);
>>          g_free(name);
>>      }
>>      return &sdev->as;
>> @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
>>      uint32_t flags = le32_to_cpu(req->flags);
>>      VirtIOIOMMUDomain *domain;
>>      VirtIOIOMMUEndpoint *ep;
>> +    IOMMUDevice *sdev;
>>
>>      trace_virtio_iommu_attach(domain_id, ep_id);
>>
>> @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
>>      QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
>>
>>      ep->domain = domain;
>> +    sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
>> +    virtio_iommu_switch_address_space(sdev);
>>
>>      /* Replay domain mappings on the associated memory region */
>>      g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, @@
>> -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev,
>>              return;
>>          }
>>          dev_config->bypass = in_config->bypass;
>> +        virtio_iommu_switch_address_space_all(dev);
>>      }
>>
>>      trace_virtio_iommu_set_config(in_config->bypass);
>> @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void
>*opaque)
>>       * system reset
>>       */
>>      s->config.bypass = s->boot_bypass;
>> +    virtio_iommu_switch_address_space_all(s);
>> +
>>  }
>>
>>  static void virtio_iommu_device_realize(DeviceState *dev, Error
>> **errp) @@ -1041,6 +1147,11 @@ static void
>virtio_iommu_device_realize(DeviceState *dev, Error **errp)
>>                               virtio_iommu_handle_command);
>>      s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
>> NULL);
>>
>> +    /*
>> +     * config.bypass is needed to get initial address space early, such as
>> +     * in vfio realize
>> +     */
>I don't understand this comment, please can you elaborate?

In vfio_realize(), virtio_iommu_find_add_as() is called to get the address space
from virtio-iommu. virtio_iommu_find_add_as() calls virtio_iommu_switch_address_space()
and virtio_iommu_switch_address_space() need to check config.bypass to enable
the right MRs, either shared bypass MR or iommu MR.

It could be ok to use s->boot_bypass directly in virtio_iommu_device_bypassed(),
then this change could be removed.

>> +    s->config.bypass = s->boot_bypass;
>>      s->config.page_size_mask = TARGET_PAGE_MASK;
>>      s->config.input_range.end = UINT64_MAX;
>>      s->config.domain_range.end = UINT32_MAX; diff --git
>> a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
>> index 84391f844826..102eeefa731d 100644
>> --- a/include/hw/virtio/virtio-iommu.h
>> +++ b/include/hw/virtio/virtio-iommu.h
>> @@ -37,6 +37,8 @@ typedef struct IOMMUDevice {
>>      int           devfn;
>>      IOMMUMemoryRegion  iommu_mr;
>>      AddressSpace  as;
>> +    MemoryRegion root;          /* The root container of the device */
>> +    MemoryRegion bypass_mr;     /* The alias of shared memory MR */
>>  } IOMMUDevice;
>>
>>  typedef struct IOMMUPciBus {
>Did you test migration? I wonder if we shouldn't call
>
>virtio_iommu_switch_address_space_all(s)
>after restoring the endpoints in iommu_post_load() as it is done in intel-
>iommu

Thanks for point out, I missed the migration scenario. After your suggested
change, the migration works without pass-through device. With pass-through
device I got "VFIO device doesn't support migration". Not clear if
pass-through device migration is totally supported now.

Zhenzhong
diff mbox series

Patch

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index ab8e095b73fa..20af2e7ebd78 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -124,6 +124,7 @@  virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uin
 virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64
 virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
 virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
+virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 
 # virtio-mem.c
 virtio_mem_send_response(uint16_t type) "type=%" PRIu16
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 2597e166f939..ff718107ee02 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -69,6 +69,77 @@  static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
     return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
 }
 
+static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
+{
+    uint32_t sid;
+    bool bypassed;
+    VirtIOIOMMU *s = sdev->viommu;
+    VirtIOIOMMUEndpoint *ep;
+
+    sid = virtio_iommu_get_bdf(sdev);
+
+    qemu_mutex_lock(&s->mutex);
+    /* need to check bypass before system reset */
+    if (!s->endpoints) {
+        bypassed = s->config.bypass;
+        goto unlock;
+    }
+
+    ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+    if (!ep || !ep->domain) {
+        bypassed = s->config.bypass;
+    } else {
+        bypassed = ep->domain->bypass;
+    }
+
+unlock:
+    qemu_mutex_unlock(&s->mutex);
+    return bypassed;
+}
+
+/* Return whether the device is using IOMMU translation. */
+static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
+{
+    bool use_remapping;
+
+    assert(sdev);
+
+    use_remapping = !virtio_iommu_device_bypassed(sdev);
+
+    trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
+                                            PCI_SLOT(sdev->devfn),
+                                            PCI_FUNC(sdev->devfn),
+                                            use_remapping);
+
+    /* Turn off first then on the other */
+    if (use_remapping) {
+        memory_region_set_enabled(&sdev->bypass_mr, false);
+        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
+    } else {
+        memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
+        memory_region_set_enabled(&sdev->bypass_mr, true);
+    }
+
+    return use_remapping;
+}
+
+static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
+{
+    GHashTableIter iter;
+    IOMMUPciBus *iommu_pci_bus;
+    int i;
+
+    g_hash_table_iter_init(&iter, s->as_by_busptr);
+    while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
+        for (i = 0; i < PCI_DEVFN_MAX; i++) {
+            if (!iommu_pci_bus->pbdev[i]) {
+                continue;
+            }
+            virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
+        }
+    }
+}
+
 /**
  * The bus number is used for lookup when SID based operations occur.
  * In that case we lazily populate the IOMMUPciBus array from the bus hash
@@ -213,6 +284,7 @@  static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
 static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
 {
     VirtIOIOMMUDomain *domain = ep->domain;
+    IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
 
     if (!ep->domain) {
         return;
@@ -221,6 +293,7 @@  static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
                    ep->iommu_mr);
     QLIST_REMOVE(ep, next);
     ep->domain = NULL;
+    virtio_iommu_switch_address_space(sdev);
 }
 
 static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
@@ -323,12 +396,39 @@  static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
 
         trace_virtio_iommu_init_iommu_mr(name);
 
+        memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
+        address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
+
+        /*
+         * Build the IOMMU disabled container with aliases to the
+         * shared MRs.  Note that aliasing to a shared memory region
+         * could help the memory API to detect same FlatViews so we
+         * can have devices to share the same FlatView when in bypass
+         * mode. (either by not configuring virtio-iommu driver or with
+         * "iommu=pt").  It will greatly reduce the total number of
+         * FlatViews of the system hence VM runs faster.
+         */
+        memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
+                                 "system", get_system_memory(), 0,
+                                 memory_region_size(get_system_memory()));
+
         memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
                                  TYPE_VIRTIO_IOMMU_MEMORY_REGION,
                                  OBJECT(s), name,
                                  UINT64_MAX);
-        address_space_init(&sdev->as,
-                           MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU);
+
+        /*
+         * Hook both the containers under the root container, we
+         * switch between iommu & bypass MRs by enable/disable
+         * corresponding sub-containers
+         */
+        memory_region_add_subregion_overlap(&sdev->root, 0,
+                                            MEMORY_REGION(&sdev->iommu_mr),
+                                            0);
+        memory_region_add_subregion_overlap(&sdev->root, 0,
+                                            &sdev->bypass_mr, 0);
+
+        virtio_iommu_switch_address_space(sdev);
         g_free(name);
     }
     return &sdev->as;
@@ -342,6 +442,7 @@  static int virtio_iommu_attach(VirtIOIOMMU *s,
     uint32_t flags = le32_to_cpu(req->flags);
     VirtIOIOMMUDomain *domain;
     VirtIOIOMMUEndpoint *ep;
+    IOMMUDevice *sdev;
 
     trace_virtio_iommu_attach(domain_id, ep_id);
 
@@ -375,6 +476,8 @@  static int virtio_iommu_attach(VirtIOIOMMU *s,
     QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
 
     ep->domain = domain;
+    sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
+    virtio_iommu_switch_address_space(sdev);
 
     /* Replay domain mappings on the associated memory region */
     g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
@@ -887,6 +990,7 @@  static void virtio_iommu_set_config(VirtIODevice *vdev,
             return;
         }
         dev_config->bypass = in_config->bypass;
+        virtio_iommu_switch_address_space_all(dev);
     }
 
     trace_virtio_iommu_set_config(in_config->bypass);
@@ -1026,6 +1130,8 @@  static void virtio_iommu_system_reset(void *opaque)
      * system reset
      */
     s->config.bypass = s->boot_bypass;
+    virtio_iommu_switch_address_space_all(s);
+
 }
 
 static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
@@ -1041,6 +1147,11 @@  static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
                              virtio_iommu_handle_command);
     s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
 
+    /*
+     * config.bypass is needed to get initial address space early, such as
+     * in vfio realize
+     */
+    s->config.bypass = s->boot_bypass;
     s->config.page_size_mask = TARGET_PAGE_MASK;
     s->config.input_range.end = UINT64_MAX;
     s->config.domain_range.end = UINT32_MAX;
diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index 84391f844826..102eeefa731d 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -37,6 +37,8 @@  typedef struct IOMMUDevice {
     int           devfn;
     IOMMUMemoryRegion  iommu_mr;
     AddressSpace  as;
+    MemoryRegion root;          /* The root container of the device */
+    MemoryRegion bypass_mr;     /* The alias of shared memory MR */
 } IOMMUDevice;
 
 typedef struct IOMMUPciBus {