Message ID | 20220613061010.2674054-2-zhenzhong.duan@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add bypass mode support to assigned device | expand |
Hi, On 6/13/22 08:10, Zhenzhong Duan wrote: > Currently assigned devices can not work in virtio-iommu bypass mode. > Guest driver fails to probe the device due to DMA failure. And the > reason is because of lacking GPA -> HPA mappings when VM is created. > > Add a root container memory region to hold both bypass memory region > and iommu memory region, so the switch between them is supported > just like the implementation in virtual VT-d. > > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > hw/virtio/trace-events | 1 + > hw/virtio/virtio-iommu.c | 115 ++++++++++++++++++++++++++++++- > include/hw/virtio/virtio-iommu.h | 2 + > 3 files changed, 116 insertions(+), 2 deletions(-) > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events > index ab8e095b73fa..20af2e7ebd78 100644 > --- a/hw/virtio/trace-events > +++ b/hw/virtio/trace-events > @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uin > virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 > virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" > virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" > +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" > > # virtio-mem.c > virtio_mem_send_response(uint16_t type) "type=%" PRIu16 > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 2597e166f939..ff718107ee02 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -69,6 +69,77 @@ static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) > return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); > } > > +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) > +{ > + uint32_t sid; > + bool bypassed; > + VirtIOIOMMU *s = sdev->viommu; > + VirtIOIOMMUEndpoint *ep; > + > + sid = virtio_iommu_get_bdf(sdev); > + > + qemu_mutex_lock(&s->mutex); > + /* need to check bypass before system reset */ > + if (!s->endpoints) { > + bypassed = s->config.bypass; > + goto unlock; > + } > + > + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); > + if (!ep || !ep->domain) { > + bypassed = s->config.bypass; > + } else { > + bypassed = ep->domain->bypass; > + } > + > +unlock: > + qemu_mutex_unlock(&s->mutex); > + return bypassed; > +} > + > +/* Return whether the device is using IOMMU translation. */ > +static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) > +{ > + bool use_remapping; > + > + assert(sdev); > + > + use_remapping = !virtio_iommu_device_bypassed(sdev); > + > + trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus), > + PCI_SLOT(sdev->devfn), > + PCI_FUNC(sdev->devfn), > + use_remapping); > + > + /* Turn off first then on the other */ > + if (use_remapping) { > + memory_region_set_enabled(&sdev->bypass_mr, false); > + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true); > + } else { > + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false); > + memory_region_set_enabled(&sdev->bypass_mr, true); > + } > + > + return use_remapping; > +} > + > +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) > +{ > + GHashTableIter iter; > + IOMMUPciBus *iommu_pci_bus; > + int i; > + > + g_hash_table_iter_init(&iter, s->as_by_busptr); > + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) { > + for (i = 0; i < PCI_DEVFN_MAX; i++) { > + if (!iommu_pci_bus->pbdev[i]) { > + continue; > + } > + virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]); > + } > + } > +} > + > /** > * The bus number is used for lookup when SID based operations occur. > * In that case we lazily populate the IOMMUPciBus array from the bus hash > @@ -213,6 +284,7 @@ static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value, > static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) > { > VirtIOIOMMUDomain *domain = ep->domain; > + IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); > > if (!ep->domain) { > return; > @@ -221,6 +293,7 @@ static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) > ep->iommu_mr); > QLIST_REMOVE(ep, next); > ep->domain = NULL; > + virtio_iommu_switch_address_space(sdev); > } > > static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, > @@ -323,12 +396,39 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, > > trace_virtio_iommu_init_iommu_mr(name); > > + memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); > + address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); > + > + /* > + * Build the IOMMU disabled container with aliases to the > + * shared MRs. Note that aliasing to a shared memory region What do you call the 'disabled container' and the shared MRs? > + * could help the memory API to detect same FlatViews so we > + * can have devices to share the same FlatView when in bypass > + * mode. (either by not configuring virtio-iommu driver or with > + * "iommu=pt"). It will greatly reduce the total number of > + * FlatViews of the system hence VM runs faster. Do you mean that we could have used a shared bypass MR instead of creatingone per device? > + */ > + memory_region_init_alias(&sdev->bypass_mr, OBJECT(s), > + "system", get_system_memory(), 0, > + memory_region_size(get_system_memory())); > + > memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr), > TYPE_VIRTIO_IOMMU_MEMORY_REGION, > OBJECT(s), name, > UINT64_MAX); > - address_space_init(&sdev->as, > - MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU); > + > + /* > + * Hook both the containers under the root container, we did you mean "hook both sub-regions to the root container"? > + * switch between iommu & bypass MRs by enable/disable > + * corresponding sub-containers > + */ > + memory_region_add_subregion_overlap(&sdev->root, 0, > + MEMORY_REGION(&sdev->iommu_mr), > + 0); > + memory_region_add_subregion_overlap(&sdev->root, 0, > + &sdev->bypass_mr, 0); > + > + virtio_iommu_switch_address_space(sdev); > g_free(name); > } > return &sdev->as; > @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, > uint32_t flags = le32_to_cpu(req->flags); > VirtIOIOMMUDomain *domain; > VirtIOIOMMUEndpoint *ep; > + IOMMUDevice *sdev; > > trace_virtio_iommu_attach(domain_id, ep_id); > > @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, > QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); > > ep->domain = domain; > + sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); > + virtio_iommu_switch_address_space(sdev); > > /* Replay domain mappings on the associated memory region */ > g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, > @@ -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev, > return; > } > dev_config->bypass = in_config->bypass; > + virtio_iommu_switch_address_space_all(dev); > } > > trace_virtio_iommu_set_config(in_config->bypass); > @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void *opaque) > * system reset > */ > s->config.bypass = s->boot_bypass; > + virtio_iommu_switch_address_space_all(s); > + > } > > static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) > @@ -1041,6 +1147,11 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) > virtio_iommu_handle_command); > s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL); > > + /* > + * config.bypass is needed to get initial address space early, such as > + * in vfio realize > + */ I don't understand this comment, please can you elaborate? > + s->config.bypass = s->boot_bypass; > s->config.page_size_mask = TARGET_PAGE_MASK; > s->config.input_range.end = UINT64_MAX; > s->config.domain_range.end = UINT32_MAX; > diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h > index 84391f844826..102eeefa731d 100644 > --- a/include/hw/virtio/virtio-iommu.h > +++ b/include/hw/virtio/virtio-iommu.h > @@ -37,6 +37,8 @@ typedef struct IOMMUDevice { > int devfn; > IOMMUMemoryRegion iommu_mr; > AddressSpace as; > + MemoryRegion root; /* The root container of the device */ > + MemoryRegion bypass_mr; /* The alias of shared memory MR */ > } IOMMUDevice; > > typedef struct IOMMUPciBus { Did you test migration? I wonder if we shouldn't call virtio_iommu_switch_address_space_all(s) after restoring the endpoints in iommu_post_load() as it is done in intel-iommu Thanks Eric
>-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Sent: Friday, June 24, 2022 12:52 AM >To: Duan, Zhenzhong <zhenzhong.duan@intel.com> >Cc: qemu-devel@nongnu.org; mst@redhat.com; jean-philippe@linaro.org; >pbonzini@redhat.com; Zhang, Yu C <yu.c.zhang@intel.com>; Dong, >Chuanxiao <chuanxiao.dong@intel.com>; Zhang, Tina ><tina.zhang@intel.com> >Subject: Re: [PATCH 1/3] virtio-iommu: Add bypass mode support to >assigned device > >Hi, >On 6/13/22 08:10, Zhenzhong Duan wrote: >> Currently assigned devices can not work in virtio-iommu bypass mode. >> Guest driver fails to probe the device due to DMA failure. And the >> reason is because of lacking GPA -> HPA mappings when VM is created. >> >> Add a root container memory region to hold both bypass memory region >> and iommu memory region, so the switch between them is supported just >> like the implementation in virtual VT-d. >> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> hw/virtio/trace-events | 1 + >> hw/virtio/virtio-iommu.c | 115 ++++++++++++++++++++++++++++++- >> include/hw/virtio/virtio-iommu.h | 2 + >> 3 files changed, 116 insertions(+), 2 deletions(-) >> >> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index >> ab8e095b73fa..20af2e7ebd78 100644 >> --- a/hw/virtio/trace-events >> +++ b/hw/virtio/trace-events >> @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t >> virt_start, uint64_t virt_end, uin >> virtio_iommu_set_page_size_mask(const char *name, uint64_t old, >uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 >virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" >> virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" >> +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, >bool on) "Device %02x:%02x.%x switching address space (iommu >enabled=%d)" >> >> # virtio-mem.c >> virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git >> a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index >> 2597e166f939..ff718107ee02 100644 >> --- a/hw/virtio/virtio-iommu.c >> +++ b/hw/virtio/virtio-iommu.c >> @@ -69,6 +69,77 @@ static inline uint16_t >virtio_iommu_get_bdf(IOMMUDevice *dev) >> return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); } >> >> +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) { >> + uint32_t sid; >> + bool bypassed; >> + VirtIOIOMMU *s = sdev->viommu; >> + VirtIOIOMMUEndpoint *ep; >> + >> + sid = virtio_iommu_get_bdf(sdev); >> + >> + qemu_mutex_lock(&s->mutex); >> + /* need to check bypass before system reset */ >> + if (!s->endpoints) { >> + bypassed = s->config.bypass; >> + goto unlock; >> + } >> + >> + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); >> + if (!ep || !ep->domain) { >> + bypassed = s->config.bypass; >> + } else { >> + bypassed = ep->domain->bypass; >> + } >> + >> +unlock: >> + qemu_mutex_unlock(&s->mutex); >> + return bypassed; >> +} >> + >> +/* Return whether the device is using IOMMU translation. */ static >> +bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) { >> + bool use_remapping; >> + >> + assert(sdev); >> + >> + use_remapping = !virtio_iommu_device_bypassed(sdev); >> + >> + trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus), >> + PCI_SLOT(sdev->devfn), >> + PCI_FUNC(sdev->devfn), >> + use_remapping); >> + >> + /* Turn off first then on the other */ >> + if (use_remapping) { >> + memory_region_set_enabled(&sdev->bypass_mr, false); >> + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), >true); >> + } else { >> + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), >false); >> + memory_region_set_enabled(&sdev->bypass_mr, true); >> + } >> + >> + return use_remapping; >> +} >> + >> +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) { >> + GHashTableIter iter; >> + IOMMUPciBus *iommu_pci_bus; >> + int i; >> + >> + g_hash_table_iter_init(&iter, s->as_by_busptr); >> + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) >{ >> + for (i = 0; i < PCI_DEVFN_MAX; i++) { >> + if (!iommu_pci_bus->pbdev[i]) { >> + continue; >> + } >> + virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]); >> + } >> + } >> +} >> + >> /** >> * The bus number is used for lookup when SID based operations occur. >> * In that case we lazily populate the IOMMUPciBus array from the bus >> hash @@ -213,6 +284,7 @@ static gboolean >> virtio_iommu_notify_map_cb(gpointer key, gpointer value, static void >> virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) >{ >> VirtIOIOMMUDomain *domain = ep->domain; >> + IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, >> + iommu_mr); >> >> if (!ep->domain) { >> return; >> @@ -221,6 +293,7 @@ static void >virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) >> ep->iommu_mr); >> QLIST_REMOVE(ep, next); >> ep->domain = NULL; >> + virtio_iommu_switch_address_space(sdev); >> } >> >> static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU >*s, >> @@ -323,12 +396,39 @@ static AddressSpace >> *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, >> >> trace_virtio_iommu_init_iommu_mr(name); >> >> + memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); >> + address_space_init(&sdev->as, &sdev->root, >> + TYPE_VIRTIO_IOMMU); >> + >> + /* >> + * Build the IOMMU disabled container with aliases to the >> + * shared MRs. Note that aliasing to a shared memory region >What do you call the 'disabled container' and the shared MRs? ' IOMMU disabled container' means sdev->bypass_mr. Shared MRs means get_system_memory(), aka system_memory. All endpoint's sdev->bypass_mr alias to system_memory, so saying sdev->bypass_mr 'IOMMU disabled' and system_memory shared MRs. > >> + * could help the memory API to detect same FlatViews so we >> + * can have devices to share the same FlatView when in bypass >> + * mode. (either by not configuring virtio-iommu driver or with >> + * "iommu=pt"). It will greatly reduce the total number of >> + * FlatViews of the system hence VM runs faster. > >Do you mean that we could have used a shared bypass MR instead of >creatingone per device? I think we still need to create iommu MR per device which is enabled when endpoint attached to dma domain. Shared bypass MR is enabled when: 1. endpoint not attached to any domain yet and virtio-iommu.boot_bypass=true 2. endpoint is attached to bypass domain >> + */ >> + memory_region_init_alias(&sdev->bypass_mr, OBJECT(s), >> + "system", get_system_memory(), 0, >> + >> + memory_region_size(get_system_memory())); >> + >> memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev- >>iommu_mr), >> TYPE_VIRTIO_IOMMU_MEMORY_REGION, >> OBJECT(s), name, >> UINT64_MAX); >> - address_space_init(&sdev->as, >> - MEMORY_REGION(&sdev->iommu_mr), >TYPE_VIRTIO_IOMMU); >> + >> + /* >> + * Hook both the containers under the root container, we >did you mean "hook both sub-regions to the root container"? Yes >> + * switch between iommu & bypass MRs by enable/disable >> + * corresponding sub-containers >> + */ >> + memory_region_add_subregion_overlap(&sdev->root, 0, >> + MEMORY_REGION(&sdev->iommu_mr), >> + 0); >> + memory_region_add_subregion_overlap(&sdev->root, 0, >> + &sdev->bypass_mr, 0); >> + >> + virtio_iommu_switch_address_space(sdev); >> g_free(name); >> } >> return &sdev->as; >> @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, >> uint32_t flags = le32_to_cpu(req->flags); >> VirtIOIOMMUDomain *domain; >> VirtIOIOMMUEndpoint *ep; >> + IOMMUDevice *sdev; >> >> trace_virtio_iommu_attach(domain_id, ep_id); >> >> @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, >> QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); >> >> ep->domain = domain; >> + sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); >> + virtio_iommu_switch_address_space(sdev); >> >> /* Replay domain mappings on the associated memory region */ >> g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, @@ >> -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev, >> return; >> } >> dev_config->bypass = in_config->bypass; >> + virtio_iommu_switch_address_space_all(dev); >> } >> >> trace_virtio_iommu_set_config(in_config->bypass); >> @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void >*opaque) >> * system reset >> */ >> s->config.bypass = s->boot_bypass; >> + virtio_iommu_switch_address_space_all(s); >> + >> } >> >> static void virtio_iommu_device_realize(DeviceState *dev, Error >> **errp) @@ -1041,6 +1147,11 @@ static void >virtio_iommu_device_realize(DeviceState *dev, Error **errp) >> virtio_iommu_handle_command); >> s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, >> NULL); >> >> + /* >> + * config.bypass is needed to get initial address space early, such as >> + * in vfio realize >> + */ >I don't understand this comment, please can you elaborate? In vfio_realize(), virtio_iommu_find_add_as() is called to get the address space from virtio-iommu. virtio_iommu_find_add_as() calls virtio_iommu_switch_address_space() and virtio_iommu_switch_address_space() need to check config.bypass to enable the right MRs, either shared bypass MR or iommu MR. It could be ok to use s->boot_bypass directly in virtio_iommu_device_bypassed(), then this change could be removed. >> + s->config.bypass = s->boot_bypass; >> s->config.page_size_mask = TARGET_PAGE_MASK; >> s->config.input_range.end = UINT64_MAX; >> s->config.domain_range.end = UINT32_MAX; diff --git >> a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h >> index 84391f844826..102eeefa731d 100644 >> --- a/include/hw/virtio/virtio-iommu.h >> +++ b/include/hw/virtio/virtio-iommu.h >> @@ -37,6 +37,8 @@ typedef struct IOMMUDevice { >> int devfn; >> IOMMUMemoryRegion iommu_mr; >> AddressSpace as; >> + MemoryRegion root; /* The root container of the device */ >> + MemoryRegion bypass_mr; /* The alias of shared memory MR */ >> } IOMMUDevice; >> >> typedef struct IOMMUPciBus { >Did you test migration? I wonder if we shouldn't call > >virtio_iommu_switch_address_space_all(s) >after restoring the endpoints in iommu_post_load() as it is done in intel- >iommu Thanks for point out, I missed the migration scenario. After your suggested change, the migration works without pass-through device. With pass-through device I got "VFIO device doesn't support migration". Not clear if pass-through device migration is totally supported now. Zhenzhong
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index ab8e095b73fa..20af2e7ebd78 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uin virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" # virtio-mem.c virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 2597e166f939..ff718107ee02 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -69,6 +69,77 @@ static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); } +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) +{ + uint32_t sid; + bool bypassed; + VirtIOIOMMU *s = sdev->viommu; + VirtIOIOMMUEndpoint *ep; + + sid = virtio_iommu_get_bdf(sdev); + + qemu_mutex_lock(&s->mutex); + /* need to check bypass before system reset */ + if (!s->endpoints) { + bypassed = s->config.bypass; + goto unlock; + } + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + if (!ep || !ep->domain) { + bypassed = s->config.bypass; + } else { + bypassed = ep->domain->bypass; + } + +unlock: + qemu_mutex_unlock(&s->mutex); + return bypassed; +} + +/* Return whether the device is using IOMMU translation. */ +static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) +{ + bool use_remapping; + + assert(sdev); + + use_remapping = !virtio_iommu_device_bypassed(sdev); + + trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus), + PCI_SLOT(sdev->devfn), + PCI_FUNC(sdev->devfn), + use_remapping); + + /* Turn off first then on the other */ + if (use_remapping) { + memory_region_set_enabled(&sdev->bypass_mr, false); + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true); + } else { + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false); + memory_region_set_enabled(&sdev->bypass_mr, true); + } + + return use_remapping; +} + +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) +{ + GHashTableIter iter; + IOMMUPciBus *iommu_pci_bus; + int i; + + g_hash_table_iter_init(&iter, s->as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) { + for (i = 0; i < PCI_DEVFN_MAX; i++) { + if (!iommu_pci_bus->pbdev[i]) { + continue; + } + virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]); + } + } +} + /** * The bus number is used for lookup when SID based operations occur. * In that case we lazily populate the IOMMUPciBus array from the bus hash @@ -213,6 +284,7 @@ static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value, static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) { VirtIOIOMMUDomain *domain = ep->domain; + IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); if (!ep->domain) { return; @@ -221,6 +293,7 @@ static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) ep->iommu_mr); QLIST_REMOVE(ep, next); ep->domain = NULL; + virtio_iommu_switch_address_space(sdev); } static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, @@ -323,12 +396,39 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, trace_virtio_iommu_init_iommu_mr(name); + memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); + address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); + + /* + * Build the IOMMU disabled container with aliases to the + * shared MRs. Note that aliasing to a shared memory region + * could help the memory API to detect same FlatViews so we + * can have devices to share the same FlatView when in bypass + * mode. (either by not configuring virtio-iommu driver or with + * "iommu=pt"). It will greatly reduce the total number of + * FlatViews of the system hence VM runs faster. + */ + memory_region_init_alias(&sdev->bypass_mr, OBJECT(s), + "system", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr), TYPE_VIRTIO_IOMMU_MEMORY_REGION, OBJECT(s), name, UINT64_MAX); - address_space_init(&sdev->as, - MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU); + + /* + * Hook both the containers under the root container, we + * switch between iommu & bypass MRs by enable/disable + * corresponding sub-containers + */ + memory_region_add_subregion_overlap(&sdev->root, 0, + MEMORY_REGION(&sdev->iommu_mr), + 0); + memory_region_add_subregion_overlap(&sdev->root, 0, + &sdev->bypass_mr, 0); + + virtio_iommu_switch_address_space(sdev); g_free(name); } return &sdev->as; @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, uint32_t flags = le32_to_cpu(req->flags); VirtIOIOMMUDomain *domain; VirtIOIOMMUEndpoint *ep; + IOMMUDevice *sdev; trace_virtio_iommu_attach(domain_id, ep_id); @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); ep->domain = domain; + sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); + virtio_iommu_switch_address_space(sdev); /* Replay domain mappings on the associated memory region */ g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, @@ -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev, return; } dev_config->bypass = in_config->bypass; + virtio_iommu_switch_address_space_all(dev); } trace_virtio_iommu_set_config(in_config->bypass); @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void *opaque) * system reset */ s->config.bypass = s->boot_bypass; + virtio_iommu_switch_address_space_all(s); + } static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) @@ -1041,6 +1147,11 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) virtio_iommu_handle_command); s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL); + /* + * config.bypass is needed to get initial address space early, such as + * in vfio realize + */ + s->config.bypass = s->boot_bypass; s->config.page_size_mask = TARGET_PAGE_MASK; s->config.input_range.end = UINT64_MAX; s->config.domain_range.end = UINT32_MAX; diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index 84391f844826..102eeefa731d 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -37,6 +37,8 @@ typedef struct IOMMUDevice { int devfn; IOMMUMemoryRegion iommu_mr; AddressSpace as; + MemoryRegion root; /* The root container of the device */ + MemoryRegion bypass_mr; /* The alias of shared memory MR */ } IOMMUDevice; typedef struct IOMMUPciBus {
Currently assigned devices can not work in virtio-iommu bypass mode. Guest driver fails to probe the device due to DMA failure. And the reason is because of lacking GPA -> HPA mappings when VM is created. Add a root container memory region to hold both bypass memory region and iommu memory region, so the switch between them is supported just like the implementation in virtual VT-d. Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 115 ++++++++++++++++++++++++++++++- include/hw/virtio/virtio-iommu.h | 2 + 3 files changed, 116 insertions(+), 2 deletions(-)