Message ID | 1462344751-28281-19-git-send-email-aik@ozlabs.ru (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, 4 May 2016 16:52:30 +1000 Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. > This adds ability to VFIO common code to dynamically allocate/remove > DMA windows in the host kernel when new VFIO container is added/removed. > > This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add > and adds just created IOMMU into the host IOMMU list; the opposite > action is taken in vfio_listener_region_del. > > When creating a new window, this uses heuristic to decide on the TCE table > levels number. > > This should cause no guest visible change in behavior. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > Changes: > v16: > * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() > * enforced no intersections between windows > > v14: > * new to the series > --- > hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > trace-events | 2 + > 2 files changed, 125 insertions(+), 10 deletions(-) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 03daf88..bd2dee8 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, > return -errno; > } > > +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) > +{ > + return start <= addr && addr <= end; > +} a) If you want a "range_foo" function then put it in range.h b) I suspect there are already range.h functions that can do this. > + > +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, > + hwaddr min_iova, hwaddr max_iova) > +{ > + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || > + range_contains(min_iova, max_iova, hostwin->min_iova); > +} How is this different than ranges_overlap()? > + > static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, > hwaddr min_iova, hwaddr max_iova) > { > @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, > return 0; > } > > +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) > +{ > + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); > + > + g_assert(hostwin); Handle the error please. > + QLIST_REMOVE(hostwin, hostwin_next); > +} > + > static bool vfio_listener_skipped_section(MemoryRegionSection *section) > { > return (!memory_region_is_ram(section->mr) && > @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, > } > end = int128_get64(int128_sub(llend, int128_one())); > > + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { > + VFIOHostDMAWindow *hostwin; > + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); > + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); > + unsigned entries, pages; > + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; > + > + trace_vfio_listener_region_add_iommu(iova, end); > + /* > + * FIXME: For VFIO iommu types which have KVM acceleration to > + * avoid bouncing all map/unmaps through qemu this way, this > + * would be the right place to wire that up (tell the KVM > + * device emulation the VFIO iommu handles to use). > + */ > + create.window_size = int128_get64(section->size); > + create.page_shift = ctz64(pagesize); > + /* > + * SPAPR host supports multilevel TCE tables, there is some > + * heuristic to decide how many levels we want for our table: > + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 > + */ > + entries = create.window_size >> create.page_shift; > + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); > + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ > + create.levels = ctz64(pages) / 6 + 1; > + > + /* For now intersections are not allowed, we may relax this later */ > + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { > + if (vfio_host_win_intersects(hostwin, > + section->offset_within_address_space, > + section->offset_within_address_space + > + create.window_size - 1)) { > + goto fail; > + } > + } > + > + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); > + if (ret) { > + error_report("Failed to create a window, ret = %d (%m)", ret); > + goto fail; > + } > + > + if (create.start_addr != section->offset_within_address_space) { > + struct vfio_iommu_spapr_tce_remove remove = { > + .argsz = sizeof(remove), > + .start_addr = create.start_addr > + }; > + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, > + section->offset_within_address_space, > + create.start_addr); > + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > + ret = -EINVAL; > + goto fail; > + } > + trace_vfio_spapr_create_window(create.page_shift, > + create.window_size, > + create.start_addr); > + > + vfio_host_win_add(container, create.start_addr, > + create.start_addr + create.window_size - 1, > + 1ULL << create.page_shift); > + } This is a function on its own, split it out and why not stop pretending prereg is some sort of generic interface and let's just make a spapr support file. > + > if (!vfio_host_win_lookup(container, iova, end)) { > error_report("vfio: IOMMU container %p can't map guest IOVA region" > " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, > @@ -566,6 +649,22 @@ static void vfio_listener_region_del(MemoryListener *listener, > container, iova, int128_get64(llsize), ret); > } > > + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { > + struct vfio_iommu_spapr_tce_remove remove = { > + .argsz = sizeof(remove), > + .start_addr = section->offset_within_address_space, > + }; > + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > + if (ret) { > + error_report("Failed to remove window at %"PRIx64, > + remove.start_addr); > + } > + > + vfio_host_win_del(container, section->offset_within_address_space); > + > + trace_vfio_spapr_remove_window(remove.start_addr); > + } This would be in that spapr file too. > + > if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) { > iommu->iommu_ops->vfio_stop(section->mr); > } > @@ -957,11 +1056,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) > } > } > > - /* > - * This only considers the host IOMMU's 32-bit window. At > - * some point we need to add support for the optional 64-bit > - * window and dynamic windows > - */ > info.argsz = sizeof(info); > ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); > if (ret) { > @@ -970,11 +1064,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) > goto listener_release_exit; > } > > - /* The default table uses 4K pages */ > - vfio_host_win_add(container, info.dma32_window_start, > - info.dma32_window_start + > - info.dma32_window_size - 1, > - 0x1000); > + if (v2) { > + /* > + * There is a default window in just created container. > + * To make region_add/del simpler, we better remove this > + * window now and let those iommu_listener callbacks > + * create/remove them when needed. > + */ > + struct vfio_iommu_spapr_tce_remove remove = { > + .argsz = sizeof(remove), > + .start_addr = info.dma32_window_start, > + }; > + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > + if (ret) { > + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m"); > + ret = -errno; > + goto free_container_exit; > + } > + } else { > + /* The default table uses 4K pages */ > + vfio_host_win_add(container, info.dma32_window_start, > + info.dma32_window_start + > + info.dma32_window_size - 1, > + 0x1000); > + } > } else { > error_report("vfio: No available IOMMU models"); > ret = -EINVAL; > diff --git a/trace-events b/trace-events > index d0d8615..b5419de 100644 > --- a/trace-events > +++ b/trace-events > @@ -1739,6 +1739,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d" > vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" > vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" > vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" > +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 > +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 > > # hw/vfio/platform.c > vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
On 05/14/2016 08:26 AM, Alex Williamson wrote: > On Wed, 4 May 2016 16:52:30 +1000 > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > >> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. >> This adds ability to VFIO common code to dynamically allocate/remove >> DMA windows in the host kernel when new VFIO container is added/removed. >> >> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add >> and adds just created IOMMU into the host IOMMU list; the opposite >> action is taken in vfio_listener_region_del. >> >> When creating a new window, this uses heuristic to decide on the TCE table >> levels number. >> >> This should cause no guest visible change in behavior. >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> --- >> Changes: >> v16: >> * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() >> * enforced no intersections between windows >> >> v14: >> * new to the series >> --- >> hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- >> trace-events | 2 + >> 2 files changed, 125 insertions(+), 10 deletions(-) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 03daf88..bd2dee8 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, >> return -errno; >> } >> >> +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) >> +{ >> + return start <= addr && addr <= end; >> +} > > a) If you want a "range_foo" function then put it in range.h > b) I suspect there are already range.h functions that can do this. > >> + >> +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, >> + hwaddr min_iova, hwaddr max_iova) >> +{ >> + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || >> + range_contains(min_iova, max_iova, hostwin->min_iova); >> +} > > How is this different than ranges_overlap()? >> + >> static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, >> hwaddr min_iova, hwaddr max_iova) >> { >> @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, >> return 0; >> } >> >> +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) >> +{ >> + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); >> + >> + g_assert(hostwin); > > Handle the error please. Will this be enough? if (!hostwin) { error_report("%s: Cannot delete missing window at %"HWADDR_PRIx, __func__, min_iova); return; } > >> + QLIST_REMOVE(hostwin, hostwin_next); >> +} >> + >> static bool vfio_listener_skipped_section(MemoryRegionSection *section) >> { >> return (!memory_region_is_ram(section->mr) && >> @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, >> } >> end = int128_get64(int128_sub(llend, int128_one())); >> >> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { >> + VFIOHostDMAWindow *hostwin; >> + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); >> + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); >> + unsigned entries, pages; >> + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; >> + >> + trace_vfio_listener_region_add_iommu(iova, end); >> + /* >> + * FIXME: For VFIO iommu types which have KVM acceleration to >> + * avoid bouncing all map/unmaps through qemu this way, this >> + * would be the right place to wire that up (tell the KVM >> + * device emulation the VFIO iommu handles to use). >> + */ >> + create.window_size = int128_get64(section->size); >> + create.page_shift = ctz64(pagesize); >> + /* >> + * SPAPR host supports multilevel TCE tables, there is some >> + * heuristic to decide how many levels we want for our table: >> + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 >> + */ >> + entries = create.window_size >> create.page_shift; >> + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); >> + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ >> + create.levels = ctz64(pages) / 6 + 1; >> + >> + /* For now intersections are not allowed, we may relax this later */ >> + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { >> + if (vfio_host_win_intersects(hostwin, >> + section->offset_within_address_space, >> + section->offset_within_address_space + >> + create.window_size - 1)) { >> + goto fail; >> + } >> + } >> + >> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); >> + if (ret) { >> + error_report("Failed to create a window, ret = %d (%m)", ret); >> + goto fail; >> + } >> + >> + if (create.start_addr != section->offset_within_address_space) { >> + struct vfio_iommu_spapr_tce_remove remove = { >> + .argsz = sizeof(remove), >> + .start_addr = create.start_addr >> + }; >> + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, >> + section->offset_within_address_space, >> + create.start_addr); >> + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); >> + ret = -EINVAL; >> + goto fail; >> + } >> + trace_vfio_spapr_create_window(create.page_shift, >> + create.window_size, >> + create.start_addr); >> + >> + vfio_host_win_add(container, create.start_addr, >> + create.start_addr + create.window_size - 1, >> + 1ULL << create.page_shift); >> + } > > This is a function on its own, split it out and why not stop pretending > prereg is some sort of generic interface and let's just make a spapr > support file. Yet another new file - spapr.c, or rename prereg.c to spapr.c and add this stuff there? Also, what bits? I'd keep VFIOHostDMAWindow business in here and moved vfio_iommu_spapr_tce_create and ioctl to the new place, will this be ok? Thanks. > >> + >> if (!vfio_host_win_lookup(container, iova, end)) { >> error_report("vfio: IOMMU container %p can't map guest IOVA region" >> " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, >> @@ -566,6 +649,22 @@ static void vfio_listener_region_del(MemoryListener *listener, >> container, iova, int128_get64(llsize), ret); >> } >> >> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { >> + struct vfio_iommu_spapr_tce_remove remove = { >> + .argsz = sizeof(remove), >> + .start_addr = section->offset_within_address_space, >> + }; >> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); >> + if (ret) { >> + error_report("Failed to remove window at %"PRIx64, >> + remove.start_addr); >> + } >> + >> + vfio_host_win_del(container, section->offset_within_address_space); >> + >> + trace_vfio_spapr_remove_window(remove.start_addr); >> + } > > This would be in that spapr file too. > >> + >> if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) { >> iommu->iommu_ops->vfio_stop(section->mr); >> } >> @@ -957,11 +1056,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) >> } >> } >> >> - /* >> - * This only considers the host IOMMU's 32-bit window. At >> - * some point we need to add support for the optional 64-bit >> - * window and dynamic windows >> - */ >> info.argsz = sizeof(info); >> ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); >> if (ret) { >> @@ -970,11 +1064,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) >> goto listener_release_exit; >> } >> >> - /* The default table uses 4K pages */ >> - vfio_host_win_add(container, info.dma32_window_start, >> - info.dma32_window_start + >> - info.dma32_window_size - 1, >> - 0x1000); >> + if (v2) { >> + /* >> + * There is a default window in just created container. >> + * To make region_add/del simpler, we better remove this >> + * window now and let those iommu_listener callbacks >> + * create/remove them when needed. >> + */ >> + struct vfio_iommu_spapr_tce_remove remove = { >> + .argsz = sizeof(remove), >> + .start_addr = info.dma32_window_start, >> + }; >> + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); >> + if (ret) { >> + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m"); >> + ret = -errno; >> + goto free_container_exit; >> + } >> + } else { >> + /* The default table uses 4K pages */ >> + vfio_host_win_add(container, info.dma32_window_start, >> + info.dma32_window_start + >> + info.dma32_window_size - 1, >> + 0x1000); >> + } >> } else { >> error_report("vfio: No available IOMMU models"); >> ret = -EINVAL; >> diff --git a/trace-events b/trace-events >> index d0d8615..b5419de 100644 >> --- a/trace-events >> +++ b/trace-events >> @@ -1739,6 +1739,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d" >> vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" >> vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" >> vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" >> +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 >> +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 >> >> # hw/vfio/platform.c >> vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" >
On Mon, 16 May 2016 14:52:41 +1000 Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > On 05/14/2016 08:26 AM, Alex Williamson wrote: > > On Wed, 4 May 2016 16:52:30 +1000 > > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > > > >> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. > >> This adds ability to VFIO common code to dynamically allocate/remove > >> DMA windows in the host kernel when new VFIO container is added/removed. > >> > >> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add > >> and adds just created IOMMU into the host IOMMU list; the opposite > >> action is taken in vfio_listener_region_del. > >> > >> When creating a new window, this uses heuristic to decide on the TCE table > >> levels number. > >> > >> This should cause no guest visible change in behavior. > >> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >> --- > >> Changes: > >> v16: > >> * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() > >> * enforced no intersections between windows > >> > >> v14: > >> * new to the series > >> --- > >> hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > >> trace-events | 2 + > >> 2 files changed, 125 insertions(+), 10 deletions(-) > >> > >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c > >> index 03daf88..bd2dee8 100644 > >> --- a/hw/vfio/common.c > >> +++ b/hw/vfio/common.c > >> @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, > >> return -errno; > >> } > >> > >> +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) > >> +{ > >> + return start <= addr && addr <= end; > >> +} > > > > a) If you want a "range_foo" function then put it in range.h > > b) I suspect there are already range.h functions that can do this. > > > >> + > >> +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, > >> + hwaddr min_iova, hwaddr max_iova) > >> +{ > >> + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || > >> + range_contains(min_iova, max_iova, hostwin->min_iova); > >> +} > > > > How is this different than ranges_overlap()? > >> + > >> static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, > >> hwaddr min_iova, hwaddr max_iova) > >> { > >> @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, > >> return 0; > >> } > >> > >> +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) > >> +{ > >> + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); > >> + > >> + g_assert(hostwin); > > > > Handle the error please. > > Will this be enough? > > if (!hostwin) { > error_report("%s: Cannot delete missing window at %"HWADDR_PRIx, > __func__, min_iova); > return; > } Better. I was really thinking to return error to the caller, but if the caller has no return path, perhaps this is as good as we can do. Expect that I will push back on any assert() calls added to vfio. > >> + QLIST_REMOVE(hostwin, hostwin_next); > >> +} > >> + > >> static bool vfio_listener_skipped_section(MemoryRegionSection *section) > >> { > >> return (!memory_region_is_ram(section->mr) && > >> @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, > >> } > >> end = int128_get64(int128_sub(llend, int128_one())); > >> > >> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { > >> + VFIOHostDMAWindow *hostwin; > >> + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); > >> + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); > >> + unsigned entries, pages; > >> + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; > >> + > >> + trace_vfio_listener_region_add_iommu(iova, end); > >> + /* > >> + * FIXME: For VFIO iommu types which have KVM acceleration to > >> + * avoid bouncing all map/unmaps through qemu this way, this > >> + * would be the right place to wire that up (tell the KVM > >> + * device emulation the VFIO iommu handles to use). > >> + */ > >> + create.window_size = int128_get64(section->size); > >> + create.page_shift = ctz64(pagesize); > >> + /* > >> + * SPAPR host supports multilevel TCE tables, there is some > >> + * heuristic to decide how many levels we want for our table: > >> + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 > >> + */ > >> + entries = create.window_size >> create.page_shift; > >> + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); > >> + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ > >> + create.levels = ctz64(pages) / 6 + 1; > >> + > >> + /* For now intersections are not allowed, we may relax this later */ > >> + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { > >> + if (vfio_host_win_intersects(hostwin, > >> + section->offset_within_address_space, > >> + section->offset_within_address_space + > >> + create.window_size - 1)) { > >> + goto fail; > >> + } > >> + } > >> + > >> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); > >> + if (ret) { > >> + error_report("Failed to create a window, ret = %d (%m)", ret); > >> + goto fail; > >> + } > >> + > >> + if (create.start_addr != section->offset_within_address_space) { > >> + struct vfio_iommu_spapr_tce_remove remove = { > >> + .argsz = sizeof(remove), > >> + .start_addr = create.start_addr > >> + }; > >> + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, > >> + section->offset_within_address_space, > >> + create.start_addr); > >> + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > >> + ret = -EINVAL; > >> + goto fail; > >> + } > >> + trace_vfio_spapr_create_window(create.page_shift, > >> + create.window_size, > >> + create.start_addr); > >> + > >> + vfio_host_win_add(container, create.start_addr, > >> + create.start_addr + create.window_size - 1, > >> + 1ULL << create.page_shift); > >> + } > > > > This is a function on its own, split it out and why not stop pretending > > prereg is some sort of generic interface and let's just make a spapr > > support file. > > > Yet another new file - spapr.c, or rename prereg.c to spapr.c and add this > stuff there? prereg.c is already spapr specific, so I'd rename it and potentially add this to it. > Also, what bits? I'd keep VFIOHostDMAWindow business in here and moved > vfio_iommu_spapr_tce_create and ioctl to the new place, will this be ok? > Thanks. VFIOHostDMAWindow is actually generic, so keeping it here makes sense. > >> + > >> if (!vfio_host_win_lookup(container, iova, end)) { > >> error_report("vfio: IOMMU container %p can't map guest IOVA region" > >> " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, > >> @@ -566,6 +649,22 @@ static void vfio_listener_region_del(MemoryListener *listener, > >> container, iova, int128_get64(llsize), ret); > >> } > >> > >> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { > >> + struct vfio_iommu_spapr_tce_remove remove = { > >> + .argsz = sizeof(remove), > >> + .start_addr = section->offset_within_address_space, > >> + }; > >> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > >> + if (ret) { > >> + error_report("Failed to remove window at %"PRIx64, > >> + remove.start_addr); > >> + } > >> + > >> + vfio_host_win_del(container, section->offset_within_address_space); > >> + > >> + trace_vfio_spapr_remove_window(remove.start_addr); > >> + } > > > > This would be in that spapr file too. > > > >> + > >> if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) { > >> iommu->iommu_ops->vfio_stop(section->mr); > >> } > >> @@ -957,11 +1056,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) > >> } > >> } > >> > >> - /* > >> - * This only considers the host IOMMU's 32-bit window. At > >> - * some point we need to add support for the optional 64-bit > >> - * window and dynamic windows > >> - */ > >> info.argsz = sizeof(info); > >> ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); > >> if (ret) { > >> @@ -970,11 +1064,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) > >> goto listener_release_exit; > >> } > >> > >> - /* The default table uses 4K pages */ > >> - vfio_host_win_add(container, info.dma32_window_start, > >> - info.dma32_window_start + > >> - info.dma32_window_size - 1, > >> - 0x1000); > >> + if (v2) { > >> + /* > >> + * There is a default window in just created container. > >> + * To make region_add/del simpler, we better remove this > >> + * window now and let those iommu_listener callbacks > >> + * create/remove them when needed. > >> + */ > >> + struct vfio_iommu_spapr_tce_remove remove = { > >> + .argsz = sizeof(remove), > >> + .start_addr = info.dma32_window_start, > >> + }; > >> + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > >> + if (ret) { > >> + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m"); > >> + ret = -errno; > >> + goto free_container_exit; > >> + } > >> + } else { > >> + /* The default table uses 4K pages */ > >> + vfio_host_win_add(container, info.dma32_window_start, > >> + info.dma32_window_start + > >> + info.dma32_window_size - 1, > >> + 0x1000); > >> + } > >> } else { > >> error_report("vfio: No available IOMMU models"); > >> ret = -EINVAL; > >> diff --git a/trace-events b/trace-events > >> index d0d8615..b5419de 100644 > >> --- a/trace-events > >> +++ b/trace-events > >> @@ -1739,6 +1739,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d" > >> vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" > >> vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" > >> vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" > >> +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 > >> +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 > >> > >> # hw/vfio/platform.c > >> vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" > > > >
On Mon, May 16, 2016 at 02:20:02PM -0600, Alex Williamson wrote: > On Mon, 16 May 2016 14:52:41 +1000 > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > > > On 05/14/2016 08:26 AM, Alex Williamson wrote: > > > On Wed, 4 May 2016 16:52:30 +1000 > > > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > > > > > >> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. > > >> This adds ability to VFIO common code to dynamically allocate/remove > > >> DMA windows in the host kernel when new VFIO container is added/removed. > > >> > > >> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add > > >> and adds just created IOMMU into the host IOMMU list; the opposite > > >> action is taken in vfio_listener_region_del. > > >> > > >> When creating a new window, this uses heuristic to decide on the TCE table > > >> levels number. > > >> > > >> This should cause no guest visible change in behavior. > > >> > > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > >> --- > > >> Changes: > > >> v16: > > >> * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() > > >> * enforced no intersections between windows > > >> > > >> v14: > > >> * new to the series > > >> --- > > >> hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > > >> trace-events | 2 + > > >> 2 files changed, 125 insertions(+), 10 deletions(-) > > >> > > >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c > > >> index 03daf88..bd2dee8 100644 > > >> --- a/hw/vfio/common.c > > >> +++ b/hw/vfio/common.c > > >> @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, > > >> return -errno; > > >> } > > >> > > >> +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) > > >> +{ > > >> + return start <= addr && addr <= end; > > >> +} > > > > > > a) If you want a "range_foo" function then put it in range.h > > > b) I suspect there are already range.h functions that can do this. > > > > > >> + > > >> +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, > > >> + hwaddr min_iova, hwaddr max_iova) > > >> +{ > > >> + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || > > >> + range_contains(min_iova, max_iova, hostwin->min_iova); > > >> +} > > > > > > How is this different than ranges_overlap()? > > >> + > > >> static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, > > >> hwaddr min_iova, hwaddr max_iova) > > >> { > > >> @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, > > >> return 0; > > >> } > > >> > > >> +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) > > >> +{ > > >> + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); > > >> + > > >> + g_assert(hostwin); > > > > > > Handle the error please. > > > > Will this be enough? > > > > if (!hostwin) { > > error_report("%s: Cannot delete missing window at %"HWADDR_PRIx, > > __func__, min_iova); > > return; > > } > > Better. I was really thinking to return error to the caller, but if > the caller has no return path, perhaps this is as good as we can do. > Expect that I will push back on any assert() calls added to vfio. In this case I think returning an error makes more sense. The caller understands the context and so can make reasonable error handling decisions. .. which will probably just be a fatal error, but it still makes more logical sense there.
On 17/05/16 06:20, Alex Williamson wrote: > On Mon, 16 May 2016 14:52:41 +1000 > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > >> On 05/14/2016 08:26 AM, Alex Williamson wrote: >>> On Wed, 4 May 2016 16:52:30 +1000 >>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote: >>> >>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. >>>> This adds ability to VFIO common code to dynamically allocate/remove >>>> DMA windows in the host kernel when new VFIO container is added/removed. >>>> >>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add >>>> and adds just created IOMMU into the host IOMMU list; the opposite >>>> action is taken in vfio_listener_region_del. >>>> >>>> When creating a new window, this uses heuristic to decide on the TCE table >>>> levels number. >>>> >>>> This should cause no guest visible change in behavior. >>>> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >>>> --- >>>> Changes: >>>> v16: >>>> * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() >>>> * enforced no intersections between windows >>>> >>>> v14: >>>> * new to the series >>>> --- >>>> hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- >>>> trace-events | 2 + >>>> 2 files changed, 125 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>> index 03daf88..bd2dee8 100644 >>>> --- a/hw/vfio/common.c >>>> +++ b/hw/vfio/common.c >>>> @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, >>>> return -errno; >>>> } >>>> >>>> +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) >>>> +{ >>>> + return start <= addr && addr <= end; >>>> +} >>> >>> a) If you want a "range_foo" function then put it in range.h >>> b) I suspect there are already range.h functions that can do this. >>> >>>> + >>>> +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, >>>> + hwaddr min_iova, hwaddr max_iova) >>>> +{ >>>> + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || >>>> + range_contains(min_iova, max_iova, hostwin->min_iova); >>>> +} >>> >>> How is this different than ranges_overlap()? >>>> + >>>> static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, >>>> hwaddr min_iova, hwaddr max_iova) >>>> { >>>> @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, >>>> return 0; >>>> } >>>> >>>> +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) >>>> +{ >>>> + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); >>>> + >>>> + g_assert(hostwin); >>> >>> Handle the error please. >> >> Will this be enough? >> >> if (!hostwin) { >> error_report("%s: Cannot delete missing window at %"HWADDR_PRIx, >> __func__, min_iova); >> return; >> } > > Better. I was really thinking to return error to the caller, but if > the caller has no return path, perhaps this is as good as we can do. > Expect that I will push back on any assert() calls added to vfio. > > >>>> + QLIST_REMOVE(hostwin, hostwin_next); >>>> +} >>>> + >>>> static bool vfio_listener_skipped_section(MemoryRegionSection *section) >>>> { >>>> return (!memory_region_is_ram(section->mr) && >>>> @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, >>>> } >>>> end = int128_get64(int128_sub(llend, int128_one())); >>>> >>>> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { >>>> + VFIOHostDMAWindow *hostwin; >>>> + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); >>>> + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); >>>> + unsigned entries, pages; >>>> + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; >>>> + >>>> + trace_vfio_listener_region_add_iommu(iova, end); >>>> + /* >>>> + * FIXME: For VFIO iommu types which have KVM acceleration to >>>> + * avoid bouncing all map/unmaps through qemu this way, this >>>> + * would be the right place to wire that up (tell the KVM >>>> + * device emulation the VFIO iommu handles to use). >>>> + */ >>>> + create.window_size = int128_get64(section->size); >>>> + create.page_shift = ctz64(pagesize); >>>> + /* >>>> + * SPAPR host supports multilevel TCE tables, there is some >>>> + * heuristic to decide how many levels we want for our table: >>>> + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 >>>> + */ >>>> + entries = create.window_size >> create.page_shift; >>>> + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); >>>> + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ >>>> + create.levels = ctz64(pages) / 6 + 1; >>>> + >>>> + /* For now intersections are not allowed, we may relax this later */ >>>> + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { >>>> + if (vfio_host_win_intersects(hostwin, >>>> + section->offset_within_address_space, >>>> + section->offset_within_address_space + >>>> + create.window_size - 1)) { >>>> + goto fail; >>>> + } >>>> + } >>>> + >>>> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); >>>> + if (ret) { >>>> + error_report("Failed to create a window, ret = %d (%m)", ret); >>>> + goto fail; >>>> + } >>>> + >>>> + if (create.start_addr != section->offset_within_address_space) { >>>> + struct vfio_iommu_spapr_tce_remove remove = { >>>> + .argsz = sizeof(remove), >>>> + .start_addr = create.start_addr >>>> + }; >>>> + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, >>>> + section->offset_within_address_space, >>>> + create.start_addr); >>>> + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); >>>> + ret = -EINVAL; >>>> + goto fail; >>>> + } >>>> + trace_vfio_spapr_create_window(create.page_shift, >>>> + create.window_size, >>>> + create.start_addr); >>>> + >>>> + vfio_host_win_add(container, create.start_addr, >>>> + create.start_addr + create.window_size - 1, >>>> + 1ULL << create.page_shift); >>>> + } >>> >>> This is a function on its own, split it out and why not stop pretending >>> prereg is some sort of generic interface and let's just make a spapr >>> support file. >> >> >> Yet another new file - spapr.c, or rename prereg.c to spapr.c and add this >> stuff there? > > prereg.c is already spapr specific, so I'd rename it and potentially > add this to it. It would help if you two decided what I should do about prereg.c vs. spapr.c - merge or keep 2 files. Thanks.
On Fri, May 27, 2016 at 01:49:19PM +1000, Alexey Kardashevskiy wrote: > On 17/05/16 06:20, Alex Williamson wrote: > > On Mon, 16 May 2016 14:52:41 +1000 > > Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > > > >> On 05/14/2016 08:26 AM, Alex Williamson wrote: > >>> On Wed, 4 May 2016 16:52:30 +1000 > >>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote: > >>> > >>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. > >>>> This adds ability to VFIO common code to dynamically allocate/remove > >>>> DMA windows in the host kernel when new VFIO container is added/removed. > >>>> > >>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add > >>>> and adds just created IOMMU into the host IOMMU list; the opposite > >>>> action is taken in vfio_listener_region_del. > >>>> > >>>> When creating a new window, this uses heuristic to decide on the TCE table > >>>> levels number. > >>>> > >>>> This should cause no guest visible change in behavior. > >>>> > >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >>>> --- > >>>> Changes: > >>>> v16: > >>>> * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() > >>>> * enforced no intersections between windows > >>>> > >>>> v14: > >>>> * new to the series > >>>> --- > >>>> hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- > >>>> trace-events | 2 + > >>>> 2 files changed, 125 insertions(+), 10 deletions(-) > >>>> > >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c > >>>> index 03daf88..bd2dee8 100644 > >>>> --- a/hw/vfio/common.c > >>>> +++ b/hw/vfio/common.c > >>>> @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, > >>>> return -errno; > >>>> } > >>>> > >>>> +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) > >>>> +{ > >>>> + return start <= addr && addr <= end; > >>>> +} > >>> > >>> a) If you want a "range_foo" function then put it in range.h > >>> b) I suspect there are already range.h functions that can do this. > >>> > >>>> + > >>>> +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, > >>>> + hwaddr min_iova, hwaddr max_iova) > >>>> +{ > >>>> + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || > >>>> + range_contains(min_iova, max_iova, hostwin->min_iova); > >>>> +} > >>> > >>> How is this different than ranges_overlap()? > >>>> + > >>>> static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, > >>>> hwaddr min_iova, hwaddr max_iova) > >>>> { > >>>> @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, > >>>> return 0; > >>>> } > >>>> > >>>> +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) > >>>> +{ > >>>> + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); > >>>> + > >>>> + g_assert(hostwin); > >>> > >>> Handle the error please. > >> > >> Will this be enough? > >> > >> if (!hostwin) { > >> error_report("%s: Cannot delete missing window at %"HWADDR_PRIx, > >> __func__, min_iova); > >> return; > >> } > > > > Better. I was really thinking to return error to the caller, but if > > the caller has no return path, perhaps this is as good as we can do. > > Expect that I will push back on any assert() calls added to vfio. > > > > > >>>> + QLIST_REMOVE(hostwin, hostwin_next); > >>>> +} > >>>> + > >>>> static bool vfio_listener_skipped_section(MemoryRegionSection *section) > >>>> { > >>>> return (!memory_region_is_ram(section->mr) && > >>>> @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, > >>>> } > >>>> end = int128_get64(int128_sub(llend, int128_one())); > >>>> > >>>> + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { > >>>> + VFIOHostDMAWindow *hostwin; > >>>> + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); > >>>> + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); > >>>> + unsigned entries, pages; > >>>> + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; > >>>> + > >>>> + trace_vfio_listener_region_add_iommu(iova, end); > >>>> + /* > >>>> + * FIXME: For VFIO iommu types which have KVM acceleration to > >>>> + * avoid bouncing all map/unmaps through qemu this way, this > >>>> + * would be the right place to wire that up (tell the KVM > >>>> + * device emulation the VFIO iommu handles to use). > >>>> + */ > >>>> + create.window_size = int128_get64(section->size); > >>>> + create.page_shift = ctz64(pagesize); > >>>> + /* > >>>> + * SPAPR host supports multilevel TCE tables, there is some > >>>> + * heuristic to decide how many levels we want for our table: > >>>> + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 > >>>> + */ > >>>> + entries = create.window_size >> create.page_shift; > >>>> + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); > >>>> + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ > >>>> + create.levels = ctz64(pages) / 6 + 1; > >>>> + > >>>> + /* For now intersections are not allowed, we may relax this later */ > >>>> + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { > >>>> + if (vfio_host_win_intersects(hostwin, > >>>> + section->offset_within_address_space, > >>>> + section->offset_within_address_space + > >>>> + create.window_size - 1)) { > >>>> + goto fail; > >>>> + } > >>>> + } > >>>> + > >>>> + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); > >>>> + if (ret) { > >>>> + error_report("Failed to create a window, ret = %d (%m)", ret); > >>>> + goto fail; > >>>> + } > >>>> + > >>>> + if (create.start_addr != section->offset_within_address_space) { > >>>> + struct vfio_iommu_spapr_tce_remove remove = { > >>>> + .argsz = sizeof(remove), > >>>> + .start_addr = create.start_addr > >>>> + }; > >>>> + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, > >>>> + section->offset_within_address_space, > >>>> + create.start_addr); > >>>> + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > >>>> + ret = -EINVAL; > >>>> + goto fail; > >>>> + } > >>>> + trace_vfio_spapr_create_window(create.page_shift, > >>>> + create.window_size, > >>>> + create.start_addr); > >>>> + > >>>> + vfio_host_win_add(container, create.start_addr, > >>>> + create.start_addr + create.window_size - 1, > >>>> + 1ULL << create.page_shift); > >>>> + } > >>> > >>> This is a function on its own, split it out and why not stop pretending > >>> prereg is some sort of generic interface and let's just make a spapr > >>> support file. > >> > >> > >> Yet another new file - spapr.c, or rename prereg.c to spapr.c and add this > >> stuff there? > > > > prereg.c is already spapr specific, so I'd rename it and potentially > > add this to it. > > > It would help if you two decided what I should do about prereg.c vs. > spapr.c - merge or keep 2 files. Thanks. I'm not particularly fussed either way. So I suggest putting the prereg stuff in spapr.c for now, and we can move it out if/when another platform starts using the mechanism.
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 03daf88..bd2dee8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -240,6 +240,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } +static bool range_contains(hwaddr start, hwaddr end, hwaddr addr) +{ + return start <= addr && addr <= end; +} + +static bool vfio_host_win_intersects(VFIOHostDMAWindow *hostwin, + hwaddr min_iova, hwaddr max_iova) +{ + return range_contains(hostwin->min_iova, hostwin->min_iova, min_iova) || + range_contains(min_iova, max_iova, hostwin->min_iova); +} + static VFIOHostDMAWindow *vfio_host_win_lookup(VFIOContainer *container, hwaddr min_iova, hwaddr max_iova) { @@ -279,6 +291,14 @@ static int vfio_host_win_add(VFIOContainer *container, return 0; } +static void vfio_host_win_del(VFIOContainer *container, hwaddr min_iova) +{ + VFIOHostDMAWindow *hostwin = vfio_host_win_lookup(container, min_iova, 1); + + g_assert(hostwin); + QLIST_REMOVE(hostwin, hostwin_next); +} + static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -392,6 +412,69 @@ static void vfio_listener_region_add(MemoryListener *listener, } end = int128_get64(int128_sub(llend, int128_one())); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + VFIOHostDMAWindow *hostwin; + unsigned pagesizes = memory_region_iommu_get_page_sizes(section->mr); + unsigned pagesize = (hwaddr)1 << ctz64(pagesizes); + unsigned entries, pages; + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; + + trace_vfio_listener_region_add_iommu(iova, end); + /* + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + create.window_size = int128_get64(section->size); + create.page_shift = ctz64(pagesize); + /* + * SPAPR host supports multilevel TCE tables, there is some + * heuristic to decide how many levels we want for our table: + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 + */ + entries = create.window_size >> create.page_shift; + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ + create.levels = ctz64(pages) / 6 + 1; + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (vfio_host_win_intersects(hostwin, + section->offset_within_address_space, + section->offset_within_address_space + + create.window_size - 1)) { + goto fail; + } + } + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (ret) { + error_report("Failed to create a window, ret = %d (%m)", ret); + goto fail; + } + + if (create.start_addr != section->offset_within_address_space) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = create.start_addr + }; + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, + section->offset_within_address_space, + create.start_addr); + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + ret = -EINVAL; + goto fail; + } + trace_vfio_spapr_create_window(create.page_shift, + create.window_size, + create.start_addr); + + vfio_host_win_add(container, create.start_addr, + create.start_addr + create.window_size - 1, + 1ULL << create.page_shift); + } + if (!vfio_host_win_lookup(container, iova, end)) { error_report("vfio: IOMMU container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, @@ -566,6 +649,22 @@ static void vfio_listener_region_del(MemoryListener *listener, container, iova, int128_get64(llsize), ret); } + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = section->offset_within_address_space, + }; + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + remove.start_addr); + } + + vfio_host_win_del(container, section->offset_within_address_space); + + trace_vfio_spapr_remove_window(remove.start_addr); + } + if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) { iommu->iommu_ops->vfio_stop(section->mr); } @@ -957,11 +1056,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) } } - /* - * This only considers the host IOMMU's 32-bit window. At - * some point we need to add support for the optional 64-bit - * window and dynamic windows - */ info.argsz = sizeof(info); ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); if (ret) { @@ -970,11 +1064,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto listener_release_exit; } - /* The default table uses 4K pages */ - vfio_host_win_add(container, info.dma32_window_start, - info.dma32_window_start + - info.dma32_window_size - 1, - 0x1000); + if (v2) { + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = info.dma32_window_start, + }; + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m"); + ret = -errno; + goto free_container_exit; + } + } else { + /* The default table uses 4K pages */ + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; diff --git a/trace-events b/trace-events index d0d8615..b5419de 100644 --- a/trace-events +++ b/trace-events @@ -1739,6 +1739,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 # hw/vfio/platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. This adds ability to VFIO common code to dynamically allocate/remove DMA windows in the host kernel when new VFIO container is added/removed. This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add and adds just created IOMMU into the host IOMMU list; the opposite action is taken in vfio_listener_region_del. When creating a new window, this uses heuristic to decide on the TCE table levels number. This should cause no guest visible change in behavior. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- Changes: v16: * used memory_region_iommu_get_page_sizes() in vfio_listener_region_add() * enforced no intersections between windows v14: * new to the series --- hw/vfio/common.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++----- trace-events | 2 + 2 files changed, 125 insertions(+), 10 deletions(-)