diff mbox

[qemu,v14,17/18] vfio/spapr: Use VFIO_SPAPR_TCE_v2_IOMMU

Message ID 1458546426-26222-18-git-send-email-aik@ozlabs.ru (mailing list archive)
State New, archived
Headers show

Commit Message

Alexey Kardashevskiy March 21, 2016, 7:47 a.m. UTC
New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
This adds ability to VFIO common code to dynamically allocate/remove
DMA windows in the host kernel when new VFIO container is added/removed.

This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
and adds just created IOMMU into the host IOMMU list; the opposite
action is taken in vfio_listener_region_del.

When creating a new window, this uses euristic to decide on the TCE table
levels number.

This should cause no guest visible change in behavior.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v14:
* new to the series

---
TODO:
* export levels to PHB
---
 hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 trace-events     |   2 ++
 2 files changed, 105 insertions(+), 5 deletions(-)

Comments

David Gibson March 22, 2016, 5:14 a.m. UTC | #1
On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
> This adds ability to VFIO common code to dynamically allocate/remove
> DMA windows in the host kernel when new VFIO container is added/removed.
> 
> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
> and adds just created IOMMU into the host IOMMU list; the opposite
> action is taken in vfio_listener_region_del.
> 
> When creating a new window, this uses euristic to decide on the TCE table
> levels number.
> 
> This should cause no guest visible change in behavior.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v14:
> * new to the series
> 
> ---
> TODO:
> * export levels to PHB
> ---
>  hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  trace-events     |   2 ++
>  2 files changed, 105 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 4e873b7..421d6eb 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
>      return 0;
>  }
>  
> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
> +{
> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);

The hard-coded 0x1000 looks dubious..

> +    g_assert(hiommu);
> +    QLIST_REMOVE(hiommu, hiommu_next);
> +}
> +
>  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  {
>      return (!memory_region_is_ram(section->mr) &&
> @@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
>      }
>      end = int128_get64(llend);
>  
> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {

I think this would be clearer split out into a helper function,
vfio_create_host_window() or something.

> +        unsigned entries, pages;
> +        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
> +
> +        g_assert(section->mr->iommu_ops);
> +        g_assert(memory_region_is_iommu(section->mr));

I don't think you need these asserts.  AFAICT the same logic should
work if a RAM MR was added directly to PCI address space - this would
create the new host window, then the existing code for adding a RAM MR
would map that block of RAM statically into the new window.

> +        trace_vfio_listener_region_add_iommu(iova, end - 1);
> +        /*
> +         * FIXME: For VFIO iommu types which have KVM acceleration to
> +         * avoid bouncing all map/unmaps through qemu this way, this
> +         * would be the right place to wire that up (tell the KVM
> +         * device emulation the VFIO iommu handles to use).
> +         */
> +        create.window_size = memory_region_size(section->mr);
> +        create.page_shift =
> +                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));

Ah.. except that I guess you'd need to fall back to host page size
here to handle a RAM MR.

> +        /*
> +         * SPAPR host supports multilevel TCE tables, there is some
> +         * euristic to decide how many levels we want for our table:
> +         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
> +         */
> +        entries = create.window_size >> create.page_shift;
> +        pages = (entries * sizeof(uint64_t)) / getpagesize();
> +        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
> +
> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> +        if (ret) {
> +            error_report("Failed to create a window, ret = %d (%m)", ret);
> +            goto fail;
> +        }
> +
> +        if (create.start_addr != section->offset_within_address_space ||
> +            vfio_host_iommu_lookup(container, create.start_addr,
> +                                   create.start_addr + create.window_size - 1)) {

Under what circumstances can this trigger?  Is the kernel ioctl
allowed to return a different window start address than the one
requested?

The second check looks very strange - if it returns true doesn't that
mean you *do* have host window which can accomodate this guest region,
which is what you want?

> +            struct vfio_iommu_spapr_tce_remove remove = {
> +                .argsz = sizeof(remove),
> +                .start_addr = create.start_addr
> +            };
> +            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
> +                         section->offset_within_address_space,
> +                         create.start_addr);
> +            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +            ret = -EINVAL;
> +            goto fail;
> +        }
> +        trace_vfio_spapr_create_window(create.page_shift,
> +                                       create.window_size,
> +                                       create.start_addr);
> +
> +        vfio_host_iommu_add(container, create.start_addr,
> +                            create.start_addr + create.window_size - 1,
> +                            1ULL << create.page_shift);
> +    }
> +
>      if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
>          error_report("vfio: IOMMU container %p can't map guest IOVA region"
>                       " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
> @@ -525,6 +588,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
>                       container, iova, end - iova, ret);
>      }
>  
> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> +        struct vfio_iommu_spapr_tce_remove remove = {
> +            .argsz = sizeof(remove),
> +            .start_addr = section->offset_within_address_space,
> +        };
> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +        if (ret) {
> +            error_report("Failed to remove window at %"PRIx64,
> +                         remove.start_addr);
> +        }
> +
> +        vfio_host_iommu_del(container, section->offset_within_address_space);
> +
> +        trace_vfio_spapr_remove_window(remove.start_addr);
> +    }
> +
>      if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
>          iommu->iommu_ops->vfio_stop(section->mr);
>      }
> @@ -928,11 +1007,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>              goto listener_release_exit;
>          }
>  
> -        /* The default table uses 4K pages */
> -        vfio_host_iommu_add(container, info.dma32_window_start,
> -                            info.dma32_window_start +
> -                            info.dma32_window_size - 1,
> -                            0x1000);
> +        if (v2) {
> +            /*
> +             * There is a default window in just created container.
> +             * To make region_add/del simpler, we better remove this
> +             * window now and let those iommu_listener callbacks
> +             * create/remove them when needed.
> +             */
> +            struct vfio_iommu_spapr_tce_remove remove = {
> +                .argsz = sizeof(remove),
> +                .start_addr = info.dma32_window_start,
> +            };
> +            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +            if (ret) {
> +                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
> +                ret = -errno;
> +                goto free_container_exit;
> +            }
> +        } else {
> +            /* The default table uses 4K pages */
> +            vfio_host_iommu_add(container, info.dma32_window_start,
> +                                info.dma32_window_start +
> +                                info.dma32_window_size - 1,
> +                                0x1000);
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models");
>          ret = -EINVAL;
> diff --git a/trace-events b/trace-events
> index cc619e1..f2b75a3 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1736,6 +1736,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
>  vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
>  vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>  vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
> +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>  
>  # hw/vfio/platform.c
>  vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
Alexey Kardashevskiy March 22, 2016, 5:54 a.m. UTC | #2
On 03/22/2016 04:14 PM, David Gibson wrote:
> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
>> This adds ability to VFIO common code to dynamically allocate/remove
>> DMA windows in the host kernel when new VFIO container is added/removed.
>>
>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
>> and adds just created IOMMU into the host IOMMU list; the opposite
>> action is taken in vfio_listener_region_del.
>>
>> When creating a new window, this uses euristic to decide on the TCE table
>> levels number.
>>
>> This should cause no guest visible change in behavior.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v14:
>> * new to the series
>>
>> ---
>> TODO:
>> * export levels to PHB
>> ---
>>   hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   trace-events     |   2 ++
>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 4e873b7..421d6eb 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
>>       return 0;
>>   }
>>
>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
>> +{
>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
>
> The hard-coded 0x1000 looks dubious..

Well, that's the minimal page size...


>
>> +    g_assert(hiommu);
>> +    QLIST_REMOVE(hiommu, hiommu_next);
>> +}
>> +
>>   static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>   {
>>       return (!memory_region_is_ram(section->mr) &&
>> @@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>       }
>>       end = int128_get64(llend);
>>
>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>
> I think this would be clearer split out into a helper function,
> vfio_create_host_window() or something.


It is rather vfio_spapr_create_host_window() and we were avoiding 
xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a 
separate file but this usually triggers more discussion and never ends well.



>> +        unsigned entries, pages;
>> +        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
>> +
>> +        g_assert(section->mr->iommu_ops);
>> +        g_assert(memory_region_is_iommu(section->mr));
>
> I don't think you need these asserts.  AFAICT the same logic should
> work if a RAM MR was added directly to PCI address space - this would
> create the new host window, then the existing code for adding a RAM MR
> would map that block of RAM statically into the new window.

In what configuration/machine can we do that on SPAPR?


>> +        trace_vfio_listener_region_add_iommu(iova, end - 1);
>> +        /*
>> +         * FIXME: For VFIO iommu types which have KVM acceleration to
>> +         * avoid bouncing all map/unmaps through qemu this way, this
>> +         * would be the right place to wire that up (tell the KVM
>> +         * device emulation the VFIO iommu handles to use).
>> +         */
>> +        create.window_size = memory_region_size(section->mr);
>> +        create.page_shift =
>> +                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
>
> Ah.. except that I guess you'd need to fall back to host page size
> here to handle a RAM MR.

Can you give an example of such RAM MR being added to PCI AS on SPAPR?


>> +        /*
>> +         * SPAPR host supports multilevel TCE tables, there is some
>> +         * euristic to decide how many levels we want for our table:
>> +         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
>> +         */
>> +        entries = create.window_size >> create.page_shift;
>> +        pages = (entries * sizeof(uint64_t)) / getpagesize();
>> +        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
>> +
>> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>> +        if (ret) {
>> +            error_report("Failed to create a window, ret = %d (%m)", ret);
>> +            goto fail;
>> +        }
>> +
>> +        if (create.start_addr != section->offset_within_address_space ||
>> +            vfio_host_iommu_lookup(container, create.start_addr,
>> +                                   create.start_addr + create.window_size - 1)) {
>
> Under what circumstances can this trigger?  Is the kernel ioctl
> allowed to return a different window start address than the one
> requested?

You already asked this some time ago :) The userspace cannot request 
address, the host kernel returns one.


> The second check looks very strange - if it returns true doesn't that
> mean you *do* have host window which can accomodate this guest region,
> which is what you want?

This should not happen, this is what this check is for. Can make it 
assert() or something like this.


>
>> +            struct vfio_iommu_spapr_tce_remove remove = {
>> +                .argsz = sizeof(remove),
>> +                .start_addr = create.start_addr
>> +            };
>> +            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
>> +                         section->offset_within_address_space,
>> +                         create.start_addr);
>> +            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>> +            ret = -EINVAL;
>> +            goto fail;
>> +        }
>> +        trace_vfio_spapr_create_window(create.page_shift,
>> +                                       create.window_size,
>> +                                       create.start_addr);
>> +
>> +        vfio_host_iommu_add(container, create.start_addr,
>> +                            create.start_addr + create.window_size - 1,
>> +                            1ULL << create.page_shift);
>> +    }
>> +
>>       if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
>>           error_report("vfio: IOMMU container %p can't map guest IOVA region"
>>                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
>> @@ -525,6 +588,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>                        container, iova, end - iova, ret);
>>       }
>>
>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>> +        struct vfio_iommu_spapr_tce_remove remove = {
>> +            .argsz = sizeof(remove),
>> +            .start_addr = section->offset_within_address_space,
>> +        };
>> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>> +        if (ret) {
>> +            error_report("Failed to remove window at %"PRIx64,
>> +                         remove.start_addr);
>> +        }
>> +
>> +        vfio_host_iommu_del(container, section->offset_within_address_space);
>> +
>> +        trace_vfio_spapr_remove_window(remove.start_addr);
>> +    }
>> +
>>       if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
>>           iommu->iommu_ops->vfio_stop(section->mr);
>>       }
>> @@ -928,11 +1007,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>>               goto listener_release_exit;
>>           }
>>
>> -        /* The default table uses 4K pages */
>> -        vfio_host_iommu_add(container, info.dma32_window_start,
>> -                            info.dma32_window_start +
>> -                            info.dma32_window_size - 1,
>> -                            0x1000);
>> +        if (v2) {
>> +            /*
>> +             * There is a default window in just created container.
>> +             * To make region_add/del simpler, we better remove this
>> +             * window now and let those iommu_listener callbacks
>> +             * create/remove them when needed.
>> +             */
>> +            struct vfio_iommu_spapr_tce_remove remove = {
>> +                .argsz = sizeof(remove),
>> +                .start_addr = info.dma32_window_start,
>> +            };
>> +            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>> +            if (ret) {
>> +                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
>> +                ret = -errno;
>> +                goto free_container_exit;
>> +            }
>> +        } else {
>> +            /* The default table uses 4K pages */
>> +            vfio_host_iommu_add(container, info.dma32_window_start,
>> +                                info.dma32_window_start +
>> +                                info.dma32_window_size - 1,
>> +                                0x1000);
>> +        }
>>       } else {
>>           error_report("vfio: No available IOMMU models");
>>           ret = -EINVAL;
>> diff --git a/trace-events b/trace-events
>> index cc619e1..f2b75a3 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -1736,6 +1736,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
>>   vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
>>   vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>>   vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>> +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>> +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>>
>>   # hw/vfio/platform.c
>>   vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
>
David Gibson March 23, 2016, 1:08 a.m. UTC | #3
On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
> On 03/22/2016 04:14 PM, David Gibson wrote:
> >On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> >>New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
> >>This adds ability to VFIO common code to dynamically allocate/remove
> >>DMA windows in the host kernel when new VFIO container is added/removed.
> >>
> >>This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
> >>and adds just created IOMMU into the host IOMMU list; the opposite
> >>action is taken in vfio_listener_region_del.
> >>
> >>When creating a new window, this uses euristic to decide on the TCE table
> >>levels number.
> >>
> >>This should cause no guest visible change in behavior.
> >>
> >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>---
> >>Changes:
> >>v14:
> >>* new to the series
> >>
> >>---
> >>TODO:
> >>* export levels to PHB
> >>---
> >>  hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>  trace-events     |   2 ++
> >>  2 files changed, 105 insertions(+), 5 deletions(-)
> >>
> >>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>index 4e873b7..421d6eb 100644
> >>--- a/hw/vfio/common.c
> >>+++ b/hw/vfio/common.c
> >>@@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
> >>      return 0;
> >>  }
> >>
> >>+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
> >>+{
> >>+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
> >
> >The hard-coded 0x1000 looks dubious..
> 
> Well, that's the minimal page size...

Really?  Some BookE CPUs support 1KiB page size..

> >>+    g_assert(hiommu);
> >>+    QLIST_REMOVE(hiommu, hiommu_next);
> >>+}
> >>+
> >>  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >>  {
> >>      return (!memory_region_is_ram(section->mr) &&
> >>@@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >>      }
> >>      end = int128_get64(llend);
> >>
> >>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >
> >I think this would be clearer split out into a helper function,
> >vfio_create_host_window() or something.
> 
> 
> It is rather vfio_spapr_create_host_window() and we were avoiding
> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
> separate file but this usually triggers more discussion and never ends well.
> 
> 
> 
> >>+        unsigned entries, pages;
> >>+        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
> >>+
> >>+        g_assert(section->mr->iommu_ops);
> >>+        g_assert(memory_region_is_iommu(section->mr));
> >
> >I don't think you need these asserts.  AFAICT the same logic should
> >work if a RAM MR was added directly to PCI address space - this would
> >create the new host window, then the existing code for adding a RAM MR
> >would map that block of RAM statically into the new window.
> 
> In what configuration/machine can we do that on SPAPR?

spapr guests won't ever do that.  But you can run an x86 guest on a
powernv host and this situation could come up.  In any case there's no
point asserting if the code is correct anyway.

> >>+        trace_vfio_listener_region_add_iommu(iova, end - 1);
> >>+        /*
> >>+         * FIXME: For VFIO iommu types which have KVM acceleration to
> >>+         * avoid bouncing all map/unmaps through qemu this way, this
> >>+         * would be the right place to wire that up (tell the KVM
> >>+         * device emulation the VFIO iommu handles to use).
> >>+         */
> >>+        create.window_size = memory_region_size(section->mr);
> >>+        create.page_shift =
> >>+                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
> >
> >Ah.. except that I guess you'd need to fall back to host page size
> >here to handle a RAM MR.
> 
> Can you give an example of such RAM MR being added to PCI AS on
> SPAPR?

On spapr, no.  But you can run other machine types as guests (at least
with TCG) on a host with the spapr IOMMU.

> >>+        /*
> >>+         * SPAPR host supports multilevel TCE tables, there is some
> >>+         * euristic to decide how many levels we want for our table:
> >>+         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
> >>+         */
> >>+        entries = create.window_size >> create.page_shift;
> >>+        pages = (entries * sizeof(uint64_t)) / getpagesize();
> >>+        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
> >>+
> >>+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> >>+        if (ret) {
> >>+            error_report("Failed to create a window, ret = %d (%m)", ret);
> >>+            goto fail;
> >>+        }
> >>+
> >>+        if (create.start_addr != section->offset_within_address_space ||
> >>+            vfio_host_iommu_lookup(container, create.start_addr,
> >>+                                   create.start_addr + create.window_size - 1)) {
> >
> >Under what circumstances can this trigger?  Is the kernel ioctl
> >allowed to return a different window start address than the one
> >requested?
> 
> You already asked this some time ago :) The userspace cannot request
> address, the host kernel returns one.

Ok.  For generality it would be nice if you could succeed here as long
as the new host window covers the requested guest window, even if it
doesn't match exactly.  And for that matter to not request the new
window if the host already has a window covering the guest region.

> >The second check looks very strange - if it returns true doesn't that
> >mean you *do* have host window which can accomodate this guest region,
> >which is what you want?
> 
> This should not happen, this is what this check is for. Can make it assert()
> or something like this.

Oh.. I see.  Because you've done the ioctl, but not recorded the new
host window in the list yet.

No, I think the correct approach is to look for an existing host
window containing the requested guest window *before* you try to
create a new host window.  If one is already there, you can just carry
on.

> >>+            struct vfio_iommu_spapr_tce_remove remove = {
> >>+                .argsz = sizeof(remove),
> >>+                .start_addr = create.start_addr
> >>+            };
> >>+            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
> >>+                         section->offset_within_address_space,
> >>+                         create.start_addr);
> >>+            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>+            ret = -EINVAL;
> >>+            goto fail;
> >>+        }
> >>+        trace_vfio_spapr_create_window(create.page_shift,
> >>+                                       create.window_size,
> >>+                                       create.start_addr);
> >>+
> >>+        vfio_host_iommu_add(container, create.start_addr,
> >>+                            create.start_addr + create.window_size - 1,
> >>+                            1ULL << create.page_shift);
> >>+    }
> >>+
> >>      if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
> >>          error_report("vfio: IOMMU container %p can't map guest IOVA region"
> >>                       " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
> >>@@ -525,6 +588,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >>                       container, iova, end - iova, ret);
> >>      }
> >>
> >>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>+        struct vfio_iommu_spapr_tce_remove remove = {
> >>+            .argsz = sizeof(remove),
> >>+            .start_addr = section->offset_within_address_space,
> >>+        };
> >>+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>+        if (ret) {
> >>+            error_report("Failed to remove window at %"PRIx64,
> >>+                         remove.start_addr);
> >>+        }
> >>+
> >>+        vfio_host_iommu_del(container, section->offset_within_address_space);
> >>+
> >>+        trace_vfio_spapr_remove_window(remove.start_addr);
> >>+    }
> >>+
> >>      if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
> >>          iommu->iommu_ops->vfio_stop(section->mr);
> >>      }
> >>@@ -928,11 +1007,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
> >>              goto listener_release_exit;
> >>          }
> >>
> >>-        /* The default table uses 4K pages */
> >>-        vfio_host_iommu_add(container, info.dma32_window_start,
> >>-                            info.dma32_window_start +
> >>-                            info.dma32_window_size - 1,
> >>-                            0x1000);
> >>+        if (v2) {
> >>+            /*
> >>+             * There is a default window in just created container.
> >>+             * To make region_add/del simpler, we better remove this
> >>+             * window now and let those iommu_listener callbacks
> >>+             * create/remove them when needed.
> >>+             */
> >>+            struct vfio_iommu_spapr_tce_remove remove = {
> >>+                .argsz = sizeof(remove),
> >>+                .start_addr = info.dma32_window_start,
> >>+            };
> >>+            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>+            if (ret) {
> >>+                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
> >>+                ret = -errno;
> >>+                goto free_container_exit;
> >>+            }
> >>+        } else {
> >>+            /* The default table uses 4K pages */
> >>+            vfio_host_iommu_add(container, info.dma32_window_start,
> >>+                                info.dma32_window_start +
> >>+                                info.dma32_window_size - 1,
> >>+                                0x1000);
> >>+        }
> >>      } else {
> >>          error_report("vfio: No available IOMMU models");
> >>          ret = -EINVAL;
> >>diff --git a/trace-events b/trace-events
> >>index cc619e1..f2b75a3 100644
> >>--- a/trace-events
> >>+++ b/trace-events
> >>@@ -1736,6 +1736,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
> >>  vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
> >>  vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> >>  vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> >>+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
> >>+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
> >>
> >>  # hw/vfio/platform.c
> >>  vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
> >
> 
>
Alexey Kardashevskiy March 23, 2016, 2:12 a.m. UTC | #4
On 03/23/2016 12:08 PM, David Gibson wrote:
> On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
>> On 03/22/2016 04:14 PM, David Gibson wrote:
>>> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
>>>> This adds ability to VFIO common code to dynamically allocate/remove
>>>> DMA windows in the host kernel when new VFIO container is added/removed.
>>>>
>>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
>>>> and adds just created IOMMU into the host IOMMU list; the opposite
>>>> action is taken in vfio_listener_region_del.
>>>>
>>>> When creating a new window, this uses euristic to decide on the TCE table
>>>> levels number.
>>>>
>>>> This should cause no guest visible change in behavior.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>> Changes:
>>>> v14:
>>>> * new to the series
>>>>
>>>> ---
>>>> TODO:
>>>> * export levels to PHB
>>>> ---
>>>>   hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>   trace-events     |   2 ++
>>>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 4e873b7..421d6eb 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
>>>>       return 0;
>>>>   }
>>>>
>>>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
>>>> +{
>>>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
>>>
>>> The hard-coded 0x1000 looks dubious..
>>
>> Well, that's the minimal page size...
>
> Really?  Some BookE CPUs support 1KiB page size..


Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)


>
>>>> +    g_assert(hiommu);
>>>> +    QLIST_REMOVE(hiommu, hiommu_next);
>>>> +}
>>>> +
>>>>   static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>>>   {
>>>>       return (!memory_region_is_ram(section->mr) &&
>>>> @@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>>>       }
>>>>       end = int128_get64(llend);
>>>>
>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>
>>> I think this would be clearer split out into a helper function,
>>> vfio_create_host_window() or something.
>>
>>
>> It is rather vfio_spapr_create_host_window() and we were avoiding
>> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
>> separate file but this usually triggers more discussion and never ends well.
>>
>>
>>
>>>> +        unsigned entries, pages;
>>>> +        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
>>>> +
>>>> +        g_assert(section->mr->iommu_ops);
>>>> +        g_assert(memory_region_is_iommu(section->mr));
>>>
>>> I don't think you need these asserts.  AFAICT the same logic should
>>> work if a RAM MR was added directly to PCI address space - this would
>>> create the new host window, then the existing code for adding a RAM MR
>>> would map that block of RAM statically into the new window.
>>
>> In what configuration/machine can we do that on SPAPR?
>
> spapr guests won't ever do that.  But you can run an x86 guest on a
> powernv host and this situation could come up.


I am pretty sure VFIO won't work in this case anyway.

> In any case there's no point asserting if the code is correct anyway.

Assert here says (at least) "not tested" or "not expected to happen".


>
>>>> +        trace_vfio_listener_region_add_iommu(iova, end - 1);
>>>> +        /*
>>>> +         * FIXME: For VFIO iommu types which have KVM acceleration to
>>>> +         * avoid bouncing all map/unmaps through qemu this way, this
>>>> +         * would be the right place to wire that up (tell the KVM
>>>> +         * device emulation the VFIO iommu handles to use).
>>>> +         */
>>>> +        create.window_size = memory_region_size(section->mr);
>>>> +        create.page_shift =
>>>> +                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
>>>
>>> Ah.. except that I guess you'd need to fall back to host page size
>>> here to handle a RAM MR.
>>
>> Can you give an example of such RAM MR being added to PCI AS on
>> SPAPR?
>
> On spapr, no.  But you can run other machine types as guests (at least
> with TCG) on a host with the spapr IOMMU.
>
>>>> +        /*
>>>> +         * SPAPR host supports multilevel TCE tables, there is some
>>>> +         * euristic to decide how many levels we want for our table:
>>>> +         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
>>>> +         */
>>>> +        entries = create.window_size >> create.page_shift;
>>>> +        pages = (entries * sizeof(uint64_t)) / getpagesize();
>>>> +        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
>>>> +
>>>> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>>>> +        if (ret) {
>>>> +            error_report("Failed to create a window, ret = %d (%m)", ret);
>>>> +            goto fail;
>>>> +        }
>>>> +
>>>> +        if (create.start_addr != section->offset_within_address_space ||
>>>> +            vfio_host_iommu_lookup(container, create.start_addr,
>>>> +                                   create.start_addr + create.window_size - 1)) {
>>>
>>> Under what circumstances can this trigger?  Is the kernel ioctl
>>> allowed to return a different window start address than the one
>>> requested?
>>
>> You already asked this some time ago :) The userspace cannot request
>> address, the host kernel returns one.
>
> Ok.  For generality it would be nice if you could succeed here as long
> as the new host window covers the requested guest window, even if it
> doesn't match exactly.  And for that matter to not request the new
> window if the host already has a window covering the guest region.


That would be dead code - when would it possibly work? I mean I could 
instrument an artificial test but the actual user which might appear later 
will likely be soooo different so this won't help anyway.


>>> The second check looks very strange - if it returns true doesn't that
>>> mean you *do* have host window which can accomodate this guest region,
>>> which is what you want?
>>
>> This should not happen, this is what this check is for. Can make it assert()
>> or something like this.
>
> Oh.. I see.  Because you've done the ioctl, but not recorded the new
> host window in the list yet.
>
> No, I think the correct approach is to look for an existing host
> window containing the requested guest window *before* you try to
> create a new host window.  If one is already there, you can just carry
> on.

Right, I'll change this.


>
>>>> +            struct vfio_iommu_spapr_tce_remove remove = {
>>>> +                .argsz = sizeof(remove),
>>>> +                .start_addr = create.start_addr
>>>> +            };
>>>> +            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
>>>> +                         section->offset_within_address_space,
>>>> +                         create.start_addr);
>>>> +            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>>>> +            ret = -EINVAL;
>>>> +            goto fail;
>>>> +        }
>>>> +        trace_vfio_spapr_create_window(create.page_shift,
>>>> +                                       create.window_size,
>>>> +                                       create.start_addr);
>>>> +
>>>> +        vfio_host_iommu_add(container, create.start_addr,
>>>> +                            create.start_addr + create.window_size - 1,
>>>> +                            1ULL << create.page_shift);
>>>> +    }
>>>> +
>>>>       if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
>>>>           error_report("vfio: IOMMU container %p can't map guest IOVA region"
>>>>                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
>>>> @@ -525,6 +588,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>>>                        container, iova, end - iova, ret);
>>>>       }
>>>>
>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>> +        struct vfio_iommu_spapr_tce_remove remove = {
>>>> +            .argsz = sizeof(remove),
>>>> +            .start_addr = section->offset_within_address_space,
>>>> +        };
>>>> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>>>> +        if (ret) {
>>>> +            error_report("Failed to remove window at %"PRIx64,
>>>> +                         remove.start_addr);
>>>> +        }
>>>> +
>>>> +        vfio_host_iommu_del(container, section->offset_within_address_space);
>>>> +
>>>> +        trace_vfio_spapr_remove_window(remove.start_addr);
>>>> +    }
>>>> +
>>>>       if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
>>>>           iommu->iommu_ops->vfio_stop(section->mr);
>>>>       }
>>>> @@ -928,11 +1007,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
>>>>               goto listener_release_exit;
>>>>           }
>>>>
>>>> -        /* The default table uses 4K pages */
>>>> -        vfio_host_iommu_add(container, info.dma32_window_start,
>>>> -                            info.dma32_window_start +
>>>> -                            info.dma32_window_size - 1,
>>>> -                            0x1000);
>>>> +        if (v2) {
>>>> +            /*
>>>> +             * There is a default window in just created container.
>>>> +             * To make region_add/del simpler, we better remove this
>>>> +             * window now and let those iommu_listener callbacks
>>>> +             * create/remove them when needed.
>>>> +             */
>>>> +            struct vfio_iommu_spapr_tce_remove remove = {
>>>> +                .argsz = sizeof(remove),
>>>> +                .start_addr = info.dma32_window_start,
>>>> +            };
>>>> +            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
>>>> +            if (ret) {
>>>> +                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
>>>> +                ret = -errno;
>>>> +                goto free_container_exit;
>>>> +            }
>>>> +        } else {
>>>> +            /* The default table uses 4K pages */
>>>> +            vfio_host_iommu_add(container, info.dma32_window_start,
>>>> +                                info.dma32_window_start +
>>>> +                                info.dma32_window_size - 1,
>>>> +                                0x1000);
>>>> +        }
>>>>       } else {
>>>>           error_report("vfio: No available IOMMU models");
>>>>           ret = -EINVAL;
>>>> diff --git a/trace-events b/trace-events
>>>> index cc619e1..f2b75a3 100644
>>>> --- a/trace-events
>>>> +++ b/trace-events
>>>> @@ -1736,6 +1736,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
>>>>   vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
>>>>   vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>>>>   vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
>>>> +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
>>>> +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
>>>>
>>>>   # hw/vfio/platform.c
>>>>   vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
>>>
>>
>>
>
David Gibson March 23, 2016, 2:53 a.m. UTC | #5
On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
> On 03/23/2016 12:08 PM, David Gibson wrote:
> >On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/22/2016 04:14 PM, David Gibson wrote:
> >>>On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> >>>>New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
> >>>>This adds ability to VFIO common code to dynamically allocate/remove
> >>>>DMA windows in the host kernel when new VFIO container is added/removed.
> >>>>
> >>>>This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
> >>>>and adds just created IOMMU into the host IOMMU list; the opposite
> >>>>action is taken in vfio_listener_region_del.
> >>>>
> >>>>When creating a new window, this uses euristic to decide on the TCE table
> >>>>levels number.
> >>>>
> >>>>This should cause no guest visible change in behavior.
> >>>>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>---
> >>>>Changes:
> >>>>v14:
> >>>>* new to the series
> >>>>
> >>>>---
> >>>>TODO:
> >>>>* export levels to PHB
> >>>>---
> >>>>  hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>>>  trace-events     |   2 ++
> >>>>  2 files changed, 105 insertions(+), 5 deletions(-)
> >>>>
> >>>>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>>index 4e873b7..421d6eb 100644
> >>>>--- a/hw/vfio/common.c
> >>>>+++ b/hw/vfio/common.c
> >>>>@@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
> >>>>      return 0;
> >>>>  }
> >>>>
> >>>>+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
> >>>>+{
> >>>>+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
> >>>
> >>>The hard-coded 0x1000 looks dubious..
> >>
> >>Well, that's the minimal page size...
> >
> >Really?  Some BookE CPUs support 1KiB page size..
> 
> Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)

Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
it's been done for CPU MMU I wouldn't count on it not being done for
IOMMU.

1 is a safer choice.

> 
> 
> >
> >>>>+    g_assert(hiommu);
> >>>>+    QLIST_REMOVE(hiommu, hiommu_next);
> >>>>+}
> >>>>+
> >>>>  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >>>>  {
> >>>>      return (!memory_region_is_ram(section->mr) &&
> >>>>@@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >>>>      }
> >>>>      end = int128_get64(llend);
> >>>>
> >>>>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>>
> >>>I think this would be clearer split out into a helper function,
> >>>vfio_create_host_window() or something.
> >>
> >>
> >>It is rather vfio_spapr_create_host_window() and we were avoiding
> >>xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
> >>separate file but this usually triggers more discussion and never ends well.
> >>
> >>
> >>
> >>>>+        unsigned entries, pages;
> >>>>+        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
> >>>>+
> >>>>+        g_assert(section->mr->iommu_ops);
> >>>>+        g_assert(memory_region_is_iommu(section->mr));
> >>>
> >>>I don't think you need these asserts.  AFAICT the same logic should
> >>>work if a RAM MR was added directly to PCI address space - this would
> >>>create the new host window, then the existing code for adding a RAM MR
> >>>would map that block of RAM statically into the new window.
> >>
> >>In what configuration/machine can we do that on SPAPR?
> >
> >spapr guests won't ever do that.  But you can run an x86 guest on a
> >powernv host and this situation could come up.
> 
> 
> I am pretty sure VFIO won't work in this case anyway.

I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.

> >In any case there's no point asserting if the code is correct anyway.
> 
> Assert here says (at least) "not tested" or "not expected to
> happen".

Hmmm..

> 
> 
> >
> >>>>+        trace_vfio_listener_region_add_iommu(iova, end - 1);
> >>>>+        /*
> >>>>+         * FIXME: For VFIO iommu types which have KVM acceleration to
> >>>>+         * avoid bouncing all map/unmaps through qemu this way, this
> >>>>+         * would be the right place to wire that up (tell the KVM
> >>>>+         * device emulation the VFIO iommu handles to use).
> >>>>+         */
> >>>>+        create.window_size = memory_region_size(section->mr);
> >>>>+        create.page_shift =
> >>>>+                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
> >>>
> >>>Ah.. except that I guess you'd need to fall back to host page size
> >>>here to handle a RAM MR.
> >>
> >>Can you give an example of such RAM MR being added to PCI AS on
> >>SPAPR?
> >
> >On spapr, no.  But you can run other machine types as guests (at least
> >with TCG) on a host with the spapr IOMMU.
> >
> >>>>+        /*
> >>>>+         * SPAPR host supports multilevel TCE tables, there is some
> >>>>+         * euristic to decide how many levels we want for our table:
> >>>>+         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
> >>>>+         */
> >>>>+        entries = create.window_size >> create.page_shift;
> >>>>+        pages = (entries * sizeof(uint64_t)) / getpagesize();
> >>>>+        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
> >>>>+
> >>>>+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> >>>>+        if (ret) {
> >>>>+            error_report("Failed to create a window, ret = %d (%m)", ret);
> >>>>+            goto fail;
> >>>>+        }
> >>>>+
> >>>>+        if (create.start_addr != section->offset_within_address_space ||
> >>>>+            vfio_host_iommu_lookup(container, create.start_addr,
> >>>>+                                   create.start_addr + create.window_size - 1)) {
> >>>
> >>>Under what circumstances can this trigger?  Is the kernel ioctl
> >>>allowed to return a different window start address than the one
> >>>requested?
> >>
> >>You already asked this some time ago :) The userspace cannot request
> >>address, the host kernel returns one.
> >
> >Ok.  For generality it would be nice if you could succeed here as long
> >as the new host window covers the requested guest window, even if it
> >doesn't match exactly.  And for that matter to not request the new
> >window if the host already has a window covering the guest region.
> 
> 
> That would be dead code - when would it possibly work? I mean I could
> instrument an artificial test but the actual user which might appear later
> will likely be soooo different so this won't help anyway.

Hmm, I suppose.  It actually shouldn't be that hard to trigger a case
like this, if you just bumped the bridge's dma64 base address property
up a little bit - above the host kernel's base address, but small
enough that you can still easily fit the guest memory in.

> >>>The second check looks very strange - if it returns true doesn't that
> >>>mean you *do* have host window which can accomodate this guest region,
> >>>which is what you want?
> >>
> >>This should not happen, this is what this check is for. Can make it assert()
> >>or something like this.
> >
> >Oh.. I see.  Because you've done the ioctl, but not recorded the new
> >host window in the list yet.
> >
> >No, I think the correct approach is to look for an existing host
> >window containing the requested guest window *before* you try to
> >create a new host window.  If one is already there, you can just carry
> >on.
> 
> Right, I'll change this.
> 
> 
> >
> >>>>+            struct vfio_iommu_spapr_tce_remove remove = {
> >>>>+                .argsz = sizeof(remove),
> >>>>+                .start_addr = create.start_addr
> >>>>+            };
> >>>>+            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
> >>>>+                         section->offset_within_address_space,
> >>>>+                         create.start_addr);
> >>>>+            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>>>+            ret = -EINVAL;
> >>>>+            goto fail;
> >>>>+        }
> >>>>+        trace_vfio_spapr_create_window(create.page_shift,
> >>>>+                                       create.window_size,
> >>>>+                                       create.start_addr);
> >>>>+
> >>>>+        vfio_host_iommu_add(container, create.start_addr,
> >>>>+                            create.start_addr + create.window_size - 1,
> >>>>+                            1ULL << create.page_shift);
> >>>>+    }
> >>>>+
> >>>>      if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
> >>>>          error_report("vfio: IOMMU container %p can't map guest IOVA region"
> >>>>                       " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
> >>>>@@ -525,6 +588,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >>>>                       container, iova, end - iova, ret);
> >>>>      }
> >>>>
> >>>>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>>>+        struct vfio_iommu_spapr_tce_remove remove = {
> >>>>+            .argsz = sizeof(remove),
> >>>>+            .start_addr = section->offset_within_address_space,
> >>>>+        };
> >>>>+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>>>+        if (ret) {
> >>>>+            error_report("Failed to remove window at %"PRIx64,
> >>>>+                         remove.start_addr);
> >>>>+        }
> >>>>+
> >>>>+        vfio_host_iommu_del(container, section->offset_within_address_space);
> >>>>+
> >>>>+        trace_vfio_spapr_remove_window(remove.start_addr);
> >>>>+    }
> >>>>+
> >>>>      if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
> >>>>          iommu->iommu_ops->vfio_stop(section->mr);
> >>>>      }
> >>>>@@ -928,11 +1007,30 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
> >>>>              goto listener_release_exit;
> >>>>          }
> >>>>
> >>>>-        /* The default table uses 4K pages */
> >>>>-        vfio_host_iommu_add(container, info.dma32_window_start,
> >>>>-                            info.dma32_window_start +
> >>>>-                            info.dma32_window_size - 1,
> >>>>-                            0x1000);
> >>>>+        if (v2) {
> >>>>+            /*
> >>>>+             * There is a default window in just created container.
> >>>>+             * To make region_add/del simpler, we better remove this
> >>>>+             * window now and let those iommu_listener callbacks
> >>>>+             * create/remove them when needed.
> >>>>+             */
> >>>>+            struct vfio_iommu_spapr_tce_remove remove = {
> >>>>+                .argsz = sizeof(remove),
> >>>>+                .start_addr = info.dma32_window_start,
> >>>>+            };
> >>>>+            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> >>>>+            if (ret) {
> >>>>+                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
> >>>>+                ret = -errno;
> >>>>+                goto free_container_exit;
> >>>>+            }
> >>>>+        } else {
> >>>>+            /* The default table uses 4K pages */
> >>>>+            vfio_host_iommu_add(container, info.dma32_window_start,
> >>>>+                                info.dma32_window_start +
> >>>>+                                info.dma32_window_size - 1,
> >>>>+                                0x1000);
> >>>>+        }
> >>>>      } else {
> >>>>          error_report("vfio: No available IOMMU models");
> >>>>          ret = -EINVAL;
> >>>>diff --git a/trace-events b/trace-events
> >>>>index cc619e1..f2b75a3 100644
> >>>>--- a/trace-events
> >>>>+++ b/trace-events
> >>>>@@ -1736,6 +1736,8 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d"
> >>>>  vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
> >>>>  vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> >>>>  vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
> >>>>+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
> >>>>+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
> >>>>
> >>>>  # hw/vfio/platform.c
> >>>>  vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
> >>>
> >>
> >>
> >
> 
>
Alexey Kardashevskiy March 23, 2016, 3:06 a.m. UTC | #6
On 03/23/2016 01:53 PM, David Gibson wrote:
> On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
>> On 03/23/2016 12:08 PM, David Gibson wrote:
>>> On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
>>>> On 03/22/2016 04:14 PM, David Gibson wrote:
>>>>> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>>>>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
>>>>>> This adds ability to VFIO common code to dynamically allocate/remove
>>>>>> DMA windows in the host kernel when new VFIO container is added/removed.
>>>>>>
>>>>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
>>>>>> and adds just created IOMMU into the host IOMMU list; the opposite
>>>>>> action is taken in vfio_listener_region_del.
>>>>>>
>>>>>> When creating a new window, this uses euristic to decide on the TCE table
>>>>>> levels number.
>>>>>>
>>>>>> This should cause no guest visible change in behavior.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>> Changes:
>>>>>> v14:
>>>>>> * new to the series
>>>>>>
>>>>>> ---
>>>>>> TODO:
>>>>>> * export levels to PHB
>>>>>> ---
>>>>>>   hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>>>   trace-events     |   2 ++
>>>>>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>> index 4e873b7..421d6eb 100644
>>>>>> --- a/hw/vfio/common.c
>>>>>> +++ b/hw/vfio/common.c
>>>>>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
>>>>>>       return 0;
>>>>>>   }
>>>>>>
>>>>>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
>>>>>> +{
>>>>>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
>>>>>
>>>>> The hard-coded 0x1000 looks dubious..
>>>>
>>>> Well, that's the minimal page size...
>>>
>>> Really?  Some BookE CPUs support 1KiB page size..
>>
>> Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
>
> Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
> it's been done for CPU MMU I wouldn't count on it not being done for
> IOMMU.
>
> 1 is a safer choice.
>
>>
>>
>>>
>>>>>> +    g_assert(hiommu);
>>>>>> +    QLIST_REMOVE(hiommu, hiommu_next);
>>>>>> +}
>>>>>> +
>>>>>>   static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>>>>>   {
>>>>>>       return (!memory_region_is_ram(section->mr) &&
>>>>>> @@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>>>>>       }
>>>>>>       end = int128_get64(llend);
>>>>>>
>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>
>>>>> I think this would be clearer split out into a helper function,
>>>>> vfio_create_host_window() or something.
>>>>
>>>>
>>>> It is rather vfio_spapr_create_host_window() and we were avoiding
>>>> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
>>>> separate file but this usually triggers more discussion and never ends well.
>>>>
>>>>
>>>>
>>>>>> +        unsigned entries, pages;
>>>>>> +        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
>>>>>> +
>>>>>> +        g_assert(section->mr->iommu_ops);
>>>>>> +        g_assert(memory_region_is_iommu(section->mr));
>>>>>
>>>>> I don't think you need these asserts.  AFAICT the same logic should
>>>>> work if a RAM MR was added directly to PCI address space - this would
>>>>> create the new host window, then the existing code for adding a RAM MR
>>>>> would map that block of RAM statically into the new window.
>>>>
>>>> In what configuration/machine can we do that on SPAPR?
>>>
>>> spapr guests won't ever do that.  But you can run an x86 guest on a
>>> powernv host and this situation could come up.
>>
>>
>> I am pretty sure VFIO won't work in this case anyway.
>
> I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.

This is not about TCG (pseries TCG guest works with VFIO on powernv host), 
this is about things like VFIO_IOMMU_GET_INFO vs. 
VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.

Should I add such support in this patchset?


>
>>> In any case there's no point asserting if the code is correct anyway.
>>
>> Assert here says (at least) "not tested" or "not expected to
>> happen".
>
> Hmmm..
>
>>
>>
>>>
>>>>>> +        trace_vfio_listener_region_add_iommu(iova, end - 1);
>>>>>> +        /*
>>>>>> +         * FIXME: For VFIO iommu types which have KVM acceleration to
>>>>>> +         * avoid bouncing all map/unmaps through qemu this way, this
>>>>>> +         * would be the right place to wire that up (tell the KVM
>>>>>> +         * device emulation the VFIO iommu handles to use).
>>>>>> +         */
>>>>>> +        create.window_size = memory_region_size(section->mr);
>>>>>> +        create.page_shift =
>>>>>> +                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
>>>>>
>>>>> Ah.. except that I guess you'd need to fall back to host page size
>>>>> here to handle a RAM MR.
>>>>
>>>> Can you give an example of such RAM MR being added to PCI AS on
>>>> SPAPR?
>>>
>>> On spapr, no.  But you can run other machine types as guests (at least
>>> with TCG) on a host with the spapr IOMMU.
>>>
>>>>>> +        /*
>>>>>> +         * SPAPR host supports multilevel TCE tables, there is some
>>>>>> +         * euristic to decide how many levels we want for our table:
>>>>>> +         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
>>>>>> +         */
>>>>>> +        entries = create.window_size >> create.page_shift;
>>>>>> +        pages = (entries * sizeof(uint64_t)) / getpagesize();
>>>>>> +        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
>>>>>> +
>>>>>> +        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>>>>>> +        if (ret) {
>>>>>> +            error_report("Failed to create a window, ret = %d (%m)", ret);
>>>>>> +            goto fail;
>>>>>> +        }
>>>>>> +
>>>>>> +        if (create.start_addr != section->offset_within_address_space ||
>>>>>> +            vfio_host_iommu_lookup(container, create.start_addr,
>>>>>> +                                   create.start_addr + create.window_size - 1)) {
>>>>>
>>>>> Under what circumstances can this trigger?  Is the kernel ioctl
>>>>> allowed to return a different window start address than the one
>>>>> requested?
>>>>
>>>> You already asked this some time ago :) The userspace cannot request
>>>> address, the host kernel returns one.
>>>
>>> Ok.  For generality it would be nice if you could succeed here as long
>>> as the new host window covers the requested guest window, even if it
>>> doesn't match exactly.  And for that matter to not request the new
>>> window if the host already has a window covering the guest region.
>>
>>
>> That would be dead code - when would it possibly work? I mean I could
>> instrument an artificial test but the actual user which might appear later
>> will likely be soooo different so this won't help anyway.
>
> Hmm, I suppose.  It actually shouldn't be that hard to trigger a case
> like this, if you just bumped the bridge's dma64 base address property
> up a little bit - above the host kernel's base address, but small
> enough that you can still easily fit the guest memory in.


I can test it today for sure but once committed, we will have to support 
it. Which I am trying to avoid until we get clear picture what we are 
supporting here.
David Gibson March 23, 2016, 6:03 a.m. UTC | #7
On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
> On 03/23/2016 01:53 PM, David Gibson wrote:
> >On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/23/2016 12:08 PM, David Gibson wrote:
> >>>On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
> >>>>On 03/22/2016 04:14 PM, David Gibson wrote:
> >>>>>On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
> >>>>>>This adds ability to VFIO common code to dynamically allocate/remove
> >>>>>>DMA windows in the host kernel when new VFIO container is added/removed.
> >>>>>>
> >>>>>>This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
> >>>>>>and adds just created IOMMU into the host IOMMU list; the opposite
> >>>>>>action is taken in vfio_listener_region_del.
> >>>>>>
> >>>>>>When creating a new window, this uses euristic to decide on the TCE table
> >>>>>>levels number.
> >>>>>>
> >>>>>>This should cause no guest visible change in behavior.
> >>>>>>
> >>>>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>---
> >>>>>>Changes:
> >>>>>>v14:
> >>>>>>* new to the series
> >>>>>>
> >>>>>>---
> >>>>>>TODO:
> >>>>>>* export levels to PHB
> >>>>>>---
> >>>>>>  hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>>>>>  trace-events     |   2 ++
> >>>>>>  2 files changed, 105 insertions(+), 5 deletions(-)
> >>>>>>
> >>>>>>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>>>>index 4e873b7..421d6eb 100644
> >>>>>>--- a/hw/vfio/common.c
> >>>>>>+++ b/hw/vfio/common.c
> >>>>>>@@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
> >>>>>>      return 0;
> >>>>>>  }
> >>>>>>
> >>>>>>+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
> >>>>>>+{
> >>>>>>+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
> >>>>>
> >>>>>The hard-coded 0x1000 looks dubious..
> >>>>
> >>>>Well, that's the minimal page size...
> >>>
> >>>Really?  Some BookE CPUs support 1KiB page size..
> >>
> >>Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
> >
> >Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
> >it's been done for CPU MMU I wouldn't count on it not being done for
> >IOMMU.
> >
> >1 is a safer choice.
> >
> >>
> >>
> >>>
> >>>>>>+    g_assert(hiommu);
> >>>>>>+    QLIST_REMOVE(hiommu, hiommu_next);
> >>>>>>+}
> >>>>>>+
> >>>>>>  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >>>>>>  {
> >>>>>>      return (!memory_region_is_ram(section->mr) &&
> >>>>>>@@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >>>>>>      }
> >>>>>>      end = int128_get64(llend);
> >>>>>>
> >>>>>>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>>>>
> >>>>>I think this would be clearer split out into a helper function,
> >>>>>vfio_create_host_window() or something.
> >>>>
> >>>>
> >>>>It is rather vfio_spapr_create_host_window() and we were avoiding
> >>>>xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
> >>>>separate file but this usually triggers more discussion and never ends well.
> >>>>
> >>>>
> >>>>
> >>>>>>+        unsigned entries, pages;
> >>>>>>+        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
> >>>>>>+
> >>>>>>+        g_assert(section->mr->iommu_ops);
> >>>>>>+        g_assert(memory_region_is_iommu(section->mr));
> >>>>>
> >>>>>I don't think you need these asserts.  AFAICT the same logic should
> >>>>>work if a RAM MR was added directly to PCI address space - this would
> >>>>>create the new host window, then the existing code for adding a RAM MR
> >>>>>would map that block of RAM statically into the new window.
> >>>>
> >>>>In what configuration/machine can we do that on SPAPR?
> >>>
> >>>spapr guests won't ever do that.  But you can run an x86 guest on a
> >>>powernv host and this situation could come up.
> >>
> >>
> >>I am pretty sure VFIO won't work in this case anyway.
> >
> >I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
> 
> This is not about TCG (pseries TCG guest works with VFIO on powernv host),
> this is about things like VFIO_IOMMU_GET_INFO vs.
> VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
> 
> Should I add such support in this patchset?

Unless adding the generality is really complex, and so far I haven't
seen a reason for it to be.

> 
> 
> >
> >>>In any case there's no point asserting if the code is correct anyway.
> >>
> >>Assert here says (at least) "not tested" or "not expected to
> >>happen".
> >
> >Hmmm..
> >
> >>
> >>
> >>>
> >>>>>>+        trace_vfio_listener_region_add_iommu(iova, end - 1);
> >>>>>>+        /*
> >>>>>>+         * FIXME: For VFIO iommu types which have KVM acceleration to
> >>>>>>+         * avoid bouncing all map/unmaps through qemu this way, this
> >>>>>>+         * would be the right place to wire that up (tell the KVM
> >>>>>>+         * device emulation the VFIO iommu handles to use).
> >>>>>>+         */
> >>>>>>+        create.window_size = memory_region_size(section->mr);
> >>>>>>+        create.page_shift =
> >>>>>>+                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
> >>>>>
> >>>>>Ah.. except that I guess you'd need to fall back to host page size
> >>>>>here to handle a RAM MR.
> >>>>
> >>>>Can you give an example of such RAM MR being added to PCI AS on
> >>>>SPAPR?
> >>>
> >>>On spapr, no.  But you can run other machine types as guests (at least
> >>>with TCG) on a host with the spapr IOMMU.
> >>>
> >>>>>>+        /*
> >>>>>>+         * SPAPR host supports multilevel TCE tables, there is some
> >>>>>>+         * euristic to decide how many levels we want for our table:
> >>>>>>+         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
> >>>>>>+         */
> >>>>>>+        entries = create.window_size >> create.page_shift;
> >>>>>>+        pages = (entries * sizeof(uint64_t)) / getpagesize();
> >>>>>>+        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
> >>>>>>+
> >>>>>>+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> >>>>>>+        if (ret) {
> >>>>>>+            error_report("Failed to create a window, ret = %d (%m)", ret);
> >>>>>>+            goto fail;
> >>>>>>+        }
> >>>>>>+
> >>>>>>+        if (create.start_addr != section->offset_within_address_space ||
> >>>>>>+            vfio_host_iommu_lookup(container, create.start_addr,
> >>>>>>+                                   create.start_addr + create.window_size - 1)) {
> >>>>>
> >>>>>Under what circumstances can this trigger?  Is the kernel ioctl
> >>>>>allowed to return a different window start address than the one
> >>>>>requested?
> >>>>
> >>>>You already asked this some time ago :) The userspace cannot request
> >>>>address, the host kernel returns one.
> >>>
> >>>Ok.  For generality it would be nice if you could succeed here as long
> >>>as the new host window covers the requested guest window, even if it
> >>>doesn't match exactly.  And for that matter to not request the new
> >>>window if the host already has a window covering the guest region.
> >>
> >>
> >>That would be dead code - when would it possibly work? I mean I could
> >>instrument an artificial test but the actual user which might appear later
> >>will likely be soooo different so this won't help anyway.
> >
> >Hmm, I suppose.  It actually shouldn't be that hard to trigger a case
> >like this, if you just bumped the bridge's dma64 base address property
> >up a little bit - above the host kernel's base address, but small
> >enough that you can still easily fit the guest memory in.
> 
> 
> I can test it today for sure but once committed, we will have to support it.
> Which I am trying to avoid until we get clear picture what we are supporting
> here.
> 
>
Alexey Kardashevskiy March 24, 2016, 12:03 a.m. UTC | #8
On 03/23/2016 05:03 PM, David Gibson wrote:
> On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
>> On 03/23/2016 01:53 PM, David Gibson wrote:
>>> On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
>>>> On 03/23/2016 12:08 PM, David Gibson wrote:
>>>>> On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
>>>>>> On 03/22/2016 04:14 PM, David Gibson wrote:
>>>>>>> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>>>>>>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
>>>>>>>> This adds ability to VFIO common code to dynamically allocate/remove
>>>>>>>> DMA windows in the host kernel when new VFIO container is added/removed.
>>>>>>>>
>>>>>>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to vfio_listener_region_add
>>>>>>>> and adds just created IOMMU into the host IOMMU list; the opposite
>>>>>>>> action is taken in vfio_listener_region_del.
>>>>>>>>
>>>>>>>> When creating a new window, this uses euristic to decide on the TCE table
>>>>>>>> levels number.
>>>>>>>>
>>>>>>>> This should cause no guest visible change in behavior.
>>>>>>>>
>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>> ---
>>>>>>>> Changes:
>>>>>>>> v14:
>>>>>>>> * new to the series
>>>>>>>>
>>>>>>>> ---
>>>>>>>> TODO:
>>>>>>>> * export levels to PHB
>>>>>>>> ---
>>>>>>>>   hw/vfio/common.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>>>>>   trace-events     |   2 ++
>>>>>>>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>>>> index 4e873b7..421d6eb 100644
>>>>>>>> --- a/hw/vfio/common.c
>>>>>>>> +++ b/hw/vfio/common.c
>>>>>>>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer *container,
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>>
>>>>>>>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
>>>>>>>> +{
>>>>>>>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
>>>>>>>
>>>>>>> The hard-coded 0x1000 looks dubious..
>>>>>>
>>>>>> Well, that's the minimal page size...
>>>>>
>>>>> Really?  Some BookE CPUs support 1KiB page size..
>>>>
>>>> Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
>>>
>>> Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
>>> it's been done for CPU MMU I wouldn't count on it not being done for
>>> IOMMU.
>>>
>>> 1 is a safer choice.
>>>
>>>>
>>>>
>>>>>
>>>>>>>> +    g_assert(hiommu);
>>>>>>>> +    QLIST_REMOVE(hiommu, hiommu_next);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>   static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>>>>>>>   {
>>>>>>>>       return (!memory_region_is_ram(section->mr) &&
>>>>>>>> @@ -392,6 +400,61 @@ static void vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>       }
>>>>>>>>       end = int128_get64(llend);
>>>>>>>>
>>>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>>>
>>>>>>> I think this would be clearer split out into a helper function,
>>>>>>> vfio_create_host_window() or something.
>>>>>>
>>>>>>
>>>>>> It is rather vfio_spapr_create_host_window() and we were avoiding
>>>>>> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
>>>>>> separate file but this usually triggers more discussion and never ends well.
>>>>>>
>>>>>>
>>>>>>
>>>>>>>> +        unsigned entries, pages;
>>>>>>>> +        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
>>>>>>>> +
>>>>>>>> +        g_assert(section->mr->iommu_ops);
>>>>>>>> +        g_assert(memory_region_is_iommu(section->mr));
>>>>>>>
>>>>>>> I don't think you need these asserts.  AFAICT the same logic should
>>>>>>> work if a RAM MR was added directly to PCI address space - this would
>>>>>>> create the new host window, then the existing code for adding a RAM MR
>>>>>>> would map that block of RAM statically into the new window.
>>>>>>
>>>>>> In what configuration/machine can we do that on SPAPR?
>>>>>
>>>>> spapr guests won't ever do that.  But you can run an x86 guest on a
>>>>> powernv host and this situation could come up.
>>>>
>>>>
>>>> I am pretty sure VFIO won't work in this case anyway.
>>>
>>> I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
>>
>> This is not about TCG (pseries TCG guest works with VFIO on powernv host),
>> this is about things like VFIO_IOMMU_GET_INFO vs.
>> VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
>>
>> Should I add such support in this patchset?
>
> Unless adding the generality is really complex, and so far I haven't
> seen a reason for it to be.

Seriously? :(
Alexey Kardashevskiy March 24, 2016, 9:10 a.m. UTC | #9
On 03/24/2016 11:03 AM, Alexey Kardashevskiy wrote:
> On 03/23/2016 05:03 PM, David Gibson wrote:
>> On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
>>> On 03/23/2016 01:53 PM, David Gibson wrote:
>>>> On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
>>>>> On 03/23/2016 12:08 PM, David Gibson wrote:
>>>>>> On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
>>>>>>> On 03/22/2016 04:14 PM, David Gibson wrote:
>>>>>>>> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>>>>>>>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window
>>>>>>>>> management.
>>>>>>>>> This adds ability to VFIO common code to dynamically allocate/remove
>>>>>>>>> DMA windows in the host kernel when new VFIO container is
>>>>>>>>> added/removed.
>>>>>>>>>
>>>>>>>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to
>>>>>>>>> vfio_listener_region_add
>>>>>>>>> and adds just created IOMMU into the host IOMMU list; the opposite
>>>>>>>>> action is taken in vfio_listener_region_del.
>>>>>>>>>
>>>>>>>>> When creating a new window, this uses euristic to decide on the
>>>>>>>>> TCE table
>>>>>>>>> levels number.
>>>>>>>>>
>>>>>>>>> This should cause no guest visible change in behavior.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>> ---
>>>>>>>>> Changes:
>>>>>>>>> v14:
>>>>>>>>> * new to the series
>>>>>>>>>
>>>>>>>>> ---
>>>>>>>>> TODO:
>>>>>>>>> * export levels to PHB
>>>>>>>>> ---
>>>>>>>>>   hw/vfio/common.c | 108
>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>>>>>>   trace-events     |   2 ++
>>>>>>>>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>>>>> index 4e873b7..421d6eb 100644
>>>>>>>>> --- a/hw/vfio/common.c
>>>>>>>>> +++ b/hw/vfio/common.c
>>>>>>>>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer
>>>>>>>>> *container,
>>>>>>>>>       return 0;
>>>>>>>>>   }
>>>>>>>>>
>>>>>>>>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr
>>>>>>>>> min_iova)
>>>>>>>>> +{
>>>>>>>>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container,
>>>>>>>>> min_iova, 0x1000);
>>>>>>>>
>>>>>>>> The hard-coded 0x1000 looks dubious..
>>>>>>>
>>>>>>> Well, that's the minimal page size...
>>>>>>
>>>>>> Really?  Some BookE CPUs support 1KiB page size..
>>>>>
>>>>> Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
>>>>
>>>> Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
>>>> it's been done for CPU MMU I wouldn't count on it not being done for
>>>> IOMMU.
>>>>
>>>> 1 is a safer choice.
>>>>
>>>>>
>>>>>
>>>>>>
>>>>>>>>> +    g_assert(hiommu);
>>>>>>>>> +    QLIST_REMOVE(hiommu, hiommu_next);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>   static bool vfio_listener_skipped_section(MemoryRegionSection
>>>>>>>>> *section)
>>>>>>>>>   {
>>>>>>>>>       return (!memory_region_is_ram(section->mr) &&
>>>>>>>>> @@ -392,6 +400,61 @@ static void
>>>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>>       }
>>>>>>>>>       end = int128_get64(llend);
>>>>>>>>>
>>>>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>>>>
>>>>>>>> I think this would be clearer split out into a helper function,
>>>>>>>> vfio_create_host_window() or something.
>>>>>>>
>>>>>>>
>>>>>>> It is rather vfio_spapr_create_host_window() and we were avoiding
>>>>>>> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
>>>>>>> separate file but this usually triggers more discussion and never
>>>>>>> ends well.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>> +        unsigned entries, pages;
>>>>>>>>> +        struct vfio_iommu_spapr_tce_create create = { .argsz =
>>>>>>>>> sizeof(create) };
>>>>>>>>> +
>>>>>>>>> +        g_assert(section->mr->iommu_ops);
>>>>>>>>> +        g_assert(memory_region_is_iommu(section->mr));
>>>>>>>>
>>>>>>>> I don't think you need these asserts.  AFAICT the same logic should
>>>>>>>> work if a RAM MR was added directly to PCI address space - this would
>>>>>>>> create the new host window, then the existing code for adding a RAM MR
>>>>>>>> would map that block of RAM statically into the new window.
>>>>>>>
>>>>>>> In what configuration/machine can we do that on SPAPR?
>>>>>>
>>>>>> spapr guests won't ever do that.  But you can run an x86 guest on a
>>>>>> powernv host and this situation could come up.
>>>>>
>>>>>
>>>>> I am pretty sure VFIO won't work in this case anyway.
>>>>
>>>> I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
>>>
>>> This is not about TCG (pseries TCG guest works with VFIO on powernv host),
>>> this is about things like VFIO_IOMMU_GET_INFO vs.
>>> VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
>>>
>>> Should I add such support in this patchset?
>>
>> Unless adding the generality is really complex, and so far I haven't
>> seen a reason for it to be.
>
> Seriously? :(


So, I tried.

With q35 machine (pc-i440fx-2.6 have even worse memory tree), there are 
several RAM blocks - 0..0xc0000, then pc.rom, then RAM again till 2GB, then 
a gap (PCI MMIO?), then PC BIOS at 0xfffc0000 (which is RAM), then after 
4GB the rest of the RAM:

memory-region: system
   0000000000000000-ffffffffffffffff (prio 0, RW): system
     0000000000000000-000000007fffffff (prio 0, RW): alias ram-below-4g 
@pc.ram 0000000000000000-000000007fffffff
     0000000000000000-ffffffffffffffff (prio -1, RW): pci
       00000000000c0000-00000000000dffff (prio 1, RW): pc.rom
       00000000000e0000-00000000000fffff (prio 1, R-): alias isa-bios 
@pc.bios 0000000000020000-000000000003ffff
       00000000febe0000-00000000febeffff (prio 1, RW): 0003:09:00.0 BAR 0
         00000000febe0000-00000000febeffff (prio 0, RW): 0003:09:00.0 BAR 0 
mmaps[0]
       00000000febf0000-00000000febf1fff (prio 1, RW): 0003:09:00.0 BAR 2
         00000000febf0000-00000000febf007f (prio 0, RW): msix-table
         00000000febf1000-00000000febf1007 (prio 0, RW): msix-pba [disabled]
       00000000febf2000-00000000febf2fff (prio 1, RW): ahci
       00000000fffc0000-00000000ffffffff (prio 0, R-): pc.bios
[...]
     0000000100000000-000000027fffffff (prio 0, RW): alias ram-above-4g 
@pc.ram 0000000080000000-00000001ffffffff


Ok, let's say we filter out "pc.bios", "pc.rom", BARs (aka "skip dump" 
regions) and we end up having RAM below 2GB and above 4GB, 2 windows. 
Problem is a window needs to be aligned to power of two.

Ok, I change my test from -m8G to -m10G to have 2 aligned regions - 2GB and 
8GB), next problem - the second window needs to start from 1<<<59, not 4G 
or any other random offset.

Ok, I change my test to start with -m1G to have a single window.
Now it fails because here is what happens:
region_add: pc.ram: 0 40000000
region_del: pc.ram: 0 40000000
region_add: pc.ram: 0 c0000
The second "add" is not power of two -> fail. And I cannot avoid this - 
"pc.rom" is still there, it is a separate region so RAM gets split into 
smaller chunks. I do not know to to fix this properly.


So, in order to make it (x86/tcg on powernv host) work anyhow, I had to 
create a single huge window (as it is a single window - it starts from @0) 
unconditionally in vfio_connect_container(), not in 
vfio_listener_region_add(); and I added filtering (skip "pc.bios", 
"pc.rom", BARs - these things start above 1GB) and then I could boot a 
x86_64 guest and even pass Mellanox ConnextX3, it would bring 2 interfaces 
up and dhclient assigned IPs to them (which is quite amusing that it even 
works).


So, I think I will replace assert() with:

unsigned pagesize = qemu_real_host_page_size;
if (section->mr->iommu_ops) {
     pagesize = section->mr->iommu_ops->get_page_sizes(section->mr);
}

but there is no practical use for this anyway.


What do I miss and what do I need to try more to proceed with this patch? 
Thanks.
David Gibson March 29, 2016, 5:30 a.m. UTC | #10
On Thu, Mar 24, 2016 at 08:10:44PM +1100, Alexey Kardashevskiy wrote:
> On 03/24/2016 11:03 AM, Alexey Kardashevskiy wrote:
> >On 03/23/2016 05:03 PM, David Gibson wrote:
> >>On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
> >>>On 03/23/2016 01:53 PM, David Gibson wrote:
> >>>>On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
> >>>>>On 03/23/2016 12:08 PM, David Gibson wrote:
> >>>>>>On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>>On 03/22/2016 04:14 PM, David Gibson wrote:
> >>>>>>>>On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>>>>New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window
> >>>>>>>>>management.
> >>>>>>>>>This adds ability to VFIO common code to dynamically allocate/remove
> >>>>>>>>>DMA windows in the host kernel when new VFIO container is
> >>>>>>>>>added/removed.
> >>>>>>>>>
> >>>>>>>>>This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to
> >>>>>>>>>vfio_listener_region_add
> >>>>>>>>>and adds just created IOMMU into the host IOMMU list; the opposite
> >>>>>>>>>action is taken in vfio_listener_region_del.
> >>>>>>>>>
> >>>>>>>>>When creating a new window, this uses euristic to decide on the
> >>>>>>>>>TCE table
> >>>>>>>>>levels number.
> >>>>>>>>>
> >>>>>>>>>This should cause no guest visible change in behavior.
> >>>>>>>>>
> >>>>>>>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>>>>---
> >>>>>>>>>Changes:
> >>>>>>>>>v14:
> >>>>>>>>>* new to the series
> >>>>>>>>>
> >>>>>>>>>---
> >>>>>>>>>TODO:
> >>>>>>>>>* export levels to PHB
> >>>>>>>>>---
> >>>>>>>>>  hw/vfio/common.c | 108
> >>>>>>>>>++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>>>>>>>>  trace-events     |   2 ++
> >>>>>>>>>  2 files changed, 105 insertions(+), 5 deletions(-)
> >>>>>>>>>
> >>>>>>>>>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>>>>>>>index 4e873b7..421d6eb 100644
> >>>>>>>>>--- a/hw/vfio/common.c
> >>>>>>>>>+++ b/hw/vfio/common.c
> >>>>>>>>>@@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer
> >>>>>>>>>*container,
> >>>>>>>>>      return 0;
> >>>>>>>>>  }
> >>>>>>>>>
> >>>>>>>>>+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr
> >>>>>>>>>min_iova)
> >>>>>>>>>+{
> >>>>>>>>>+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container,
> >>>>>>>>>min_iova, 0x1000);
> >>>>>>>>
> >>>>>>>>The hard-coded 0x1000 looks dubious..
> >>>>>>>
> >>>>>>>Well, that's the minimal page size...
> >>>>>>
> >>>>>>Really?  Some BookE CPUs support 1KiB page size..
> >>>>>
> >>>>>Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
> >>>>
> >>>>Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
> >>>>it's been done for CPU MMU I wouldn't count on it not being done for
> >>>>IOMMU.
> >>>>
> >>>>1 is a safer choice.
> >>>>
> >>>>>
> >>>>>
> >>>>>>
> >>>>>>>>>+    g_assert(hiommu);
> >>>>>>>>>+    QLIST_REMOVE(hiommu, hiommu_next);
> >>>>>>>>>+}
> >>>>>>>>>+
> >>>>>>>>>  static bool vfio_listener_skipped_section(MemoryRegionSection
> >>>>>>>>>*section)
> >>>>>>>>>  {
> >>>>>>>>>      return (!memory_region_is_ram(section->mr) &&
> >>>>>>>>>@@ -392,6 +400,61 @@ static void
> >>>>>>>>>vfio_listener_region_add(MemoryListener *listener,
> >>>>>>>>>      }
> >>>>>>>>>      end = int128_get64(llend);
> >>>>>>>>>
> >>>>>>>>>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>>>>>>>
> >>>>>>>>I think this would be clearer split out into a helper function,
> >>>>>>>>vfio_create_host_window() or something.
> >>>>>>>
> >>>>>>>
> >>>>>>>It is rather vfio_spapr_create_host_window() and we were avoiding
> >>>>>>>xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
> >>>>>>>separate file but this usually triggers more discussion and never
> >>>>>>>ends well.
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>>>>+        unsigned entries, pages;
> >>>>>>>>>+        struct vfio_iommu_spapr_tce_create create = { .argsz =
> >>>>>>>>>sizeof(create) };
> >>>>>>>>>+
> >>>>>>>>>+        g_assert(section->mr->iommu_ops);
> >>>>>>>>>+        g_assert(memory_region_is_iommu(section->mr));
> >>>>>>>>
> >>>>>>>>I don't think you need these asserts.  AFAICT the same logic should
> >>>>>>>>work if a RAM MR was added directly to PCI address space - this would
> >>>>>>>>create the new host window, then the existing code for adding a RAM MR
> >>>>>>>>would map that block of RAM statically into the new window.
> >>>>>>>
> >>>>>>>In what configuration/machine can we do that on SPAPR?
> >>>>>>
> >>>>>>spapr guests won't ever do that.  But you can run an x86 guest on a
> >>>>>>powernv host and this situation could come up.
> >>>>>
> >>>>>
> >>>>>I am pretty sure VFIO won't work in this case anyway.
> >>>>
> >>>>I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
> >>>
> >>>This is not about TCG (pseries TCG guest works with VFIO on powernv host),
> >>>this is about things like VFIO_IOMMU_GET_INFO vs.
> >>>VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
> >>>
> >>>Should I add such support in this patchset?
> >>
> >>Unless adding the generality is really complex, and so far I haven't
> >>seen a reason for it to be.
> >
> >Seriously? :(
> 
> 
> So, I tried.
> 
> With q35 machine (pc-i440fx-2.6 have even worse memory tree), there are
> several RAM blocks - 0..0xc0000, then pc.rom, then RAM again till 2GB, then
> a gap (PCI MMIO?), then PC BIOS at 0xfffc0000 (which is RAM), then after 4GB
> the rest of the RAM:
> 
> memory-region: system
>   0000000000000000-ffffffffffffffff (prio 0, RW): system
>     0000000000000000-000000007fffffff (prio 0, RW): alias ram-below-4g
> @pc.ram 0000000000000000-000000007fffffff
>     0000000000000000-ffffffffffffffff (prio -1, RW): pci
>       00000000000c0000-00000000000dffff (prio 1, RW): pc.rom
>       00000000000e0000-00000000000fffff (prio 1, R-): alias isa-bios
> @pc.bios 0000000000020000-000000000003ffff
>       00000000febe0000-00000000febeffff (prio 1, RW): 0003:09:00.0 BAR 0
>         00000000febe0000-00000000febeffff (prio 0, RW): 0003:09:00.0 BAR 0
> mmaps[0]
>       00000000febf0000-00000000febf1fff (prio 1, RW): 0003:09:00.0 BAR 2
>         00000000febf0000-00000000febf007f (prio 0, RW): msix-table
>         00000000febf1000-00000000febf1007 (prio 0, RW): msix-pba [disabled]
>       00000000febf2000-00000000febf2fff (prio 1, RW): ahci
>       00000000fffc0000-00000000ffffffff (prio 0, R-): pc.bios
> [...]
>     0000000100000000-000000027fffffff (prio 0, RW): alias ram-above-4g
> @pc.ram 0000000080000000-00000001ffffffff
> 
> 
> Ok, let's say we filter out "pc.bios", "pc.rom", BARs (aka "skip dump"
> regions) and we end up having RAM below 2GB and above 4GB, 2 windows.
> Problem is a window needs to be aligned to power of two.

Window size?  That should be ok - you can just round up the requested
size to a power of two.  We don't have to populate the whole host window.

> Ok, I change my test from -m8G to -m10G to have 2 aligned regions - 2GB and
> 8GB), next problem - the second window needs to start from 1<<<59, not 4G or
> any other random offset.

Yeah, that's harder.  With the IODA is it always that the 32-bit
window is at 0 and the 64-bit window is at 1<<59, or can you have a
64-bit window at 0?

If it's the second, then this is theoretically possible, but the host
kernel - on seeing the second window request, would need to see that
it can instead of adding a new host window, instead extend the first
host window so that it covers both the guest windows.

That does sound reasonably tricky and I don't think we should try to
implement it any time soon.  BUT - we should design our *interfaces*
so that it's reasonable to add that in future.

> Ok, I change my test to start with -m1G to have a single window.
> Now it fails because here is what happens:
> region_add: pc.ram: 0 40000000
> region_del: pc.ram: 0 40000000
> region_add: pc.ram: 0 c0000
> The second "add" is not power of two -> fail. And I cannot avoid this -
> "pc.rom" is still there, it is a separate region so RAM gets split into
> smaller chunks. I do not know to to fix this properly.
> 
> 
> So, in order to make it (x86/tcg on powernv host) work anyhow, I had to
> create a single huge window (as it is a single window - it starts from @0)
> unconditionally in vfio_connect_container(), not in
> vfio_listener_region_add(); and I added filtering (skip "pc.bios", "pc.rom",
> BARs - these things start above 1GB) and then I could boot a x86_64 guest
> and even pass Mellanox ConnextX3, it would bring 2 interfaces up and
> dhclient assigned IPs to them (which is quite amusing that it even works).
> 
> 
> So, I think I will replace assert() with:
> 
> unsigned pagesize = qemu_real_host_page_size;
> if (section->mr->iommu_ops) {
>     pagesize = section->mr->iommu_ops->get_page_sizes(section->mr);
> }
> 
> but there is no practical use for this anyway.

So, I think what you need here is to compute the effective IOMMU page
size (see other email for details).  That will be getrampagesize() for
RAM regions, and the mininum of that and the guest IOMMU page size for
IOMMU regions.
Alexey Kardashevskiy March 29, 2016, 5:44 a.m. UTC | #11
On 03/29/2016 04:30 PM, David Gibson wrote:
> On Thu, Mar 24, 2016 at 08:10:44PM +1100, Alexey Kardashevskiy wrote:
>> On 03/24/2016 11:03 AM, Alexey Kardashevskiy wrote:
>>> On 03/23/2016 05:03 PM, David Gibson wrote:
>>>> On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
>>>>> On 03/23/2016 01:53 PM, David Gibson wrote:
>>>>>> On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
>>>>>>> On 03/23/2016 12:08 PM, David Gibson wrote:
>>>>>>>> On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
>>>>>>>>> On 03/22/2016 04:14 PM, David Gibson wrote:
>>>>>>>>>> On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
>>>>>>>>>>> New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window
>>>>>>>>>>> management.
>>>>>>>>>>> This adds ability to VFIO common code to dynamically allocate/remove
>>>>>>>>>>> DMA windows in the host kernel when new VFIO container is
>>>>>>>>>>> added/removed.
>>>>>>>>>>>
>>>>>>>>>>> This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to
>>>>>>>>>>> vfio_listener_region_add
>>>>>>>>>>> and adds just created IOMMU into the host IOMMU list; the opposite
>>>>>>>>>>> action is taken in vfio_listener_region_del.
>>>>>>>>>>>
>>>>>>>>>>> When creating a new window, this uses euristic to decide on the
>>>>>>>>>>> TCE table
>>>>>>>>>>> levels number.
>>>>>>>>>>>
>>>>>>>>>>> This should cause no guest visible change in behavior.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>>>>>> ---
>>>>>>>>>>> Changes:
>>>>>>>>>>> v14:
>>>>>>>>>>> * new to the series
>>>>>>>>>>>
>>>>>>>>>>> ---
>>>>>>>>>>> TODO:
>>>>>>>>>>> * export levels to PHB
>>>>>>>>>>> ---
>>>>>>>>>>>   hw/vfio/common.c | 108
>>>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>>>>>>>>>   trace-events     |   2 ++
>>>>>>>>>>>   2 files changed, 105 insertions(+), 5 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>>>>>>> index 4e873b7..421d6eb 100644
>>>>>>>>>>> --- a/hw/vfio/common.c
>>>>>>>>>>> +++ b/hw/vfio/common.c
>>>>>>>>>>> @@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer
>>>>>>>>>>> *container,
>>>>>>>>>>>       return 0;
>>>>>>>>>>>   }
>>>>>>>>>>>
>>>>>>>>>>> +static void vfio_host_iommu_del(VFIOContainer *container, hwaddr
>>>>>>>>>>> min_iova)
>>>>>>>>>>> +{
>>>>>>>>>>> +    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container,
>>>>>>>>>>> min_iova, 0x1000);
>>>>>>>>>>
>>>>>>>>>> The hard-coded 0x1000 looks dubious..
>>>>>>>>>
>>>>>>>>> Well, that's the minimal page size...
>>>>>>>>
>>>>>>>> Really?  Some BookE CPUs support 1KiB page size..
>>>>>>>
>>>>>>> Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
>>>>>>
>>>>>> Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
>>>>>> it's been done for CPU MMU I wouldn't count on it not being done for
>>>>>> IOMMU.
>>>>>>
>>>>>> 1 is a safer choice.
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>>> +    g_assert(hiommu);
>>>>>>>>>>> +    QLIST_REMOVE(hiommu, hiommu_next);
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>>   static bool vfio_listener_skipped_section(MemoryRegionSection
>>>>>>>>>>> *section)
>>>>>>>>>>>   {
>>>>>>>>>>>       return (!memory_region_is_ram(section->mr) &&
>>>>>>>>>>> @@ -392,6 +400,61 @@ static void
>>>>>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>>>>       }
>>>>>>>>>>>       end = int128_get64(llend);
>>>>>>>>>>>
>>>>>>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>>>>>>
>>>>>>>>>> I think this would be clearer split out into a helper function,
>>>>>>>>>> vfio_create_host_window() or something.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> It is rather vfio_spapr_create_host_window() and we were avoiding
>>>>>>>>> xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
>>>>>>>>> separate file but this usually triggers more discussion and never
>>>>>>>>> ends well.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>> +        unsigned entries, pages;
>>>>>>>>>>> +        struct vfio_iommu_spapr_tce_create create = { .argsz =
>>>>>>>>>>> sizeof(create) };
>>>>>>>>>>> +
>>>>>>>>>>> +        g_assert(section->mr->iommu_ops);
>>>>>>>>>>> +        g_assert(memory_region_is_iommu(section->mr));
>>>>>>>>>>
>>>>>>>>>> I don't think you need these asserts.  AFAICT the same logic should
>>>>>>>>>> work if a RAM MR was added directly to PCI address space - this would
>>>>>>>>>> create the new host window, then the existing code for adding a RAM MR
>>>>>>>>>> would map that block of RAM statically into the new window.
>>>>>>>>>
>>>>>>>>> In what configuration/machine can we do that on SPAPR?
>>>>>>>>
>>>>>>>> spapr guests won't ever do that.  But you can run an x86 guest on a
>>>>>>>> powernv host and this situation could come up.
>>>>>>>
>>>>>>>
>>>>>>> I am pretty sure VFIO won't work in this case anyway.
>>>>>>
>>>>>> I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
>>>>>
>>>>> This is not about TCG (pseries TCG guest works with VFIO on powernv host),
>>>>> this is about things like VFIO_IOMMU_GET_INFO vs.
>>>>> VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
>>>>>
>>>>> Should I add such support in this patchset?
>>>>
>>>> Unless adding the generality is really complex, and so far I haven't
>>>> seen a reason for it to be.
>>>
>>> Seriously? :(
>>
>>
>> So, I tried.
>>
>> With q35 machine (pc-i440fx-2.6 have even worse memory tree), there are
>> several RAM blocks - 0..0xc0000, then pc.rom, then RAM again till 2GB, then
>> a gap (PCI MMIO?), then PC BIOS at 0xfffc0000 (which is RAM), then after 4GB
>> the rest of the RAM:
>>
>> memory-region: system
>>    0000000000000000-ffffffffffffffff (prio 0, RW): system
>>      0000000000000000-000000007fffffff (prio 0, RW): alias ram-below-4g
>> @pc.ram 0000000000000000-000000007fffffff
>>      0000000000000000-ffffffffffffffff (prio -1, RW): pci
>>        00000000000c0000-00000000000dffff (prio 1, RW): pc.rom
>>        00000000000e0000-00000000000fffff (prio 1, R-): alias isa-bios
>> @pc.bios 0000000000020000-000000000003ffff
>>        00000000febe0000-00000000febeffff (prio 1, RW): 0003:09:00.0 BAR 0
>>          00000000febe0000-00000000febeffff (prio 0, RW): 0003:09:00.0 BAR 0
>> mmaps[0]
>>        00000000febf0000-00000000febf1fff (prio 1, RW): 0003:09:00.0 BAR 2
>>          00000000febf0000-00000000febf007f (prio 0, RW): msix-table
>>          00000000febf1000-00000000febf1007 (prio 0, RW): msix-pba [disabled]
>>        00000000febf2000-00000000febf2fff (prio 1, RW): ahci
>>        00000000fffc0000-00000000ffffffff (prio 0, R-): pc.bios
>> [...]
>>      0000000100000000-000000027fffffff (prio 0, RW): alias ram-above-4g
>> @pc.ram 0000000080000000-00000001ffffffff
>>
>>
>> Ok, let's say we filter out "pc.bios", "pc.rom", BARs (aka "skip dump"
>> regions) and we end up having RAM below 2GB and above 4GB, 2 windows.
>> Problem is a window needs to be aligned to power of two.
>
> Window size?  That should be ok - you can just round up the requested
> size to a power of two.  We don't have to populate the whole host window.
 >
>> Ok, I change my test from -m8G to -m10G to have 2 aligned regions - 2GB and
>> 8GB), next problem - the second window needs to start from 1<<<59, not 4G or
>> any other random offset.
>
> Yeah, that's harder.  With the IODA is it always that the 32-bit
> window is at 0 and the 64-bit window is at 1<<59, or can you have a
> 64-bit window at 0?

I can have a single window at 0. This is why my (hacked) version works at all.

IODA2 allows having 2 windows, each window can be backed with TCE table 
with variable pagesize or be a bypass, the window capabilities are exactly 
the same.


> If it's the second, then this is theoretically possible, but the host
> kernel - on seeing the second window request, would need to see that
> it can instead of adding a new host window, instead extend the first
> host window so that it covers both the guest windows.
> That does sound reasonably tricky and I don't think we should try to
> implement it any time soon.  BUT - we should design our *interfaces*
> so that it's reasonable to add that in future.
>
>> Ok, I change my test to start with -m1G to have a single window.
>> Now it fails because here is what happens:
>> region_add: pc.ram: 0 40000000
>> region_del: pc.ram: 0 40000000
>> region_add: pc.ram: 0 c0000
>> The second "add" is not power of two -> fail. And I cannot avoid this -
>> "pc.rom" is still there, it is a separate region so RAM gets split into
>> smaller chunks. I do not know to to fix this properly.


Still, how to fix this? Do I need to fix this now? Practically, when do I 
create this single huge window?

>>
>>
>> So, in order to make it (x86/tcg on powernv host) work anyhow, I had to
>> create a single huge window (as it is a single window - it starts from @0)
>> unconditionally in vfio_connect_container(), not in
>> vfio_listener_region_add(); and I added filtering (skip "pc.bios", "pc.rom",
>> BARs - these things start above 1GB) and then I could boot a x86_64 guest
>> and even pass Mellanox ConnextX3, it would bring 2 interfaces up and
>> dhclient assigned IPs to them (which is quite amusing that it even works).
>>
>>
>> So, I think I will replace assert() with:
>>
>> unsigned pagesize = qemu_real_host_page_size;
>> if (section->mr->iommu_ops) {
>>      pagesize = section->mr->iommu_ops->get_page_sizes(section->mr);
>> }
>>
>> but there is no practical use for this anyway.
>
> So, I think what you need here is to compute the effective IOMMU page
> size (see other email for details).  That will be getrampagesize() for
> RAM regions, and the mininum of that and the guest IOMMU page size for
> IOMMU regions.
David Gibson March 29, 2016, 6:44 a.m. UTC | #12
On Tue, Mar 29, 2016 at 04:44:04PM +1100, Alexey Kardashevskiy wrote:
> On 03/29/2016 04:30 PM, David Gibson wrote:
> >On Thu, Mar 24, 2016 at 08:10:44PM +1100, Alexey Kardashevskiy wrote:
> >>On 03/24/2016 11:03 AM, Alexey Kardashevskiy wrote:
> >>>On 03/23/2016 05:03 PM, David Gibson wrote:
> >>>>On Wed, Mar 23, 2016 at 02:06:36PM +1100, Alexey Kardashevskiy wrote:
> >>>>>On 03/23/2016 01:53 PM, David Gibson wrote:
> >>>>>>On Wed, Mar 23, 2016 at 01:12:59PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>>On 03/23/2016 12:08 PM, David Gibson wrote:
> >>>>>>>>On Tue, Mar 22, 2016 at 04:54:07PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>>>>On 03/22/2016 04:14 PM, David Gibson wrote:
> >>>>>>>>>>On Mon, Mar 21, 2016 at 06:47:05PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>>>>>>New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window
> >>>>>>>>>>>management.
> >>>>>>>>>>>This adds ability to VFIO common code to dynamically allocate/remove
> >>>>>>>>>>>DMA windows in the host kernel when new VFIO container is
> >>>>>>>>>>>added/removed.
> >>>>>>>>>>>
> >>>>>>>>>>>This adds VFIO_IOMMU_SPAPR_TCE_CREATE ioctl to
> >>>>>>>>>>>vfio_listener_region_add
> >>>>>>>>>>>and adds just created IOMMU into the host IOMMU list; the opposite
> >>>>>>>>>>>action is taken in vfio_listener_region_del.
> >>>>>>>>>>>
> >>>>>>>>>>>When creating a new window, this uses euristic to decide on the
> >>>>>>>>>>>TCE table
> >>>>>>>>>>>levels number.
> >>>>>>>>>>>
> >>>>>>>>>>>This should cause no guest visible change in behavior.
> >>>>>>>>>>>
> >>>>>>>>>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>>>>>>>>---
> >>>>>>>>>>>Changes:
> >>>>>>>>>>>v14:
> >>>>>>>>>>>* new to the series
> >>>>>>>>>>>
> >>>>>>>>>>>---
> >>>>>>>>>>>TODO:
> >>>>>>>>>>>* export levels to PHB
> >>>>>>>>>>>---
> >>>>>>>>>>>  hw/vfio/common.c | 108
> >>>>>>>>>>>++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>>>>>>>>>>  trace-events     |   2 ++
> >>>>>>>>>>>  2 files changed, 105 insertions(+), 5 deletions(-)
> >>>>>>>>>>>
> >>>>>>>>>>>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>>>>>>>>>index 4e873b7..421d6eb 100644
> >>>>>>>>>>>--- a/hw/vfio/common.c
> >>>>>>>>>>>+++ b/hw/vfio/common.c
> >>>>>>>>>>>@@ -279,6 +279,14 @@ static int vfio_host_iommu_add(VFIOContainer
> >>>>>>>>>>>*container,
> >>>>>>>>>>>      return 0;
> >>>>>>>>>>>  }
> >>>>>>>>>>>
> >>>>>>>>>>>+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr
> >>>>>>>>>>>min_iova)
> >>>>>>>>>>>+{
> >>>>>>>>>>>+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container,
> >>>>>>>>>>>min_iova, 0x1000);
> >>>>>>>>>>
> >>>>>>>>>>The hard-coded 0x1000 looks dubious..
> >>>>>>>>>
> >>>>>>>>>Well, that's the minimal page size...
> >>>>>>>>
> >>>>>>>>Really?  Some BookE CPUs support 1KiB page size..
> >>>>>>>
> >>>>>>>Hm. For IOMMU? Ok. s/0x1000/1/ should do then :)
> >>>>>>
> >>>>>>Uh.. actually I don't think those CPUs generally had an IOMMU.  But if
> >>>>>>it's been done for CPU MMU I wouldn't count on it not being done for
> >>>>>>IOMMU.
> >>>>>>
> >>>>>>1 is a safer choice.
> >>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>>>>>>>>+    g_assert(hiommu);
> >>>>>>>>>>>+    QLIST_REMOVE(hiommu, hiommu_next);
> >>>>>>>>>>>+}
> >>>>>>>>>>>+
> >>>>>>>>>>>  static bool vfio_listener_skipped_section(MemoryRegionSection
> >>>>>>>>>>>*section)
> >>>>>>>>>>>  {
> >>>>>>>>>>>      return (!memory_region_is_ram(section->mr) &&
> >>>>>>>>>>>@@ -392,6 +400,61 @@ static void
> >>>>>>>>>>>vfio_listener_region_add(MemoryListener *listener,
> >>>>>>>>>>>      }
> >>>>>>>>>>>      end = int128_get64(llend);
> >>>>>>>>>>>
> >>>>>>>>>>>+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> >>>>>>>>>>
> >>>>>>>>>>I think this would be clearer split out into a helper function,
> >>>>>>>>>>vfio_create_host_window() or something.
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>It is rather vfio_spapr_create_host_window() and we were avoiding
> >>>>>>>>>xxx_spapr_xxx so far. I'd cut-n-paste the SPAPR PCI AS listener to a
> >>>>>>>>>separate file but this usually triggers more discussion and never
> >>>>>>>>>ends well.
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>>>+        unsigned entries, pages;
> >>>>>>>>>>>+        struct vfio_iommu_spapr_tce_create create = { .argsz =
> >>>>>>>>>>>sizeof(create) };
> >>>>>>>>>>>+
> >>>>>>>>>>>+        g_assert(section->mr->iommu_ops);
> >>>>>>>>>>>+        g_assert(memory_region_is_iommu(section->mr));
> >>>>>>>>>>
> >>>>>>>>>>I don't think you need these asserts.  AFAICT the same logic should
> >>>>>>>>>>work if a RAM MR was added directly to PCI address space - this would
> >>>>>>>>>>create the new host window, then the existing code for adding a RAM MR
> >>>>>>>>>>would map that block of RAM statically into the new window.
> >>>>>>>>>
> >>>>>>>>>In what configuration/machine can we do that on SPAPR?
> >>>>>>>>
> >>>>>>>>spapr guests won't ever do that.  But you can run an x86 guest on a
> >>>>>>>>powernv host and this situation could come up.
> >>>>>>>
> >>>>>>>
> >>>>>>>I am pretty sure VFIO won't work in this case anyway.
> >>>>>>
> >>>>>>I'm not.  There's no fundamental reason VFIO shouldn't work with TCG.
> >>>>>
> >>>>>This is not about TCG (pseries TCG guest works with VFIO on powernv host),
> >>>>>this is about things like VFIO_IOMMU_GET_INFO vs.
> >>>>>VFIO_IOMMU_SPAPR_TCE_GET_INFO ioctls but yes, fundamentally, it can work.
> >>>>>
> >>>>>Should I add such support in this patchset?
> >>>>
> >>>>Unless adding the generality is really complex, and so far I haven't
> >>>>seen a reason for it to be.
> >>>
> >>>Seriously? :(
> >>
> >>
> >>So, I tried.
> >>
> >>With q35 machine (pc-i440fx-2.6 have even worse memory tree), there are
> >>several RAM blocks - 0..0xc0000, then pc.rom, then RAM again till 2GB, then
> >>a gap (PCI MMIO?), then PC BIOS at 0xfffc0000 (which is RAM), then after 4GB
> >>the rest of the RAM:
> >>
> >>memory-region: system
> >>   0000000000000000-ffffffffffffffff (prio 0, RW): system
> >>     0000000000000000-000000007fffffff (prio 0, RW): alias ram-below-4g
> >>@pc.ram 0000000000000000-000000007fffffff
> >>     0000000000000000-ffffffffffffffff (prio -1, RW): pci
> >>       00000000000c0000-00000000000dffff (prio 1, RW): pc.rom
> >>       00000000000e0000-00000000000fffff (prio 1, R-): alias isa-bios
> >>@pc.bios 0000000000020000-000000000003ffff
> >>       00000000febe0000-00000000febeffff (prio 1, RW): 0003:09:00.0 BAR 0
> >>         00000000febe0000-00000000febeffff (prio 0, RW): 0003:09:00.0 BAR 0
> >>mmaps[0]
> >>       00000000febf0000-00000000febf1fff (prio 1, RW): 0003:09:00.0 BAR 2
> >>         00000000febf0000-00000000febf007f (prio 0, RW): msix-table
> >>         00000000febf1000-00000000febf1007 (prio 0, RW): msix-pba [disabled]
> >>       00000000febf2000-00000000febf2fff (prio 1, RW): ahci
> >>       00000000fffc0000-00000000ffffffff (prio 0, R-): pc.bios
> >>[...]
> >>     0000000100000000-000000027fffffff (prio 0, RW): alias ram-above-4g
> >>@pc.ram 0000000080000000-00000001ffffffff
> >>
> >>
> >>Ok, let's say we filter out "pc.bios", "pc.rom", BARs (aka "skip dump"
> >>regions) and we end up having RAM below 2GB and above 4GB, 2 windows.
> >>Problem is a window needs to be aligned to power of two.
> >
> >Window size?  That should be ok - you can just round up the requested
> >size to a power of two.  We don't have to populate the whole host window.
> >
> >>Ok, I change my test from -m8G to -m10G to have 2 aligned regions - 2GB and
> >>8GB), next problem - the second window needs to start from 1<<<59, not 4G or
> >>any other random offset.
> >
> >Yeah, that's harder.  With the IODA is it always that the 32-bit
> >window is at 0 and the 64-bit window is at 1<<59, or can you have a
> >64-bit window at 0?
> 
> I can have a single window at 0. This is why my (hacked) version works at all.
> 
> IODA2 allows having 2 windows, each window can be backed with TCE table with
> variable pagesize or be a bypass, the window capabilities are exactly the
> same.

Ok.

> >If it's the second, then this is theoretically possible, but the host
> >kernel - on seeing the second window request, would need to see that
> >it can instead of adding a new host window, instead extend the first
> >host window so that it covers both the guest windows.
> >That does sound reasonably tricky and I don't think we should try to
> >implement it any time soon.  BUT - we should design our *interfaces*
> >so that it's reasonable to add that in future.
> >
> >>Ok, I change my test to start with -m1G to have a single window.
> >>Now it fails because here is what happens:
> >>region_add: pc.ram: 0 40000000
> >>region_del: pc.ram: 0 40000000
> >>region_add: pc.ram: 0 c0000
> >>The second "add" is not power of two -> fail. And I cannot avoid this -
> >>"pc.rom" is still there, it is a separate region so RAM gets split into
> >>smaller chunks. I do not know to to fix this properly.
> 
> 
> Still, how to fix this? Do I need to fix this now? Practically, when do I
> create this single huge window?

I don't think you want to do this now.  For now, I think qemu should
just request a window matching the MR, and if the kernel can't supply
it we fail.

In future we can look at extending the kernel so that we try harder to
satisfy VFIO requests for new windows, by merging and/or extending
their requests to cover multiple areas.

> >>So, in order to make it (x86/tcg on powernv host) work anyhow, I had to
> >>create a single huge window (as it is a single window - it starts from @0)
> >>unconditionally in vfio_connect_container(), not in
> >>vfio_listener_region_add(); and I added filtering (skip "pc.bios", "pc.rom",
> >>BARs - these things start above 1GB) and then I could boot a x86_64 guest
> >>and even pass Mellanox ConnextX3, it would bring 2 interfaces up and
> >>dhclient assigned IPs to them (which is quite amusing that it even works).
> >>
> >>
> >>So, I think I will replace assert() with:
> >>
> >>unsigned pagesize = qemu_real_host_page_size;
> >>if (section->mr->iommu_ops) {
> >>     pagesize = section->mr->iommu_ops->get_page_sizes(section->mr);
> >>}
> >>
> >>but there is no practical use for this anyway.
> >
> >So, I think what you need here is to compute the effective IOMMU page
> >size (see other email for details).  That will be getrampagesize() for
> >RAM regions, and the mininum of that and the guest IOMMU page size for
> >IOMMU regions.
> 
> 
> 
> 
> 
>
diff mbox

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4e873b7..421d6eb 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -279,6 +279,14 @@  static int vfio_host_iommu_add(VFIOContainer *container,
     return 0;
 }
 
+static void vfio_host_iommu_del(VFIOContainer *container, hwaddr min_iova)
+{
+    VFIOHostIOMMU *hiommu = vfio_host_iommu_lookup(container, min_iova, 0x1000);
+
+    g_assert(hiommu);
+    QLIST_REMOVE(hiommu, hiommu_next);
+}
+
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
     return (!memory_region_is_ram(section->mr) &&
@@ -392,6 +400,61 @@  static void vfio_listener_region_add(MemoryListener *listener,
     }
     end = int128_get64(llend);
 
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        unsigned entries, pages;
+        struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
+
+        g_assert(section->mr->iommu_ops);
+        g_assert(memory_region_is_iommu(section->mr));
+
+        trace_vfio_listener_region_add_iommu(iova, end - 1);
+        /*
+         * FIXME: For VFIO iommu types which have KVM acceleration to
+         * avoid bouncing all map/unmaps through qemu this way, this
+         * would be the right place to wire that up (tell the KVM
+         * device emulation the VFIO iommu handles to use).
+         */
+        create.window_size = memory_region_size(section->mr);
+        create.page_shift =
+                ctz64(section->mr->iommu_ops->get_page_sizes(section->mr));
+        /*
+         * SPAPR host supports multilevel TCE tables, there is some
+         * euristic to decide how many levels we want for our table:
+         * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
+         */
+        entries = create.window_size >> create.page_shift;
+        pages = (entries * sizeof(uint64_t)) / getpagesize();
+        create.levels = ctz64(pow2ceil(pages) - 1) / 6 + 1;
+
+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+        if (ret) {
+            error_report("Failed to create a window, ret = %d (%m)", ret);
+            goto fail;
+        }
+
+        if (create.start_addr != section->offset_within_address_space ||
+            vfio_host_iommu_lookup(container, create.start_addr,
+                                   create.start_addr + create.window_size - 1)) {
+            struct vfio_iommu_spapr_tce_remove remove = {
+                .argsz = sizeof(remove),
+                .start_addr = create.start_addr
+            };
+            error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
+                         section->offset_within_address_space,
+                         create.start_addr);
+            ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+            ret = -EINVAL;
+            goto fail;
+        }
+        trace_vfio_spapr_create_window(create.page_shift,
+                                       create.window_size,
+                                       create.start_addr);
+
+        vfio_host_iommu_add(container, create.start_addr,
+                            create.start_addr + create.window_size - 1,
+                            1ULL << create.page_shift);
+    }
+
     if (!vfio_host_iommu_lookup(container, iova, end - 1)) {
         error_report("vfio: IOMMU container %p can't map guest IOVA region"
                      " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
@@ -525,6 +588,22 @@  static void vfio_listener_region_del(MemoryListener *listener,
                      container, iova, end - iova, ret);
     }
 
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        struct vfio_iommu_spapr_tce_remove remove = {
+            .argsz = sizeof(remove),
+            .start_addr = section->offset_within_address_space,
+        };
+        ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+        if (ret) {
+            error_report("Failed to remove window at %"PRIx64,
+                         remove.start_addr);
+        }
+
+        vfio_host_iommu_del(container, section->offset_within_address_space);
+
+        trace_vfio_spapr_remove_window(remove.start_addr);
+    }
+
     if (iommu && iommu->iommu_ops && iommu->iommu_ops->vfio_stop) {
         iommu->iommu_ops->vfio_stop(section->mr);
     }
@@ -928,11 +1007,30 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             goto listener_release_exit;
         }
 
-        /* The default table uses 4K pages */
-        vfio_host_iommu_add(container, info.dma32_window_start,
-                            info.dma32_window_start +
-                            info.dma32_window_size - 1,
-                            0x1000);
+        if (v2) {
+            /*
+             * There is a default window in just created container.
+             * To make region_add/del simpler, we better remove this
+             * window now and let those iommu_listener callbacks
+             * create/remove them when needed.
+             */
+            struct vfio_iommu_spapr_tce_remove remove = {
+                .argsz = sizeof(remove),
+                .start_addr = info.dma32_window_start,
+            };
+            ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+            if (ret) {
+                error_report("vfio: VFIO_IOMMU_SPAPR_TCE_REMOVE failed: %m");
+                ret = -errno;
+                goto free_container_exit;
+            }
+        } else {
+            /* The default table uses 4K pages */
+            vfio_host_iommu_add(container, info.dma32_window_start,
+                                info.dma32_window_start +
+                                info.dma32_window_size - 1,
+                                0x1000);
+        }
     } else {
         error_report("vfio: No available IOMMU models");
         ret = -EINVAL;
diff --git a/trace-events b/trace-events
index cc619e1..f2b75a3 100644
--- a/trace-events
+++ b/trace-events
@@ -1736,6 +1736,8 @@  vfio_region_finalize(const char *name, int index) "Device %s, region %d"
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
 vfio_ram_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
 vfio_ram_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
 
 # hw/vfio/platform.c
 vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"