diff mbox series

[v4,12/15] vfio/common: Support device dirty page tracking with vIOMMU

Message ID 20230622214845.3980-13-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio: VFIO migration support with vIOMMU | expand

Commit Message

Joao Martins June 22, 2023, 9:48 p.m. UTC
Currently, device dirty page tracking with vIOMMU is not supported,
and a blocker is added and the migration is prevented.

When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
requesting by the vIOMMU. These IOVA ranges can potentially be mapped
anywhere in the vIOMMU IOVA space as advertised by the VMM.

To support device dirty tracking when vIOMMU enabled instead create the
dirty ranges based on the vIOMMU provided limits, which leads to the
tracking of the whole IOVA space regardless of what devices use.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 include/hw/vfio/vfio-common.h |  1 +
 hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
 hw/vfio/pci.c                 |  7 +++++
 3 files changed, 56 insertions(+), 10 deletions(-)

Comments

Avihai Horon July 9, 2023, 3:24 p.m. UTC | #1
On 23/06/2023 0:48, Joao Martins wrote:
> External email: Use caution opening links or attachments
>
>
> Currently, device dirty page tracking with vIOMMU is not supported,
> and a blocker is added and the migration is prevented.
>
> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>
> To support device dirty tracking when vIOMMU enabled instead create the
> dirty ranges based on the vIOMMU provided limits, which leads to the
> tracking of the whole IOVA space regardless of what devices use.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   include/hw/vfio/vfio-common.h |  1 +
>   hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>   hw/vfio/pci.c                 |  7 +++++
>   3 files changed, 56 insertions(+), 10 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f41860988d6b..c4bafad084b4 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>   typedef struct VFIOAddressSpace {
>       AddressSpace *as;
>       bool no_dma_translation;
> +    hwaddr max_iova;
>       QLIST_HEAD(, VFIOContainer) containers;
>       QLIST_ENTRY(VFIOAddressSpace) list;
>   } VFIOAddressSpace;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ecfb9afb3fb6..85fddef24026 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>       return false;
>   }
>
> +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
> +{
> +    VFIOAddressSpace *space;
> +
> +    *max_iova = 0;
> +
> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
> +        if (space->as == &address_space_memory) {
> +            continue;
> +        }
> +
> +        if (*max_iova < space->max_iova) {
> +            *max_iova = space->max_iova;
> +        }
> +    }

Looks like max_iova is a per VFIOAddressSpace property, so why do we 
need to iterate over all address spaces?

Thanks.

> +
> +    return *max_iova == 0;
> +}
> +
>   int vfio_block_giommu_migration(Error **errp)
>   {
>       int ret;
> @@ -1464,10 +1483,11 @@ static const MemoryListener vfio_dirty_tracking_listener = {
>       .region_add = vfio_listener_dirty_tracking_update,
>   };
>
> -static void vfio_dirty_tracking_init(VFIOContainer *container,
> +static int vfio_dirty_tracking_init(VFIOContainer *container,
>                                        VFIODirtyRanges *ranges)
>   {
>       VFIODirtyRangesListener dirty;
> +    int ret;
>
>       memset(&dirty, 0, sizeof(dirty));
>       dirty.ranges.min32 = UINT32_MAX;
> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
>       dirty.listener = vfio_dirty_tracking_listener;
>       dirty.container = container;
>
> -    memory_listener_register(&dirty.listener,
> -                             container->space->as);
> +    if (vfio_viommu_preset()) {
> +        hwaddr iommu_max_iova;
> +
> +        ret = vfio_viommu_get_max_iova(&iommu_max_iova);
> +        if (ret) {
> +            return -EINVAL;
> +        }
> +
> +        vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges);
> +    } else {
> +        memory_listener_register(&dirty.listener,
> +                                 container->space->as);
> +        /*
> +         * The memory listener is synchronous, and used to calculate the range
> +         * to dirty tracking. Unregister it after we are done as we are not
> +         * interested in any follow-up updates.
> +         */
> +        memory_listener_unregister(&dirty.listener);
> +    }
>
>       *ranges = dirty.ranges;
>
> -    /*
> -     * The memory listener is synchronous, and used to calculate the range
> -     * to dirty tracking. Unregister it after we are done as we are not
> -     * interested in any follow-up updates.
> -     */
> -    memory_listener_unregister(&dirty.listener);
> +    return 0;
>   }
>
>   static void vfio_devices_dma_logging_stop(VFIOContainer *container)
> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
>       VFIOGroup *group;
>       int ret = 0;
>
> -    vfio_dirty_tracking_init(container, &ranges);
> +    ret = vfio_dirty_tracking_init(container, &ranges);
> +    if (ret) {
> +        error_report("Failed to init DMA logging ranges, err %d",
> +                      ret);
> +        return -EOPNOTSUPP;
> +    }
> +
>       feature = vfio_device_feature_dma_logging_start_create(container,
>                                                              &ranges);
>       if (!feature) {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 8a98e6ffc480..3bda5618c5b5 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2974,6 +2974,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>                                 &dma_translation);
>       space->no_dma_translation = !dma_translation;
>
> +    /*
> +     * Support for advertised IOMMU address space boundaries is optional.
> +     * By default, it is not advertised i.e. space::max_iova is 0.
> +     */
> +    pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA,
> +                              &space->max_iova);
> +
>       QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>           if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
>               error_setg(errp, "device is already attached");
> --
> 2.17.2
>
Joao Martins July 10, 2023, 1:49 p.m. UTC | #2
On 09/07/2023 16:24, Avihai Horon wrote:
> On 23/06/2023 0:48, Joao Martins wrote:
>> Currently, device dirty page tracking with vIOMMU is not supported,
>> and a blocker is added and the migration is prevented.
>>
>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
>> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>>
>> To support device dirty tracking when vIOMMU enabled instead create the
>> dirty ranges based on the vIOMMU provided limits, which leads to the
>> tracking of the whole IOVA space regardless of what devices use.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>   include/hw/vfio/vfio-common.h |  1 +
>>   hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>>   hw/vfio/pci.c                 |  7 +++++
>>   3 files changed, 56 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f41860988d6b..c4bafad084b4 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>>   typedef struct VFIOAddressSpace {
>>       AddressSpace *as;
>>       bool no_dma_translation;
>> +    hwaddr max_iova;
>>       QLIST_HEAD(, VFIOContainer) containers;
>>       QLIST_ENTRY(VFIOAddressSpace) list;
>>   } VFIOAddressSpace;
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ecfb9afb3fb6..85fddef24026 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>>       return false;
>>   }
>>
>> +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
>> +{
>> +    VFIOAddressSpace *space;
>> +
>> +    *max_iova = 0;
>> +
>> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
>> +        if (space->as == &address_space_memory) {
>> +            continue;
>> +        }
>> +
>> +        if (*max_iova < space->max_iova) {
>> +            *max_iova = space->max_iova;
>> +        }
>> +    }
> 
> Looks like max_iova is a per VFIOAddressSpace property, so why do we need to
> iterate over all address spaces?
> 

This was more futureproof-ing when Qemu supports multiple vIOMMU. In theory this
tracks device address space, and if two different devices stand behind different
vIOMMU, then this loop would compute the highest IOVA that we would track by the
host device dirty tracker.

But I realize this might introduce unnecessary complexity, and we should 'obey'
the advertised vIOMMU max_iova for the device. With Zhenzhong blocker cleanup I
can make this just fetch the max_iova in the space and be done with it.

	Joao

> Thanks.
> 
>> +
>> +    return *max_iova == 0;
>> +}
>> +
>>   int vfio_block_giommu_migration(Error **errp)
>>   {
>>       int ret;
>> @@ -1464,10 +1483,11 @@ static const MemoryListener
>> vfio_dirty_tracking_listener = {
>>       .region_add = vfio_listener_dirty_tracking_update,
>>   };
>>
>> -static void vfio_dirty_tracking_init(VFIOContainer *container,
>> +static int vfio_dirty_tracking_init(VFIOContainer *container,
>>                                        VFIODirtyRanges *ranges)
>>   {
>>       VFIODirtyRangesListener dirty;
>> +    int ret;
>>
>>       memset(&dirty, 0, sizeof(dirty));
>>       dirty.ranges.min32 = UINT32_MAX;
>> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer
>> *container,
>>       dirty.listener = vfio_dirty_tracking_listener;
>>       dirty.container = container;
>>
>> -    memory_listener_register(&dirty.listener,
>> -                             container->space->as);
>> +    if (vfio_viommu_preset()) {
>> +        hwaddr iommu_max_iova;
>> +
>> +        ret = vfio_viommu_get_max_iova(&iommu_max_iova);
>> +        if (ret) {
>> +            return -EINVAL;
>> +        }
>> +
>> +        vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges);
>> +    } else {
>> +        memory_listener_register(&dirty.listener,
>> +                                 container->space->as);
>> +        /*
>> +         * The memory listener is synchronous, and used to calculate the range
>> +         * to dirty tracking. Unregister it after we are done as we are not
>> +         * interested in any follow-up updates.
>> +         */
>> +        memory_listener_unregister(&dirty.listener);
>> +    }
>>
>>       *ranges = dirty.ranges;
>>
>> -    /*
>> -     * The memory listener is synchronous, and used to calculate the range
>> -     * to dirty tracking. Unregister it after we are done as we are not
>> -     * interested in any follow-up updates.
>> -     */
>> -    memory_listener_unregister(&dirty.listener);
>> +    return 0;
>>   }
>>
>>   static void vfio_devices_dma_logging_stop(VFIOContainer *container)
>> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer
>> *container)
>>       VFIOGroup *group;
>>       int ret = 0;
>>
>> -    vfio_dirty_tracking_init(container, &ranges);
>> +    ret = vfio_dirty_tracking_init(container, &ranges);
>> +    if (ret) {
>> +        error_report("Failed to init DMA logging ranges, err %d",
>> +                      ret);
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>>       feature = vfio_device_feature_dma_logging_start_create(container,
>>                                                              &ranges);
>>       if (!feature) {
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 8a98e6ffc480..3bda5618c5b5 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2974,6 +2974,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>>                                 &dma_translation);
>>       space->no_dma_translation = !dma_translation;
>>
>> +    /*
>> +     * Support for advertised IOMMU address space boundaries is optional.
>> +     * By default, it is not advertised i.e. space::max_iova is 0.
>> +     */
>> +    pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA,
>> +                              &space->max_iova);
>> +
>>       QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>>           if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
>>               error_setg(errp, "device is already attached");
>> -- 
>> 2.17.2
>>
Duan, Zhenzhong Sept. 8, 2023, 6:11 a.m. UTC | #3
Hi Joao,

On 6/23/2023 5:48 AM, Joao Martins wrote:
> Currently, device dirty page tracking with vIOMMU is not supported,
> and a blocker is added and the migration is prevented.
>
> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>
> To support device dirty tracking when vIOMMU enabled instead create the
> dirty ranges based on the vIOMMU provided limits, which leads to the
> tracking of the whole IOVA space regardless of what devices use.
>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   include/hw/vfio/vfio-common.h |  1 +
>   hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>   hw/vfio/pci.c                 |  7 +++++
>   3 files changed, 56 insertions(+), 10 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index f41860988d6b..c4bafad084b4 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>   typedef struct VFIOAddressSpace {
>       AddressSpace *as;
>       bool no_dma_translation;
> +    hwaddr max_iova;
>       QLIST_HEAD(, VFIOContainer) containers;
>       QLIST_ENTRY(VFIOAddressSpace) list;
>   } VFIOAddressSpace;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ecfb9afb3fb6..85fddef24026 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>       return false;
>   }
>   
> +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
> +{
> +    VFIOAddressSpace *space;
> +
> +    *max_iova = 0;
> +
> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
> +        if (space->as == &address_space_memory) {
> +            continue;
> +        }

Just curious why address_space_memory is bypassed?

Imagine two vfio devices linked to two host bridge, one has bypass_iommu set

and the other not. Don't we need to include the guest memory ranges in

address_space_memory?

> +
> +        if (*max_iova < space->max_iova) {
> +            *max_iova = space->max_iova;
> +        }
> +    }
> +
> +    return *max_iova == 0;
> +}
> +
>   int vfio_block_giommu_migration(Error **errp)
>   {
>       int ret;
> @@ -1464,10 +1483,11 @@ static const MemoryListener vfio_dirty_tracking_listener = {
>       .region_add = vfio_listener_dirty_tracking_update,
>   };
>   
> -static void vfio_dirty_tracking_init(VFIOContainer *container,
> +static int vfio_dirty_tracking_init(VFIOContainer *container,
>                                        VFIODirtyRanges *ranges)
>   {
>       VFIODirtyRangesListener dirty;
> +    int ret;
>   
>       memset(&dirty, 0, sizeof(dirty));
>       dirty.ranges.min32 = UINT32_MAX;
> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
>       dirty.listener = vfio_dirty_tracking_listener;
>       dirty.container = container;
>   
> -    memory_listener_register(&dirty.listener,
> -                             container->space->as);
> +    if (vfio_viommu_preset()) {
> +        hwaddr iommu_max_iova;
> +
> +        ret = vfio_viommu_get_max_iova(&iommu_max_iova);
> +        if (ret) {
> +            return -EINVAL;
> +        }
> +
> +        vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges);
> +    } else {
> +        memory_listener_register(&dirty.listener,
> +                                 container->space->as);
> +        /*
> +         * The memory listener is synchronous, and used to calculate the range
> +         * to dirty tracking. Unregister it after we are done as we are not
> +         * interested in any follow-up updates.
> +         */
> +        memory_listener_unregister(&dirty.listener);
> +    }
>   
>       *ranges = dirty.ranges;
>   
> -    /*
> -     * The memory listener is synchronous, and used to calculate the range
> -     * to dirty tracking. Unregister it after we are done as we are not
> -     * interested in any follow-up updates.
> -     */
> -    memory_listener_unregister(&dirty.listener);
> +    return 0;
>   }
>   
>   static void vfio_devices_dma_logging_stop(VFIOContainer *container)
> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
>       VFIOGroup *group;
>       int ret = 0;
>   
> -    vfio_dirty_tracking_init(container, &ranges);
> +    ret = vfio_dirty_tracking_init(container, &ranges);
> +    if (ret) {
> +        error_report("Failed to init DMA logging ranges, err %d",
> +                      ret);
> +        return -EOPNOTSUPP;
> +    }
> +
>       feature = vfio_device_feature_dma_logging_start_create(container,
>                                                              &ranges);

No clear how much dirty range size could impact device dirty tracking.

Maybe some devices linking to vIOMMU with small aw_bits or bypassing vIOMMU

with small guest memory could benefit if we use per address space's 
dirty range

Thanks

Zhenzhong
Joao Martins Sept. 8, 2023, 10:11 a.m. UTC | #4
On 08/09/2023 07:11, Duan, Zhenzhong wrote:
> Hi Joao,
> 
> On 6/23/2023 5:48 AM, Joao Martins wrote:
>> Currently, device dirty page tracking with vIOMMU is not supported,
>> and a blocker is added and the migration is prevented.
>>
>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
>> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>>
>> To support device dirty tracking when vIOMMU enabled instead create the
>> dirty ranges based on the vIOMMU provided limits, which leads to the
>> tracking of the whole IOVA space regardless of what devices use.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>   include/hw/vfio/vfio-common.h |  1 +
>>   hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>>   hw/vfio/pci.c                 |  7 +++++
>>   3 files changed, 56 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index f41860988d6b..c4bafad084b4 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>>   typedef struct VFIOAddressSpace {
>>       AddressSpace *as;
>>       bool no_dma_translation;
>> +    hwaddr max_iova;
>>       QLIST_HEAD(, VFIOContainer) containers;
>>       QLIST_ENTRY(VFIOAddressSpace) list;
>>   } VFIOAddressSpace;
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ecfb9afb3fb6..85fddef24026 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>>       return false;
>>   }
>>   +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
>> +{
>> +    VFIOAddressSpace *space;
>> +
>> +    *max_iova = 0;
>> +
>> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
>> +        if (space->as == &address_space_memory) {
>> +            continue;
>> +        }
> 
> Just curious why address_space_memory is bypassed?
> 

But address_space_memory part is done by memory listeners, but I see your point
conceptually on not considering it

> Imagine two vfio devices linked to two host bridge, one has bypass_iommu set
> 
> and the other not. Don't we need to include the guest memory ranges in
> 
> address_space_memory?

I am probably making a bad assumption that vIOMMU maximum adress space is a
superset of the CPU address space. But as you just reminded me, there is a user
case where all it takes is one bypass_iommu=true on a 2T guest setup with
aw-bits=39.

I'll rework this to consider this.

> 
>> +
>> +        if (*max_iova < space->max_iova) {
>> +            *max_iova = space->max_iova;
>> +        }
>> +    }
>> +
>> +    return *max_iova == 0;
>> +}
>> +
>>   int vfio_block_giommu_migration(Error **errp)
>>   {
>>       int ret;
>> @@ -1464,10 +1483,11 @@ static const MemoryListener
>> vfio_dirty_tracking_listener = {
>>       .region_add = vfio_listener_dirty_tracking_update,
>>   };
>>   -static void vfio_dirty_tracking_init(VFIOContainer *container,
>> +static int vfio_dirty_tracking_init(VFIOContainer *container,
>>                                        VFIODirtyRanges *ranges)
>>   {
>>       VFIODirtyRangesListener dirty;
>> +    int ret;
>>         memset(&dirty, 0, sizeof(dirty));
>>       dirty.ranges.min32 = UINT32_MAX;
>> @@ -1475,17 +1495,29 @@ static void vfio_dirty_tracking_init(VFIOContainer
>> *container,
>>       dirty.listener = vfio_dirty_tracking_listener;
>>       dirty.container = container;
>>   -    memory_listener_register(&dirty.listener,
>> -                             container->space->as);
>> +    if (vfio_viommu_preset()) {
>> +        hwaddr iommu_max_iova;
>> +
>> +        ret = vfio_viommu_get_max_iova(&iommu_max_iova);
>> +        if (ret) {
>> +            return -EINVAL;
>> +        }
>> +
>> +        vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges);
>> +    } else {
>> +        memory_listener_register(&dirty.listener,
>> +                                 container->space->as);
>> +        /*
>> +         * The memory listener is synchronous, and used to calculate the range
>> +         * to dirty tracking. Unregister it after we are done as we are not
>> +         * interested in any follow-up updates.
>> +         */
>> +        memory_listener_unregister(&dirty.listener);
>> +    }
>>         *ranges = dirty.ranges;
>>   -    /*
>> -     * The memory listener is synchronous, and used to calculate the range
>> -     * to dirty tracking. Unregister it after we are done as we are not
>> -     * interested in any follow-up updates.
>> -     */
>> -    memory_listener_unregister(&dirty.listener);
>> +    return 0;
>>   }
>>     static void vfio_devices_dma_logging_stop(VFIOContainer *container)
>> @@ -1590,7 +1622,13 @@ static int vfio_devices_dma_logging_start(VFIOContainer
>> *container)
>>       VFIOGroup *group;
>>       int ret = 0;
>>   -    vfio_dirty_tracking_init(container, &ranges);
>> +    ret = vfio_dirty_tracking_init(container, &ranges);
>> +    if (ret) {
>> +        error_report("Failed to init DMA logging ranges, err %d",
>> +                      ret);
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>>       feature = vfio_device_feature_dma_logging_start_create(container,
>>                                                              &ranges);
> 
> No clear how much dirty range size could impact device dirty tracking.
> 
> Maybe some devices linking to vIOMMU with small aw_bits or bypassing vIOMMU
> 

So, my intended usecase with this series starts with DMA_TRANSLATION=off, where
vIOMMU is restricted to interrupt remapping, yet guest only uses it for
interrupt remapping. Right now only supported by intel-iommu, but my
understanding of AMD IOMMU is that is also possible there (haven't checked
smmu-v3). This unblocks guests with more >255. I have another patch separate to
this that hopefully relaxes vIOMMU blockage for these older guests.

Now with real emulated vIOMMU DMA translation usage, intel-iommu there's only
39, 48, 57 address space width (mimmicing the levels). And it's true that only
the first value is supportable and somewhat limiting as you say.

virtio-iommu has more going there as you can limit input iova to be the size of
the CPU address space. Eric latest series[0] is actually nice on that end of
fixing that issue (my alternative I had there was to introduce a gaw-bits
equivalent, but it's better done like [0]).

[0]
https://lore.kernel.org/qemu-devel/20230904080451.424731-1-eric.auger@redhat.com/

> with small guest memory could benefit if we use per address space's dirty range
> 


Yeah, I'll need to fix that, when there's big guest memory and small vIOMMU
address space. Hopefully I picked up on your comment right :)
Duan, Zhenzhong Sept. 8, 2023, 11:52 a.m. UTC | #5
On 9/8/2023 6:11 PM, Joao Martins wrote:
> On 08/09/2023 07:11, Duan, Zhenzhong wrote:
>> Hi Joao,
>>
>> On 6/23/2023 5:48 AM, Joao Martins wrote:
>>> Currently, device dirty page tracking with vIOMMU is not supported,
>>> and a blocker is added and the migration is prevented.
>>>
>>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
>>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
>>> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>>>
>>> To support device dirty tracking when vIOMMU enabled instead create the
>>> dirty ranges based on the vIOMMU provided limits, which leads to the
>>> tracking of the whole IOVA space regardless of what devices use.
>>>
>>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>>    include/hw/vfio/vfio-common.h |  1 +
>>>    hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>>>    hw/vfio/pci.c                 |  7 +++++
>>>    3 files changed, 56 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>> index f41860988d6b..c4bafad084b4 100644
>>> --- a/include/hw/vfio/vfio-common.h
>>> +++ b/include/hw/vfio/vfio-common.h
>>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>>>    typedef struct VFIOAddressSpace {
>>>        AddressSpace *as;
>>>        bool no_dma_translation;
>>> +    hwaddr max_iova;
>>>        QLIST_HEAD(, VFIOContainer) containers;
>>>        QLIST_ENTRY(VFIOAddressSpace) list;
>>>    } VFIOAddressSpace;
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index ecfb9afb3fb6..85fddef24026 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>>>        return false;
>>>    }
>>>    +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
>>> +{
>>> +    VFIOAddressSpace *space;
>>> +
>>> +    *max_iova = 0;
>>> +
>>> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
>>> +        if (space->as == &address_space_memory) {
>>> +            continue;
>>> +        }
>> Just curious why address_space_memory is bypassed?
>>
> But address_space_memory part is done by memory listeners

Only this part. Still think about the case with two vfio devices, one 
bypass iommu, the other not.

The device bypassing iommu will get address_space_memory, the other get 
iommu

address space. vfio_viommu_preset() return true for any device, so we 
never run into

memory listener even for device bypassing iommu?


Thanks

Zhenzhong
Joao Martins Sept. 8, 2023, 11:54 a.m. UTC | #6
On 08/09/2023 12:52, Duan, Zhenzhong wrote:
> On 9/8/2023 6:11 PM, Joao Martins wrote:
>> On 08/09/2023 07:11, Duan, Zhenzhong wrote:
>>> Hi Joao,
>>>
>>> On 6/23/2023 5:48 AM, Joao Martins wrote:
>>>> Currently, device dirty page tracking with vIOMMU is not supported,
>>>> and a blocker is added and the migration is prevented.
>>>>
>>>> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
>>>> requesting by the vIOMMU. These IOVA ranges can potentially be mapped
>>>> anywhere in the vIOMMU IOVA space as advertised by the VMM.
>>>>
>>>> To support device dirty tracking when vIOMMU enabled instead create the
>>>> dirty ranges based on the vIOMMU provided limits, which leads to the
>>>> tracking of the whole IOVA space regardless of what devices use.
>>>>
>>>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> ---
>>>>    include/hw/vfio/vfio-common.h |  1 +
>>>>    hw/vfio/common.c              | 58 +++++++++++++++++++++++++++++------
>>>>    hw/vfio/pci.c                 |  7 +++++
>>>>    3 files changed, 56 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> index f41860988d6b..c4bafad084b4 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -71,6 +71,7 @@ typedef struct VFIOMigration {
>>>>    typedef struct VFIOAddressSpace {
>>>>        AddressSpace *as;
>>>>        bool no_dma_translation;
>>>> +    hwaddr max_iova;
>>>>        QLIST_HEAD(, VFIOContainer) containers;
>>>>        QLIST_ENTRY(VFIOAddressSpace) list;
>>>>    } VFIOAddressSpace;
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index ecfb9afb3fb6..85fddef24026 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -428,6 +428,25 @@ static bool vfio_viommu_preset(void)
>>>>        return false;
>>>>    }
>>>>    +static int vfio_viommu_get_max_iova(hwaddr *max_iova)
>>>> +{
>>>> +    VFIOAddressSpace *space;
>>>> +
>>>> +    *max_iova = 0;
>>>> +
>>>> +    QLIST_FOREACH(space, &vfio_address_spaces, list) {
>>>> +        if (space->as == &address_space_memory) {
>>>> +            continue;
>>>> +        }
>>> Just curious why address_space_memory is bypassed?
>>>
>> But address_space_memory part is done by memory listeners
> 
> Only this part. Still think about the case with two vfio devices, one bypass
> iommu, the other not.
> 
> The device bypassing iommu will get address_space_memory, the other get iommu
> 
> address space. vfio_viommu_preset() return true for any device, so we never run
> into
> 
> memory listener even for device bypassing iommu?

Yeap, that's correct. When I said earlier 'reworking this' I meant this part
exactly to do both.
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f41860988d6b..c4bafad084b4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -71,6 +71,7 @@  typedef struct VFIOMigration {
 typedef struct VFIOAddressSpace {
     AddressSpace *as;
     bool no_dma_translation;
+    hwaddr max_iova;
     QLIST_HEAD(, VFIOContainer) containers;
     QLIST_ENTRY(VFIOAddressSpace) list;
 } VFIOAddressSpace;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ecfb9afb3fb6..85fddef24026 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -428,6 +428,25 @@  static bool vfio_viommu_preset(void)
     return false;
 }
 
+static int vfio_viommu_get_max_iova(hwaddr *max_iova)
+{
+    VFIOAddressSpace *space;
+
+    *max_iova = 0;
+
+    QLIST_FOREACH(space, &vfio_address_spaces, list) {
+        if (space->as == &address_space_memory) {
+            continue;
+        }
+
+        if (*max_iova < space->max_iova) {
+            *max_iova = space->max_iova;
+        }
+    }
+
+    return *max_iova == 0;
+}
+
 int vfio_block_giommu_migration(Error **errp)
 {
     int ret;
@@ -1464,10 +1483,11 @@  static const MemoryListener vfio_dirty_tracking_listener = {
     .region_add = vfio_listener_dirty_tracking_update,
 };
 
-static void vfio_dirty_tracking_init(VFIOContainer *container,
+static int vfio_dirty_tracking_init(VFIOContainer *container,
                                      VFIODirtyRanges *ranges)
 {
     VFIODirtyRangesListener dirty;
+    int ret;
 
     memset(&dirty, 0, sizeof(dirty));
     dirty.ranges.min32 = UINT32_MAX;
@@ -1475,17 +1495,29 @@  static void vfio_dirty_tracking_init(VFIOContainer *container,
     dirty.listener = vfio_dirty_tracking_listener;
     dirty.container = container;
 
-    memory_listener_register(&dirty.listener,
-                             container->space->as);
+    if (vfio_viommu_preset()) {
+        hwaddr iommu_max_iova;
+
+        ret = vfio_viommu_get_max_iova(&iommu_max_iova);
+        if (ret) {
+            return -EINVAL;
+        }
+
+        vfio_dirty_tracking_update(0, iommu_max_iova, &dirty.ranges);
+    } else {
+        memory_listener_register(&dirty.listener,
+                                 container->space->as);
+        /*
+         * The memory listener is synchronous, and used to calculate the range
+         * to dirty tracking. Unregister it after we are done as we are not
+         * interested in any follow-up updates.
+         */
+        memory_listener_unregister(&dirty.listener);
+    }
 
     *ranges = dirty.ranges;
 
-    /*
-     * The memory listener is synchronous, and used to calculate the range
-     * to dirty tracking. Unregister it after we are done as we are not
-     * interested in any follow-up updates.
-     */
-    memory_listener_unregister(&dirty.listener);
+    return 0;
 }
 
 static void vfio_devices_dma_logging_stop(VFIOContainer *container)
@@ -1590,7 +1622,13 @@  static int vfio_devices_dma_logging_start(VFIOContainer *container)
     VFIOGroup *group;
     int ret = 0;
 
-    vfio_dirty_tracking_init(container, &ranges);
+    ret = vfio_dirty_tracking_init(container, &ranges);
+    if (ret) {
+        error_report("Failed to init DMA logging ranges, err %d",
+                      ret);
+        return -EOPNOTSUPP;
+    }
+
     feature = vfio_device_feature_dma_logging_start_create(container,
                                                            &ranges);
     if (!feature) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8a98e6ffc480..3bda5618c5b5 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2974,6 +2974,13 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
                               &dma_translation);
     space->no_dma_translation = !dma_translation;
 
+    /*
+     * Support for advertised IOMMU address space boundaries is optional.
+     * By default, it is not advertised i.e. space::max_iova is 0.
+     */
+    pci_device_iommu_get_attr(pdev, IOMMU_ATTR_MAX_IOVA,
+                              &space->max_iova);
+
     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
             error_setg(errp, "device is already attached");