diff mbox series

[v2,10/20] vfio/common: Record DMA mapped IOVA ranges

Message ID 20230222174915.5647-11-avihaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio: Add migration pre-copy support and device dirty tracking | expand

Commit Message

Avihai Horon Feb. 22, 2023, 5:49 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

According to the device DMA logging uAPI, IOVA ranges to be logged by
the device must be provided all at once upon DMA logging start.

As preparation for the following patches which will add device dirty
page tracking, keep a record of all DMA mapped IOVA ranges so later they
can be used for DMA logging start.

Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
Following patches will address the vIOMMU case specifically.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 include/hw/vfio/vfio-common.h |  3 ++
 hw/vfio/common.c              | 86 +++++++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 3 deletions(-)

Comments

Alex Williamson Feb. 22, 2023, 10:10 p.m. UTC | #1
On Wed, 22 Feb 2023 19:49:05 +0200
Avihai Horon <avihaih@nvidia.com> wrote:

> From: Joao Martins <joao.m.martins@oracle.com>
> 
> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> Following patches will address the vIOMMU case specifically.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>  include/hw/vfio/vfio-common.h |  3 ++
>  hw/vfio/common.c              | 86 +++++++++++++++++++++++++++++++++--
>  2 files changed, 86 insertions(+), 3 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index ee55d442b4..6f36876ce0 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -23,6 +23,7 @@
>  
>  #include "exec/memory.h"
>  #include "qemu/queue.h"
> +#include "qemu/iova-tree.h"
>  #include "qemu/notify.h"
>  #include "ui/console.h"
>  #include "hw/display/ramfb.h"
> @@ -92,6 +93,8 @@ typedef struct VFIOContainer {
>      uint64_t max_dirty_bitmap_size;
>      unsigned long pgsizes;
>      unsigned int dma_max_mappings;
> +    IOVATree *mappings;
> +    QemuMutex mappings_mutex;
>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>      QLIST_HEAD(, VFIOGroup) group_list;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 84f08bdbbb..6041da6c7e 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -44,6 +44,7 @@
>  #include "migration/blocker.h"
>  #include "migration/qemu-file.h"
>  #include "sysemu/tpm.h"
> +#include "qemu/iova-tree.h"
>  
>  VFIOGroupList vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>      multiple_devices_migration_blocker = NULL;
>  }
>  
> +static bool vfio_have_giommu(VFIOContainer *container)
> +{
> +    return !QLIST_EMPTY(&container->giommu_list);
> +}
> +
>  static void vfio_set_migration_error(int err)
>  {
>      MigrationState *ms = migrate_get_current();
> @@ -499,6 +505,51 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
>      return true;
>  }
>  
> +static int vfio_record_mapping(VFIOContainer *container, hwaddr iova,
> +                               hwaddr size, bool readonly)
> +{
> +    DMAMap map = {
> +        .iova = iova,
> +        .size = size - 1, /* IOVATree is inclusive, so subtract 1 from size */
> +        .perm = readonly ? IOMMU_RO : IOMMU_RW,
> +    };
> +    int ret;
> +
> +    if (vfio_have_giommu(container)) {
> +        return 0;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
> +        ret = iova_tree_insert(container->mappings, &map);
> +        if (ret) {
> +            if (ret == IOVA_ERR_INVALID) {
> +                ret = -EINVAL;
> +            } else if (ret == IOVA_ERR_OVERLAP) {
> +                ret = -EEXIST;
> +            }
> +        }
> +    }
> +
> +    return ret;
> +}
> +
> +static void vfio_erase_mapping(VFIOContainer *container, hwaddr iova,
> +                                hwaddr size)
> +{
> +    DMAMap map = {
> +        .iova = iova,
> +        .size = size - 1, /* IOVATree is inclusive, so subtract 1 from size */
> +    };
> +
> +    if (vfio_have_giommu(container)) {
> +        return;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
> +        iova_tree_remove(container->mappings, map);
> +    }
> +}

Nit, 'insert' and 'remove' to match the IOVATree semantics?

>  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>                                   hwaddr iova, ram_addr_t size,
>                                   IOMMUTLBEntry *iotlb)
> @@ -599,6 +650,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
>                                              DIRTY_CLIENTS_NOCODE);
>      }
>  
> +    vfio_erase_mapping(container, iova, size);
> +
>      return 0;
>  }
>  
> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>          .iova = iova,
>          .size = size,
>      };
> +    int ret;
> +
> +    ret = vfio_record_mapping(container, iova, size, readonly);
> +    if (ret) {
> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
> +                     iova, size, ret, strerror(-ret));
> +
> +        return ret;
> +    }

Is there no way to replay the mappings when a migration is started?
This seems like a horrible latency and bloat trade-off for the
possibility that the VM might migrate and the device might support
these features.  Our performance with vIOMMU is already terrible, I
can't help but believe this makes it worse.  Thanks,

Alex
Joao Martins Feb. 23, 2023, 10:37 a.m. UTC | #2
On 22/02/2023 22:10, Alex Williamson wrote:
> On Wed, 22 Feb 2023 19:49:05 +0200
> Avihai Horon <avihaih@nvidia.com> wrote:
>> From: Joao Martins <joao.m.martins@oracle.com>
>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>          .iova = iova,
>>          .size = size,
>>      };
>> +    int ret;
>> +
>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>> +    if (ret) {
>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>> +                     iova, size, ret, strerror(-ret));
>> +
>> +        return ret;
>> +    }
> 
> Is there no way to replay the mappings when a migration is started?
> This seems like a horrible latency and bloat trade-off for the
> possibility that the VM might migrate and the device might support
> these features.  Our performance with vIOMMU is already terrible, I
> can't help but believe this makes it worse.  Thanks,
> 

It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
that uses a max-iova based IOVA range. So this is really for iommu identity
mapping and no-VIOMMU. We could replay them if they were tracked/stored anywhere.

I suppose we could move the vfio_devices_all_device_dirty_tracking() into this
patch and then conditionally call this vfio_{record,erase}_mapping() in case we
are passing through a device that doesn't have live-migration support? Would
that address the impact you're concerned wrt to non-live-migrateable devices?

On the other hand, the PCI device hotplug hypothetical even makes this a bit
complicated as we can still attempt to hotplug a device before migration is even
attempted. Meaning that we start with live-migrateable devices, and we added the
tracking, up to hotpluging a device without such support (adding a blocker)
leaving the mappings there with no further use. So it felt simpler to just track
always and avoid any mappings recording if the vIOMMU is in active use?
Alex Williamson Feb. 23, 2023, 9:05 p.m. UTC | #3
On Thu, 23 Feb 2023 10:37:10 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 22/02/2023 22:10, Alex Williamson wrote:
> > On Wed, 22 Feb 2023 19:49:05 +0200
> > Avihai Horon <avihaih@nvidia.com> wrote:  
> >> From: Joao Martins <joao.m.martins@oracle.com>
> >> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> >>          .iova = iova,
> >>          .size = size,
> >>      };
> >> +    int ret;
> >> +
> >> +    ret = vfio_record_mapping(container, iova, size, readonly);
> >> +    if (ret) {
> >> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
> >> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
> >> +                     iova, size, ret, strerror(-ret));
> >> +
> >> +        return ret;
> >> +    }  
> > 
> > Is there no way to replay the mappings when a migration is started?
> > This seems like a horrible latency and bloat trade-off for the
> > possibility that the VM might migrate and the device might support
> > these features.  Our performance with vIOMMU is already terrible, I
> > can't help but believe this makes it worse.  Thanks,
> >   
> 
> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
> that uses a max-iova based IOVA range. So this is really for iommu identity
> mapping and no-VIOMMU.

Ok, yes, there are no mappings recorded for any containers that have a
non-empty giommu_list.

> We could replay them if they were tracked/stored anywhere.

Rather than piggybacking on vfio_memory_listener, why not simply
register a new MemoryListener when migration is started?  That will
replay all the existing ranges and allow tracking to happen separate
from mapping, and only when needed.

> I suppose we could move the vfio_devices_all_device_dirty_tracking() into this
> patch and then conditionally call this vfio_{record,erase}_mapping() in case we
> are passing through a device that doesn't have live-migration support? Would
> that address the impact you're concerned wrt to non-live-migrateable devices?
> 
> On the other hand, the PCI device hotplug hypothetical even makes this a bit
> complicated as we can still attempt to hotplug a device before migration is even
> attempted. Meaning that we start with live-migrateable devices, and we added the
> tracking, up to hotpluging a device without such support (adding a blocker)
> leaving the mappings there with no further use. So it felt simpler to just track
> always and avoid any mappings recording if the vIOMMU is in active use?

My preference would be that there's no runtime overhead for migration
support until a migration is initiated.  I currently don't see why we
can't achieve that by dynamically adding a new MemoryListener around
migration for that purpose.  Do you?  Thanks,

Alex
Joao Martins Feb. 23, 2023, 9:19 p.m. UTC | #4
On 23/02/2023 21:05, Alex Williamson wrote:
> On Thu, 23 Feb 2023 10:37:10 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 22/02/2023 22:10, Alex Williamson wrote:
>>> On Wed, 22 Feb 2023 19:49:05 +0200
>>> Avihai Horon <avihaih@nvidia.com> wrote:  
>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>>>          .iova = iova,
>>>>          .size = size,
>>>>      };
>>>> +    int ret;
>>>> +
>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>>>> +    if (ret) {
>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>>>> +                     iova, size, ret, strerror(-ret));
>>>> +
>>>> +        return ret;
>>>> +    }  
>>>
>>> Is there no way to replay the mappings when a migration is started?
>>> This seems like a horrible latency and bloat trade-off for the
>>> possibility that the VM might migrate and the device might support
>>> these features.  Our performance with vIOMMU is already terrible, I
>>> can't help but believe this makes it worse.  Thanks,
>>>   
>>
>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
>> that uses a max-iova based IOVA range. So this is really for iommu identity
>> mapping and no-VIOMMU.
> 
> Ok, yes, there are no mappings recorded for any containers that have a
> non-empty giommu_list.
> 
>> We could replay them if they were tracked/stored anywhere.
> 
> Rather than piggybacking on vfio_memory_listener, why not simply
> register a new MemoryListener when migration is started?  That will
> replay all the existing ranges and allow tracking to happen separate
> from mapping, and only when needed.
> 

The problem with that is that *starting* dirty tracking needs to have all the
range, we aren't supposed to start each range separately. So on a memory
listener callback you don't have introspection when you are dealing with the
last range, do we?

>> I suppose we could move the vfio_devices_all_device_dirty_tracking() into this
>> patch and then conditionally call this vfio_{record,erase}_mapping() in case we
>> are passing through a device that doesn't have live-migration support? Would
>> that address the impact you're concerned wrt to non-live-migrateable devices?
>>
>> On the other hand, the PCI device hotplug hypothetical even makes this a bit
>> complicated as we can still attempt to hotplug a device before migration is even
>> attempted. Meaning that we start with live-migrateable devices, and we added the
>> tracking, up to hotpluging a device without such support (adding a blocker)
>> leaving the mappings there with no further use. So it felt simpler to just track
>> always and avoid any mappings recording if the vIOMMU is in active use?
> 
> My preference would be that there's no runtime overhead for migration
> support until a migration is initiated.  I currently don't see why we
> can't achieve that by dynamically adding a new MemoryListener around
> migration for that purpose.  Do you?  Thanks,

I definitely agree with the general sentiment of being more dynamic, but perhaps
I am not seeing how.
Alex Williamson Feb. 23, 2023, 9:50 p.m. UTC | #5
On Thu, 23 Feb 2023 21:19:12 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 23/02/2023 21:05, Alex Williamson wrote:
> > On Thu, 23 Feb 2023 10:37:10 +0000
> > Joao Martins <joao.m.martins@oracle.com> wrote:  
> >> On 22/02/2023 22:10, Alex Williamson wrote:  
> >>> On Wed, 22 Feb 2023 19:49:05 +0200
> >>> Avihai Horon <avihaih@nvidia.com> wrote:    
> >>>> From: Joao Martins <joao.m.martins@oracle.com>
> >>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> >>>>          .iova = iova,
> >>>>          .size = size,
> >>>>      };
> >>>> +    int ret;
> >>>> +
> >>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
> >>>> +    if (ret) {
> >>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
> >>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
> >>>> +                     iova, size, ret, strerror(-ret));
> >>>> +
> >>>> +        return ret;
> >>>> +    }    
> >>>
> >>> Is there no way to replay the mappings when a migration is started?
> >>> This seems like a horrible latency and bloat trade-off for the
> >>> possibility that the VM might migrate and the device might support
> >>> these features.  Our performance with vIOMMU is already terrible, I
> >>> can't help but believe this makes it worse.  Thanks,
> >>>     
> >>
> >> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
> >> that uses a max-iova based IOVA range. So this is really for iommu identity
> >> mapping and no-VIOMMU.  
> > 
> > Ok, yes, there are no mappings recorded for any containers that have a
> > non-empty giommu_list.
> >   
> >> We could replay them if they were tracked/stored anywhere.  
> > 
> > Rather than piggybacking on vfio_memory_listener, why not simply
> > register a new MemoryListener when migration is started?  That will
> > replay all the existing ranges and allow tracking to happen separate
> > from mapping, and only when needed.
> >   
> 
> The problem with that is that *starting* dirty tracking needs to have all the
> range, we aren't supposed to start each range separately. So on a memory
> listener callback you don't have introspection when you are dealing with the
> last range, do we?

As soon as memory_listener_register() returns, all your callbacks to
build the IOVATree have been called and you can act on the result the
same as if you were relying on the vfio mapping MemoryListener.  I'm
not seeing the problem.  Thanks,

Alex
Joao Martins Feb. 23, 2023, 9:54 p.m. UTC | #6
On 23/02/2023 21:50, Alex Williamson wrote:
> On Thu, 23 Feb 2023 21:19:12 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 23/02/2023 21:05, Alex Williamson wrote:
>>> On Thu, 23 Feb 2023 10:37:10 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:  
>>>> On 22/02/2023 22:10, Alex Williamson wrote:  
>>>>> On Wed, 22 Feb 2023 19:49:05 +0200
>>>>> Avihai Horon <avihaih@nvidia.com> wrote:    
>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>>>>>          .iova = iova,
>>>>>>          .size = size,
>>>>>>      };
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>>>>>> +    if (ret) {
>>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>>>>>> +                     iova, size, ret, strerror(-ret));
>>>>>> +
>>>>>> +        return ret;
>>>>>> +    }    
>>>>>
>>>>> Is there no way to replay the mappings when a migration is started?
>>>>> This seems like a horrible latency and bloat trade-off for the
>>>>> possibility that the VM might migrate and the device might support
>>>>> these features.  Our performance with vIOMMU is already terrible, I
>>>>> can't help but believe this makes it worse.  Thanks,
>>>>>     
>>>>
>>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
>>>> that uses a max-iova based IOVA range. So this is really for iommu identity
>>>> mapping and no-VIOMMU.  
>>>
>>> Ok, yes, there are no mappings recorded for any containers that have a
>>> non-empty giommu_list.
>>>   
>>>> We could replay them if they were tracked/stored anywhere.  
>>>
>>> Rather than piggybacking on vfio_memory_listener, why not simply
>>> register a new MemoryListener when migration is started?  That will
>>> replay all the existing ranges and allow tracking to happen separate
>>> from mapping, and only when needed.
>>>   
>>
>> The problem with that is that *starting* dirty tracking needs to have all the
>> range, we aren't supposed to start each range separately. So on a memory
>> listener callback you don't have introspection when you are dealing with the
>> last range, do we?
> 
> As soon as memory_listener_register() returns, all your callbacks to
> build the IOVATree have been called and you can act on the result the
> same as if you were relying on the vfio mapping MemoryListener.  I'm
> not seeing the problem.  Thanks,

I was just checking memory_global_dirty_log_start() (as when dirty tracking
starts getting enabled) and yes you're definitively right.

I thought this was asynchronous given that there are so many mrs, but I must be
confusing with something elsewhere.
Joao Martins Feb. 28, 2023, 12:11 p.m. UTC | #7
On 23/02/2023 21:50, Alex Williamson wrote:
> On Thu, 23 Feb 2023 21:19:12 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 23/02/2023 21:05, Alex Williamson wrote:
>>> On Thu, 23 Feb 2023 10:37:10 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:  
>>>> On 22/02/2023 22:10, Alex Williamson wrote:  
>>>>> On Wed, 22 Feb 2023 19:49:05 +0200
>>>>> Avihai Horon <avihaih@nvidia.com> wrote:    
>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>>>>>          .iova = iova,
>>>>>>          .size = size,
>>>>>>      };
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>>>>>> +    if (ret) {
>>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>>>>>> +                     iova, size, ret, strerror(-ret));
>>>>>> +
>>>>>> +        return ret;
>>>>>> +    }    
>>>>>
>>>>> Is there no way to replay the mappings when a migration is started?
>>>>> This seems like a horrible latency and bloat trade-off for the
>>>>> possibility that the VM might migrate and the device might support
>>>>> these features.  Our performance with vIOMMU is already terrible, I
>>>>> can't help but believe this makes it worse.  Thanks,
>>>>>     
>>>>
>>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
>>>> that uses a max-iova based IOVA range. So this is really for iommu identity
>>>> mapping and no-VIOMMU.  
>>>
>>> Ok, yes, there are no mappings recorded for any containers that have a
>>> non-empty giommu_list.
>>>   
>>>> We could replay them if they were tracked/stored anywhere.  
>>>
>>> Rather than piggybacking on vfio_memory_listener, why not simply
>>> register a new MemoryListener when migration is started?  That will
>>> replay all the existing ranges and allow tracking to happen separate
>>> from mapping, and only when needed.
>>>   
>>
>> The problem with that is that *starting* dirty tracking needs to have all the
>> range, we aren't supposed to start each range separately. So on a memory
>> listener callback you don't have introspection when you are dealing with the
>> last range, do we?
> 
> As soon as memory_listener_register() returns, all your callbacks to
> build the IOVATree have been called and you can act on the result the
> same as if you were relying on the vfio mapping MemoryListener.  I'm
> not seeing the problem.  Thanks,
> 

While doing these changes, the nice thing of the current patch is that whatever
changes apply to vfio_listener_region_add() will be reflected in the mappings
tree that stores what we will dirty track. If we move the mappings calculation
necessary for dirty tracking only when we start, we will have to duplicate the
same checks, and open for bugs where we ask things to be dirty track-ed that
haven't been DMA mapped. These two aren't necessarily tied, but felt like I
should raise the potentially duplication of the checks (and the same thing
applies for handling virtio-mem and what not).

I understand that if we were going to store *a lot* of mappings that this would
add up in space requirements. But for no-vIOMMU (or iommu=pt) case this is only
about 12ranges or so, it is much simpler to piggyback the existing listener.
Would you still want to move this to its own dedicated memory listener?

	Joao
Alex Williamson Feb. 28, 2023, 8:36 p.m. UTC | #8
On Tue, 28 Feb 2023 12:11:06 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 23/02/2023 21:50, Alex Williamson wrote:
> > On Thu, 23 Feb 2023 21:19:12 +0000
> > Joao Martins <joao.m.martins@oracle.com> wrote:  
> >> On 23/02/2023 21:05, Alex Williamson wrote:  
> >>> On Thu, 23 Feb 2023 10:37:10 +0000
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:    
> >>>> On 22/02/2023 22:10, Alex Williamson wrote:    
> >>>>> On Wed, 22 Feb 2023 19:49:05 +0200
> >>>>> Avihai Horon <avihaih@nvidia.com> wrote:      
> >>>>>> From: Joao Martins <joao.m.martins@oracle.com>
> >>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> >>>>>>          .iova = iova,
> >>>>>>          .size = size,
> >>>>>>      };
> >>>>>> +    int ret;
> >>>>>> +
> >>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
> >>>>>> +    if (ret) {
> >>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
> >>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
> >>>>>> +                     iova, size, ret, strerror(-ret));
> >>>>>> +
> >>>>>> +        return ret;
> >>>>>> +    }      
> >>>>>
> >>>>> Is there no way to replay the mappings when a migration is started?
> >>>>> This seems like a horrible latency and bloat trade-off for the
> >>>>> possibility that the VM might migrate and the device might support
> >>>>> these features.  Our performance with vIOMMU is already terrible, I
> >>>>> can't help but believe this makes it worse.  Thanks,
> >>>>>       
> >>>>
> >>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
> >>>> that uses a max-iova based IOVA range. So this is really for iommu identity
> >>>> mapping and no-VIOMMU.    
> >>>
> >>> Ok, yes, there are no mappings recorded for any containers that have a
> >>> non-empty giommu_list.
> >>>     
> >>>> We could replay them if they were tracked/stored anywhere.    
> >>>
> >>> Rather than piggybacking on vfio_memory_listener, why not simply
> >>> register a new MemoryListener when migration is started?  That will
> >>> replay all the existing ranges and allow tracking to happen separate
> >>> from mapping, and only when needed.
> >>>     
> >>
> >> The problem with that is that *starting* dirty tracking needs to have all the
> >> range, we aren't supposed to start each range separately. So on a memory
> >> listener callback you don't have introspection when you are dealing with the
> >> last range, do we?  
> > 
> > As soon as memory_listener_register() returns, all your callbacks to
> > build the IOVATree have been called and you can act on the result the
> > same as if you were relying on the vfio mapping MemoryListener.  I'm
> > not seeing the problem.  Thanks,
> >   
> 
> While doing these changes, the nice thing of the current patch is that whatever
> changes apply to vfio_listener_region_add() will be reflected in the mappings
> tree that stores what we will dirty track. If we move the mappings calculation
> necessary for dirty tracking only when we start, we will have to duplicate the
> same checks, and open for bugs where we ask things to be dirty track-ed that
> haven't been DMA mapped. These two aren't necessarily tied, but felt like I
> should raise the potentially duplication of the checks (and the same thing
> applies for handling virtio-mem and what not).
> 
> I understand that if we were going to store *a lot* of mappings that this would
> add up in space requirements. But for no-vIOMMU (or iommu=pt) case this is only
> about 12ranges or so, it is much simpler to piggyback the existing listener.
> Would you still want to move this to its own dedicated memory listener?

Code duplication and bugs are good points, but while typically we're
only seeing a few handfuls of ranges, doesn't virtio-mem in particular
allow that we could be seeing quite a lot more?

We used to be limited to a fairly small number of KVM memory slots,
which effectively bounded non-vIOMMU DMA mappings, but that value is
now 2^15, so we need to anticipate that we could see many more than a
dozen mappings.

Can we make the same argument that the overhead is negligible if a VM
makes use of 10s of GB of virtio-mem with 2MB block size?

But then on a 4KB host we're limited to 256 tracking entries, so
wasting all that time and space on a runtime IOVATree is even more
dubious.

In fact, it doesn't really matter that vfio_listener_region_add and
this potentially new listener come to the same result, as long as the
new listener is a superset of the existing listener.  So I think we can
simplify out a lot of the places we'd see duplication and bugs.  I'm
not even really sure why we wouldn't simplify things further and only
record a single range covering the low and high memory marks for a
non-vIOMMU VMs, or potentially an approximation removing gaps of 1GB or
more, for example.  Thanks,

Alex
Joao Martins March 2, 2023, 12:07 a.m. UTC | #9
On 28/02/2023 20:36, Alex Williamson wrote:
> On Tue, 28 Feb 2023 12:11:06 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 23/02/2023 21:50, Alex Williamson wrote:
>>> On Thu, 23 Feb 2023 21:19:12 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:  
>>>> On 23/02/2023 21:05, Alex Williamson wrote:  
>>>>> On Thu, 23 Feb 2023 10:37:10 +0000
>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:    
>>>>>> On 22/02/2023 22:10, Alex Williamson wrote:    
>>>>>>> On Wed, 22 Feb 2023 19:49:05 +0200
>>>>>>> Avihai Horon <avihaih@nvidia.com> wrote:      
>>>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>>>>>>>          .iova = iova,
>>>>>>>>          .size = size,
>>>>>>>>      };
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>>>>>>>> +    if (ret) {
>>>>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>>>>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>>>>>>>> +                     iova, size, ret, strerror(-ret));
>>>>>>>> +
>>>>>>>> +        return ret;
>>>>>>>> +    }      
>>>>>>>
>>>>>>> Is there no way to replay the mappings when a migration is started?
>>>>>>> This seems like a horrible latency and bloat trade-off for the
>>>>>>> possibility that the VM might migrate and the device might support
>>>>>>> these features.  Our performance with vIOMMU is already terrible, I
>>>>>>> can't help but believe this makes it worse.  Thanks,
>>>>>>>       
>>>>>>
>>>>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
>>>>>> that uses a max-iova based IOVA range. So this is really for iommu identity
>>>>>> mapping and no-VIOMMU.    
>>>>>
>>>>> Ok, yes, there are no mappings recorded for any containers that have a
>>>>> non-empty giommu_list.
>>>>>     
>>>>>> We could replay them if they were tracked/stored anywhere.    
>>>>>
>>>>> Rather than piggybacking on vfio_memory_listener, why not simply
>>>>> register a new MemoryListener when migration is started?  That will
>>>>> replay all the existing ranges and allow tracking to happen separate
>>>>> from mapping, and only when needed.
>>>>
>>>> The problem with that is that *starting* dirty tracking needs to have all the
>>>> range, we aren't supposed to start each range separately. So on a memory
>>>> listener callback you don't have introspection when you are dealing with the
>>>> last range, do we?  
>>>
>>> As soon as memory_listener_register() returns, all your callbacks to
>>> build the IOVATree have been called and you can act on the result the
>>> same as if you were relying on the vfio mapping MemoryListener.  I'm
>>> not seeing the problem.  Thanks,
>>>   
>>
>> While doing these changes, the nice thing of the current patch is that whatever
>> changes apply to vfio_listener_region_add() will be reflected in the mappings
>> tree that stores what we will dirty track. If we move the mappings calculation
>> necessary for dirty tracking only when we start, we will have to duplicate the
>> same checks, and open for bugs where we ask things to be dirty track-ed that
>> haven't been DMA mapped. These two aren't necessarily tied, but felt like I
>> should raise the potentially duplication of the checks (and the same thing
>> applies for handling virtio-mem and what not).
>>
>> I understand that if we were going to store *a lot* of mappings that this would
>> add up in space requirements. But for no-vIOMMU (or iommu=pt) case this is only
>> about 12ranges or so, it is much simpler to piggyback the existing listener.
>> Would you still want to move this to its own dedicated memory listener?
> 
> Code duplication and bugs are good points, but while typically we're
> only seeing a few handfuls of ranges, doesn't virtio-mem in particular
> allow that we could be seeing quite a lot more?
> 
Ugh yes, it could be.

> We used to be limited to a fairly small number of KVM memory slots,
> which effectively bounded non-vIOMMU DMA mappings, but that value is
> now 2^15, so we need to anticipate that we could see many more than a
> dozen mappings.
> 

Even with 32k memory slots today we are still reduced on a handful. hv-balloon
and virtio-mem approaches though are the ones that may stress such limit IIUC
prior to starting migration.

> Can we make the same argument that the overhead is negligible if a VM
> makes use of 10s of GB of virtio-mem with 2MB block size?
> 
> But then on a 4KB host we're limited to 256 tracking entries, so
> wasting all that time and space on a runtime IOVATree is even more
> dubious.
>
> In fact, it doesn't really matter that vfio_listener_region_add and
> this potentially new listener come to the same result, as long as the
> new listener is a superset of the existing listener. 

I am trying to put this in a way that's not too ugly to reuse the most between
vfio_listener_region_add() and the vfio_migration_mapping_add().

For you to have an idea, here's so far how it looks thus far:

https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking

Particularly this one:

https://github.com/jpemartins/qemu/commit/3b11fa0e4faa0f9c0f42689a7367284a25d1b585

vfio_get_section_iova_range() is where most of these checks are that are sort of
a subset of the ones in vfio_listener_region_add().

> So I think we can
> simplify out a lot of the places we'd see duplication and bugs.  I'm
> not even really sure why we wouldn't simplify things further and only
> record a single range covering the low and high memory marks for a
> non-vIOMMU VMs, or potentially an approximation removing gaps of 1GB or
> more, for example.  Thanks,

Yes, for Qemu, to have one single artificial range with a computed min IOVA and
max IOVA is the simplest to get it implemented. It would avoid us maintaining an
IOVATree as you would only track min/max pair (maybe max_below).

My concern with a reduced single range is 1) big holes in address space leading
to asking more than you need[*] and then 2) device dirty tracking limits e.g.
hardware may have upper limits, so you may prematurely exercise those. So giving
more choice to the vfio drivers to decide how to cope with the mapped address
space description looks to have a bit more longevity.

Anyway the temptation with having a single range is that this can all go away if
the vfio_listener_region_add() tracks just min/max IOVA pair.

Below scissors mark it's how this patch is looking like in the commit above
while being a full list of mappings. It's also stored here:

https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking

I'll respond here with a patch on what it looks like with the range watermark
approach.

	Joao

[0] AMD 1T boundary is what comes to mind, which on Qemu relocates memory above
4G into after 1T.

------------------>8--------------------

From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 22 Feb 2023 19:49:05 +0200
Subject: [PATCH wip 7/12] vfio/common: Record DMA mapped IOVA ranges

According to the device DMA logging uAPI, IOVA ranges to be logged by
the device must be provided all at once upon DMA logging start.

As preparation for the following patches which will add device dirty
page tracking, keep a record of all DMA mapped IOVA ranges so later they
can be used for DMA logging start.

Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 hw/vfio/common.c              | 147 +++++++++++++++++++++++++++++++++-
 hw/vfio/trace-events          |   2 +
 include/hw/vfio/vfio-common.h |   4 +
 3 files changed, 150 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 655e8dbb74d4..17971e6dbaeb 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -44,6 +44,7 @@
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
+#include "qemu/iova-tree.h"

 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }

+static bool vfio_have_giommu(VFIOContainer *container)
+{
+    return !QLIST_EMPTY(&container->giommu_list);
+}
+
 static void vfio_set_migration_error(int err)
 {
     MigrationState *ms = migrate_get_current();
@@ -610,6 +616,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
+    int ret;

     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
@@ -626,8 +633,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         return 0;
     }

+    ret = -errno;
     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
-    return -errno;
+
+    return ret;
 }

 static void vfio_host_win_add(VFIOContainer *container,
@@ -1326,11 +1335,127 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
*container, bool start)
     return ret;
 }

+static bool vfio_get_section_iova_range(VFIOContainer *container,
+                                        MemoryRegionSection *section,
+                                        hwaddr *out_iova, hwaddr *out_end)
+{
+    Int128 llend, llsize;
+    hwaddr iova, end;
+
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return false;
+    }
+    end = int128_get64(int128_sub(llend, int128_one()));
+
+    if (memory_region_is_iommu(section->mr) ||
+        memory_region_has_ram_discard_manager(section->mr)) {
+	return false;
+    }
+
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    if (memory_region_is_ram_device(section->mr)) {
+        VFIOHostDMAWindow *hostwin;
+        hwaddr pgmask;
+
+        hostwin = vfio_find_hostwin(container, iova, end);
+        if (!hostwin) {
+            return false;
+        }
+
+	pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
+            return false;
+        }
+    }
+
+    *out_iova = iova;
+    *out_end = int128_get64(llend);
+    return true;
+}
+
+static void vfio_migration_add_mapping(MemoryListener *listener,
+                                       MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
mappings_listener);
+    hwaddr end = 0;
+    DMAMap map;
+    int ret;
+
+    if (vfio_have_giommu(container)) {
+        vfio_set_migration_error(-EOPNOTSUPP);
+        return;
+    }
+
+    if (!vfio_listener_valid_section(section) ||
+        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
+        return;
+    }
+
+    map.size = end - map.iova - 1; // IOVATree is inclusive, so subtract 1 from
size
+    map.perm = section->readonly ? IOMMU_RO : IOMMU_RW;
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        ret = iova_tree_insert(container->mappings, &map);
+        if (ret) {
+            if (ret == IOVA_ERR_INVALID) {
+                ret = -EINVAL;
+            } else if (ret == IOVA_ERR_OVERLAP) {
+                ret = -EEXIST;
+            }
+        }
+    }
+
+    trace_vfio_migration_mapping_add(map.iova, map.iova + map.size, ret);
+
+    if (ret)
+        vfio_set_migration_error(ret);
+    return;
+}
+
+static void vfio_migration_remove_mapping(MemoryListener *listener,
+                                          MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
mappings_listener);
+    hwaddr end = 0;
+    DMAMap map;
+
+    if (vfio_have_giommu(container)) {
+        vfio_set_migration_error(-EOPNOTSUPP);
+        return;
+    }
+
+    if (!vfio_listener_valid_section(section) ||
+        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
+        return;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        iova_tree_remove(container->mappings, map);
+    }
+
+    trace_vfio_migration_mapping_del(map.iova, map.iova + map.size);
+}
+
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+    .name = "vfio-migration",
+    .region_add = vfio_migration_add_mapping,
+    .region_del = vfio_migration_remove_mapping,
+};
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;

+    memory_listener_register(&container->mappings_listener, container->space->as);
+
     ret = vfio_set_dirty_page_tracking(container, true);
     if (ret) {
         vfio_set_migration_error(ret);
@@ -1346,6 +1471,8 @@ static void vfio_listener_log_global_stop(MemoryListener
*listener)
     if (ret) {
         vfio_set_migration_error(ret);
     }
+
+    memory_listener_unregister(&container->mappings_listener);
 }

 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
@@ -2172,16 +2299,24 @@ static int vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
     QLIST_INIT(&container->vrdl_list);
+    container->mappings = iova_tree_new();
+    if (!container->mappings) {
+        error_setg(errp, "Cannot allocate DMA mappings tree");
+        ret = -ENOMEM;
+        goto free_container_exit;
+    }
+    qemu_mutex_init(&container->mappings_mutex);
+    container->mappings_listener = vfio_dirty_tracking_listener;

     ret = vfio_init_container(container, group->fd, errp);
     if (ret) {
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }

     ret = vfio_ram_block_discard_disable(container, true);
     if (ret) {
         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }

     switch (container->iommu_type) {
@@ -2317,6 +2452,10 @@ listener_release_exit:
 enable_discards_exit:
     vfio_ram_block_discard_disable(container, false);

+destroy_mappings_exit:
+    qemu_mutex_destroy(&container->mappings_mutex);
+    iova_tree_destroy(container->mappings);
+
 free_container_exit:
     g_free(container);

@@ -2371,6 +2510,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
         }

         trace_vfio_disconnect_container(container->fd);
+        qemu_mutex_destroy(&container->mappings_mutex);
+        iova_tree_destroy(container->mappings);
         close(container->fd);
         g_free(container);

diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07cd9..c92eaadcc7c4 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,6 +104,8 @@ vfio_known_safe_misalignment(const char *name, uint64_t
iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
- 0x%"PRIx64
+vfio_migration_mapping_add(uint64_t start, uint64_t end, int err) "mapping_add
0x%"PRIx64" - 0x%"PRIx64" err=%d"
+vfio_migration_mapping_del(uint64_t start, uint64_t end) "mapping_del
0x%"PRIx64" - 0x%"PRIx64
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int
num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 87524c64a443..48951da11ab4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -23,6 +23,7 @@

 #include "exec/memory.h"
 #include "qemu/queue.h"
+#include "qemu/iova-tree.h"
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
@@ -81,6 +82,7 @@ typedef struct VFIOContainer {
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     MemoryListener listener;
     MemoryListener prereg_listener;
+    MemoryListener mappings_listener;
     unsigned iommu_type;
     Error *error;
     bool initialized;
@@ -89,6 +91,8 @@ typedef struct VFIOContainer {
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
     unsigned int dma_max_mappings;
+    IOVATree *mappings;
+    QemuMutex mappings_mutex;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
--
2.17.2
Joao Martins March 2, 2023, 12:13 a.m. UTC | #10
On 02/03/2023 00:07, Joao Martins wrote:
> On 28/02/2023 20:36, Alex Williamson wrote:

[...]

>> Can we make the same argument that the overhead is negligible if a VM
>> makes use of 10s of GB of virtio-mem with 2MB block size?
>>
>> But then on a 4KB host we're limited to 256 tracking entries, so
>> wasting all that time and space on a runtime IOVATree is even more
>> dubious.
>>
>> In fact, it doesn't really matter that vfio_listener_region_add and
>> this potentially new listener come to the same result, as long as the
>> new listener is a superset of the existing listener. 
> 
> I am trying to put this in a way that's not too ugly to reuse the most between
> vfio_listener_region_add() and the vfio_migration_mapping_add().
> 
> For you to have an idea, here's so far how it looks thus far:
> 
> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
> 
> Particularly this one:
> 
> https://github.com/jpemartins/qemu/commit/3b11fa0e4faa0f9c0f42689a7367284a25d1b585
> 
> vfio_get_section_iova_range() is where most of these checks are that are sort of
> a subset of the ones in vfio_listener_region_add().
> 
>> So I think we can
>> simplify out a lot of the places we'd see duplication and bugs.  I'm
>> not even really sure why we wouldn't simplify things further and only
>> record a single range covering the low and high memory marks for a
>> non-vIOMMU VMs, or potentially an approximation removing gaps of 1GB or
>> more, for example.  Thanks,
> 
> Yes, for Qemu, to have one single artificial range with a computed min IOVA and
> max IOVA is the simplest to get it implemented. It would avoid us maintaining an
> IOVATree as you would only track min/max pair (maybe max_below).
> 
> My concern with a reduced single range is 1) big holes in address space leading
> to asking more than you need[*] and then 2) device dirty tracking limits e.g.
> hardware may have upper limits, so you may prematurely exercise those. So giving
> more choice to the vfio drivers to decide how to cope with the mapped address
> space description looks to have a bit more longevity.
> 
> Anyway the temptation with having a single range is that this can all go away if
> the vfio_listener_region_add() tracks just min/max IOVA pair.
> 
> Below scissors mark it's how this patch is looking like in the commit above
> while being a full list of mappings. It's also stored here:
> 
> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
> 
> I'll respond here with a patch on what it looks like with the range watermark
> approach.
> 

... Which is here:

https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking-range

And below scissors mark at the end this patch in the series. Smaller, most of
the churn is the new checks. I need to adjust commit messages, depending on
which way the group decides to go. So take those with a grain of salt.

> 
> [0] AMD 1T boundary is what comes to mind, which on Qemu relocates memory above
> 4G into after 1T.

---------------->8-----------------

From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 22 Feb 2023 19:49:05 +0200
Subject: [PATCH wip 7/12] vfio/common: Record DMA mapped IOVA ranges

According to the device DMA logging uAPI, IOVA ranges to be logged by
the device must be provided all at once upon DMA logging start.

As preparation for the following patches which will add device dirty
page tracking, keep a record of all DMA mapped IOVA ranges so later they
can be used for DMA logging start.

Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 hw/vfio/common.c              | 110 ++++++++++++++++++++++++++++++++--
 hw/vfio/trace-events          |   1 +
 include/hw/vfio/vfio-common.h |   5 ++
 3 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 655e8dbb74d4..ff4a2aa0e14b 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -44,6 +44,7 @@
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
+#include "qemu/iova-tree.h"

 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }

+static bool vfio_have_giommu(VFIOContainer *container)
+{
+    return !QLIST_EMPTY(&container->giommu_list);
+}
+
 static void vfio_set_migration_error(int err)
 {
     MigrationState *ms = migrate_get_current();
@@ -610,6 +616,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
+    int ret;

     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
@@ -626,8 +633,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         return 0;
     }

+    ret = -errno;
     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
-    return -errno;
+
+    return ret;
 }

 static void vfio_host_win_add(VFIOContainer *container,
@@ -1326,11 +1335,93 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
*container, bool start)
     return ret;
 }

+static bool vfio_get_section_iova_range(VFIOContainer *container,
+                                        MemoryRegionSection *section,
+                                        hwaddr *out_iova, hwaddr *out_end)
+{
+    Int128 llend, llsize;
+    hwaddr iova, end;
+
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return false;
+    }
+    end = int128_get64(int128_sub(llend, int128_one()));
+
+    if (memory_region_is_iommu(section->mr) ||
+        memory_region_has_ram_discard_manager(section->mr)) {
+        return false;
+    }
+
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    if (memory_region_is_ram_device(section->mr)) {
+        VFIOHostDMAWindow *hostwin;
+        hwaddr pgmask;
+
+        hostwin = vfio_find_hostwin(container, iova, end);
+        if (!hostwin) {
+            return false;
+        }
+
+        pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
+            return false;
+        }
+    }
+
+    *out_iova = iova;
+    *out_end = int128_get64(llend);
+    return true;
+}
+
+static void vfio_dma_tracking_update(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
mappings_listener);
+    hwaddr iova, end;
+
+    if (vfio_have_giommu(container)) {
+        vfio_set_migration_error(-EOPNOTSUPP);
+        return;
+    }
+
+    if (!vfio_listener_valid_section(section) ||
+        !vfio_get_section_iova_range(container, section, &iova, &end)) {
+        return;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        if (container->min_tracking_iova > iova) {
+            container->min_tracking_iova = iova;
+        }
+        if (container->max_tracking_iova < end) {
+            container->max_tracking_iova = end;
+        }
+    }
+
+    trace_vfio_dma_tracking_update(iova, end,
+                                   container->min_tracking_iova,
+                                   container->max_tracking_iova);
+    return;
+}
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+    .name = "vfio-tracking",
+    .region_add = vfio_dma_tracking_update,
+};
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;

+    memory_listener_register(&container->mappings_listener, container->space->as);
+
     ret = vfio_set_dirty_page_tracking(container, true);
     if (ret) {
         vfio_set_migration_error(ret);
@@ -1346,6 +1437,13 @@ static void vfio_listener_log_global_stop(MemoryListener
*listener)
     if (ret) {
         vfio_set_migration_error(ret);
     }
+
+    memory_listener_unregister(&container->mappings_listener);
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        container->min_tracking_iova = 0;
+        container->max_tracking_iova = 0;
+    }
 }

 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
@@ -2172,16 +2270,18 @@ static int vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
     QLIST_INIT(&container->vrdl_list);
+    qemu_mutex_init(&container->mappings_mutex);
+    container->mappings_listener = vfio_dirty_tracking_listener;

     ret = vfio_init_container(container, group->fd, errp);
     if (ret) {
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }

     ret = vfio_ram_block_discard_disable(container, true);
     if (ret) {
         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }

     switch (container->iommu_type) {
@@ -2317,7 +2417,8 @@ listener_release_exit:
 enable_discards_exit:
     vfio_ram_block_discard_disable(container, false);

-free_container_exit:
+destroy_mappings_exit:
+    qemu_mutex_destroy(&container->mappings_mutex);
     g_free(container);

 close_fd_exit:
@@ -2371,6 +2472,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
         }

         trace_vfio_disconnect_container(container->fd);
+        qemu_mutex_destroy(&container->mappings_mutex);
         close(container->fd);
         g_free(container);

diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07cd9..8591f660595b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t
iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
- 0x%"PRIx64
+vfio_dma_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t
max) "tracking_update 0x%"PRIx64" - 0x%"PRIx64" -> [0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int
num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 87524c64a443..bb54f204ab8b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -23,6 +23,7 @@

 #include "exec/memory.h"
 #include "qemu/queue.h"
+#include "qemu/iova-tree.h"
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
@@ -81,6 +82,7 @@ typedef struct VFIOContainer {
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     MemoryListener listener;
     MemoryListener prereg_listener;
+    MemoryListener mappings_listener;
     unsigned iommu_type;
     Error *error;
     bool initialized;
@@ -89,6 +91,9 @@ typedef struct VFIOContainer {
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
     unsigned int dma_max_mappings;
+    hwaddr min_tracking_iova;
+    hwaddr max_tracking_iova;
+    QemuMutex mappings_mutex;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
--
2.17.2
Alex Williamson March 2, 2023, 6:42 p.m. UTC | #11
On Thu, 2 Mar 2023 00:07:35 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 28/02/2023 20:36, Alex Williamson wrote:
> > On Tue, 28 Feb 2023 12:11:06 +0000
> > Joao Martins <joao.m.martins@oracle.com> wrote:  
> >> On 23/02/2023 21:50, Alex Williamson wrote:  
> >>> On Thu, 23 Feb 2023 21:19:12 +0000
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:    
> >>>> On 23/02/2023 21:05, Alex Williamson wrote:    
> >>>>> On Thu, 23 Feb 2023 10:37:10 +0000
> >>>>> Joao Martins <joao.m.martins@oracle.com> wrote:      
> >>>>>> On 22/02/2023 22:10, Alex Williamson wrote:      
> >>>>>>> On Wed, 22 Feb 2023 19:49:05 +0200
> >>>>>>> Avihai Horon <avihaih@nvidia.com> wrote:        
> >>>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> >>>>>>>>          .iova = iova,
> >>>>>>>>          .size = size,
> >>>>>>>>      };
> >>>>>>>> +    int ret;
> >>>>>>>> +
> >>>>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
> >>>>>>>> +    if (ret) {
> >>>>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
> >>>>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
> >>>>>>>> +                     iova, size, ret, strerror(-ret));
> >>>>>>>> +
> >>>>>>>> +        return ret;
> >>>>>>>> +    }        
> >>>>>>>
> >>>>>>> Is there no way to replay the mappings when a migration is started?
> >>>>>>> This seems like a horrible latency and bloat trade-off for the
> >>>>>>> possibility that the VM might migrate and the device might support
> >>>>>>> these features.  Our performance with vIOMMU is already terrible, I
> >>>>>>> can't help but believe this makes it worse.  Thanks,
> >>>>>>>         
> >>>>>>
> >>>>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
> >>>>>> that uses a max-iova based IOVA range. So this is really for iommu identity
> >>>>>> mapping and no-VIOMMU.      
> >>>>>
> >>>>> Ok, yes, there are no mappings recorded for any containers that have a
> >>>>> non-empty giommu_list.
> >>>>>       
> >>>>>> We could replay them if they were tracked/stored anywhere.      
> >>>>>
> >>>>> Rather than piggybacking on vfio_memory_listener, why not simply
> >>>>> register a new MemoryListener when migration is started?  That will
> >>>>> replay all the existing ranges and allow tracking to happen separate
> >>>>> from mapping, and only when needed.  
> >>>>
> >>>> The problem with that is that *starting* dirty tracking needs to have all the
> >>>> range, we aren't supposed to start each range separately. So on a memory
> >>>> listener callback you don't have introspection when you are dealing with the
> >>>> last range, do we?    
> >>>
> >>> As soon as memory_listener_register() returns, all your callbacks to
> >>> build the IOVATree have been called and you can act on the result the
> >>> same as if you were relying on the vfio mapping MemoryListener.  I'm
> >>> not seeing the problem.  Thanks,
> >>>     
> >>
> >> While doing these changes, the nice thing of the current patch is that whatever
> >> changes apply to vfio_listener_region_add() will be reflected in the mappings
> >> tree that stores what we will dirty track. If we move the mappings calculation
> >> necessary for dirty tracking only when we start, we will have to duplicate the
> >> same checks, and open for bugs where we ask things to be dirty track-ed that
> >> haven't been DMA mapped. These two aren't necessarily tied, but felt like I
> >> should raise the potentially duplication of the checks (and the same thing
> >> applies for handling virtio-mem and what not).
> >>
> >> I understand that if we were going to store *a lot* of mappings that this would
> >> add up in space requirements. But for no-vIOMMU (or iommu=pt) case this is only
> >> about 12ranges or so, it is much simpler to piggyback the existing listener.
> >> Would you still want to move this to its own dedicated memory listener?  
> > 
> > Code duplication and bugs are good points, but while typically we're
> > only seeing a few handfuls of ranges, doesn't virtio-mem in particular
> > allow that we could be seeing quite a lot more?
> >   
> Ugh yes, it could be.
> 
> > We used to be limited to a fairly small number of KVM memory slots,
> > which effectively bounded non-vIOMMU DMA mappings, but that value is
> > now 2^15, so we need to anticipate that we could see many more than a
> > dozen mappings.
> >   
> 
> Even with 32k memory slots today we are still reduced on a handful. hv-balloon
> and virtio-mem approaches though are the ones that may stress such limit IIUC
> prior to starting migration.
> 
> > Can we make the same argument that the overhead is negligible if a VM
> > makes use of 10s of GB of virtio-mem with 2MB block size?
> > 
> > But then on a 4KB host we're limited to 256 tracking entries, so
> > wasting all that time and space on a runtime IOVATree is even more
> > dubious.
> >
> > In fact, it doesn't really matter that vfio_listener_region_add and
> > this potentially new listener come to the same result, as long as the
> > new listener is a superset of the existing listener.   
> 
> I am trying to put this in a way that's not too ugly to reuse the most between
> vfio_listener_region_add() and the vfio_migration_mapping_add().
> 
> For you to have an idea, here's so far how it looks thus far:
> 
> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
> 
> Particularly this one:
> 
> https://github.com/jpemartins/qemu/commit/3b11fa0e4faa0f9c0f42689a7367284a25d1b585
> 
> vfio_get_section_iova_range() is where most of these checks are that are sort of
> a subset of the ones in vfio_listener_region_add().
> 
> > So I think we can
> > simplify out a lot of the places we'd see duplication and bugs.  I'm
> > not even really sure why we wouldn't simplify things further and only
> > record a single range covering the low and high memory marks for a
> > non-vIOMMU VMs, or potentially an approximation removing gaps of 1GB or
> > more, for example.  Thanks,  
> 
> Yes, for Qemu, to have one single artificial range with a computed min IOVA and
> max IOVA is the simplest to get it implemented. It would avoid us maintaining an
> IOVATree as you would only track min/max pair (maybe max_below).
> 
> My concern with a reduced single range is 1) big holes in address space leading
> to asking more than you need[*] and then 2) device dirty tracking limits e.g.
> hardware may have upper limits, so you may prematurely exercise those. So giving
> more choice to the vfio drivers to decide how to cope with the mapped address
> space description looks to have a bit more longevity.

The fact that we don't know anything about the device dirty tracking
limits worries me.  If QEMU reports the VM is migratable, ie. lacking
migration blockers, then we really shouldn't have non-extraordinary
things like the VM actually having a bigger address space than the
device can support, or enabling a vIOMMU, suddenly make the VM
non-migratable.

If we only needed to worry about scenarios like the AMD hypertransport
memory relocation, then tracking ranges for 32-bit and 64-bit RAM
separately would be an easy solution, we always have 1-2 ranges for the
device to track.  That's still a big simplification from tracking every
DMA mappings.

AIUI, we really can't even rely on the device supporting a full host
page size worth of mappings, the uAPI only stipulates that the core
kernel code will support such a request.  So it seems prudent that
userspace should conserve entries wherever it can.  For the
alternative, to provide ranges that closely match actual mappings, I
think we'd need to be able to collapse IOVATree entries with the
smallest gap when we reach the limit, and continue to collapse each time
the driver rejects the number of ranges provided.  That's obviously
much more complicated and I'd prefer to avoid it if there are easier
approximations.

> Anyway the temptation with having a single range is that this can all go away if
> the vfio_listener_region_add() tracks just min/max IOVA pair.
> 
> Below scissors mark it's how this patch is looking like in the commit above
> while being a full list of mappings. It's also stored here:
> 
> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
> 
> I'll respond here with a patch on what it looks like with the range watermark
> approach.
> 
> 	Joao
> 
> [0] AMD 1T boundary is what comes to mind, which on Qemu relocates memory above
> 4G into after 1T.
> 
> ------------------>8--------------------  
> 
> From: Joao Martins <joao.m.martins@oracle.com>
> Date: Wed, 22 Feb 2023 19:49:05 +0200
> Subject: [PATCH wip 7/12] vfio/common: Record DMA mapped IOVA ranges
> 
> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>  hw/vfio/common.c              | 147 +++++++++++++++++++++++++++++++++-
>  hw/vfio/trace-events          |   2 +
>  include/hw/vfio/vfio-common.h |   4 +
>  3 files changed, 150 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 655e8dbb74d4..17971e6dbaeb 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -44,6 +44,7 @@
>  #include "migration/blocker.h"
>  #include "migration/qemu-file.h"
>  #include "sysemu/tpm.h"
> +#include "qemu/iova-tree.h"
> 
>  VFIOGroupList vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>      multiple_devices_migration_blocker = NULL;
>  }
> 
> +static bool vfio_have_giommu(VFIOContainer *container)
> +{
> +    return !QLIST_EMPTY(&container->giommu_list);
> +}

I think it's the case, but can you confirm we build the giommu_list
regardless of whether the vIOMMU is actually enabled?

> +
>  static void vfio_set_migration_error(int err)
>  {
>      MigrationState *ms = migrate_get_current();
> @@ -610,6 +616,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>          .iova = iova,
>          .size = size,
>      };
> +    int ret;
> 
>      if (!readonly) {
>          map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
> @@ -626,8 +633,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>          return 0;
>      }
> 
> +    ret = -errno;
>      error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
> -    return -errno;
> +
> +    return ret;
>  }
> 
>  static void vfio_host_win_add(VFIOContainer *container,
> @@ -1326,11 +1335,127 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
> *container, bool start)
>      return ret;
>  }
> 
> +static bool vfio_get_section_iova_range(VFIOContainer *container,
> +                                        MemoryRegionSection *section,
> +                                        hwaddr *out_iova, hwaddr *out_end)
> +{
> +    Int128 llend, llsize;
> +    hwaddr iova, end;
> +
> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return false;
> +    }
> +    end = int128_get64(int128_sub(llend, int128_one()));
> +
> +    if (memory_region_is_iommu(section->mr) ||

Shouldn't there already be a migration blocker in place preventing this
from being possible?

> +        memory_region_has_ram_discard_manager(section->mr)) {

Are we claiming not to support virtio-mem VMs as well?  The current
comment in vfio/common.c that states we only want to map actually
populated parts seems like it doesn't apply here, we'd want dirty
tracking ranges to include these regardless.  Unless there's some
reason virtio-mem changes are blocked during pre-copy.

> +	return false;
> +    }
> +
> +    llsize = int128_sub(llend, int128_make64(iova));
> +
> +    if (memory_region_is_ram_device(section->mr)) {
> +        VFIOHostDMAWindow *hostwin;
> +        hwaddr pgmask;
> +
> +        hostwin = vfio_find_hostwin(container, iova, end);
> +        if (!hostwin) {
> +            return false;
> +        }
> +
> +	pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
> +        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
> +            return false;
> +        }
> +    }

ram_device is intended to be an address range on another device, so do
we really need it in DMA dirty tracking?  ex. we don't include device
BARs in the dirty bitmap, we expect modified device state to be
reported by the device, so it seems like there's no case where we'd
include this in the device dirty tracking ranges.

> +
> +    *out_iova = iova;
> +    *out_end = int128_get64(llend);
> +    return true;
> +}
> +
> +static void vfio_migration_add_mapping(MemoryListener *listener,
> +                                       MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> mappings_listener);
> +    hwaddr end = 0;
> +    DMAMap map;
> +    int ret;
> +
> +    if (vfio_have_giommu(container)) {
> +        vfio_set_migration_error(-EOPNOTSUPP);

There should be a migration blocker that prevents this from ever being
called in this case.

> +        return;
> +    }
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
> +        return;
> +    }
> +
> +    map.size = end - map.iova - 1; // IOVATree is inclusive, so subtract 1 from
> size
> +    map.perm = section->readonly ? IOMMU_RO : IOMMU_RW;
> +
> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
> +        ret = iova_tree_insert(container->mappings, &map);
> +        if (ret) {
> +            if (ret == IOVA_ERR_INVALID) {
> +                ret = -EINVAL;
> +            } else if (ret == IOVA_ERR_OVERLAP) {
> +                ret = -EEXIST;
> +            }
> +        }
> +    }
> +
> +    trace_vfio_migration_mapping_add(map.iova, map.iova + map.size, ret);
> +
> +    if (ret)
> +        vfio_set_migration_error(ret);
> +    return;
> +}
> +
> +static void vfio_migration_remove_mapping(MemoryListener *listener,
> +                                          MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> mappings_listener);
> +    hwaddr end = 0;
> +    DMAMap map;
> +
> +    if (vfio_have_giommu(container)) {
> +        vfio_set_migration_error(-EOPNOTSUPP);
> +        return;
> +    }
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
> +        return;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
> +        iova_tree_remove(container->mappings, map);
> +    }
> +
> +    trace_vfio_migration_mapping_del(map.iova, map.iova + map.size);
> +}

Why do we need a region_del callback?  We don't support modifying the
dirty tracking ranges we've provided to the device.

> +
> +
> +static const MemoryListener vfio_dirty_tracking_listener = {
> +    .name = "vfio-migration",
> +    .region_add = vfio_migration_add_mapping,
> +    .region_del = vfio_migration_remove_mapping,
> +};
> +
>  static void vfio_listener_log_global_start(MemoryListener *listener)
>  {
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>      int ret;
> 
> +    memory_listener_register(&container->mappings_listener, container->space->as);
> +
>      ret = vfio_set_dirty_page_tracking(container, true);
>      if (ret) {
>          vfio_set_migration_error(ret);
> @@ -1346,6 +1471,8 @@ static void vfio_listener_log_global_stop(MemoryListener
> *listener)
>      if (ret) {
>          vfio_set_migration_error(ret);
>      }
> +
> +    memory_listener_unregister(&container->mappings_listener);

We don't have a way to update dirty tracking ranges for a device once
dirty tracking is enabled, so what's the point of this listener running
in more than a one-shot mode?  The only purpose of a listener
continuing to run seems like it would be to generate an error for
untracked ranges and either generate a migration error or mark them
perpetually dirty.

>  }
> 
>  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
> @@ -2172,16 +2299,24 @@ static int vfio_connect_container(VFIOGroup *group,
> AddressSpace *as,
>      QLIST_INIT(&container->giommu_list);
>      QLIST_INIT(&container->hostwin_list);
>      QLIST_INIT(&container->vrdl_list);
> +    container->mappings = iova_tree_new();
> +    if (!container->mappings) {
> +        error_setg(errp, "Cannot allocate DMA mappings tree");
> +        ret = -ENOMEM;
> +        goto free_container_exit;
> +    }
> +    qemu_mutex_init(&container->mappings_mutex);
> +    container->mappings_listener = vfio_dirty_tracking_listener;

This all seems like code that would only be necessary before starting
the listener.

> 
>      ret = vfio_init_container(container, group->fd, errp);
>      if (ret) {
> -        goto free_container_exit;
> +        goto destroy_mappings_exit;
>      }
> 
>      ret = vfio_ram_block_discard_disable(container, true);
>      if (ret) {
>          error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
> -        goto free_container_exit;
> +        goto destroy_mappings_exit;
>      }
> 
>      switch (container->iommu_type) {
> @@ -2317,6 +2452,10 @@ listener_release_exit:
>  enable_discards_exit:
>      vfio_ram_block_discard_disable(container, false);
> 
> +destroy_mappings_exit:
> +    qemu_mutex_destroy(&container->mappings_mutex);
> +    iova_tree_destroy(container->mappings);
> +
>  free_container_exit:
>      g_free(container);
> 
> @@ -2371,6 +2510,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
>          }
> 
>          trace_vfio_disconnect_container(container->fd);
> +        qemu_mutex_destroy(&container->mappings_mutex);
> +        iova_tree_destroy(container->mappings);

The IOVATree should be destroyed as soon as we're done processing the
result upon starting logging.  It serves no purpose to keep it around.

Comparing with the follow-up, setting {min,max}_tracking_iova, many of
the same comments apply.  Both of these are only preparing for the
question of what do we actually do with this data.  In the IOVATree
approach, we have more fine grained information, but we can also exceed
what the device supports and we need to be able to handle that.  If our
fallback is to simply identify the min and max based on the IOVATree,
and we expect that to work better than the more granular approach, why
not start with just min/max?  If we expect there's value to the more
granular approach, then when not proceed to collapse the IOVATree until
we find a set of ranges the device can support?  Thanks,

Alex

>          close(container->fd);
>          g_free(container);
> 
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 669d9fe07cd9..c92eaadcc7c4 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,6 +104,8 @@ vfio_known_safe_misalignment(const char *name, uint64_t
> iova, uint64_t offset_wi
>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
> size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
> aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
> region_del 0x%"PRIx64" - 0x%"PRIx64
>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
> - 0x%"PRIx64
> +vfio_migration_mapping_add(uint64_t start, uint64_t end, int err) "mapping_add
> 0x%"PRIx64" - 0x%"PRIx64" err=%d"
> +vfio_migration_mapping_del(uint64_t start, uint64_t end) "mapping_del
> 0x%"PRIx64" - 0x%"PRIx64
>  vfio_disconnect_container(int fd) "close container->fd=%d"
>  vfio_put_group(int fd) "close group->fd=%d"
>  vfio_get_device(const char * name, unsigned int flags, unsigned int
> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 87524c64a443..48951da11ab4 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -23,6 +23,7 @@
> 
>  #include "exec/memory.h"
>  #include "qemu/queue.h"
> +#include "qemu/iova-tree.h"
>  #include "qemu/notify.h"
>  #include "ui/console.h"
>  #include "hw/display/ramfb.h"
> @@ -81,6 +82,7 @@ typedef struct VFIOContainer {
>      int fd; /* /dev/vfio/vfio, empowered by the attached groups */
>      MemoryListener listener;
>      MemoryListener prereg_listener;
> +    MemoryListener mappings_listener;
>      unsigned iommu_type;
>      Error *error;
>      bool initialized;
> @@ -89,6 +91,8 @@ typedef struct VFIOContainer {
>      uint64_t max_dirty_bitmap_size;
>      unsigned long pgsizes;
>      unsigned int dma_max_mappings;
> +    IOVATree *mappings;
> +    QemuMutex mappings_mutex;
>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>      QLIST_HEAD(, VFIOGroup) group_list;
> --
> 2.17.2
>
Joao Martins March 3, 2023, 12:19 a.m. UTC | #12
On 02/03/2023 18:42, Alex Williamson wrote:
> On Thu, 2 Mar 2023 00:07:35 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 28/02/2023 20:36, Alex Williamson wrote:
>>> On Tue, 28 Feb 2023 12:11:06 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:  
>>>> On 23/02/2023 21:50, Alex Williamson wrote:  
>>>>> On Thu, 23 Feb 2023 21:19:12 +0000
>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:    
>>>>>> On 23/02/2023 21:05, Alex Williamson wrote:    
>>>>>>> On Thu, 23 Feb 2023 10:37:10 +0000
>>>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:      
>>>>>>>> On 22/02/2023 22:10, Alex Williamson wrote:      
>>>>>>>>> On Wed, 22 Feb 2023 19:49:05 +0200
>>>>>>>>> Avihai Horon <avihaih@nvidia.com> wrote:        
>>>>>>>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>>>> @@ -612,6 +665,16 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>>>>>>>>>          .iova = iova,
>>>>>>>>>>          .size = size,
>>>>>>>>>>      };
>>>>>>>>>> +    int ret;
>>>>>>>>>> +
>>>>>>>>>> +    ret = vfio_record_mapping(container, iova, size, readonly);
>>>>>>>>>> +    if (ret) {
>>>>>>>>>> +        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
>>>>>>>>>> +                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
>>>>>>>>>> +                     iova, size, ret, strerror(-ret));
>>>>>>>>>> +
>>>>>>>>>> +        return ret;
>>>>>>>>>> +    }        
>>>>>>>>>
>>>>>>>>> Is there no way to replay the mappings when a migration is started?
>>>>>>>>> This seems like a horrible latency and bloat trade-off for the
>>>>>>>>> possibility that the VM might migrate and the device might support
>>>>>>>>> these features.  Our performance with vIOMMU is already terrible, I
>>>>>>>>> can't help but believe this makes it worse.  Thanks,
>>>>>>>>>         
>>>>>>>>
>>>>>>>> It is a nop if the vIOMMU is being used (entries in container->giommu_list) as
>>>>>>>> that uses a max-iova based IOVA range. So this is really for iommu identity
>>>>>>>> mapping and no-VIOMMU.      
>>>>>>>
>>>>>>> Ok, yes, there are no mappings recorded for any containers that have a
>>>>>>> non-empty giommu_list.
>>>>>>>       
>>>>>>>> We could replay them if they were tracked/stored anywhere.      
>>>>>>>
>>>>>>> Rather than piggybacking on vfio_memory_listener, why not simply
>>>>>>> register a new MemoryListener when migration is started?  That will
>>>>>>> replay all the existing ranges and allow tracking to happen separate
>>>>>>> from mapping, and only when needed.  
>>>>>>
>>>>>> The problem with that is that *starting* dirty tracking needs to have all the
>>>>>> range, we aren't supposed to start each range separately. So on a memory
>>>>>> listener callback you don't have introspection when you are dealing with the
>>>>>> last range, do we?    
>>>>>
>>>>> As soon as memory_listener_register() returns, all your callbacks to
>>>>> build the IOVATree have been called and you can act on the result the
>>>>> same as if you were relying on the vfio mapping MemoryListener.  I'm
>>>>> not seeing the problem.  Thanks,
>>>>>     
>>>>
>>>> While doing these changes, the nice thing of the current patch is that whatever
>>>> changes apply to vfio_listener_region_add() will be reflected in the mappings
>>>> tree that stores what we will dirty track. If we move the mappings calculation
>>>> necessary for dirty tracking only when we start, we will have to duplicate the
>>>> same checks, and open for bugs where we ask things to be dirty track-ed that
>>>> haven't been DMA mapped. These two aren't necessarily tied, but felt like I
>>>> should raise the potentially duplication of the checks (and the same thing
>>>> applies for handling virtio-mem and what not).
>>>>
>>>> I understand that if we were going to store *a lot* of mappings that this would
>>>> add up in space requirements. But for no-vIOMMU (or iommu=pt) case this is only
>>>> about 12ranges or so, it is much simpler to piggyback the existing listener.
>>>> Would you still want to move this to its own dedicated memory listener?  
>>>
>>> Code duplication and bugs are good points, but while typically we're
>>> only seeing a few handfuls of ranges, doesn't virtio-mem in particular
>>> allow that we could be seeing quite a lot more?
>>>   
>> Ugh yes, it could be.
>>
>>> We used to be limited to a fairly small number of KVM memory slots,
>>> which effectively bounded non-vIOMMU DMA mappings, but that value is
>>> now 2^15, so we need to anticipate that we could see many more than a
>>> dozen mappings.
>>>   
>>
>> Even with 32k memory slots today we are still reduced on a handful. hv-balloon
>> and virtio-mem approaches though are the ones that may stress such limit IIUC
>> prior to starting migration.
>>
>>> Can we make the same argument that the overhead is negligible if a VM
>>> makes use of 10s of GB of virtio-mem with 2MB block size?
>>>
>>> But then on a 4KB host we're limited to 256 tracking entries, so
>>> wasting all that time and space on a runtime IOVATree is even more
>>> dubious.
>>>
>>> In fact, it doesn't really matter that vfio_listener_region_add and
>>> this potentially new listener come to the same result, as long as the
>>> new listener is a superset of the existing listener.   
>>
>> I am trying to put this in a way that's not too ugly to reuse the most between
>> vfio_listener_region_add() and the vfio_migration_mapping_add().
>>
>> For you to have an idea, here's so far how it looks thus far:
>>
>> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
>>
>> Particularly this one:
>>
>> https://github.com/jpemartins/qemu/commit/3b11fa0e4faa0f9c0f42689a7367284a25d1b585
>>
>> vfio_get_section_iova_range() is where most of these checks are that are sort of
>> a subset of the ones in vfio_listener_region_add().
>>
>>> So I think we can
>>> simplify out a lot of the places we'd see duplication and bugs.  I'm
>>> not even really sure why we wouldn't simplify things further and only
>>> record a single range covering the low and high memory marks for a
>>> non-vIOMMU VMs, or potentially an approximation removing gaps of 1GB or
>>> more, for example.  Thanks,  
>>
>> Yes, for Qemu, to have one single artificial range with a computed min IOVA and
>> max IOVA is the simplest to get it implemented. It would avoid us maintaining an
>> IOVATree as you would only track min/max pair (maybe max_below).
>>
>> My concern with a reduced single range is 1) big holes in address space leading
>> to asking more than you need[*] and then 2) device dirty tracking limits e.g.
>> hardware may have upper limits, so you may prematurely exercise those. So giving
>> more choice to the vfio drivers to decide how to cope with the mapped address
>> space description looks to have a bit more longevity.
> 
> The fact that we don't know anything about the device dirty tracking
> limits worries me.  If QEMU reports the VM is migratable, ie. lacking
> migration blockers, then we really shouldn't have non-extraordinary
> things like the VM actually having a bigger address space than the
> device can support, or enabling a vIOMMU, suddenly make the VM
> non-migratable.
> 
Makes sense.

> If we only needed to worry about scenarios like the AMD hypertransport
> memory relocation, 

So far that's my major concern, considering it's still a 1T gap.

> then tracking ranges for 32-bit and 64-bit RAM
> separately would be an easy solution, we always have 1-2 ranges for the
> device to track.  That's still a big simplification from tracking every
> DMA mappings.
> 
I was too stuck on the naming on PC code that I thought it would be too x86
specific. But in reality we would be just covering 32-bit and 64-bit limits, so
it should cover everybody without being specific on a target.

The kernel (and device) will ultimately grab that info and adjust into its own
limits if needs to be split, as long as it can track what's requested.

That should work, and greatly simplify things here as you say. And later on for
vIOMMU we can expand the max limit to cover the 39-bit/48-bit intel max (or any
other IOMMU equivalent defined max).

> AIUI, we really can't even rely on the device supporting a full host
> page size worth of mappings, the uAPI only stipulates that the core
> kernel code will support such a request. 

Yes. Limited to LOG_MAX_RANGES, which is 256 ranges IIRC.

> So it seems prudent that
> userspace should conserve entries wherever it can.

Indeed.

> For the
> alternative, to provide ranges that closely match actual mappings, I
> think we'd need to be able to collapse IOVATree entries with the
> smallest gap when we reach the limit, and continue to collapse each time
> the driver rejects the number of ranges provided.  That's obviously
> much more complicated and I'd prefer to avoid it if there are easier
> approximations.
>

I think I start to like on the min/max {32,64} range limit approach given the
much less runtime overhead and complexity to maintain [while supporting the same
things].

>> Anyway the temptation with having a single range is that this can all go away if
>> the vfio_listener_region_add() tracks just min/max IOVA pair.
>>
>> Below scissors mark it's how this patch is looking like in the commit above
>> while being a full list of mappings. It's also stored here:
>>
>> https://github.com/jpemartins/qemu/commits/vfio-dirty-tracking
>>
>> I'll respond here with a patch on what it looks like with the range watermark
>> approach.
>>
>> 	Joao
>>
>> [0] AMD 1T boundary is what comes to mind, which on Qemu relocates memory above
>> 4G into after 1T.
>>
>> ------------------>8--------------------  
>>
>> From: Joao Martins <joao.m.martins@oracle.com>
>> Date: Wed, 22 Feb 2023 19:49:05 +0200
>> Subject: [PATCH wip 7/12] vfio/common: Record DMA mapped IOVA ranges
>>
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
>> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> ---
>>  hw/vfio/common.c              | 147 +++++++++++++++++++++++++++++++++-
>>  hw/vfio/trace-events          |   2 +
>>  include/hw/vfio/vfio-common.h |   4 +
>>  3 files changed, 150 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 655e8dbb74d4..17971e6dbaeb 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -44,6 +44,7 @@
>>  #include "migration/blocker.h"
>>  #include "migration/qemu-file.h"
>>  #include "sysemu/tpm.h"
>> +#include "qemu/iova-tree.h"
>>
>>  VFIOGroupList vfio_group_list =
>>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>      multiple_devices_migration_blocker = NULL;
>>  }
>>
>> +static bool vfio_have_giommu(VFIOContainer *container)
>> +{
>> +    return !QLIST_EMPTY(&container->giommu_list);
>> +}
> 
> I think it's the case, but can you confirm we build the giommu_list
> regardless of whether the vIOMMU is actually enabled?
> 
I think that is only non-empty when we have the first IOVA mappings e.g. on
IOMMU passthrough mode *I think* it's empty. Let me confirm.

Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
the VM was configured with a vIOMMU or not. That is to create the LM blocker.

>> +
>>  static void vfio_set_migration_error(int err)
>>  {
>>      MigrationState *ms = migrate_get_current();
>> @@ -610,6 +616,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>          .iova = iova,
>>          .size = size,
>>      };
>> +    int ret;
>>
>>      if (!readonly) {
>>          map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>> @@ -626,8 +633,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>          return 0;
>>      }
>>
>> +    ret = -errno;
>>      error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
>> -    return -errno;
>> +
>> +    return ret;
>>  }
>>
>>  static void vfio_host_win_add(VFIOContainer *container,
>> @@ -1326,11 +1335,127 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
>> *container, bool start)
>>      return ret;
>>  }
>>
>> +static bool vfio_get_section_iova_range(VFIOContainer *container,
>> +                                        MemoryRegionSection *section,
>> +                                        hwaddr *out_iova, hwaddr *out_end)
>> +{
>> +    Int128 llend, llsize;
>> +    hwaddr iova, end;
>> +
>> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>> +    llend = int128_make64(section->offset_within_address_space);
>> +    llend = int128_add(llend, section->size);
>> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
>> +
>> +    if (int128_ge(int128_make64(iova), llend)) {
>> +        return false;
>> +    }
>> +    end = int128_get64(int128_sub(llend, int128_one()));
>> +
>> +    if (memory_region_is_iommu(section->mr) ||
> 
> Shouldn't there already be a migration blocker in place preventing this
> from being possible?
> 
Yes. From my previous comment I am still working it out.

>> +        memory_region_has_ram_discard_manager(section->mr)) {
> 
> Are we claiming not to support virtio-mem VMs as well? 

That was not the intention. From explanation below I likely misunderstood the
unpopulated parts handling and included that check.

> The current
> comment in vfio/common.c that states we only want to map actually
> populated parts seems like it doesn't apply here, we'd want dirty
> tracking ranges to include these regardless.  Unless there's some
> reason virtio-mem changes are blocked during pre-copy.
> 
As far as I am aware, virtio-mem is deemed busy when !migration_is_idle(), hence
plug and unplug requests are blocked throughout migration (devices add/del are
also blocked during migration, so hotplug of memory/cpus/devices are also blocked).

Anyways your point still holds regardless, I'll drop the check.

>> +	return false;
>> +    }
>> +
>> +    llsize = int128_sub(llend, int128_make64(iova));
>> +
>> +    if (memory_region_is_ram_device(section->mr)) {
>> +        VFIOHostDMAWindow *hostwin;
>> +        hwaddr pgmask;
>> +
>> +        hostwin = vfio_find_hostwin(container, iova, end);
>> +        if (!hostwin) {
>> +            return false;
>> +        }
>> +
>> +	pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
>> +        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
>> +            return false;
>> +        }
>> +    }
> 
> ram_device is intended to be an address range on another device, so do
> we really need it in DMA dirty tracking?

I don't think so.

> ex. we don't include device
> BARs in the dirty bitmap, we expect modified device state to be
> reported by the device, so it seems like there's no case where we'd
> include this in the device dirty tracking ranges.
> 
/me nods

>> +
>> +    *out_iova = iova;
>> +    *out_end = int128_get64(llend);
>> +    return true;
>> +}
>> +
>> +static void vfio_migration_add_mapping(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> mappings_listener);
>> +    hwaddr end = 0;
>> +    DMAMap map;
>> +    int ret;
>> +
>> +    if (vfio_have_giommu(container)) {
>> +        vfio_set_migration_error(-EOPNOTSUPP);
> 
> There should be a migration blocker that prevents this from ever being
> called in this case.
> 
Correct.

>> +        return;
>> +    }
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
>> +        return;
>> +    }
>> +
>> +    map.size = end - map.iova - 1; // IOVATree is inclusive, so subtract 1 from
>> size
>> +    map.perm = section->readonly ? IOMMU_RO : IOMMU_RW;
>> +
>> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
>> +        ret = iova_tree_insert(container->mappings, &map);
>> +        if (ret) {
>> +            if (ret == IOVA_ERR_INVALID) {
>> +                ret = -EINVAL;
>> +            } else if (ret == IOVA_ERR_OVERLAP) {
>> +                ret = -EEXIST;
>> +            }
>> +        }
>> +    }
>> +
>> +    trace_vfio_migration_mapping_add(map.iova, map.iova + map.size, ret);
>> +
>> +    if (ret)
>> +        vfio_set_migration_error(ret);
>> +    return;
>> +}
>> +
>> +static void vfio_migration_remove_mapping(MemoryListener *listener,
>> +                                          MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> mappings_listener);
>> +    hwaddr end = 0;
>> +    DMAMap map;
>> +
>> +    if (vfio_have_giommu(container)) {
>> +        vfio_set_migration_error(-EOPNOTSUPP);
>> +        return;
>> +    }
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(container, section, &map.iova, &end)) {
>> +        return;
>> +    }
>> +
>> +    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
>> +        iova_tree_remove(container->mappings, map);
>> +    }
>> +
>> +    trace_vfio_migration_mapping_del(map.iova, map.iova + map.size);
>> +}
> 
> Why do we need a region_del callback?  We don't support modifying the
> dirty tracking ranges we've provided to the device.
> 

My intention with a region_del callback was for the simple case where migration
fails, or is cancelled and you want to try again later on, which could likely
mean a different min/max depending on setup on each retry. Given most operations
that change address space are blocked, this would work.

In the range alternative I was only clearing the min/max to zeroes.

>> +
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-migration",
>> +    .region_add = vfio_migration_add_mapping,
>> +    .region_del = vfio_migration_remove_mapping,
>> +};
>> +
>>  static void vfio_listener_log_global_start(MemoryListener *listener)
>>  {
>>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>      int ret;
>>
>> +    memory_listener_register(&container->mappings_listener, container->space->as);
>> +
>>      ret = vfio_set_dirty_page_tracking(container, true);
>>      if (ret) {
>>          vfio_set_migration_error(ret);
>> @@ -1346,6 +1471,8 @@ static void vfio_listener_log_global_stop(MemoryListener
>> *listener)
>>      if (ret) {
>>          vfio_set_migration_error(ret);
>>      }
>> +
>> +    memory_listener_unregister(&container->mappings_listener);
> 
> We don't have a way to update dirty tracking ranges for a device once
> dirty tracking is enabled, so what's the point of this listener running
> in more than a one-shot mode? 

See above.

> The only purpose of a listener
> continuing to run seems like it would be to generate an error for
> untracked ranges and either generate a migration error or mark them
> perpetually dirty.
> 
True for vIOMMU. But my intention was for "migration failure and later retry"
case. I am not sure we should be outright deleting what we requested tracking.
But we shouldn't be changing it for sure, yes.

I can delete the region_del callback, and on migration (re)start I clear/destroy
mapping tracking info if there was one there already.

>>  }
>>
>>  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>> @@ -2172,16 +2299,24 @@ static int vfio_connect_container(VFIOGroup *group,
>> AddressSpace *as,
>>      QLIST_INIT(&container->giommu_list);
>>      QLIST_INIT(&container->hostwin_list);
>>      QLIST_INIT(&container->vrdl_list);
>> +    container->mappings = iova_tree_new();
>> +    if (!container->mappings) {
>> +        error_setg(errp, "Cannot allocate DMA mappings tree");
>> +        ret = -ENOMEM;
>> +        goto free_container_exit;
>> +    }
>> +    qemu_mutex_init(&container->mappings_mutex);
>> +    container->mappings_listener = vfio_dirty_tracking_listener;
> 
> This all seems like code that would only be necessary before starting
> the listener.
> 
I can move it there.

>>
>>      ret = vfio_init_container(container, group->fd, errp);
>>      if (ret) {
>> -        goto free_container_exit;
>> +        goto destroy_mappings_exit;
>>      }
>>
>>      ret = vfio_ram_block_discard_disable(container, true);
>>      if (ret) {
>>          error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
>> -        goto free_container_exit;
>> +        goto destroy_mappings_exit;
>>      }
>>
>>      switch (container->iommu_type) {
>> @@ -2317,6 +2452,10 @@ listener_release_exit:
>>  enable_discards_exit:
>>      vfio_ram_block_discard_disable(container, false);
>>
>> +destroy_mappings_exit:
>> +    qemu_mutex_destroy(&container->mappings_mutex);
>> +    iova_tree_destroy(container->mappings);
>> +
>>  free_container_exit:
>>      g_free(container);
>>
>> @@ -2371,6 +2510,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
>>          }
>>
>>          trace_vfio_disconnect_container(container->fd);
>> +        qemu_mutex_destroy(&container->mappings_mutex);
>> +        iova_tree_destroy(container->mappings);
> 
> The IOVATree should be destroyed as soon as we're done processing the
> result upon starting logging.  It serves no purpose to keep it around.
> 
OK

> Comparing with the follow-up, setting {min,max}_tracking_iova, many of
> the same comments apply.

/me nods

> Both of these are only preparing for the
> question of what do we actually do with this data.  In the IOVATree
> approach, we have more fine grained information, but we can also exceed
> what the device supports and we need to be able to handle that.  If our
> fallback is to simply identify the min and max based on the IOVATree,
> and we expect that to work better than the more granular approach, why
> not start with just min/max?  If we expect there's value to the more
> granular approach, then when not proceed to collapse the IOVATree until
> we find a set of ranges the device can support?  Thanks,
> 

Your proposed solution for the HT address space gap handles my bigger concern.
I think it makes more sense to adopt a simplistic approach. Specially as we also
know it's applicable to the vIOMMU case too that a later series will handle, and
 less fragile against a limited number of ranges UAPI. On a second
consideration, while a two-range watermark isn't faithful to the real guest
address space it still lets the VFIO driver pick and split it to accomodate to
its own requirements. The range approach also doesn't affect how we ask the
logging reports, which are still honoring the real address space (hence no
different in runtime bitmap sizes).

With the tree approach, I fear that collapsing the tree is adding too much
complexity already to accomodate the reduced number of ranges we can pass, in
addition to the runtime overhead that it implies (specially for virtio-mem like
cases). I am happy to ressurect that approach again, should it be deemed required.

Unless there's objections, we should be able to send v3 of this series with all
the comments above (and those already given in this series) with the min/max
range approach, *I hope* no later than tomorrow.

> Alex
> 
>>          close(container->fd);
>>          g_free(container);
>>
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..c92eaadcc7c4 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,8 @@ vfio_known_safe_misalignment(const char *name, uint64_t
>> iova, uint64_t offset_wi
>>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
>> size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
>> aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
>> region_del 0x%"PRIx64" - 0x%"PRIx64
>>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
>> - 0x%"PRIx64
>> +vfio_migration_mapping_add(uint64_t start, uint64_t end, int err) "mapping_add
>> 0x%"PRIx64" - 0x%"PRIx64" err=%d"
>> +vfio_migration_mapping_del(uint64_t start, uint64_t end) "mapping_del
>> 0x%"PRIx64" - 0x%"PRIx64
>>  vfio_disconnect_container(int fd) "close container->fd=%d"
>>  vfio_put_group(int fd) "close group->fd=%d"
>>  vfio_get_device(const char * name, unsigned int flags, unsigned int
>> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..48951da11ab4 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -23,6 +23,7 @@
>>
>>  #include "exec/memory.h"
>>  #include "qemu/queue.h"
>> +#include "qemu/iova-tree.h"
>>  #include "qemu/notify.h"
>>  #include "ui/console.h"
>>  #include "hw/display/ramfb.h"
>> @@ -81,6 +82,7 @@ typedef struct VFIOContainer {
>>      int fd; /* /dev/vfio/vfio, empowered by the attached groups */
>>      MemoryListener listener;
>>      MemoryListener prereg_listener;
>> +    MemoryListener mappings_listener;
>>      unsigned iommu_type;
>>      Error *error;
>>      bool initialized;
>> @@ -89,6 +91,8 @@ typedef struct VFIOContainer {
>>      uint64_t max_dirty_bitmap_size;
>>      unsigned long pgsizes;
>>      unsigned int dma_max_mappings;
>> +    IOVATree *mappings;
>> +    QemuMutex mappings_mutex;
>>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>>      QLIST_HEAD(, VFIOGroup) group_list;
>> --
>> 2.17.2
>>
>
Joao Martins March 3, 2023, 4:58 p.m. UTC | #13
On 03/03/2023 00:19, Joao Martins wrote:
> On 02/03/2023 18:42, Alex Williamson wrote:
>> On Thu, 2 Mar 2023 00:07:35 +0000
>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>>      multiple_devices_migration_blocker = NULL;
>>>  }
>>>
>>> +static bool vfio_have_giommu(VFIOContainer *container)
>>> +{
>>> +    return !QLIST_EMPTY(&container->giommu_list);
>>> +}
>>
>> I think it's the case, but can you confirm we build the giommu_list
>> regardless of whether the vIOMMU is actually enabled?
>>
> I think that is only non-empty when we have the first IOVA mappings e.g. on
> IOMMU passthrough mode *I think* it's empty. Let me confirm.
> 
Yeap, it's empty.

> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
> 
I am trying this way, with something like this, but neither
x86_iommu_get_default() nor below is really working out yet. A little afraid of
having to add the live migration blocker on each machine_init_done hook, unless
t here's a more obvious way. vfio_realize should be at a much later stage, so I
am surprised how an IOMMU object doesn't exist at that time.

@@ -416,9 +421,26 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }

-static bool vfio_have_giommu(VFIOContainer *container)
+int vfio_block_giommu_migration(Error **errp)
 {
-    return !QLIST_EMPTY(&container->giommu_list);
+    int ret;
+
+    if (!object_resolve_path_type("", TYPE_INTEL_IOMMU_DEVICE, NULL) ||
+        !object_resolve_path_type("", TYPE_AMD_IOMMU_DEVICE, NULL) ||
+        !object_resolve_path_type("", TYPE_ARM_SMMU, NULL) ||
+        !object_resolve_path_type("", TYPE_VIRTIO_IOMMU, NULL)) {
+       return 0;
+    }
+
+    error_setg(&giommu_migration_blocker,
+               "Migration is currently not supported with vIOMMU enabled");
+    ret = migrate_add_blocker(giommu_migration_blocker, errp);
+    if (ret < 0) {
+        error_free(giommu_migration_blocker);
+        giommu_migration_blocker = NULL;
+    }
+
+    return ret;
 }

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 8981ae71a6f8..127a44ccaf19 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -649,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
         return ret;
     }

+    ret = vfio_block_giommu_migration(errp);
+    if (ret) {
+        return ret;
+    }
+
     trace_vfio_migration_probe(vbasedev->name);
     return 0;
Alex Williamson March 3, 2023, 5:05 p.m. UTC | #14
On Fri, 3 Mar 2023 16:58:55 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 03/03/2023 00:19, Joao Martins wrote:
> > On 02/03/2023 18:42, Alex Williamson wrote:  
> >> On Thu, 2 Mar 2023 00:07:35 +0000
> >> Joao Martins <joao.m.martins@oracle.com> wrote:  
> >>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
> >>>      multiple_devices_migration_blocker = NULL;
> >>>  }
> >>>
> >>> +static bool vfio_have_giommu(VFIOContainer *container)
> >>> +{
> >>> +    return !QLIST_EMPTY(&container->giommu_list);
> >>> +}  
> >>
> >> I think it's the case, but can you confirm we build the giommu_list
> >> regardless of whether the vIOMMU is actually enabled?
> >>  
> > I think that is only non-empty when we have the first IOVA mappings e.g. on
> > IOMMU passthrough mode *I think* it's empty. Let me confirm.
> >   
> Yeap, it's empty.
> 
> > Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
> > the VM was configured with a vIOMMU or not. That is to create the LM blocker.
> >   
> I am trying this way, with something like this, but neither
> x86_iommu_get_default() nor below is really working out yet. A little afraid of
> having to add the live migration blocker on each machine_init_done hook, unless
> t here's a more obvious way. vfio_realize should be at a much later stage, so I
> am surprised how an IOMMU object doesn't exist at that time.

Can we just test whether the container address space is system_memory?
Thanks,

Alex
Joao Martins March 3, 2023, 7:14 p.m. UTC | #15
On 03/03/2023 17:05, Alex Williamson wrote:
> On Fri, 3 Mar 2023 16:58:55 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 03/03/2023 00:19, Joao Martins wrote:
>>> On 02/03/2023 18:42, Alex Williamson wrote:  
>>>> On Thu, 2 Mar 2023 00:07:35 +0000
>>>> Joao Martins <joao.m.martins@oracle.com> wrote:  
>>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>>>>      multiple_devices_migration_blocker = NULL;
>>>>>  }
>>>>>
>>>>> +static bool vfio_have_giommu(VFIOContainer *container)
>>>>> +{
>>>>> +    return !QLIST_EMPTY(&container->giommu_list);
>>>>> +}  
>>>>
>>>> I think it's the case, but can you confirm we build the giommu_list
>>>> regardless of whether the vIOMMU is actually enabled?
>>>>  
>>> I think that is only non-empty when we have the first IOVA mappings e.g. on
>>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
>>>   
>> Yeap, it's empty.
>>
>>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
>>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
>>>   
>> I am trying this way, with something like this, but neither
>> x86_iommu_get_default() nor below is really working out yet. A little afraid of
>> having to add the live migration blocker on each machine_init_done hook, unless
>> t here's a more obvious way. vfio_realize should be at a much later stage, so I
>> am surprised how an IOMMU object doesn't exist at that time.
> 
> Can we just test whether the container address space is system_memory?

IIUC, it doesn't work (see below snippet).

The problem is that you start as a regular VFIO guest, and when the guest boot
is when new mappings get established/invalidated and propagated into listeners
(vfio_listener_region_add) and they morph into having a giommu. And that's when
you can figure out in higher layers that 'you have a vIOMMU' as that's when the
address space gets changed? That is without being specific to a particular IOMMU
model. Maybe region_add is where to add, but then it then depends on the guest.

I was going to attempt at vtd_machine_done_notify_one() ?

@@ -416,9 +416,25 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }

-static bool vfio_have_giommu(VFIOContainer *container)
+static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
+
+int vfio_block_giommu_migration(Error **errp)
 {
-    return !QLIST_EMPTY(&container->giommu_list);
+    int ret;
+
+    if (vfio_get_address_space(&address_space_memory)) {
+        return 0;
+    }
+
+    error_setg(&giommu_migration_blocker,
+               "Migration is currently not supported with vIOMMU enabled");
+    ret = migrate_add_blocker(giommu_migration_blocker, errp);
+    if (ret < 0) {
+        error_free(giommu_migration_blocker);
+        giommu_migration_blocker = NULL;
+    }
+
+    return ret;
 }
Alex Williamson March 3, 2023, 7:40 p.m. UTC | #16
On Fri, 3 Mar 2023 19:14:50 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 03/03/2023 17:05, Alex Williamson wrote:
> > On Fri, 3 Mar 2023 16:58:55 +0000
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> On 03/03/2023 00:19, Joao Martins wrote:  
> >>> On 02/03/2023 18:42, Alex Williamson wrote:    
> >>>> On Thu, 2 Mar 2023 00:07:35 +0000
> >>>> Joao Martins <joao.m.martins@oracle.com> wrote:    
> >>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
> >>>>>      multiple_devices_migration_blocker = NULL;
> >>>>>  }
> >>>>>
> >>>>> +static bool vfio_have_giommu(VFIOContainer *container)
> >>>>> +{
> >>>>> +    return !QLIST_EMPTY(&container->giommu_list);
> >>>>> +}    
> >>>>
> >>>> I think it's the case, but can you confirm we build the giommu_list
> >>>> regardless of whether the vIOMMU is actually enabled?
> >>>>    
> >>> I think that is only non-empty when we have the first IOVA mappings e.g. on
> >>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
> >>>     
> >> Yeap, it's empty.
> >>  
> >>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
> >>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
> >>>     
> >> I am trying this way, with something like this, but neither
> >> x86_iommu_get_default() nor below is really working out yet. A little afraid of
> >> having to add the live migration blocker on each machine_init_done hook, unless
> >> t here's a more obvious way. vfio_realize should be at a much later stage, so I
> >> am surprised how an IOMMU object doesn't exist at that time.  
> > 
> > Can we just test whether the container address space is system_memory?  
> 
> IIUC, it doesn't work (see below snippet).
> 
> The problem is that you start as a regular VFIO guest, and when the guest boot
> is when new mappings get established/invalidated and propagated into listeners
> (vfio_listener_region_add) and they morph into having a giommu. And that's when
> you can figure out in higher layers that 'you have a vIOMMU' as that's when the
> address space gets changed? That is without being specific to a particular IOMMU
> model. Maybe region_add is where to add, but then it then depends on the guest.

This doesn't seem right to me, look for instance at
pci_device_iommu_address_space() which returns address_space_memory
when there is no vIOMMU.  If devices share an address space, they can
share a container.  When a vIOMMU is present (not even enabled), each
device gets it's own container due to the fact that it's in its own
address space (modulo devices within the same address space due to
aliasing).  Thanks,

Alex
Joao Martins March 3, 2023, 8:16 p.m. UTC | #17
On 03/03/2023 19:40, Alex Williamson wrote:
> On Fri, 3 Mar 2023 19:14:50 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 03/03/2023 17:05, Alex Williamson wrote:
>>> On Fri, 3 Mar 2023 16:58:55 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> On 03/03/2023 00:19, Joao Martins wrote:  
>>>>> On 02/03/2023 18:42, Alex Williamson wrote:    
>>>>>> On Thu, 2 Mar 2023 00:07:35 +0000
>>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:    
>>>>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>>>>>>      multiple_devices_migration_blocker = NULL;
>>>>>>>  }
>>>>>>>
>>>>>>> +static bool vfio_have_giommu(VFIOContainer *container)
>>>>>>> +{
>>>>>>> +    return !QLIST_EMPTY(&container->giommu_list);
>>>>>>> +}    
>>>>>>
>>>>>> I think it's the case, but can you confirm we build the giommu_list
>>>>>> regardless of whether the vIOMMU is actually enabled?
>>>>>>    
>>>>> I think that is only non-empty when we have the first IOVA mappings e.g. on
>>>>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
>>>>>     
>>>> Yeap, it's empty.
>>>>  
>>>>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
>>>>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
>>>>>     
>>>> I am trying this way, with something like this, but neither
>>>> x86_iommu_get_default() nor below is really working out yet. A little afraid of
>>>> having to add the live migration blocker on each machine_init_done hook, unless
>>>> t here's a more obvious way. vfio_realize should be at a much later stage, so I
>>>> am surprised how an IOMMU object doesn't exist at that time.  
>>>
>>> Can we just test whether the container address space is system_memory?  
>>
>> IIUC, it doesn't work (see below snippet).
>>
>> The problem is that you start as a regular VFIO guest, and when the guest boot
>> is when new mappings get established/invalidated and propagated into listeners
>> (vfio_listener_region_add) and they morph into having a giommu. And that's when
>> you can figure out in higher layers that 'you have a vIOMMU' as that's when the
>> address space gets changed? That is without being specific to a particular IOMMU
>> model. Maybe region_add is where to add, but then it then depends on the guest.
> 
> This doesn't seem right to me, look for instance at
> pci_device_iommu_address_space() which returns address_space_memory
> when there is no vIOMMU.  If devices share an address space, they can
> share a container.  When a vIOMMU is present (not even enabled), each
> device gets it's own container due to the fact that it's in its own
> address space (modulo devices within the same address space due to
> aliasing).

You're obviously right, I was reading this whole thing wrong. This works as far
as I tested with an iommu=pt guest (and without an vIOMMU).

I am gonna shape this up, and hopefully submit v3 during over night.

@@ -416,9 +416,26 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }

-static bool vfio_have_giommu(VFIOContainer *container)
+static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
+
+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
 {
-    return !QLIST_EMPTY(&container->giommu_list);
+    int ret;
+
+    if (vbasedev->type == VFIO_DEVICE_TYPE_PCI &&
+       !vfio_has_iommu(vbasedev)) {
+       return 0;
+    }
+
+    error_setg(&giommu_migration_blocker,
+               "Migration is currently not supported with vIOMMU enabled");
+    ret = migrate_add_blocker(giommu_migration_blocker, errp);
+    if (ret < 0) {
+        error_free(giommu_migration_blocker);
+        giommu_migration_blocker = NULL;
+    }
+
+    return ret;
 }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 939dcc3d4a9e..f4cf0b41a157 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2843,6 +2843,15 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }

+bool vfio_has_iommu(VFIODevice *vbasedev)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    PCIDevice *pdev = &vdev->pdev;
+    AddressSpace *as = &address_space_memory;
+
+    return !(pci_device_iommu_address_space(pdev) == as);
+}
+
Alex Williamson March 3, 2023, 11:47 p.m. UTC | #18
On Fri, 3 Mar 2023 20:16:19 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 03/03/2023 19:40, Alex Williamson wrote:
> > On Fri, 3 Mar 2023 19:14:50 +0000
> > Joao Martins <joao.m.martins@oracle.com> wrote:
> >   
> >> On 03/03/2023 17:05, Alex Williamson wrote:  
> >>> On Fri, 3 Mar 2023 16:58:55 +0000
> >>> Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>     
> >>>> On 03/03/2023 00:19, Joao Martins wrote:    
> >>>>> On 02/03/2023 18:42, Alex Williamson wrote:      
> >>>>>> On Thu, 2 Mar 2023 00:07:35 +0000
> >>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:      
> >>>>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
> >>>>>>>      multiple_devices_migration_blocker = NULL;
> >>>>>>>  }
> >>>>>>>
> >>>>>>> +static bool vfio_have_giommu(VFIOContainer *container)
> >>>>>>> +{
> >>>>>>> +    return !QLIST_EMPTY(&container->giommu_list);
> >>>>>>> +}      
> >>>>>>
> >>>>>> I think it's the case, but can you confirm we build the giommu_list
> >>>>>> regardless of whether the vIOMMU is actually enabled?
> >>>>>>      
> >>>>> I think that is only non-empty when we have the first IOVA mappings e.g. on
> >>>>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
> >>>>>       
> >>>> Yeap, it's empty.
> >>>>    
> >>>>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
> >>>>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
> >>>>>       
> >>>> I am trying this way, with something like this, but neither
> >>>> x86_iommu_get_default() nor below is really working out yet. A little afraid of
> >>>> having to add the live migration blocker on each machine_init_done hook, unless
> >>>> t here's a more obvious way. vfio_realize should be at a much later stage, so I
> >>>> am surprised how an IOMMU object doesn't exist at that time.    
> >>>
> >>> Can we just test whether the container address space is system_memory?    
> >>
> >> IIUC, it doesn't work (see below snippet).
> >>
> >> The problem is that you start as a regular VFIO guest, and when the guest boot
> >> is when new mappings get established/invalidated and propagated into listeners
> >> (vfio_listener_region_add) and they morph into having a giommu. And that's when
> >> you can figure out in higher layers that 'you have a vIOMMU' as that's when the
> >> address space gets changed? That is without being specific to a particular IOMMU
> >> model. Maybe region_add is where to add, but then it then depends on the guest.  
> > 
> > This doesn't seem right to me, look for instance at
> > pci_device_iommu_address_space() which returns address_space_memory
> > when there is no vIOMMU.  If devices share an address space, they can
> > share a container.  When a vIOMMU is present (not even enabled), each
> > device gets it's own container due to the fact that it's in its own
> > address space (modulo devices within the same address space due to
> > aliasing).  
> 
> You're obviously right, I was reading this whole thing wrong. This works as far
> as I tested with an iommu=pt guest (and without an vIOMMU).
> 
> I am gonna shape this up, and hopefully submit v3 during over night.
> 
> @@ -416,9 +416,26 @@ void vfio_unblock_multiple_devices_migration(void)
>      multiple_devices_migration_blocker = NULL;
>  }
> 
> -static bool vfio_have_giommu(VFIOContainer *container)
> +static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
> +
> +int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
>  {
> -    return !QLIST_EMPTY(&container->giommu_list);
> +    int ret;
> +
> +    if (vbasedev->type == VFIO_DEVICE_TYPE_PCI &&
> +       !vfio_has_iommu(vbasedev)) {
> +       return 0;
> +    }
> +
> +    error_setg(&giommu_migration_blocker,
> +               "Migration is currently not supported with vIOMMU enabled");
> +    ret = migrate_add_blocker(giommu_migration_blocker, errp);
> +    if (ret < 0) {
> +        error_free(giommu_migration_blocker);
> +        giommu_migration_blocker = NULL;
> +    }
> +
> +    return ret;
>  }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 939dcc3d4a9e..f4cf0b41a157 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2843,6 +2843,15 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
> 
> +bool vfio_has_iommu(VFIODevice *vbasedev)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    AddressSpace *as = &address_space_memory;
> +
> +    return !(pci_device_iommu_address_space(pdev) == as);
> +}


Shouldn't this be something non-PCI specific like:

    return vbasedev->group->container->space != &address_space_memory;

Thanks,
Alex
Joao Martins March 3, 2023, 11:57 p.m. UTC | #19
On 03/03/2023 23:47, Alex Williamson wrote:
> On Fri, 3 Mar 2023 20:16:19 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 03/03/2023 19:40, Alex Williamson wrote:
>>> On Fri, 3 Mar 2023 19:14:50 +0000
>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>   
>>>> On 03/03/2023 17:05, Alex Williamson wrote:  
>>>>> On Fri, 3 Mar 2023 16:58:55 +0000
>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>     
>>>>>> On 03/03/2023 00:19, Joao Martins wrote:    
>>>>>>> On 02/03/2023 18:42, Alex Williamson wrote:      
>>>>>>>> On Thu, 2 Mar 2023 00:07:35 +0000
>>>>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:      
>>>>>>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>>>>>>>>      multiple_devices_migration_blocker = NULL;
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> +static bool vfio_have_giommu(VFIOContainer *container)
>>>>>>>>> +{
>>>>>>>>> +    return !QLIST_EMPTY(&container->giommu_list);
>>>>>>>>> +}      
>>>>>>>>
>>>>>>>> I think it's the case, but can you confirm we build the giommu_list
>>>>>>>> regardless of whether the vIOMMU is actually enabled?
>>>>>>>>      
>>>>>>> I think that is only non-empty when we have the first IOVA mappings e.g. on
>>>>>>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
>>>>>>>       
>>>>>> Yeap, it's empty.
>>>>>>    
>>>>>>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
>>>>>>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
>>>>>>>       
>>>>>> I am trying this way, with something like this, but neither
>>>>>> x86_iommu_get_default() nor below is really working out yet. A little afraid of
>>>>>> having to add the live migration blocker on each machine_init_done hook, unless
>>>>>> t here's a more obvious way. vfio_realize should be at a much later stage, so I
>>>>>> am surprised how an IOMMU object doesn't exist at that time.    
>>>>>
>>>>> Can we just test whether the container address space is system_memory?    
>>>>
>>>> IIUC, it doesn't work (see below snippet).
>>>>
>>>> The problem is that you start as a regular VFIO guest, and when the guest boot
>>>> is when new mappings get established/invalidated and propagated into listeners
>>>> (vfio_listener_region_add) and they morph into having a giommu. And that's when
>>>> you can figure out in higher layers that 'you have a vIOMMU' as that's when the
>>>> address space gets changed? That is without being specific to a particular IOMMU
>>>> model. Maybe region_add is where to add, but then it then depends on the guest.  
>>>
>>> This doesn't seem right to me, look for instance at
>>> pci_device_iommu_address_space() which returns address_space_memory
>>> when there is no vIOMMU.  If devices share an address space, they can
>>> share a container.  When a vIOMMU is present (not even enabled), each
>>> device gets it's own container due to the fact that it's in its own
>>> address space (modulo devices within the same address space due to
>>> aliasing).  
>>
>> You're obviously right, I was reading this whole thing wrong. This works as far
>> as I tested with an iommu=pt guest (and without an vIOMMU).
>>
>> I am gonna shape this up, and hopefully submit v3 during over night.
>>
>> @@ -416,9 +416,26 @@ void vfio_unblock_multiple_devices_migration(void)
>>      multiple_devices_migration_blocker = NULL;
>>  }
>>
>> -static bool vfio_have_giommu(VFIOContainer *container)
>> +static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
>> +
>> +int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
>>  {
>> -    return !QLIST_EMPTY(&container->giommu_list);
>> +    int ret;
>> +
>> +    if (vbasedev->type == VFIO_DEVICE_TYPE_PCI &&
>> +       !vfio_has_iommu(vbasedev)) {
>> +       return 0;
>> +    }
>> +
>> +    error_setg(&giommu_migration_blocker,
>> +               "Migration is currently not supported with vIOMMU enabled");
>> +    ret = migrate_add_blocker(giommu_migration_blocker, errp);
>> +    if (ret < 0) {
>> +        error_free(giommu_migration_blocker);
>> +        giommu_migration_blocker = NULL;
>> +    }
>> +
>> +    return ret;
>>  }
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 939dcc3d4a9e..f4cf0b41a157 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2843,6 +2843,15 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>      vdev->req_enabled = false;
>>  }
>>
>> +bool vfio_has_iommu(VFIODevice *vbasedev)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    AddressSpace *as = &address_space_memory;
>> +
>> +    return !(pci_device_iommu_address_space(pdev) == as);
>> +}
> 
> 
> Shouldn't this be something non-PCI specific like:
> 
>     return vbasedev->group->container->space != &address_space_memory;
> 

Yes, much better, I've applied the following (partial diff below):

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6cd0100bbe09..60af3c3018dc 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -421,8 +421,7 @@ int vfio_block_giommu_migration(VFIODevice *vbasedev, Error
**errp)
 {
     int ret;

-    if (vbasedev->type == VFIO_DEVICE_TYPE_PCI &&
-       !vfio_has_iommu(vbasedev)) {
+    if (vbasedev->group->container->space->as == &address_space_memory) {
        return 0;
     }
Joao Martins March 4, 2023, 12:21 a.m. UTC | #20
On 03/03/2023 23:57, Joao Martins wrote:
> On 03/03/2023 23:47, Alex Williamson wrote:
>> On Fri, 3 Mar 2023 20:16:19 +0000
>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>>> On 03/03/2023 19:40, Alex Williamson wrote:
>>>> On Fri, 3 Mar 2023 19:14:50 +0000
>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>   
>>>>> On 03/03/2023 17:05, Alex Williamson wrote:  
>>>>>> On Fri, 3 Mar 2023 16:58:55 +0000
>>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>     
>>>>>>> On 03/03/2023 00:19, Joao Martins wrote:    
>>>>>>>> On 02/03/2023 18:42, Alex Williamson wrote:      
>>>>>>>>> On Thu, 2 Mar 2023 00:07:35 +0000
>>>>>>>>> Joao Martins <joao.m.martins@oracle.com> wrote:      
>>>>>>>>>> @@ -426,6 +427,11 @@ void vfio_unblock_multiple_devices_migration(void)
>>>>>>>>>>      multiple_devices_migration_blocker = NULL;
>>>>>>>>>>  }
>>>>>>>>>>
>>>>>>>>>> +static bool vfio_have_giommu(VFIOContainer *container)
>>>>>>>>>> +{
>>>>>>>>>> +    return !QLIST_EMPTY(&container->giommu_list);
>>>>>>>>>> +}      
>>>>>>>>>
>>>>>>>>> I think it's the case, but can you confirm we build the giommu_list
>>>>>>>>> regardless of whether the vIOMMU is actually enabled?
>>>>>>>>>      
>>>>>>>> I think that is only non-empty when we have the first IOVA mappings e.g. on
>>>>>>>> IOMMU passthrough mode *I think* it's empty. Let me confirm.
>>>>>>>>       
>>>>>>> Yeap, it's empty.
>>>>>>>    
>>>>>>>> Otherwise I'll have to find a TYPE_IOMMU_MEMORY_REGION object to determine if
>>>>>>>> the VM was configured with a vIOMMU or not. That is to create the LM blocker.
>>>>>>>>       
>>>>>>> I am trying this way, with something like this, but neither
>>>>>>> x86_iommu_get_default() nor below is really working out yet. A little afraid of
>>>>>>> having to add the live migration blocker on each machine_init_done hook, unless
>>>>>>> t here's a more obvious way. vfio_realize should be at a much later stage, so I
>>>>>>> am surprised how an IOMMU object doesn't exist at that time.    
>>>>>>
>>>>>> Can we just test whether the container address space is system_memory?    
>>>>>
>>>>> IIUC, it doesn't work (see below snippet).
>>>>>
>>>>> The problem is that you start as a regular VFIO guest, and when the guest boot
>>>>> is when new mappings get established/invalidated and propagated into listeners
>>>>> (vfio_listener_region_add) and they morph into having a giommu. And that's when
>>>>> you can figure out in higher layers that 'you have a vIOMMU' as that's when the
>>>>> address space gets changed? That is without being specific to a particular IOMMU
>>>>> model. Maybe region_add is where to add, but then it then depends on the guest.  
>>>>
>>>> This doesn't seem right to me, look for instance at
>>>> pci_device_iommu_address_space() which returns address_space_memory
>>>> when there is no vIOMMU.  If devices share an address space, they can
>>>> share a container.  When a vIOMMU is present (not even enabled), each
>>>> device gets it's own container due to the fact that it's in its own
>>>> address space (modulo devices within the same address space due to
>>>> aliasing).  
>>>
>>> You're obviously right, I was reading this whole thing wrong. This works as far
>>> as I tested with an iommu=pt guest (and without an vIOMMU).
>>>
>>> I am gonna shape this up, and hopefully submit v3 during over night.
>>>
>>> @@ -416,9 +416,26 @@ void vfio_unblock_multiple_devices_migration(void)
>>>      multiple_devices_migration_blocker = NULL;
>>>  }
>>>
>>> -static bool vfio_have_giommu(VFIOContainer *container)
>>> +static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
>>> +
>>> +int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
>>>  {
>>> -    return !QLIST_EMPTY(&container->giommu_list);
>>> +    int ret;
>>> +
>>> +    if (vbasedev->type == VFIO_DEVICE_TYPE_PCI &&
>>> +       !vfio_has_iommu(vbasedev)) {
>>> +       return 0;
>>> +    }
>>> +
>>> +    error_setg(&giommu_migration_blocker,
>>> +               "Migration is currently not supported with vIOMMU enabled");
>>> +    ret = migrate_add_blocker(giommu_migration_blocker, errp);
>>> +    if (ret < 0) {
>>> +        error_free(giommu_migration_blocker);
>>> +        giommu_migration_blocker = NULL;
>>> +    }
>>> +
>>> +    return ret;
>>>  }
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index 939dcc3d4a9e..f4cf0b41a157 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -2843,6 +2843,15 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>      vdev->req_enabled = false;
>>>  }
>>>
>>> +bool vfio_has_iommu(VFIODevice *vbasedev)
>>> +{
>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>> +    PCIDevice *pdev = &vdev->pdev;
>>> +    AddressSpace *as = &address_space_memory;
>>> +
>>> +    return !(pci_device_iommu_address_space(pdev) == as);
>>> +}
>>
>>
>> Shouldn't this be something non-PCI specific like:
>>
>>     return vbasedev->group->container->space != &address_space_memory;
>>
> 
> Yes, much better, I've applied the following (partial diff below):
> 
I've also structured this similar to the other blocker wrt to multiple vfio devices.

	Joao
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index ee55d442b4..6f36876ce0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -23,6 +23,7 @@ 
 
 #include "exec/memory.h"
 #include "qemu/queue.h"
+#include "qemu/iova-tree.h"
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
@@ -92,6 +93,8 @@  typedef struct VFIOContainer {
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
     unsigned int dma_max_mappings;
+    IOVATree *mappings;
+    QemuMutex mappings_mutex;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 84f08bdbbb..6041da6c7e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -44,6 +44,7 @@ 
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
+#include "qemu/iova-tree.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -426,6 +427,11 @@  void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }
 
+static bool vfio_have_giommu(VFIOContainer *container)
+{
+    return !QLIST_EMPTY(&container->giommu_list);
+}
+
 static void vfio_set_migration_error(int err)
 {
     MigrationState *ms = migrate_get_current();
@@ -499,6 +505,51 @@  static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
     return true;
 }
 
+static int vfio_record_mapping(VFIOContainer *container, hwaddr iova,
+                               hwaddr size, bool readonly)
+{
+    DMAMap map = {
+        .iova = iova,
+        .size = size - 1, /* IOVATree is inclusive, so subtract 1 from size */
+        .perm = readonly ? IOMMU_RO : IOMMU_RW,
+    };
+    int ret;
+
+    if (vfio_have_giommu(container)) {
+        return 0;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        ret = iova_tree_insert(container->mappings, &map);
+        if (ret) {
+            if (ret == IOVA_ERR_INVALID) {
+                ret = -EINVAL;
+            } else if (ret == IOVA_ERR_OVERLAP) {
+                ret = -EEXIST;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static void vfio_erase_mapping(VFIOContainer *container, hwaddr iova,
+                                hwaddr size)
+{
+    DMAMap map = {
+        .iova = iova,
+        .size = size - 1, /* IOVATree is inclusive, so subtract 1 from size */
+    };
+
+    if (vfio_have_giommu(container)) {
+        return;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&container->mappings_mutex) {
+        iova_tree_remove(container->mappings, map);
+    }
+}
+
 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
                                  hwaddr iova, ram_addr_t size,
                                  IOMMUTLBEntry *iotlb)
@@ -599,6 +650,8 @@  static int vfio_dma_unmap(VFIOContainer *container,
                                             DIRTY_CLIENTS_NOCODE);
     }
 
+    vfio_erase_mapping(container, iova, size);
+
     return 0;
 }
 
@@ -612,6 +665,16 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
+    int ret;
+
+    ret = vfio_record_mapping(container, iova, size, readonly);
+    if (ret) {
+        error_report("vfio: Failed to record mapping, iova: 0x%" HWADDR_PRIx
+                     ", size: 0x" RAM_ADDR_FMT ", ret: %d (%s)",
+                     iova, size, ret, strerror(-ret));
+
+        return ret;
+    }
 
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
@@ -628,8 +691,12 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         return 0;
     }
 
+    ret = -errno;
     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
-    return -errno;
+
+    vfio_erase_mapping(container, iova, size);
+
+    return ret;
 }
 
 static void vfio_host_win_add(VFIOContainer *container,
@@ -2183,16 +2250,23 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
     QLIST_INIT(&container->vrdl_list);
+    container->mappings = iova_tree_new();
+    if (!container->mappings) {
+        error_setg(errp, "Cannot allocate DMA mappings tree");
+        ret = -ENOMEM;
+        goto free_container_exit;
+    }
+    qemu_mutex_init(&container->mappings_mutex);
 
     ret = vfio_init_container(container, group->fd, errp);
     if (ret) {
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }
 
     ret = vfio_ram_block_discard_disable(container, true);
     if (ret) {
         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
-        goto free_container_exit;
+        goto destroy_mappings_exit;
     }
 
     switch (container->iommu_type) {
@@ -2328,6 +2402,10 @@  listener_release_exit:
 enable_discards_exit:
     vfio_ram_block_discard_disable(container, false);
 
+destroy_mappings_exit:
+    qemu_mutex_destroy(&container->mappings_mutex);
+    iova_tree_destroy(container->mappings);
+
 free_container_exit:
     g_free(container);
 
@@ -2382,6 +2460,8 @@  static void vfio_disconnect_container(VFIOGroup *group)
         }
 
         trace_vfio_disconnect_container(container->fd);
+        qemu_mutex_destroy(&container->mappings_mutex);
+        iova_tree_destroy(container->mappings);
         close(container->fd);
         g_free(container);