diff mbox series

[v4,08/14] vfio/common: Record DMA mapped IOVA ranges

Message ID 20230307020258.58215-9-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio/migration: Device dirty page tracking | expand

Commit Message

Joao Martins March 7, 2023, 2:02 a.m. UTC
According to the device DMA logging uAPI, IOVA ranges to be logged by
the device must be provided all at once upon DMA logging start.

As preparation for the following patches which will add device dirty
page tracking, keep a record of all DMA mapped IOVA ranges so later they
can be used for DMA logging start.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/vfio/common.c              | 76 +++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  1 +
 include/hw/vfio/vfio-common.h | 13 ++++++
 3 files changed, 90 insertions(+)

Comments

Alex Williamson March 7, 2023, 2:57 a.m. UTC | #1
On Tue,  7 Mar 2023 02:02:52 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  hw/vfio/common.c              | 76 +++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |  1 +
>  include/hw/vfio/vfio-common.h | 13 ++++++
>  3 files changed, 90 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 3a6491dbc523..a9b1fc999121 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -1334,11 +1334,87 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>      return ret;
>  }
>  
> +static void vfio_dirty_tracking_update(MemoryListener *listener,
> +                                       MemoryRegionSection *section)
> +{
> +    VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
> +    VFIODirtyTrackingRange *range = &dirty->ranges;
> +    hwaddr max32 = UINT32_MAX - 1ULL;

The -1 is wrong here, UINT32_MAX is (2^32 - 1)

> +    hwaddr iova, end;
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(dirty->container, section,
> +                                     &iova, &end, NULL)) {
> +        return;
> +    }
> +
> +    /*
> +     * The address space passed to the dirty tracker is reduced to two ranges:
> +     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
> +     * The underlying reports of dirty will query a sub-interval of each of
> +     * these ranges.
> +     *
> +     * The purpose of the dual range handling is to handle known cases of big
> +     * holes in the address space, like the x86 AMD 1T hole. The alternative
> +     * would be an IOVATree but that has a much bigger runtime overhead and
> +     * unnecessary complexity.
> +     */
> +    if (iova < max32 && end <= max32) {

Nit, the first test is redundant, iova is necessarily less than end.

> +        if (range->min32 > iova) {
> +            range->min32 = iova;
> +        }
> +        if (range->max32 < end) {
> +            range->max32 = end;
> +        }
> +        trace_vfio_device_dirty_tracking_update(iova, end,
> +                                    range->min32, range->max32);
> +    } else {
> +        if (!range->min64 || range->min64 > iova) {

The first test should be removed, we're initializing min64 to a
non-zero value now, so if it's zero it's been set and we can't
de-prioritize that set value.

> +            range->min64 = iova;
> +        }
> +        if (range->max64 < end) {
> +            range->max64 = end;
> +        }
> +        trace_vfio_device_dirty_tracking_update(iova, end,
> +                                    range->min64, range->max64);
> +    }
> +
> +    return;
> +}
> +
> +static const MemoryListener vfio_dirty_tracking_listener = {
> +    .name = "vfio-tracking",
> +    .region_add = vfio_dirty_tracking_update,
> +};
> +
> +static void vfio_dirty_tracking_init(VFIOContainer *container,
> +                                     VFIODirtyRanges *dirty)
> +{
> +    memset(dirty, 0, sizeof(*dirty));
> +    dirty->ranges.min32 = UINT32_MAX;
> +    dirty->ranges.min64 = UINT64_MAX;
> +    dirty->listener = vfio_dirty_tracking_listener;
> +    dirty->container = container;
> +

I was actually thinking the caller would just pass
VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the
stack here, perhaps both are defined private to this file, but this
works and we can refine later if we so decide.  Thanks,

Alex


> +    memory_listener_register(&dirty->listener,
> +                             container->space->as);
> +
> +    /*
> +     * The memory listener is synchronous, and used to calculate the range
> +     * to dirty tracking. Unregister it after we are done as we are not
> +     * interested in any follow-up updates.
> +     */
> +    memory_listener_unregister(&dirty->listener);
> +}
> +
>  static void vfio_listener_log_global_start(MemoryListener *listener)
>  {
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> +    VFIODirtyRanges dirty;
>      int ret;
>  
> +    vfio_dirty_tracking_init(container, &dirty);
> +
>      ret = vfio_set_dirty_page_tracking(container, true);
>      if (ret) {
>          vfio_set_migration_error(ret);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 669d9fe07cd9..d97a6de17921 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>  vfio_disconnect_container(int fd) "close container->fd=%d"
>  vfio_put_group(int fd) "close group->fd=%d"
>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 87524c64a443..0f84136cceb5 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -96,6 +96,19 @@ typedef struct VFIOContainer {
>      QLIST_ENTRY(VFIOContainer) next;
>  } VFIOContainer;
>  
> +typedef struct VFIODirtyTrackingRange {
> +    hwaddr min32;
> +    hwaddr max32;
> +    hwaddr min64;
> +    hwaddr max64;
> +} VFIODirtyTrackingRange;
> +
> +typedef struct VFIODirtyRanges {
> +    VFIOContainer *container;
> +    VFIODirtyTrackingRange ranges;
> +    MemoryListener listener;
> +} VFIODirtyRanges;
> +
>  typedef struct VFIOGuestIOMMU {
>      VFIOContainer *container;
>      IOMMUMemoryRegion *iommu_mr;
Cédric Le Goater March 7, 2023, 10:08 a.m. UTC | #2
On 3/7/23 03:57, Alex Williamson wrote:
> On Tue,  7 Mar 2023 02:02:52 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>   hw/vfio/common.c              | 76 +++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events          |  1 +
>>   include/hw/vfio/vfio-common.h | 13 ++++++
>>   3 files changed, 90 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 3a6491dbc523..a9b1fc999121 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -1334,11 +1334,87 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>>       return ret;
>>   }
>>   
>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
>> +    VFIODirtyTrackingRange *range = &dirty->ranges;
>> +    hwaddr max32 = UINT32_MAX - 1ULL;
> 
> The -1 is wrong here, UINT32_MAX is (2^32 - 1)
> 
>> +    hwaddr iova, end;
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(dirty->container, section,
>> +                                     &iova, &end, NULL)) {
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * The address space passed to the dirty tracker is reduced to two ranges:
>> +     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
>> +     * The underlying reports of dirty will query a sub-interval of each of
>> +     * these ranges.
>> +     *
>> +     * The purpose of the dual range handling is to handle known cases of big
>> +     * holes in the address space, like the x86 AMD 1T hole. The alternative
>> +     * would be an IOVATree but that has a much bigger runtime overhead and
>> +     * unnecessary complexity.
>> +     */
>> +    if (iova < max32 && end <= max32) {
> 
> Nit, the first test is redundant, iova is necessarily less than end.
> 
>> +        if (range->min32 > iova) {
>> +            range->min32 = iova;
>> +        }
>> +        if (range->max32 < end) {
>> +            range->max32 = end;
>> +        }
>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                    range->min32, range->max32);
>> +    } else {
>> +        if (!range->min64 || range->min64 > iova) {
> 
> The first test should be removed, we're initializing min64 to a
> non-zero value now, so if it's zero it's been set and we can't
> de-prioritize that set value.
> 
>> +            range->min64 = iova;
>> +        }
>> +        if (range->max64 < end) {
>> +            range->max64 = end;
>> +        }
>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                    range->min64, range->max64);
>> +    }
>> +
>> +    return;
>> +}
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-tracking",
>> +    .region_add = vfio_dirty_tracking_update,
>> +};
>> +
>> +static void vfio_dirty_tracking_init(VFIOContainer *container,
>> +                                     VFIODirtyRanges *dirty)
>> +{
>> +    memset(dirty, 0, sizeof(*dirty));
>> +    dirty->ranges.min32 = UINT32_MAX;
>> +    dirty->ranges.min64 = UINT64_MAX;
>> +    dirty->listener = vfio_dirty_tracking_listener;
>> +    dirty->container = container;
>> +
> 
> I was actually thinking the caller would just pass
> VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the
> stack here, perhaps both are defined private to this file, but this
> works and we can refine later if we so decide.  

It is true that vfio_devices_dma_logging_start() only needs
a VFIODirtyTrackingRange struct and not the VFIODirtyRanges struct
which is a temporary structure for the dirty ranges calculation.
That would be nicer to have if you respin a v5.

I would rename VFIODirtyRanges to VFIODirtyRangesListener and
VFIODirtyTrackingRange to VFIODirtyRanges.

I am not sure they need to be in include/hw/vfio/vfio-common.h but
that seems to be the VFIO practice.

Thanks,

C.


> 
> Alex
> 
> 
>> +    memory_listener_register(&dirty->listener,
>> +                             container->space->as);
>> +
>> +    /*
>> +     * The memory listener is synchronous, and used to calculate the range
>> +     * to dirty tracking. Unregister it after we are done as we are not
>> +     * interested in any follow-up updates.
>> +     */
>> +    memory_listener_unregister(&dirty->listener);
>> +}
>> +
>>   static void vfio_listener_log_global_start(MemoryListener *listener)
>>   {
>>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>> +    VFIODirtyRanges dirty;
>>       int ret;
>>   
>> +    vfio_dirty_tracking_init(container, &dirty);
>> +
>>       ret = vfio_set_dirty_page_tracking(container, true);
>>       if (ret) {
>>           vfio_set_migration_error(ret);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..d97a6de17921 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>   vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>>   vfio_disconnect_container(int fd) "close container->fd=%d"
>>   vfio_put_group(int fd) "close group->fd=%d"
>>   vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..0f84136cceb5 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -96,6 +96,19 @@ typedef struct VFIOContainer {
>>       QLIST_ENTRY(VFIOContainer) next;
>>   } VFIOContainer;
>>   
>> +typedef struct VFIODirtyTrackingRange {
>> +    hwaddr min32;
>> +    hwaddr max32;
>> +    hwaddr min64;
>> +    hwaddr max64;
>> +} VFIODirtyTrackingRange;
>> +
>> +typedef struct VFIODirtyRanges {
>> +    VFIOContainer *container;
>> +    VFIODirtyTrackingRange ranges;
>> +    MemoryListener listener;
>> +} VFIODirtyRanges;
>> +
>>   typedef struct VFIOGuestIOMMU {
>>       VFIOContainer *container;
>>       IOMMUMemoryRegion *iommu_mr;
>
Joao Martins March 7, 2023, 10:16 a.m. UTC | #3
On 07/03/2023 02:57, Alex Williamson wrote:
> On Tue,  7 Mar 2023 02:02:52 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  hw/vfio/common.c              | 76 +++++++++++++++++++++++++++++++++++
>>  hw/vfio/trace-events          |  1 +
>>  include/hw/vfio/vfio-common.h | 13 ++++++
>>  3 files changed, 90 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 3a6491dbc523..a9b1fc999121 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -1334,11 +1334,87 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>>      return ret;
>>  }
>>  
>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
>> +    VFIODirtyTrackingRange *range = &dirty->ranges;
>> +    hwaddr max32 = UINT32_MAX - 1ULL;
> 
> The -1 is wrong here, UINT32_MAX is (2^32 - 1)
> 
Ugh, what a distraction.

The reason it worked in my tests is because there's a whole at the boundary,
being off by one wouldn't change the end range.

>> +    hwaddr iova, end;
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(dirty->container, section,
>> +                                     &iova, &end, NULL)) {
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * The address space passed to the dirty tracker is reduced to two ranges:
>> +     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
>> +     * The underlying reports of dirty will query a sub-interval of each of
>> +     * these ranges.
>> +     *
>> +     * The purpose of the dual range handling is to handle known cases of big
>> +     * holes in the address space, like the x86 AMD 1T hole. The alternative
>> +     * would be an IOVATree but that has a much bigger runtime overhead and
>> +     * unnecessary complexity.
>> +     */
>> +    if (iova < max32 && end <= max32) {
> 
> Nit, the first test is redundant, iova is necessarily less than end.
>

I'll delete it.

>> +        if (range->min32 > iova) {
>> +            range->min32 = iova;
>> +        }
>> +        if (range->max32 < end) {
>> +            range->max32 = end;
>> +        }
>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                    range->min32, range->max32);
>> +    } else {
>> +        if (!range->min64 || range->min64 > iova) {
> 
> The first test should be removed, we're initializing min64 to a
> non-zero value now, so if it's zero it's been set and we can't
> de-prioritize that set value.
> 
Distraction again, I am sure I removed all. and the test is pretty useless as
this will never be true.

>> +            range->min64 = iova;
>> +        }
>> +        if (range->max64 < end) {
>> +            range->max64 = end;
>> +        }
>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                    range->min64, range->max64);
>> +    }
>> +
>> +    return;
>> +}
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-tracking",
>> +    .region_add = vfio_dirty_tracking_update,
>> +};
>> +
>> +static void vfio_dirty_tracking_init(VFIOContainer *container,
>> +                                     VFIODirtyRanges *dirty)
>> +{
>> +    memset(dirty, 0, sizeof(*dirty));
>> +    dirty->ranges.min32 = UINT32_MAX;
>> +    dirty->ranges.min64 = UINT64_MAX;
>> +    dirty->listener = vfio_dirty_tracking_listener;
>> +    dirty->container = container;
>> +
> 
> I was actually thinking the caller would just pass
> VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the
> stack here, perhaps both are defined private to this file, but this
> works and we can refine later if we so decide.  Thanks,
>
OK, I see what you mean. Since I have to respin v5, I'll fix this.

> 
>> +    memory_listener_register(&dirty->listener,
>> +                             container->space->as);
>> +
>> +    /*
>> +     * The memory listener is synchronous, and used to calculate the range
>> +     * to dirty tracking. Unregister it after we are done as we are not
>> +     * interested in any follow-up updates.
>> +     */
>> +    memory_listener_unregister(&dirty->listener);
>> +}
>> +
>>  static void vfio_listener_log_global_start(MemoryListener *listener)
>>  {
>>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>> +    VFIODirtyRanges dirty;
>>      int ret;
>>  
>> +    vfio_dirty_tracking_init(container, &dirty);
>> +
>>      ret = vfio_set_dirty_page_tracking(container, true);
>>      if (ret) {
>>          vfio_set_migration_error(ret);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..d97a6de17921 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>>  vfio_disconnect_container(int fd) "close container->fd=%d"
>>  vfio_put_group(int fd) "close group->fd=%d"
>>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..0f84136cceb5 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -96,6 +96,19 @@ typedef struct VFIOContainer {
>>      QLIST_ENTRY(VFIOContainer) next;
>>  } VFIOContainer;
>>  
>> +typedef struct VFIODirtyTrackingRange {
>> +    hwaddr min32;
>> +    hwaddr max32;
>> +    hwaddr min64;
>> +    hwaddr max64;
>> +} VFIODirtyTrackingRange;
>> +
>> +typedef struct VFIODirtyRanges {
>> +    VFIOContainer *container;
>> +    VFIODirtyTrackingRange ranges;
>> +    MemoryListener listener;
>> +} VFIODirtyRanges;
>> +
>>  typedef struct VFIOGuestIOMMU {
>>      VFIOContainer *container;
>>      IOMMUMemoryRegion *iommu_mr;
>
Joao Martins March 7, 2023, 10:30 a.m. UTC | #4
On 07/03/2023 10:08, Cédric Le Goater wrote:
> On 3/7/23 03:57, Alex Williamson wrote:
>> On Tue,  7 Mar 2023 02:02:52 +0000
>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>> +static void vfio_dirty_tracking_init(VFIOContainer *container,
>>> +                                     VFIODirtyRanges *dirty)
>>> +{
>>> +    memset(dirty, 0, sizeof(*dirty));
>>> +    dirty->ranges.min32 = UINT32_MAX;
>>> +    dirty->ranges.min64 = UINT64_MAX;
>>> +    dirty->listener = vfio_dirty_tracking_listener;
>>> +    dirty->container = container;
>>> +
>>
>> I was actually thinking the caller would just pass
>> VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the
>> stack here, perhaps both are defined private to this file, but this
>> works and we can refine later if we so decide.  
> 
> It is true that vfio_devices_dma_logging_start() only needs
> a VFIODirtyTrackingRange struct and not the VFIODirtyRanges struct
> which is a temporary structure for the dirty ranges calculation.
> That would be nicer to have if you respin a v5.
> 
I can.

> I would rename VFIODirtyRanges to VFIODirtyRangesListener and
> VFIODirtyTrackingRange to VFIODirtyRanges.
> 
Better naming indeed.

> I am not sure they need to be in include/hw/vfio/vfio-common.h but
> that seems to be the VFIO practice.
> 
I can move as Alex also suggested it. There's already have
vfio_giommu_dirty_notifier and VFIOBitmap as private structures. I don't expect
that this will be used by other files
Joao Martins March 7, 2023, 12:13 p.m. UTC | #5
On 07/03/2023 10:16, Joao Martins wrote:
> On 07/03/2023 02:57, Alex Williamson wrote:
>> On Tue,  7 Mar 2023 02:02:52 +0000
>> Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>>> the device must be provided all at once upon DMA logging start.
>>>
>>> As preparation for the following patches which will add device dirty
>>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>>> can be used for DMA logging start.
>>>
>>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> ---
>>>  hw/vfio/common.c              | 76 +++++++++++++++++++++++++++++++++++
>>>  hw/vfio/trace-events          |  1 +
>>>  include/hw/vfio/vfio-common.h | 13 ++++++
>>>  3 files changed, 90 insertions(+)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 3a6491dbc523..a9b1fc999121 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -1334,11 +1334,87 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>>>      return ret;
>>>  }
>>>  
>>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>>> +                                       MemoryRegionSection *section)
>>> +{
>>> +    VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
>>> +    VFIODirtyTrackingRange *range = &dirty->ranges;
>>> +    hwaddr max32 = UINT32_MAX - 1ULL;
>>
>> The -1 is wrong here, UINT32_MAX is (2^32 - 1)
>>
> Ugh, what a distraction.
> 
> The reason it worked in my tests is because there's a whole at the boundary,
> being off by one wouldn't change the end range.
> 
>>> +    hwaddr iova, end;
>>> +
>>> +    if (!vfio_listener_valid_section(section) ||
>>> +        !vfio_get_section_iova_range(dirty->container, section,
>>> +                                     &iova, &end, NULL)) {
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * The address space passed to the dirty tracker is reduced to two ranges:
>>> +     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
>>> +     * The underlying reports of dirty will query a sub-interval of each of
>>> +     * these ranges.
>>> +     *
>>> +     * The purpose of the dual range handling is to handle known cases of big
>>> +     * holes in the address space, like the x86 AMD 1T hole. The alternative
>>> +     * would be an IOVATree but that has a much bigger runtime overhead and
>>> +     * unnecessary complexity.
>>> +     */
>>> +    if (iova < max32 && end <= max32) {
>>
>> Nit, the first test is redundant, iova is necessarily less than end.
>>
> 
> I'll delete it.
> 
>>> +        if (range->min32 > iova) {
>>> +            range->min32 = iova;
>>> +        }
>>> +        if (range->max32 < end) {
>>> +            range->max32 = end;
>>> +        }
>>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>>> +                                    range->min32, range->max32);
>>> +    } else {
>>> +        if (!range->min64 || range->min64 > iova) {
>>
>> The first test should be removed, we're initializing min64 to a
>> non-zero value now, so if it's zero it's been set and we can't
>> de-prioritize that set value.
>>
> Distraction again, I am sure I removed all. and the test is pretty useless as
> this will never be true.
> 

Meanwhile I rewrote that as below for readability. I have one level off of
indentation, and it reads better to me with less repetition of checks.
We select the min/max pair and apply it and update the tracking range.

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index a03a2fdfafc5..2ba8fa9043d2 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1329,8 +1329,7 @@ static void vfio_dirty_tracking_update(MemoryListener
*listener,
 {
     VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
     VFIODirtyTrackingRange *range = &dirty->ranges;
-    hwaddr max32 = UINT32_MAX - 1ULL;
-    hwaddr iova, end;
+    hwaddr iova, end, *min, *max;

     if (!vfio_listener_valid_section(section) ||
         !vfio_get_section_iova_range(dirty->container, section,
@@ -1349,26 +1348,17 @@ static void vfio_dirty_tracking_update(MemoryListener
*listener,
      * would be an IOVATree but that has a much bigger runtime overhead and
      * unnecessary complexity.
      */
-    if (iova < max32 && end <= max32) {
-        if (range->min32 > iova) {
-            range->min32 = iova;
-        }
-        if (range->max32 < end) {
-            range->max32 = end;
-        }
-        trace_vfio_device_dirty_tracking_update(iova, end,
-                                    range->min32, range->max32);
-    } else {
-        if (!range->min64 || range->min64 > iova) {
-            range->min64 = iova;
-        }
-        if (range->max64 < end) {
-            range->max64 = end;
-        }
-        trace_vfio_device_dirty_tracking_update(iova, end,
-                                    range->min64, range->max64);
+    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+
+    if (*min > iova) {
+        *min = iova;
+    }
+    if (*max < end) {
+        *max = end;
     }

+    trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
     return;
 }

>>> +            range->min64 = iova;
>>> +        }
>>> +        if (range->max64 < end) {
>>> +            range->max64 = end;
>>> +        }
>>> +        trace_vfio_device_dirty_tracking_update(iova, end,
>>> +                                    range->min64, range->max64);
>>> +    }
>>> +
>>> +    return;
>>> +}
>>> +
>>> +static const MemoryListener vfio_dirty_tracking_listener = {
>>> +    .name = "vfio-tracking",
>>> +    .region_add = vfio_dirty_tracking_update,
>>> +};
>>> +
>>> +static void vfio_dirty_tracking_init(VFIOContainer *container,
>>> +                                     VFIODirtyRanges *dirty)
>>> +{
>>> +    memset(dirty, 0, sizeof(*dirty));
>>> +    dirty->ranges.min32 = UINT32_MAX;
>>> +    dirty->ranges.min64 = UINT64_MAX;
>>> +    dirty->listener = vfio_dirty_tracking_listener;
>>> +    dirty->container = container;
>>> +
>>
>> I was actually thinking the caller would just pass
>> VFIODirtyTrackingRange and VFIODirtyRanges would be allocated on the
>> stack here, perhaps both are defined private to this file, but this
>> works and we can refine later if we so decide.  Thanks,
>>
> OK, I see what you mean. Since I have to respin v5, I'll fix this.
> 

I've done this, and made the declarations private (using Cedric's naming
suggestions).

>>
>>> +    memory_listener_register(&dirty->listener,
>>> +                             container->space->as);
>>> +
>>> +    /*
>>> +     * The memory listener is synchronous, and used to calculate the range
>>> +     * to dirty tracking. Unregister it after we are done as we are not
>>> +     * interested in any follow-up updates.
>>> +     */
>>> +    memory_listener_unregister(&dirty->listener);
>>> +}
>>> +
>>>  static void vfio_listener_log_global_start(MemoryListener *listener)
>>>  {
>>>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>> +    VFIODirtyRanges dirty;
>>>      int ret;
>>>  
>>> +    vfio_dirty_tracking_init(container, &dirty);
>>> +
>>>      ret = vfio_set_dirty_page_tracking(container, true);
>>>      if (ret) {
>>>          vfio_set_migration_error(ret);
>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>> index 669d9fe07cd9..d97a6de17921 100644
>>> --- a/hw/vfio/trace-events
>>> +++ b/hw/vfio/trace-events
>>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>>>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>>>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>>>  vfio_disconnect_container(int fd) "close container->fd=%d"
>>>  vfio_put_group(int fd) "close group->fd=%d"
>>>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>> index 87524c64a443..0f84136cceb5 100644
>>> --- a/include/hw/vfio/vfio-common.h
>>> +++ b/include/hw/vfio/vfio-common.h
>>> @@ -96,6 +96,19 @@ typedef struct VFIOContainer {
>>>      QLIST_ENTRY(VFIOContainer) next;
>>>  } VFIOContainer;
>>>  
>>> +typedef struct VFIODirtyTrackingRange {
>>> +    hwaddr min32;
>>> +    hwaddr max32;
>>> +    hwaddr min64;
>>> +    hwaddr max64;
>>> +} VFIODirtyTrackingRange;
>>> +
>>> +typedef struct VFIODirtyRanges {
>>> +    VFIOContainer *container;
>>> +    VFIODirtyTrackingRange ranges;
>>> +    MemoryListener listener;
>>> +} VFIODirtyRanges;
>>> +
>>>  typedef struct VFIOGuestIOMMU {
>>>      VFIOContainer *container;
>>>      IOMMUMemoryRegion *iommu_mr;
>>
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3a6491dbc523..a9b1fc999121 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1334,11 +1334,87 @@  static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
     return ret;
 }
 
+static void vfio_dirty_tracking_update(MemoryListener *listener,
+                                       MemoryRegionSection *section)
+{
+    VFIODirtyRanges *dirty = container_of(listener, VFIODirtyRanges, listener);
+    VFIODirtyTrackingRange *range = &dirty->ranges;
+    hwaddr max32 = UINT32_MAX - 1ULL;
+    hwaddr iova, end;
+
+    if (!vfio_listener_valid_section(section) ||
+        !vfio_get_section_iova_range(dirty->container, section,
+                                     &iova, &end, NULL)) {
+        return;
+    }
+
+    /*
+     * The address space passed to the dirty tracker is reduced to two ranges:
+     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+     * The underlying reports of dirty will query a sub-interval of each of
+     * these ranges.
+     *
+     * The purpose of the dual range handling is to handle known cases of big
+     * holes in the address space, like the x86 AMD 1T hole. The alternative
+     * would be an IOVATree but that has a much bigger runtime overhead and
+     * unnecessary complexity.
+     */
+    if (iova < max32 && end <= max32) {
+        if (range->min32 > iova) {
+            range->min32 = iova;
+        }
+        if (range->max32 < end) {
+            range->max32 = end;
+        }
+        trace_vfio_device_dirty_tracking_update(iova, end,
+                                    range->min32, range->max32);
+    } else {
+        if (!range->min64 || range->min64 > iova) {
+            range->min64 = iova;
+        }
+        if (range->max64 < end) {
+            range->max64 = end;
+        }
+        trace_vfio_device_dirty_tracking_update(iova, end,
+                                    range->min64, range->max64);
+    }
+
+    return;
+}
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+    .name = "vfio-tracking",
+    .region_add = vfio_dirty_tracking_update,
+};
+
+static void vfio_dirty_tracking_init(VFIOContainer *container,
+                                     VFIODirtyRanges *dirty)
+{
+    memset(dirty, 0, sizeof(*dirty));
+    dirty->ranges.min32 = UINT32_MAX;
+    dirty->ranges.min64 = UINT64_MAX;
+    dirty->listener = vfio_dirty_tracking_listener;
+    dirty->container = container;
+
+    memory_listener_register(&dirty->listener,
+                             container->space->as);
+
+    /*
+     * The memory listener is synchronous, and used to calculate the range
+     * to dirty tracking. Unregister it after we are done as we are not
+     * interested in any follow-up updates.
+     */
+    memory_listener_unregister(&dirty->listener);
+}
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIODirtyRanges dirty;
     int ret;
 
+    vfio_dirty_tracking_init(container, &dirty);
+
     ret = vfio_set_dirty_page_tracking(container, true);
     if (ret) {
         vfio_set_migration_error(ret);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07cd9..d97a6de17921 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,6 +104,7 @@  vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 87524c64a443..0f84136cceb5 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -96,6 +96,19 @@  typedef struct VFIOContainer {
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
 
+typedef struct VFIODirtyTrackingRange {
+    hwaddr min32;
+    hwaddr max32;
+    hwaddr min64;
+    hwaddr max64;
+} VFIODirtyTrackingRange;
+
+typedef struct VFIODirtyRanges {
+    VFIOContainer *container;
+    VFIODirtyTrackingRange ranges;
+    MemoryListener listener;
+} VFIODirtyRanges;
+
 typedef struct VFIOGuestIOMMU {
     VFIOContainer *container;
     IOMMUMemoryRegion *iommu_mr;