diff mbox series

[v1] vfio/common: Separate vfio-pci ranges

Message ID 20230908092944.47589-1-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series [v1] vfio/common: Separate vfio-pci ranges | expand

Commit Message

Joao Martins Sept. 8, 2023, 9:29 a.m. UTC
QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
QEMU includes in the 64-bit range the RAM regions at the lower part
and vfio-pci device RAM regions which are at the top of the address
space. This range contains a large gap and the size can be bigger than
the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).

To avoid such large ranges, introduce a new PCI range covering the
vfio-pci device RAM regions, this only if the addresses are above 4GB
to avoid breaking potential SeaBIOS guests.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
[ clg: - wrote commit log
       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
v2:
* s/minpci/minpci64/
* s/maxpci/maxpci64/
* Expand comment to cover the pci-hole64 and why we don't do special
  handling of pci-hole32.
---
 hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
 hw/vfio/trace-events |  2 +-
 2 files changed, 61 insertions(+), 12 deletions(-)

Comments

Joao Martins Sept. 9, 2023, 2:39 p.m. UTC | #1
On 08/09/2023 10:29, Joao Martins wrote:
> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
> QEMU includes in the 64-bit range the RAM regions at the lower part
> and vfio-pci device RAM regions which are at the top of the address
> space. This range contains a large gap and the size can be bigger than
> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
> 
> To avoid such large ranges, introduce a new PCI range covering the
> vfio-pci device RAM regions, this only if the addresses are above 4GB
> to avoid breaking potential SeaBIOS guests.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> [ clg: - wrote commit log
>        - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
> Signed-off-by: Cédric Le Goater <clg@redhat.com>

This should have a:

Fixes: 5255bbf4ec16 ("vfio/common: Add device dirty page tracking start/stop")

> ---
> v2:
> * s/minpci/minpci64/
> * s/maxpci/maxpci64/
> * Expand comment to cover the pci-hole64 and why we don't do special
>   handling of pci-hole32.
> ---
>  hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>  hw/vfio/trace-events |  2 +-
>  2 files changed, 61 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 237101d03844..134649226d43 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -27,6 +27,7 @@
>  
>  #include "hw/vfio/vfio-common.h"
>  #include "hw/vfio/vfio.h"
> +#include "hw/vfio/pci.h"
>  #include "exec/address-spaces.h"
>  #include "exec/memory.h"
>  #include "exec/ram_addr.h"
> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>      hwaddr max32;
>      hwaddr min64;
>      hwaddr max64;
> +    hwaddr minpci64;
> +    hwaddr maxpci64;
>  } VFIODirtyRanges;
>  
>  typedef struct VFIODirtyRangesListener {
> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>      MemoryListener listener;
>  } VFIODirtyRangesListener;
>  
> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> +                                     VFIOContainer *container)
> +{
> +    VFIOPCIDevice *pcidev;
> +    VFIODevice *vbasedev;
> +    VFIOGroup *group;
> +    Object *owner;
> +
> +    owner = memory_region_owner(section->mr);
> +
> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +            if (OBJECT(pcidev) == owner) {
> +                return true;
> +            }
> +        }
> +    }
> +
> +    return false;
> +}
> +
>  static void vfio_dirty_tracking_update(MemoryListener *listener,
>                                         MemoryRegionSection *section)
>  {
> @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
>      }
>  
>      /*
> -     * The address space passed to the dirty tracker is reduced to two ranges:
> -     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
> +     * The address space passed to the dirty tracker is reduced to three ranges:
> +     * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
> +     * PCI 64-bit hole.
> +     *
>       * The underlying reports of dirty will query a sub-interval of each of
>       * these ranges.
>       *
> -     * The purpose of the dual range handling is to handle known cases of big
> -     * holes in the address space, like the x86 AMD 1T hole. The alternative
> -     * would be an IOVATree but that has a much bigger runtime overhead and
> -     * unnecessary complexity.
> +     * The purpose of the three range handling is to handle known cases of big
> +     * holes in the address space, like the x86 AMD 1T hole, and firmware (like
> +     * OVMF) which may relocate the pci-hole64 to the end of the address space.
> +     * The latter would otherwise generate large ranges for tracking, stressing
> +     * the limits of supported hardware. The pci-hole32 will always be below 4G
> +     * (overlapping or not) so it doesn't need special handling and is part of
> +     * the 32-bit range.
> +     *
> +     * The alternative would be an IOVATree but that has a much bigger runtime
> +     * overhead and unnecessary complexity.
>       */
> -    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> -    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> -
> +    if (vfio_section_is_vfio_pci(section, dirty->container) &&
> +        iova >= UINT32_MAX) {
> +        min = &range->minpci64;
> +        max = &range->maxpci64;
> +    } else {
> +        min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> +        max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> +    }
>      if (*min > iova) {
>          *min = iova;
>      }
> @@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
>      memset(&dirty, 0, sizeof(dirty));
>      dirty.ranges.min32 = UINT32_MAX;
>      dirty.ranges.min64 = UINT64_MAX;
> +    dirty.ranges.minpci64 = UINT64_MAX;
>      dirty.listener = vfio_dirty_tracking_listener;
>      dirty.container = container;
>  
> @@ -1531,7 +1573,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
>       * DMA logging uAPI guarantees to support at least a number of ranges that
>       * fits into a single host kernel base page.
>       */
> -    control->num_ranges = !!tracking->max32 + !!tracking->max64;
> +    control->num_ranges = !!tracking->max32 + !!tracking->max64 +
> +        !!tracking->maxpci64;
>      ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
>                          control->num_ranges);
>      if (!ranges) {
> @@ -1550,11 +1593,17 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
>      if (tracking->max64) {
>          ranges->iova = tracking->min64;
>          ranges->length = (tracking->max64 - tracking->min64) + 1;
> +        ranges++;
> +    }
> +    if (tracking->maxpci64) {
> +        ranges->iova = tracking->minpci64;
> +        ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
>      }
>  
>      trace_vfio_device_dirty_tracking_start(control->num_ranges,
>                                             tracking->min32, tracking->max32,
> -                                           tracking->min64, tracking->max64);
> +                                           tracking->min64, tracking->max64,
> +                                           tracking->minpci64, tracking->maxpci64);
>  
>      return feature;
>  }
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index ce61b10827b6..cc7c21365c92 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>  vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
> -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
> +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
>  vfio_disconnect_container(int fd) "close container->fd=%d"
>  vfio_put_group(int fd) "close group->fd=%d"
>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
Cédric Le Goater Sept. 11, 2023, 6:44 a.m. UTC | #2
On 9/8/23 11:29, Joao Martins wrote:
> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
> QEMU includes in the 64-bit range the RAM regions at the lower part
> and vfio-pci device RAM regions which are at the top of the address
> space. This range contains a large gap and the size can be bigger than
> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
> 
> To avoid such large ranges, introduce a new PCI range covering the
> vfio-pci device RAM regions, this only if the addresses are above 4GB
> to avoid breaking potential SeaBIOS guests.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> [ clg: - wrote commit log
>         - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
> Signed-off-by: Cédric Le Goater <clg@redhat.com>
> ---
> v2:
> * s/minpci/minpci64/
> * s/maxpci/maxpci64/
> * Expand comment to cover the pci-hole64 and why we don't do special
>    handling of pci-hole32.

This is a valuable fix for the latest OVMF enabling dynamic MMIO
window. Applied to vfio-next.

Thanks,

C.


> ---
>   hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>   hw/vfio/trace-events |  2 +-
>   2 files changed, 61 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 237101d03844..134649226d43 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -27,6 +27,7 @@
>   
>   #include "hw/vfio/vfio-common.h"
>   #include "hw/vfio/vfio.h"
> +#include "hw/vfio/pci.h"
>   #include "exec/address-spaces.h"
>   #include "exec/memory.h"
>   #include "exec/ram_addr.h"
> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>       hwaddr max32;
>       hwaddr min64;
>       hwaddr max64;
> +    hwaddr minpci64;
> +    hwaddr maxpci64;
>   } VFIODirtyRanges;
>   
>   typedef struct VFIODirtyRangesListener {
> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>       MemoryListener listener;
>   } VFIODirtyRangesListener;
>   
> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> +                                     VFIOContainer *container)
> +{
> +    VFIOPCIDevice *pcidev;
> +    VFIODevice *vbasedev;
> +    VFIOGroup *group;
> +    Object *owner;
> +
> +    owner = memory_region_owner(section->mr);
> +
> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +            if (OBJECT(pcidev) == owner) {
> +                return true;
> +            }
> +        }
> +    }
> +
> +    return false;
> +}
> +
>   static void vfio_dirty_tracking_update(MemoryListener *listener,
>                                          MemoryRegionSection *section)
>   {
> @@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
>       }
>   
>       /*
> -     * The address space passed to the dirty tracker is reduced to two ranges:
> -     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
> +     * The address space passed to the dirty tracker is reduced to three ranges:
> +     * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
> +     * PCI 64-bit hole.
> +     *
>        * The underlying reports of dirty will query a sub-interval of each of
>        * these ranges.
>        *
> -     * The purpose of the dual range handling is to handle known cases of big
> -     * holes in the address space, like the x86 AMD 1T hole. The alternative
> -     * would be an IOVATree but that has a much bigger runtime overhead and
> -     * unnecessary complexity.
> +     * The purpose of the three range handling is to handle known cases of big
> +     * holes in the address space, like the x86 AMD 1T hole, and firmware (like
> +     * OVMF) which may relocate the pci-hole64 to the end of the address space.
> +     * The latter would otherwise generate large ranges for tracking, stressing
> +     * the limits of supported hardware. The pci-hole32 will always be below 4G
> +     * (overlapping or not) so it doesn't need special handling and is part of
> +     * the 32-bit range.
> +     *
> +     * The alternative would be an IOVATree but that has a much bigger runtime
> +     * overhead and unnecessary complexity.
>        */
> -    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> -    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> -
> +    if (vfio_section_is_vfio_pci(section, dirty->container) &&
> +        iova >= UINT32_MAX) {
> +        min = &range->minpci64;
> +        max = &range->maxpci64;
> +    } else {
> +        min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> +        max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> +    }
>       if (*min > iova) {
>           *min = iova;
>       }
> @@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
>       memset(&dirty, 0, sizeof(dirty));
>       dirty.ranges.min32 = UINT32_MAX;
>       dirty.ranges.min64 = UINT64_MAX;
> +    dirty.ranges.minpci64 = UINT64_MAX;
>       dirty.listener = vfio_dirty_tracking_listener;
>       dirty.container = container;
>   
> @@ -1531,7 +1573,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
>        * DMA logging uAPI guarantees to support at least a number of ranges that
>        * fits into a single host kernel base page.
>        */
> -    control->num_ranges = !!tracking->max32 + !!tracking->max64;
> +    control->num_ranges = !!tracking->max32 + !!tracking->max64 +
> +        !!tracking->maxpci64;
>       ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
>                           control->num_ranges);
>       if (!ranges) {
> @@ -1550,11 +1593,17 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
>       if (tracking->max64) {
>           ranges->iova = tracking->min64;
>           ranges->length = (tracking->max64 - tracking->min64) + 1;
> +        ranges++;
> +    }
> +    if (tracking->maxpci64) {
> +        ranges->iova = tracking->minpci64;
> +        ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
>       }
>   
>       trace_vfio_device_dirty_tracking_start(control->num_ranges,
>                                              tracking->min32, tracking->max32,
> -                                           tracking->min64, tracking->max64);
> +                                           tracking->min64, tracking->max64,
> +                                           tracking->minpci64, tracking->maxpci64);
>   
>       return feature;
>   }
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index ce61b10827b6..cc7c21365c92 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>   vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
> -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
> +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
>   vfio_disconnect_container(int fd) "close container->fd=%d"
>   vfio_put_group(int fd) "close group->fd=%d"
>   vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
Duan, Zhenzhong Sept. 11, 2023, 8:57 a.m. UTC | #3
>-----Original Message-----
>From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
>devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
>Martins
>Sent: Friday, September 8, 2023 5:30 PM
>Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>QEMU includes in the 64-bit range the RAM regions at the lower part
>and vfio-pci device RAM regions which are at the top of the address
>space. This range contains a large gap and the size can be bigger than
>the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>
>To avoid such large ranges, introduce a new PCI range covering the
>vfio-pci device RAM regions, this only if the addresses are above 4GB
>to avoid breaking potential SeaBIOS guests.
>
>Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>[ clg: - wrote commit log
>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>Signed-off-by: Cédric Le Goater <clg@redhat.com>
>---
>v2:
>* s/minpci/minpci64/
>* s/maxpci/maxpci64/
>* Expand comment to cover the pci-hole64 and why we don't do special
>  handling of pci-hole32.
>---
> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
> hw/vfio/trace-events |  2 +-
> 2 files changed, 61 insertions(+), 12 deletions(-)
>
>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>index 237101d03844..134649226d43 100644
>--- a/hw/vfio/common.c
>+++ b/hw/vfio/common.c
>@@ -27,6 +27,7 @@
>
> #include "hw/vfio/vfio-common.h"
> #include "hw/vfio/vfio.h"
>+#include "hw/vfio/pci.h"
> #include "exec/address-spaces.h"
> #include "exec/memory.h"
> #include "exec/ram_addr.h"
>@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>     hwaddr max32;
>     hwaddr min64;
>     hwaddr max64;
>+    hwaddr minpci64;
>+    hwaddr maxpci64;
> } VFIODirtyRanges;
>
> typedef struct VFIODirtyRangesListener {
>@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>     MemoryListener listener;
> } VFIODirtyRangesListener;
>
>+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>+                                     VFIOContainer *container)
>+{
>+    VFIOPCIDevice *pcidev;
>+    VFIODevice *vbasedev;
>+    VFIOGroup *group;
>+    Object *owner;
>+
>+    owner = memory_region_owner(section->mr);
>+
>+    QLIST_FOREACH(group, &container->group_list, container_next) {
>+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>+                continue;
>+            }
>+            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>+            if (OBJECT(pcidev) == owner) {
>+                return true;
>+            }
>+        }
>+    }
>+
>+    return false;
>+}

What about simplify it with memory_region_is_ram_device()?
This way vdpa device could also be included.

Thanks
Zhenzhong
Joao Martins Sept. 11, 2023, 9:06 a.m. UTC | #4
On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>> -----Original Message-----
>> From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
>> devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
>> Martins
>> Sent: Friday, September 8, 2023 5:30 PM
>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>> QEMU includes in the 64-bit range the RAM regions at the lower part
>> and vfio-pci device RAM regions which are at the top of the address
>> space. This range contains a large gap and the size can be bigger than
>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>
>> To avoid such large ranges, introduce a new PCI range covering the
>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>> to avoid breaking potential SeaBIOS guests.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> [ clg: - wrote commit log
>>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>> Signed-off-by: Cédric Le Goater <clg@redhat.com>
>> ---
>> v2:
>> * s/minpci/minpci64/
>> * s/maxpci/maxpci64/
>> * Expand comment to cover the pci-hole64 and why we don't do special
>>  handling of pci-hole32.
>> ---
>> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>> hw/vfio/trace-events |  2 +-
>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 237101d03844..134649226d43 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -27,6 +27,7 @@
>>
>> #include "hw/vfio/vfio-common.h"
>> #include "hw/vfio/vfio.h"
>> +#include "hw/vfio/pci.h"
>> #include "exec/address-spaces.h"
>> #include "exec/memory.h"
>> #include "exec/ram_addr.h"
>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>     hwaddr max32;
>>     hwaddr min64;
>>     hwaddr max64;
>> +    hwaddr minpci64;
>> +    hwaddr maxpci64;
>> } VFIODirtyRanges;
>>
>> typedef struct VFIODirtyRangesListener {
>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>     MemoryListener listener;
>> } VFIODirtyRangesListener;
>>
>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>> +                                     VFIOContainer *container)
>> +{
>> +    VFIOPCIDevice *pcidev;
>> +    VFIODevice *vbasedev;
>> +    VFIOGroup *group;
>> +    Object *owner;
>> +
>> +    owner = memory_region_owner(section->mr);
>> +
>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>> +                continue;
>> +            }
>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +            if (OBJECT(pcidev) == owner) {
>> +                return true;
>> +            }
>> +        }
>> +    }
>> +
>> +    return false;
>> +}
> 
> What about simplify it with memory_region_is_ram_device()?
> This way vdpa device could also be included.
> 

Note that the check is not interested in RAM (or any other kinda of memory like
VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
cover it? If so, I am all for the simplification.

	Joao
Duan, Zhenzhong Sept. 11, 2023, 9:48 a.m. UTC | #5
>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Sent: Monday, September 11, 2023 5:07 PM
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>>> -----Original Message-----
>>> From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
>>> devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
>>> Martins
>>> Sent: Friday, September 8, 2023 5:30 PM
>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>
>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>> and vfio-pci device RAM regions which are at the top of the address
>>> space. This range contains a large gap and the size can be bigger than
>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>
>>> To avoid such large ranges, introduce a new PCI range covering the
>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>> to avoid breaking potential SeaBIOS guests.
>>>
>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>> [ clg: - wrote commit log
>>>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>> Signed-off-by: Cédric Le Goater <clg@redhat.com>
>>> ---
>>> v2:
>>> * s/minpci/minpci64/
>>> * s/maxpci/maxpci64/
>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>  handling of pci-hole32.
>>> ---
>>> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>>> hw/vfio/trace-events |  2 +-
>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 237101d03844..134649226d43 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -27,6 +27,7 @@
>>>
>>> #include "hw/vfio/vfio-common.h"
>>> #include "hw/vfio/vfio.h"
>>> +#include "hw/vfio/pci.h"
>>> #include "exec/address-spaces.h"
>>> #include "exec/memory.h"
>>> #include "exec/ram_addr.h"
>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>>     hwaddr max32;
>>>     hwaddr min64;
>>>     hwaddr max64;
>>> +    hwaddr minpci64;
>>> +    hwaddr maxpci64;
>>> } VFIODirtyRanges;
>>>
>>> typedef struct VFIODirtyRangesListener {
>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>>     MemoryListener listener;
>>> } VFIODirtyRangesListener;
>>>
>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>> +                                     VFIOContainer *container)
>>> +{
>>> +    VFIOPCIDevice *pcidev;
>>> +    VFIODevice *vbasedev;
>>> +    VFIOGroup *group;
>>> +    Object *owner;
>>> +
>>> +    owner = memory_region_owner(section->mr);
>>> +
>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>> +                continue;
>>> +            }
>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>> +            if (OBJECT(pcidev) == owner) {
>>> +                return true;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>
>> What about simplify it with memory_region_is_ram_device()?
>> This way vdpa device could also be included.
>>
>
>Note that the check is not interested in RAM (or any other kinda of memory like
>VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>cover it? If so, I am all for the simplification.

Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user.

I have another question, previously I think vfio pci bars are device states and
save/restored through VFIO migration protocol, so we don't need to dirty
tracking them. Do I understand wrong?

Thanks
Zhenzhong
Joao Martins Sept. 11, 2023, 10:12 a.m. UTC | #6
On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>> -----Original Message-----
>> From: Joao Martins <joao.m.martins@oracle.com>
>> Sent: Monday, September 11, 2023 5:07 PM
>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>>>> -----Original Message-----
>>>> From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
>>>> devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
>>>> Martins
>>>> Sent: Friday, September 8, 2023 5:30 PM
>>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>
>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>>> and vfio-pci device RAM regions which are at the top of the address
>>>> space. This range contains a large gap and the size can be bigger than
>>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>>
>>>> To avoid such large ranges, introduce a new PCI range covering the
>>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>>> to avoid breaking potential SeaBIOS guests.
>>>>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> [ clg: - wrote commit log
>>>>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>>> Signed-off-by: Cédric Le Goater <clg@redhat.com>
>>>> ---
>>>> v2:
>>>> * s/minpci/minpci64/
>>>> * s/maxpci/maxpci64/
>>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>>  handling of pci-hole32.
>>>> ---
>>>> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>>>> hw/vfio/trace-events |  2 +-
>>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 237101d03844..134649226d43 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -27,6 +27,7 @@
>>>>
>>>> #include "hw/vfio/vfio-common.h"
>>>> #include "hw/vfio/vfio.h"
>>>> +#include "hw/vfio/pci.h"
>>>> #include "exec/address-spaces.h"
>>>> #include "exec/memory.h"
>>>> #include "exec/ram_addr.h"
>>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>>>     hwaddr max32;
>>>>     hwaddr min64;
>>>>     hwaddr max64;
>>>> +    hwaddr minpci64;
>>>> +    hwaddr maxpci64;
>>>> } VFIODirtyRanges;
>>>>
>>>> typedef struct VFIODirtyRangesListener {
>>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>>>     MemoryListener listener;
>>>> } VFIODirtyRangesListener;
>>>>
>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>> +                                     VFIOContainer *container)
>>>> +{
>>>> +    VFIOPCIDevice *pcidev;
>>>> +    VFIODevice *vbasedev;
>>>> +    VFIOGroup *group;
>>>> +    Object *owner;
>>>> +
>>>> +    owner = memory_region_owner(section->mr);
>>>> +
>>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>> +                continue;
>>>> +            }
>>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>> +            if (OBJECT(pcidev) == owner) {
>>>> +                return true;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user.
> 
> I have another question, previously I think vfio pci bars are device states and
> save/restored through VFIO migration protocol, so we don't need to dirty
> tracking them. Do I understand wrong?

The general thinking of device dirty tracking is to track all addressable IOVAs.
But you do raise a good question. My understanding is that migrating the bars
*as is* might be device migration specific (not a guarantee?); the save file and
precopy interface are the only places we transfer from/to the data and it's just
opaque data, not bars or anything formatted specifically -- so if we migrate
bars it is hidden in what device f/w wants to move. Might be that BARs aren't
even needed as they are sort of scratch space from h/w side. Ultimately, the
dirty tracker is the one reporting the values, and the device h/w chooses to not
report those IOVAs as dirty then nothing changes.
Joao Martins Sept. 11, 2023, 10:20 a.m. UTC | #7
On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>> +                                     VFIOContainer *container)
>>>> +{
>>>> +    VFIOPCIDevice *pcidev;
>>>> +    VFIODevice *vbasedev;
>>>> +    VFIOGroup *group;
>>>> +    Object *owner;
>>>> +
>>>> +    owner = memory_region_owner(section->mr);
>>>> +
>>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>> +                continue;
>>>> +            }
>>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>> +            if (OBJECT(pcidev) == owner) {
>>>> +                return true;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user.

My only concern is whether this is all part of the pci-hole64 or not e.g. if we
expand to general memory_region_is_ram_device() would we go back to the initial
bug where we create an enourmous range. The latter that you mentioned should be
mainly virtio-net devices as presented to the guest (regardless of backend is
vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole?

	Joao
Duan, Zhenzhong Sept. 11, 2023, 11:03 a.m. UTC | #8
>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Sent: Monday, September 11, 2023 6:13 PM
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>> -----Original Message-----
>>> From: Joao Martins <joao.m.martins@oracle.com>
>>> Sent: Monday, September 11, 2023 5:07 PM
>>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
...
>> I have another question, previously I think vfio pci bars are device states and
>> save/restored through VFIO migration protocol, so we don't need to dirty
>> tracking them. Do I understand wrong?
>
>The general thinking of device dirty tracking is to track all addressable IOVAs.
>But you do raise a good question. My understanding is that migrating the bars
>*as is* might be device migration specific (not a guarantee?); the save file and
>precopy interface are the only places we transfer from/to the data and it's just
>opaque data, not bars or anything formatted specifically -- so if we migrate
>bars it is hidden in what device f/w wants to move. Might be that BARs aren't
>even needed as they are sort of scratch space from h/w side. Ultimately, the
>dirty tracker is the one reporting the values, and the device h/w chooses to not
>report those IOVAs as dirty then nothing changes.

Understood, thanks Joao.
Duan, Zhenzhong Sept. 11, 2023, 11:18 a.m. UTC | #9
>-----Original Message-----
>From: Joao Martins <joao.m.martins@oracle.com>
>Sent: Monday, September 11, 2023 6:20 PM
>To: Duan, Zhenzhong <zhenzhong.duan@intel.com>; qemu-devel@nongnu.org
>Cc: Alex Williamson <alex.williamson@redhat.com>; Cedric Le Goater
><clg@redhat.com>; Avihai Horon <avihaih@nvidia.com>; Gerd Hoffmann
><kraxel@redhat.com>
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>>> +                                     VFIOContainer *container)
>>>>> +{
>>>>> +    VFIOPCIDevice *pcidev;
>>>>> +    VFIODevice *vbasedev;
>>>>> +    VFIOGroup *group;
>>>>> +    Object *owner;
>>>>> +
>>>>> +    owner = memory_region_owner(section->mr);
>>>>> +
>>>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>>>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>>> +                continue;
>>>>> +            }
>>>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>> +            if (OBJECT(pcidev) == owner) {
>>>>> +                return true;
>>>>> +            }
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    return false;
>>>>> +}
>>>>
>>>> What about simplify it with memory_region_is_ram_device()?
>>>> This way vdpa device could also be included.
>>>>
>>>
>>> Note that the check is not interested in RAM (or any other kinda of memory
>like
>>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM
>that
>>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>>> cover it? If so, I am all for the simplification.
>>
>> Ram device is used not only by vfio pci bars but also host notifier of vdpa and
>vhost-user.
>
>My only concern is whether this is all part of the pci-hole64 or not e.g. if we
>expand to general memory_region_is_ram_device() would we go back to the
>initial bug where we create an enourmous range.

Ok, I have no idea if memory_region_is_ram_device() will be expanded for other
usage in the future. Anyway looks better to keep your code for secure.

> The latter that you mentioned should be
>mainly virtio-net devices as presented to the guest (regardless of backend is
>vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole?
Agree, that's possible.

Thanks
Zhenzhong
Alex Williamson Sept. 11, 2023, 6:35 p.m. UTC | #10
On Mon, 11 Sep 2023 11:12:55 +0100
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 11/09/2023 10:48, Duan, Zhenzhong wrote:
> >> -----Original Message-----
> >> From: Joao Martins <joao.m.martins@oracle.com>
> >> Sent: Monday, September 11, 2023 5:07 PM
> >> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
> >>
> >> On 11/09/2023 09:57, Duan, Zhenzhong wrote:  
> >>>> -----Original Message-----
> >>>> From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
> >>>> devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
> >>>> Martins
> >>>> Sent: Friday, September 8, 2023 5:30 PM
> >>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
> >>>>
> >>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> >>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
> >>>> QEMU includes in the 64-bit range the RAM regions at the lower part
> >>>> and vfio-pci device RAM regions which are at the top of the address
> >>>> space. This range contains a large gap and the size can be bigger than
> >>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
> >>>>
> >>>> To avoid such large ranges, introduce a new PCI range covering the
> >>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
> >>>> to avoid breaking potential SeaBIOS guests.
> >>>>
> >>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>> [ clg: - wrote commit log
> >>>>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
> >>>> Signed-off-by: Cédric Le Goater <clg@redhat.com>
> >>>> ---
> >>>> v2:
> >>>> * s/minpci/minpci64/
> >>>> * s/maxpci/maxpci64/
> >>>> * Expand comment to cover the pci-hole64 and why we don't do special
> >>>>  handling of pci-hole32.
> >>>> ---
> >>>> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
> >>>> hw/vfio/trace-events |  2 +-
> >>>> 2 files changed, 61 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>>> index 237101d03844..134649226d43 100644
> >>>> --- a/hw/vfio/common.c
> >>>> +++ b/hw/vfio/common.c
> >>>> @@ -27,6 +27,7 @@
> >>>>
> >>>> #include "hw/vfio/vfio-common.h"
> >>>> #include "hw/vfio/vfio.h"
> >>>> +#include "hw/vfio/pci.h"
> >>>> #include "exec/address-spaces.h"
> >>>> #include "exec/memory.h"
> >>>> #include "exec/ram_addr.h"
> >>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
> >>>>     hwaddr max32;
> >>>>     hwaddr min64;
> >>>>     hwaddr max64;
> >>>> +    hwaddr minpci64;
> >>>> +    hwaddr maxpci64;
> >>>> } VFIODirtyRanges;
> >>>>
> >>>> typedef struct VFIODirtyRangesListener {
> >>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
> >>>>     MemoryListener listener;
> >>>> } VFIODirtyRangesListener;
> >>>>
> >>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> >>>> +                                     VFIOContainer *container)
> >>>> +{
> >>>> +    VFIOPCIDevice *pcidev;
> >>>> +    VFIODevice *vbasedev;
> >>>> +    VFIOGroup *group;
> >>>> +    Object *owner;
> >>>> +
> >>>> +    owner = memory_region_owner(section->mr);
> >>>> +
> >>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> >>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> >>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> >>>> +                continue;
> >>>> +            }
> >>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>> +            if (OBJECT(pcidev) == owner) {
> >>>> +                return true;
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    return false;
> >>>> +}  
> >>>
> >>> What about simplify it with memory_region_is_ram_device()?
> >>> This way vdpa device could also be included.
> >>>  
> >>
> >> Note that the check is not interested in RAM (or any other kinda of memory like
> >> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
> >> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
> >> cover it? If so, I am all for the simplification.  
> > 
> > Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user.
> > 
> > I have another question, previously I think vfio pci bars are device states and
> > save/restored through VFIO migration protocol, so we don't need to dirty
> > tracking them. Do I understand wrong?  
> 
> The general thinking of device dirty tracking is to track all addressable IOVAs.
> But you do raise a good question. My understanding is that migrating the bars
> *as is* might be device migration specific (not a guarantee?); the save file and
> precopy interface are the only places we transfer from/to the data and it's just
> opaque data, not bars or anything formatted specifically -- so if we migrate
> bars it is hidden in what device f/w wants to move. Might be that BARs aren't
> even needed as they are sort of scratch space from h/w side. Ultimately, the
> dirty tracker is the one reporting the values, and the device h/w chooses to not
> report those IOVAs as dirty then nothing changes.

I think ram_device is somewhat controversial, only a few select device
types make use of it, so for example an emulated NIC not making use of
ram_device might still exist within the IOVA aperture of the device.

It does however seem strange to me to include the MMIO space of other
devices in the dirty tracking of the vfio device.  I'm not sure it's
really the vfio device's place to report MMIO within another device as
dirty.  Saving grace being that this likely doesn't really occur
currently, but should the vfio device even exert any resources for
tracking such ranges?

A concern with simply trying to define RAM vs PCI ranges is that QEMU
is pretty fluid about the VM layout.  For example, what prevents us
from having a NUMA configuration where one memory node might be
similarly disjoint as between RAM and PCI memory here?  TBH, I thought
this was why we had the combine_iova_ranges() function in the kernel so
that QEMU simply wrote through the dirty tracking ranges and the driver
combined them as necessary based on their own constraints.  Thanks,

Alex
Joao Martins Sept. 11, 2023, 7:17 p.m. UTC | #11
On 11/09/2023 19:35, Alex Williamson wrote:
> On Mon, 11 Sep 2023 11:12:55 +0100
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>>>> -----Original Message-----
>>>> From: Joao Martins <joao.m.martins@oracle.com>
>>>> Sent: Monday, September 11, 2023 5:07 PM
>>>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>
>>>> On 11/09/2023 09:57, Duan, Zhenzhong wrote:  
>>>>>> -----Original Message-----
>>>>>> From: qemu-devel-bounces+zhenzhong.duan=intel.com@nongnu.org <qemu-
>>>>>> devel-bounces+zhenzhong.duan=intel.com@nongnu.org> On Behalf Of Joao
>>>>>> Martins
>>>>>> Sent: Friday, September 8, 2023 5:30 PM
>>>>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>>>>
>>>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>>>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>>>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>>>>> and vfio-pci device RAM regions which are at the top of the address
>>>>>> space. This range contains a large gap and the size can be bigger than
>>>>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>>>>
>>>>>> To avoid such large ranges, introduce a new PCI range covering the
>>>>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>>>>> to avoid breaking potential SeaBIOS guests.
>>>>>>
>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>> [ clg: - wrote commit log
>>>>>>       - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>>>>> Signed-off-by: Cédric Le Goater <clg@redhat.com>
>>>>>> ---
>>>>>> v2:
>>>>>> * s/minpci/minpci64/
>>>>>> * s/maxpci/maxpci64/
>>>>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>>>>  handling of pci-hole32.
>>>>>> ---
>>>>>> hw/vfio/common.c     | 71 +++++++++++++++++++++++++++++++++++++-------
>>>>>> hw/vfio/trace-events |  2 +-
>>>>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>> index 237101d03844..134649226d43 100644
>>>>>> --- a/hw/vfio/common.c
>>>>>> +++ b/hw/vfio/common.c
>>>>>> @@ -27,6 +27,7 @@
>>>>>>
>>>>>> #include "hw/vfio/vfio-common.h"
>>>>>> #include "hw/vfio/vfio.h"
>>>>>> +#include "hw/vfio/pci.h"
>>>>>> #include "exec/address-spaces.h"
>>>>>> #include "exec/memory.h"
>>>>>> #include "exec/ram_addr.h"
>>>>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>>>>>     hwaddr max32;
>>>>>>     hwaddr min64;
>>>>>>     hwaddr max64;
>>>>>> +    hwaddr minpci64;
>>>>>> +    hwaddr maxpci64;
>>>>>> } VFIODirtyRanges;
>>>>>>
>>>>>> typedef struct VFIODirtyRangesListener {
>>>>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>>>>>     MemoryListener listener;
>>>>>> } VFIODirtyRangesListener;
>>>>>>
>>>>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>>>>> +                                     VFIOContainer *container)
>>>>>> +{
>>>>>> +    VFIOPCIDevice *pcidev;
>>>>>> +    VFIODevice *vbasedev;
>>>>>> +    VFIOGroup *group;
>>>>>> +    Object *owner;
>>>>>> +
>>>>>> +    owner = memory_region_owner(section->mr);
>>>>>> +
>>>>>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>>>>>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>>>>> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>>> +            if (OBJECT(pcidev) == owner) {
>>>>>> +                return true;
>>>>>> +            }
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    return false;
>>>>>> +}  
>>>>>
>>>>> What about simplify it with memory_region_is_ram_device()?
>>>>> This way vdpa device could also be included.
>>>>>  
>>>>
>>>> Note that the check is not interested in RAM (or any other kinda of memory like
>>>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>>>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>>>> cover it? If so, I am all for the simplification.  
>>>
>>> Ram device is used not only by vfio pci bars but also host notifier of vdpa and vhost-user.
>>>
>>> I have another question, previously I think vfio pci bars are device states and
>>> save/restored through VFIO migration protocol, so we don't need to dirty
>>> tracking them. Do I understand wrong?  
>>
>> The general thinking of device dirty tracking is to track all addressable IOVAs.
>> But you do raise a good question. My understanding is that migrating the bars
>> *as is* might be device migration specific (not a guarantee?); the save file and
>> precopy interface are the only places we transfer from/to the data and it's just
>> opaque data, not bars or anything formatted specifically -- so if we migrate
>> bars it is hidden in what device f/w wants to move. Might be that BARs aren't
>> even needed as they are sort of scratch space from h/w side. Ultimately, the
>> dirty tracker is the one reporting the values, and the device h/w chooses to not
>> report those IOVAs as dirty then nothing changes.
> 
> I think ram_device is somewhat controversial, only a few select device
> types make use of it, so for example an emulated NIC not making use of
> ram_device might still exist within the IOVA aperture of the device.
> 
> It does however seem strange to me to include the MMIO space of other
> devices in the dirty tracking of the vfio device.  I'm not sure it's
> really the vfio device's place to report MMIO within another device as
> dirty.  Saving grace being that this likely doesn't really occur
> currently, but should the vfio device even exert any resources for
> tracking such ranges?
> 
I suppose we could be strict to RAM only and its own device PCI ranges
(excluding others like VGA and etc); it makes conceptual sense. And when
mixing up devices with big RAM ranges (*if any* it may stress the dirty tracker
more than it should.

> A concern with simply trying to define RAM vs PCI ranges is that QEMU
> is pretty fluid about the VM layout.  For example, what prevents us
> from having a NUMA configuration where one memory node might be
> similarly disjoint as between RAM and PCI memory here?  TBH, I thought
> this was why we had the combine_iova_ranges() function in the kernel so
> that QEMU simply wrote through the dirty tracking ranges and the driver
> combined them as necessary based on their own constraints.

And I think it was ... as userspace can still pass very random set of ranges, so
h/w driver may need do some filtering. The initial versions took that approach
i.e. store everything, and let the kernel select what it wants to pass to
hardware. But we ended up with this dual/three range tracking to simplify the
logic as I recall. Perhaps how we started (via IOVATree) was the wrong data
structure on my end, and even switching to a simpler data struture, there will
always be the dynamical behaviour that we need to re-allocate the ranges as we go.

	Joao
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 237101d03844..134649226d43 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -27,6 +27,7 @@ 
 
 #include "hw/vfio/vfio-common.h"
 #include "hw/vfio/vfio.h"
+#include "hw/vfio/pci.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
@@ -1400,6 +1401,8 @@  typedef struct VFIODirtyRanges {
     hwaddr max32;
     hwaddr min64;
     hwaddr max64;
+    hwaddr minpci64;
+    hwaddr maxpci64;
 } VFIODirtyRanges;
 
 typedef struct VFIODirtyRangesListener {
@@ -1408,6 +1411,31 @@  typedef struct VFIODirtyRangesListener {
     MemoryListener listener;
 } VFIODirtyRangesListener;
 
+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
+                                     VFIOContainer *container)
+{
+    VFIOPCIDevice *pcidev;
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
+    Object *owner;
+
+    owner = memory_region_owner(section->mr);
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if (OBJECT(pcidev) == owner) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
 static void vfio_dirty_tracking_update(MemoryListener *listener,
                                        MemoryRegionSection *section)
 {
@@ -1424,19 +1452,32 @@  static void vfio_dirty_tracking_update(MemoryListener *listener,
     }
 
     /*
-     * The address space passed to the dirty tracker is reduced to two ranges:
-     * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+     * The address space passed to the dirty tracker is reduced to three ranges:
+     * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
+     * PCI 64-bit hole.
+     *
      * The underlying reports of dirty will query a sub-interval of each of
      * these ranges.
      *
-     * The purpose of the dual range handling is to handle known cases of big
-     * holes in the address space, like the x86 AMD 1T hole. The alternative
-     * would be an IOVATree but that has a much bigger runtime overhead and
-     * unnecessary complexity.
+     * The purpose of the three range handling is to handle known cases of big
+     * holes in the address space, like the x86 AMD 1T hole, and firmware (like
+     * OVMF) which may relocate the pci-hole64 to the end of the address space.
+     * The latter would otherwise generate large ranges for tracking, stressing
+     * the limits of supported hardware. The pci-hole32 will always be below 4G
+     * (overlapping or not) so it doesn't need special handling and is part of
+     * the 32-bit range.
+     *
+     * The alternative would be an IOVATree but that has a much bigger runtime
+     * overhead and unnecessary complexity.
      */
-    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
-    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
-
+    if (vfio_section_is_vfio_pci(section, dirty->container) &&
+        iova >= UINT32_MAX) {
+        min = &range->minpci64;
+        max = &range->maxpci64;
+    } else {
+        min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+        max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+    }
     if (*min > iova) {
         *min = iova;
     }
@@ -1461,6 +1502,7 @@  static void vfio_dirty_tracking_init(VFIOContainer *container,
     memset(&dirty, 0, sizeof(dirty));
     dirty.ranges.min32 = UINT32_MAX;
     dirty.ranges.min64 = UINT64_MAX;
+    dirty.ranges.minpci64 = UINT64_MAX;
     dirty.listener = vfio_dirty_tracking_listener;
     dirty.container = container;
 
@@ -1531,7 +1573,8 @@  vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
      * DMA logging uAPI guarantees to support at least a number of ranges that
      * fits into a single host kernel base page.
      */
-    control->num_ranges = !!tracking->max32 + !!tracking->max64;
+    control->num_ranges = !!tracking->max32 + !!tracking->max64 +
+        !!tracking->maxpci64;
     ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
                         control->num_ranges);
     if (!ranges) {
@@ -1550,11 +1593,17 @@  vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
     if (tracking->max64) {
         ranges->iova = tracking->min64;
         ranges->length = (tracking->max64 - tracking->min64) + 1;
+        ranges++;
+    }
+    if (tracking->maxpci64) {
+        ranges->iova = tracking->minpci64;
+        ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
     }
 
     trace_vfio_device_dirty_tracking_start(control->num_ranges,
                                            tracking->min32, tracking->max32,
-                                           tracking->min64, tracking->max64);
+                                           tracking->min64, tracking->max64,
+                                           tracking->minpci64, tracking->maxpci64);
 
     return feature;
 }
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ce61b10827b6..cc7c21365c92 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,7 +104,7 @@  vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
-vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
+vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"