Message ID | 20230908071438.86136-1-clg@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] vfio/common: Separate vfio-pci ranges | expand |
On 08/09/2023 08:14, Cédric Le Goater wrote: > From: Joao Martins <joao.m.martins@oracle.com> > > QEMU computes the DMA logging ranges for two predefined ranges: 32-bit > and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, > QEMU includes in the 64-bit range the RAM regions at the lower part > and vfio-pci device RAM regions which are at the top of the address > space. This range contains a large gap and the size can be bigger than > the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). > > To avoid such large ranges, introduce a new PCI range covering the > vfio-pci device RAM regions, this only if the addresses are above 4GB > to avoid breaking potential SeaBIOS guests. > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > [ clg: - wrote commit log > - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] > Signed-off-by: Cédric Le Goater <clg@redhat.com> > --- > hw/vfio/common.c | 51 +++++++++++++++++++++++++++++++++++++++----- > hw/vfio/trace-events | 2 +- > 2 files changed, 47 insertions(+), 6 deletions(-) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 237101d03844273f653d98b6d053a1ae9c05a247..a5548e3bebf999e6d9cef08bdaf1fbc3b437e5eb 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -27,6 +27,7 @@ > > #include "hw/vfio/vfio-common.h" > #include "hw/vfio/vfio.h" > +#include "hw/vfio/pci.h" > #include "exec/address-spaces.h" > #include "exec/memory.h" > #include "exec/ram_addr.h" > @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { > hwaddr max32; > hwaddr min64; > hwaddr max64; > + hwaddr minpci; > + hwaddr maxpci; Considering this is about pci64 hole relocation, I wondered post-reading your feedback, that maybe we should rename {min,max}pci to {min,max}pci64 (...) > } VFIODirtyRanges; > > typedef struct VFIODirtyRangesListener { > @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { > MemoryListener listener; > } VFIODirtyRangesListener; > > +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, > + VFIOContainer *container) > +{ > + VFIOPCIDevice *pcidev; > + VFIODevice *vbasedev; > + VFIOGroup *group; > + Object *owner; > + > + owner = memory_region_owner(section->mr); > + > + QLIST_FOREACH(group, &container->group_list, container_next) { > + QLIST_FOREACH(vbasedev, &group->device_list, next) { > + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { > + continue; > + } > + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > + if (OBJECT(pcidev) == owner) { > + return true; > + } > + } > + } > + > + return false; > +} > + > static void vfio_dirty_tracking_update(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -1434,9 +1462,14 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, > * would be an IOVATree but that has a much bigger runtime overhead and > * unnecessary complexity. > */ > - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; > - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; > - > + if (vfio_section_is_vfio_pci(section, dirty->container) && > + iova >= UINT32_MAX) { > + min = &range->minpci; > + max = &range->maxpci; (...) specially considering this check of making sure we skip the pci-hole32 (as that one is fixed) > + } else { > + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; > + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; > + } > if (*min > iova) { > *min = iova; > } > @@ -1461,6 +1494,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, > memset(&dirty, 0, sizeof(dirty)); > dirty.ranges.min32 = UINT32_MAX; > dirty.ranges.min64 = UINT64_MAX; > + dirty.ranges.minpci = UINT64_MAX; > dirty.listener = vfio_dirty_tracking_listener; > dirty.container = container; > > @@ -1531,7 +1565,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, > * DMA logging uAPI guarantees to support at least a number of ranges that > * fits into a single host kernel base page. > */ > - control->num_ranges = !!tracking->max32 + !!tracking->max64; > + control->num_ranges = !!tracking->max32 + !!tracking->max64 + > + !!tracking->maxpci; > ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, > control->num_ranges); > if (!ranges) { > @@ -1550,11 +1585,17 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, > if (tracking->max64) { > ranges->iova = tracking->min64; > ranges->length = (tracking->max64 - tracking->min64) + 1; > + ranges++; > + } > + if (tracking->maxpci) { > + ranges->iova = tracking->minpci; > + ranges->length = (tracking->maxpci - tracking->minpci) + 1; > } > > trace_vfio_device_dirty_tracking_start(control->num_ranges, > tracking->min32, tracking->max32, > - tracking->min64, tracking->max64); > + tracking->min64, tracking->max64, > + tracking->minpci, tracking->maxpci); > > return feature; > } > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index ce61b10827b6a1203a5fe1a87a76d96f25c11345..ab52c6bb7f0c11e51fefef231c108d0c9381547e 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi > vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" > vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 > vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" > -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" > +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci:[0x%"PRIx64" - 0x%"PRIx64"]" > vfio_disconnect_container(int fd) "close container->fd=%d" > vfio_put_group(int fd) "close group->fd=%d" > vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
On 9/8/23 10:16, Joao Martins wrote: > > > On 08/09/2023 08:14, Cédric Le Goater wrote: >> From: Joao Martins <joao.m.martins@oracle.com> >> >> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >> QEMU includes in the 64-bit range the RAM regions at the lower part >> and vfio-pci device RAM regions which are at the top of the address >> space. This range contains a large gap and the size can be bigger than >> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >> >> To avoid such large ranges, introduce a new PCI range covering the >> vfio-pci device RAM regions, this only if the addresses are above 4GB >> to avoid breaking potential SeaBIOS guests. >> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> [ clg: - wrote commit log >> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >> Signed-off-by: Cédric Le Goater <clg@redhat.com> >> --- >> hw/vfio/common.c | 51 +++++++++++++++++++++++++++++++++++++++----- >> hw/vfio/trace-events | 2 +- >> 2 files changed, 47 insertions(+), 6 deletions(-) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 237101d03844273f653d98b6d053a1ae9c05a247..a5548e3bebf999e6d9cef08bdaf1fbc3b437e5eb 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -27,6 +27,7 @@ >> >> #include "hw/vfio/vfio-common.h" >> #include "hw/vfio/vfio.h" >> +#include "hw/vfio/pci.h" >> #include "exec/address-spaces.h" >> #include "exec/memory.h" >> #include "exec/ram_addr.h" >> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >> hwaddr max32; >> hwaddr min64; >> hwaddr max64; >> + hwaddr minpci; >> + hwaddr maxpci; > > Considering this is about pci64 hole relocation, I wondered post-reading your > feedback, that maybe we should rename {min,max}pci to {min,max}pci64 (...) yes. > >> } VFIODirtyRanges; >> >> typedef struct VFIODirtyRangesListener { >> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >> MemoryListener listener; >> } VFIODirtyRangesListener; >> >> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >> + VFIOContainer *container) >> +{ >> + VFIOPCIDevice *pcidev; >> + VFIODevice *vbasedev; >> + VFIOGroup *group; >> + Object *owner; >> + >> + owner = memory_region_owner(section->mr); >> + >> + QLIST_FOREACH(group, &container->group_list, container_next) { >> + QLIST_FOREACH(vbasedev, &group->device_list, next) { >> + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >> + continue; >> + } >> + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >> + if (OBJECT(pcidev) == owner) { >> + return true; >> + } >> + } >> + } >> + >> + return false; >> +} >> + >> static void vfio_dirty_tracking_update(MemoryListener *listener, >> MemoryRegionSection *section) >> { >> @@ -1434,9 +1462,14 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, >> * would be an IOVATree but that has a much bigger runtime overhead and >> * unnecessary complexity. >> */ >> - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; >> - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; >> - >> + if (vfio_section_is_vfio_pci(section, dirty->container) && >> + iova >= UINT32_MAX) { >> + min = &range->minpci; >> + max = &range->maxpci; > > (...) specially considering this check of making sure we skip the pci-hole32 (as > that one is fixed) yep. That check above might deserve a comment also. Could you resend please ? Thanks, C. > >> + } else { >> + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; >> + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; >> + } >> if (*min > iova) { >> *min = iova; >> } >> @@ -1461,6 +1494,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, >> memset(&dirty, 0, sizeof(dirty)); >> dirty.ranges.min32 = UINT32_MAX; >> dirty.ranges.min64 = UINT64_MAX; >> + dirty.ranges.minpci = UINT64_MAX; >> dirty.listener = vfio_dirty_tracking_listener; >> dirty.container = container; >> >> @@ -1531,7 +1565,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, >> * DMA logging uAPI guarantees to support at least a number of ranges that >> * fits into a single host kernel base page. >> */ >> - control->num_ranges = !!tracking->max32 + !!tracking->max64; >> + control->num_ranges = !!tracking->max32 + !!tracking->max64 + >> + !!tracking->maxpci; >> ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, >> control->num_ranges); >> if (!ranges) { >> @@ -1550,11 +1585,17 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, >> if (tracking->max64) { >> ranges->iova = tracking->min64; >> ranges->length = (tracking->max64 - tracking->min64) + 1; >> + ranges++; >> + } >> + if (tracking->maxpci) { >> + ranges->iova = tracking->minpci; >> + ranges->length = (tracking->maxpci - tracking->minpci) + 1; >> } >> >> trace_vfio_device_dirty_tracking_start(control->num_ranges, >> tracking->min32, tracking->max32, >> - tracking->min64, tracking->max64); >> + tracking->min64, tracking->max64, >> + tracking->minpci, tracking->maxpci); >> >> return feature; >> } >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index ce61b10827b6a1203a5fe1a87a76d96f25c11345..ab52c6bb7f0c11e51fefef231c108d0c9381547e 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi >> vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" >> vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 >> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" >> -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" >> +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci:[0x%"PRIx64" - 0x%"PRIx64"]" >> vfio_disconnect_container(int fd) "close container->fd=%d" >> vfio_put_group(int fd) "close group->fd=%d" >> vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" >
On 08/09/2023 09:28, Cédric Le Goater wrote: > On 9/8/23 10:16, Joao Martins wrote: >> On 08/09/2023 08:14, Cédric Le Goater wrote: >>> From: Joao Martins <joao.m.martins@oracle.com> >>> >>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >>> QEMU includes in the 64-bit range the RAM regions at the lower part >>> and vfio-pci device RAM regions which are at the top of the address >>> space. This range contains a large gap and the size can be bigger than >>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >>> >>> To avoid such large ranges, introduce a new PCI range covering the >>> vfio-pci device RAM regions, this only if the addresses are above 4GB >>> to avoid breaking potential SeaBIOS guests. >>> >>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>> [ clg: - wrote commit log >>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >>> Signed-off-by: Cédric Le Goater <clg@redhat.com> >>> --- >>> hw/vfio/common.c | 51 +++++++++++++++++++++++++++++++++++++++----- >>> hw/vfio/trace-events | 2 +- >>> 2 files changed, 47 insertions(+), 6 deletions(-) >>> >>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>> index >>> 237101d03844273f653d98b6d053a1ae9c05a247..a5548e3bebf999e6d9cef08bdaf1fbc3b437e5eb 100644 >>> --- a/hw/vfio/common.c >>> +++ b/hw/vfio/common.c >>> @@ -27,6 +27,7 @@ >>> #include "hw/vfio/vfio-common.h" >>> #include "hw/vfio/vfio.h" >>> +#include "hw/vfio/pci.h" >>> #include "exec/address-spaces.h" >>> #include "exec/memory.h" >>> #include "exec/ram_addr.h" >>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >>> hwaddr max32; >>> hwaddr min64; >>> hwaddr max64; >>> + hwaddr minpci; >>> + hwaddr maxpci; >> >> Considering this is about pci64 hole relocation, I wondered post-reading your >> feedback, that maybe we should rename {min,max}pci to {min,max}pci64 (...) > > yes. > >> >>> } VFIODirtyRanges; >>> typedef struct VFIODirtyRangesListener { >>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >>> MemoryListener listener; >>> } VFIODirtyRangesListener; >>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>> + VFIOContainer *container) >>> +{ >>> + VFIOPCIDevice *pcidev; >>> + VFIODevice *vbasedev; >>> + VFIOGroup *group; >>> + Object *owner; >>> + >>> + owner = memory_region_owner(section->mr); >>> + >>> + QLIST_FOREACH(group, &container->group_list, container_next) { >>> + QLIST_FOREACH(vbasedev, &group->device_list, next) { >>> + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>> + continue; >>> + } >>> + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>> + if (OBJECT(pcidev) == owner) { >>> + return true; >>> + } >>> + } >>> + } >>> + >>> + return false; >>> +} >>> + >>> static void vfio_dirty_tracking_update(MemoryListener *listener, >>> MemoryRegionSection *section) >>> { >>> @@ -1434,9 +1462,14 @@ static void vfio_dirty_tracking_update(MemoryListener >>> *listener, >>> * would be an IOVATree but that has a much bigger runtime overhead and >>> * unnecessary complexity. >>> */ >>> - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; >>> - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; >>> - >>> + if (vfio_section_is_vfio_pci(section, dirty->container) && >>> + iova >= UINT32_MAX) { >>> + min = &range->minpci; >>> + max = &range->maxpci; >> >> (...) specially considering this check of making sure we skip the pci-hole32 (as >> that one is fixed) > > yep. That check above might deserve a comment also. > > Could you resend please ? > yes. This is on top of your vfio-8.2 branch right? > Thanks, > > C. > > >> >>> + } else { >>> + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; >>> + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; >>> + } >>> if (*min > iova) { >>> *min = iova; >>> } >>> @@ -1461,6 +1494,7 @@ static void vfio_dirty_tracking_init(VFIOContainer >>> *container, >>> memset(&dirty, 0, sizeof(dirty)); >>> dirty.ranges.min32 = UINT32_MAX; >>> dirty.ranges.min64 = UINT64_MAX; >>> + dirty.ranges.minpci = UINT64_MAX; >>> dirty.listener = vfio_dirty_tracking_listener; >>> dirty.container = container; >>> @@ -1531,7 +1565,8 @@ >>> vfio_device_feature_dma_logging_start_create(VFIOContainer *container, >>> * DMA logging uAPI guarantees to support at least a number of ranges that >>> * fits into a single host kernel base page. >>> */ >>> - control->num_ranges = !!tracking->max32 + !!tracking->max64; >>> + control->num_ranges = !!tracking->max32 + !!tracking->max64 + >>> + !!tracking->maxpci; >>> ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, >>> control->num_ranges); >>> if (!ranges) { >>> @@ -1550,11 +1585,17 @@ >>> vfio_device_feature_dma_logging_start_create(VFIOContainer *container, >>> if (tracking->max64) { >>> ranges->iova = tracking->min64; >>> ranges->length = (tracking->max64 - tracking->min64) + 1; >>> + ranges++; >>> + } >>> + if (tracking->maxpci) { >>> + ranges->iova = tracking->minpci; >>> + ranges->length = (tracking->maxpci - tracking->minpci) + 1; >>> } >>> trace_vfio_device_dirty_tracking_start(control->num_ranges, >>> tracking->min32, tracking->max32, >>> - tracking->min64, tracking->max64); >>> + tracking->min64, tracking->max64, >>> + tracking->minpci, tracking->maxpci); >>> return feature; >>> } >>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >>> index >>> ce61b10827b6a1203a5fe1a87a76d96f25c11345..ab52c6bb7f0c11e51fefef231c108d0c9381547e 100644 >>> --- a/hw/vfio/trace-events >>> +++ b/hw/vfio/trace-events >>> @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t >>> iova, uint64_t offset_wi >>> vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, >>> uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" >>> size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" >>> vfio_listener_region_del(uint64_t start, uint64_t end) "region_del >>> 0x%"PRIx64" - 0x%"PRIx64 >>> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t >>> min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" >>> - 0x%"PRIx64"]" >>> -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t >>> max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - >>> 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" >>> +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t >>> max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) >>> "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], >>> pci:[0x%"PRIx64" - 0x%"PRIx64"]" >>> vfio_disconnect_container(int fd) "close container->fd=%d" >>> vfio_put_group(int fd) "close group->fd=%d" >>> vfio_get_device(const char * name, unsigned int flags, unsigned int >>> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" >> >
On 9/8/23 10:35, Joao Martins wrote: > On 08/09/2023 09:28, Cédric Le Goater wrote: >> On 9/8/23 10:16, Joao Martins wrote: >>> On 08/09/2023 08:14, Cédric Le Goater wrote: >>>> From: Joao Martins <joao.m.martins@oracle.com> >>>> >>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit >>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled, >>>> QEMU includes in the 64-bit range the RAM regions at the lower part >>>> and vfio-pci device RAM regions which are at the top of the address >>>> space. This range contains a large gap and the size can be bigger than >>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit). >>>> >>>> To avoid such large ranges, introduce a new PCI range covering the >>>> vfio-pci device RAM regions, this only if the addresses are above 4GB >>>> to avoid breaking potential SeaBIOS guests. >>>> >>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>>> [ clg: - wrote commit log >>>> - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ] >>>> Signed-off-by: Cédric Le Goater <clg@redhat.com> >>>> --- >>>> hw/vfio/common.c | 51 +++++++++++++++++++++++++++++++++++++++----- >>>> hw/vfio/trace-events | 2 +- >>>> 2 files changed, 47 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >>>> index >>>> 237101d03844273f653d98b6d053a1ae9c05a247..a5548e3bebf999e6d9cef08bdaf1fbc3b437e5eb 100644 >>>> --- a/hw/vfio/common.c >>>> +++ b/hw/vfio/common.c >>>> @@ -27,6 +27,7 @@ >>>> #include "hw/vfio/vfio-common.h" >>>> #include "hw/vfio/vfio.h" >>>> +#include "hw/vfio/pci.h" >>>> #include "exec/address-spaces.h" >>>> #include "exec/memory.h" >>>> #include "exec/ram_addr.h" >>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { >>>> hwaddr max32; >>>> hwaddr min64; >>>> hwaddr max64; >>>> + hwaddr minpci; >>>> + hwaddr maxpci; >>> >>> Considering this is about pci64 hole relocation, I wondered post-reading your >>> feedback, that maybe we should rename {min,max}pci to {min,max}pci64 (...) >> >> yes. >> >>> >>>> } VFIODirtyRanges; >>>> typedef struct VFIODirtyRangesListener { >>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { >>>> MemoryListener listener; >>>> } VFIODirtyRangesListener; >>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, >>>> + VFIOContainer *container) >>>> +{ >>>> + VFIOPCIDevice *pcidev; >>>> + VFIODevice *vbasedev; >>>> + VFIOGroup *group; >>>> + Object *owner; >>>> + >>>> + owner = memory_region_owner(section->mr); >>>> + >>>> + QLIST_FOREACH(group, &container->group_list, container_next) { >>>> + QLIST_FOREACH(vbasedev, &group->device_list, next) { >>>> + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { >>>> + continue; >>>> + } >>>> + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); >>>> + if (OBJECT(pcidev) == owner) { >>>> + return true; >>>> + } >>>> + } >>>> + } >>>> + >>>> + return false; >>>> +} >>>> + >>>> static void vfio_dirty_tracking_update(MemoryListener *listener, >>>> MemoryRegionSection *section) >>>> { >>>> @@ -1434,9 +1462,14 @@ static void vfio_dirty_tracking_update(MemoryListener >>>> *listener, >>>> * would be an IOVATree but that has a much bigger runtime overhead and >>>> * unnecessary complexity. >>>> */ >>>> - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; >>>> - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; >>>> - >>>> + if (vfio_section_is_vfio_pci(section, dirty->container) && >>>> + iova >= UINT32_MAX) { >>>> + min = &range->minpci; >>>> + max = &range->maxpci; >>> >>> (...) specially considering this check of making sure we skip the pci-hole32 (as >>> that one is fixed) >> >> yep. That check above might deserve a comment also. >> >> Could you resend please ? >> > > yes. This is on top of your vfio-8.2 branch right? yes. Just reset the HEAD, I will merge the new version. C.
diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 237101d03844273f653d98b6d053a1ae9c05a247..a5548e3bebf999e6d9cef08bdaf1fbc3b437e5eb 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -27,6 +27,7 @@ #include "hw/vfio/vfio-common.h" #include "hw/vfio/vfio.h" +#include "hw/vfio/pci.h" #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges { hwaddr max32; hwaddr min64; hwaddr max64; + hwaddr minpci; + hwaddr maxpci; } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener { MemoryListener listener; } VFIODirtyRangesListener; +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainer *container) +{ + VFIOPCIDevice *pcidev; + VFIODevice *vbasedev; + VFIOGroup *group; + Object *owner; + + owner = memory_region_owner(section->mr); + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + if (OBJECT(pcidev) == owner) { + return true; + } + } + } + + return false; +} + static void vfio_dirty_tracking_update(MemoryListener *listener, MemoryRegionSection *section) { @@ -1434,9 +1462,14 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, * would be an IOVATree but that has a much bigger runtime overhead and * unnecessary complexity. */ - min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; - max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; - + if (vfio_section_is_vfio_pci(section, dirty->container) && + iova >= UINT32_MAX) { + min = &range->minpci; + max = &range->maxpci; + } else { + min = (end <= UINT32_MAX) ? &range->min32 : &range->min64; + max = (end <= UINT32_MAX) ? &range->max32 : &range->max64; + } if (*min > iova) { *min = iova; } @@ -1461,6 +1494,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memset(&dirty, 0, sizeof(dirty)); dirty.ranges.min32 = UINT32_MAX; dirty.ranges.min64 = UINT64_MAX; + dirty.ranges.minpci = UINT64_MAX; dirty.listener = vfio_dirty_tracking_listener; dirty.container = container; @@ -1531,7 +1565,8 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, * DMA logging uAPI guarantees to support at least a number of ranges that * fits into a single host kernel base page. */ - control->num_ranges = !!tracking->max32 + !!tracking->max64; + control->num_ranges = !!tracking->max32 + !!tracking->max64 + + !!tracking->maxpci; ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, control->num_ranges); if (!ranges) { @@ -1550,11 +1585,17 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container, if (tracking->max64) { ranges->iova = tracking->min64; ranges->length = (tracking->max64 - tracking->min64) + 1; + ranges++; + } + if (tracking->maxpci) { + ranges->iova = tracking->minpci; + ranges->length = (tracking->maxpci - tracking->minpci) + 1; } trace_vfio_device_dirty_tracking_start(control->num_ranges, tracking->min32, tracking->max32, - tracking->min64, tracking->max64); + tracking->min64, tracking->max64, + tracking->minpci, tracking->maxpci); return feature; } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index ce61b10827b6a1203a5fe1a87a76d96f25c11345..ab52c6bb7f0c11e51fefef231c108d0c9381547e 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]" -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]" +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci:[0x%"PRIx64" - 0x%"PRIx64"]" vfio_disconnect_container(int fd) "close container->fd=%d" vfio_put_group(int fd) "close group->fd=%d" vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"