diff mbox series

[v2,2/4] intel_iommu: Fix a potential issue in VFIO dirty page sync

Message ID 20230601063320.139308-3-zhenzhong.duan@intel.com (mailing list archive)
State New, archived
Headers show
Series Optimize UNMAP call and bug fix | expand

Commit Message

Duan, Zhenzhong June 1, 2023, 6:33 a.m. UTC
Peter Xu found a potential issue:

"The other thing is when I am looking at the new code I found that we
actually extended the replay() to be used also in dirty tracking of vfio,
in vfio_sync_dirty_bitmap().  For that maybe it's already broken if
unmap_all() because afaiu log_sync() can be called in migration thread
anytime during DMA so I think it means the device is prone to DMA with the
IOMMU pgtable quickly erased and rebuilt here, which means the DMA could
fail unexpectedly.  Copy Alex, Kirti and Neo."

To eliminate this small window with empty mapping, we should remove the
call to unmap_all(). Besides that, introduce a new notifier type called
IOMMU_NOTIFIER_FULL_MAP to get full mappings as intel_iommu only notifies
changed mappings while VFIO dirty page sync needs full mappings. Thanks
to current implementation of iova tree, we could pick mappings from iova
trees directly instead of walking through guest IOMMU page table.

IOMMU_NOTIFIER_MAP is still used to get changed mappings for optimization
purpose. As long as notification for IOMMU_NOTIFIER_MAP could ensure shadow
page table in sync, then it's OK.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/i386/intel_iommu.c | 49 +++++++++++++++++++++++++++++++++++--------
 hw/vfio/common.c      |  2 +-
 include/exec/memory.h | 13 ++++++++++++
 softmmu/memory.c      |  4 ++++
 4 files changed, 58 insertions(+), 10 deletions(-)

Comments

Peter Xu June 5, 2023, 6:39 p.m. UTC | #1
On Thu, Jun 01, 2023 at 02:33:18PM +0800, Zhenzhong Duan wrote:
> Peter Xu found a potential issue:
> 
> "The other thing is when I am looking at the new code I found that we
> actually extended the replay() to be used also in dirty tracking of vfio,
> in vfio_sync_dirty_bitmap().  For that maybe it's already broken if
> unmap_all() because afaiu log_sync() can be called in migration thread
> anytime during DMA so I think it means the device is prone to DMA with the
> IOMMU pgtable quickly erased and rebuilt here, which means the DMA could
> fail unexpectedly.  Copy Alex, Kirti and Neo."
> 
> To eliminate this small window with empty mapping, we should remove the
> call to unmap_all(). Besides that, introduce a new notifier type called
> IOMMU_NOTIFIER_FULL_MAP to get full mappings as intel_iommu only notifies
> changed mappings while VFIO dirty page sync needs full mappings. Thanks
> to current implementation of iova tree, we could pick mappings from iova
> trees directly instead of walking through guest IOMMU page table.
> 
> IOMMU_NOTIFIER_MAP is still used to get changed mappings for optimization
> purpose. As long as notification for IOMMU_NOTIFIER_MAP could ensure shadow
> page table in sync, then it's OK.
> 
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
>  hw/i386/intel_iommu.c | 49 +++++++++++++++++++++++++++++++++++--------
>  hw/vfio/common.c      |  2 +-
>  include/exec/memory.h | 13 ++++++++++++
>  softmmu/memory.c      |  4 ++++
>  4 files changed, 58 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 94d52f4205d2..061fcded0dfb 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -3819,6 +3819,41 @@ static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
>      return 0;
>  }
>  
> +static gboolean vtd_replay_full_map(DMAMap *map, gpointer *private)
> +{
> +    IOMMUTLBEvent event;
> +
> +    event.type = IOMMU_NOTIFIER_MAP;
> +    event.entry.iova = map->iova;
> +    event.entry.addr_mask = map->size;
> +    event.entry.target_as = &address_space_memory;
> +    event.entry.perm = map->perm;
> +    event.entry.translated_addr = map->translated_addr;
> +
> +    return vtd_replay_hook(&event, private);
> +}
> +
> +/*
> + * This is a fast path to notify the full mappings falling in the scope
> + * of IOMMU notifier. The call site should ensure no iova tree update by
> + * taking necessary locks(e.x. BQL).

We should be accurate on the locking - I think it's the BQL so far.

> + */
> +static int vtd_page_walk_full_map_fast_path(IOVATree *iova_tree,
> +                                            IOMMUNotifier *n)
> +{
> +    DMAMap map;
> +
> +    map.iova = n->start;
> +    map.size = n->end - n->start;
> +    if (!iova_tree_find(iova_tree, &map)) {
> +        return 0;
> +    }
> +
> +    iova_tree_foreach_range_data(iova_tree, &map, vtd_replay_full_map,
> +                                 (gpointer *)n);
> +    return 0;
> +}
> +
>  static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>  {
>      VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
> @@ -3826,13 +3861,6 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>      uint8_t bus_n = pci_bus_num(vtd_as->bus);
>      VTDContextEntry ce;
>  
> -    /*
> -     * The replay can be triggered by either a invalidation or a newly
> -     * created entry. No matter what, we release existing mappings
> -     * (it means flushing caches for UNMAP-only registers).
> -     */
> -    vtd_address_space_unmap(vtd_as, n);
> -
>      if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
>          trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
>                                    "legacy mode",
> @@ -3850,8 +3878,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>                  .as = vtd_as,
>                  .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid),
>              };
> -
> -            vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
> +            if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
> +                vtd_page_walk_full_map_fast_path(vtd_as->iova_tree, n);
> +            } else {
> +                vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
> +            }
>          }
>      } else {
>          trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 78358ede2764..5dae4502b908 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -1890,7 +1890,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
>  
>                  iommu_notifier_init(&gdn.n,
>                                      vfio_iommu_map_dirty_notify,
> -                                    IOMMU_NOTIFIER_MAP,
> +                                    IOMMU_NOTIFIER_FULL_MAP,
>                                      section->offset_within_region,
>                                      int128_get64(llend),
>                                      idx);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index c3661b2276c7..eecc3eec6702 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -142,6 +142,10 @@ struct IOMMUTLBEntry {
>   *       events (e.g. VFIO). Both notifications must be accurate so that
>   *       the shadow page table is fully in sync with the guest view.
>   *
> + *       Besides MAP, there is a special use case called FULL_MAP which
> + *       requests notification for all the existent mappings (e.g. VFIO
> + *       dirty page sync).

Why do we need FULL_MAP?  Can we simply reimpl MAP?

> + *
>   *   (2) When the device doesn't need accurate synchronizations of the
>   *       vIOMMU page tables, it needs to register only with UNMAP or
>   *       DEVIOTLB_UNMAP notifies.
> @@ -164,6 +168,8 @@ typedef enum {
>      IOMMU_NOTIFIER_MAP = 0x2,
>      /* Notify changes on device IOTLB entries */
>      IOMMU_NOTIFIER_DEVIOTLB_UNMAP = 0x04,
> +    /* Notify every existent entries */
> +    IOMMU_NOTIFIER_FULL_MAP = 0x8,
>  } IOMMUNotifierFlag;
>  
>  #define IOMMU_NOTIFIER_IOTLB_EVENTS (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP)
> @@ -237,6 +243,13 @@ static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                         hwaddr start, hwaddr end,
>                                         int iommu_idx)
>  {
> +    /*
> +     * memory_region_notify_iommu_one() needs IOMMU_NOTIFIER_MAP set to
> +     * trigger notifier.
> +     */
> +    if (flags & IOMMU_NOTIFIER_FULL_MAP) {
> +        flags |= IOMMU_NOTIFIER_MAP;
> +    }
>      n->notify = fn;
>      n->notifier_flags = flags;
>      n->start = start;
> diff --git a/softmmu/memory.c b/softmmu/memory.c
> index 7d9494ce7028..0a8465007c66 100644
> --- a/softmmu/memory.c
> +++ b/softmmu/memory.c
> @@ -1922,6 +1922,10 @@ int memory_region_register_iommu_notifier(MemoryRegion *mr,
>      assert(n->iommu_idx >= 0 &&
>             n->iommu_idx < memory_region_iommu_num_indexes(iommu_mr));
>  
> +    if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
> +        error_setg(errp, "FULL_MAP could only be used in replay");
> +    }
> +
>      QLIST_INSERT_HEAD(&iommu_mr->iommu_notify, n, node);
>      ret = memory_region_update_iommu_notify_flags(iommu_mr, errp);
>      if (ret) {
> -- 
> 2.34.1
>
Duan, Zhenzhong June 6, 2023, 2:35 a.m. UTC | #2
>-----Original Message-----
>From: Peter Xu <peterx@redhat.com>
>Sent: Tuesday, June 6, 2023 2:39 AM
>To: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>Cc: qemu-devel@nongnu.org; mst@redhat.com; jasowang@redhat.com;
>pbonzini@redhat.com; richard.henderson@linaro.org; eduardo@habkost.net;
>marcel.apfelbaum@gmail.com; alex.williamson@redhat.com;
>clg@redhat.com; david@redhat.com; philmd@linaro.org;
>kwankhede@nvidia.com; cjia@nvidia.com; Liu, Yi L <yi.l.liu@intel.com>; Peng,
>Chao P <chao.p.peng@intel.com>
>Subject: Re: [PATCH v2 2/4] intel_iommu: Fix a potential issue in VFIO dirty
>page sync
>
>On Thu, Jun 01, 2023 at 02:33:18PM +0800, Zhenzhong Duan wrote:
>> Peter Xu found a potential issue:
>>
>> "The other thing is when I am looking at the new code I found that we
>> actually extended the replay() to be used also in dirty tracking of vfio,
>> in vfio_sync_dirty_bitmap().  For that maybe it's already broken if
>> unmap_all() because afaiu log_sync() can be called in migration thread
>> anytime during DMA so I think it means the device is prone to DMA with the
>> IOMMU pgtable quickly erased and rebuilt here, which means the DMA
>could
>> fail unexpectedly.  Copy Alex, Kirti and Neo."
>>
>> To eliminate this small window with empty mapping, we should remove the
>> call to unmap_all(). Besides that, introduce a new notifier type called
>> IOMMU_NOTIFIER_FULL_MAP to get full mappings as intel_iommu only
>notifies
>> changed mappings while VFIO dirty page sync needs full mappings. Thanks
>> to current implementation of iova tree, we could pick mappings from iova
>> trees directly instead of walking through guest IOMMU page table.
>>
>> IOMMU_NOTIFIER_MAP is still used to get changed mappings for
>optimization
>> purpose. As long as notification for IOMMU_NOTIFIER_MAP could ensure
>shadow
>> page table in sync, then it's OK.
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>>  hw/i386/intel_iommu.c | 49 +++++++++++++++++++++++++++++++++++------
>--
>>  hw/vfio/common.c      |  2 +-
>>  include/exec/memory.h | 13 ++++++++++++
>>  softmmu/memory.c      |  4 ++++
>>  4 files changed, 58 insertions(+), 10 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index 94d52f4205d2..061fcded0dfb 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -3819,6 +3819,41 @@ static int vtd_replay_hook(IOMMUTLBEvent
>*event, void *private)
>>      return 0;
>>  }
>>
>> +static gboolean vtd_replay_full_map(DMAMap *map, gpointer *private)
>> +{
>> +    IOMMUTLBEvent event;
>> +
>> +    event.type = IOMMU_NOTIFIER_MAP;
>> +    event.entry.iova = map->iova;
>> +    event.entry.addr_mask = map->size;
>> +    event.entry.target_as = &address_space_memory;
>> +    event.entry.perm = map->perm;
>> +    event.entry.translated_addr = map->translated_addr;
>> +
>> +    return vtd_replay_hook(&event, private);
>> +}
>> +
>> +/*
>> + * This is a fast path to notify the full mappings falling in the scope
>> + * of IOMMU notifier. The call site should ensure no iova tree update by
>> + * taking necessary locks(e.x. BQL).
>
>We should be accurate on the locking - I think it's the BQL so far.

Will update comments.

>
>> + */
>> +static int vtd_page_walk_full_map_fast_path(IOVATree *iova_tree,
>> +                                            IOMMUNotifier *n)
>> +{
>> +    DMAMap map;
>> +
>> +    map.iova = n->start;
>> +    map.size = n->end - n->start;
>> +    if (!iova_tree_find(iova_tree, &map)) {
>> +        return 0;
>> +    }
>> +
>> +    iova_tree_foreach_range_data(iova_tree, &map, vtd_replay_full_map,
>> +                                 (gpointer *)n);
>> +    return 0;
>> +}
>> +
>>  static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr,
>IOMMUNotifier *n)
>>  {
>>      VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace,
>iommu);
>> @@ -3826,13 +3861,6 @@ static void
>vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>>      uint8_t bus_n = pci_bus_num(vtd_as->bus);
>>      VTDContextEntry ce;
>>
>> -    /*
>> -     * The replay can be triggered by either a invalidation or a newly
>> -     * created entry. No matter what, we release existing mappings
>> -     * (it means flushing caches for UNMAP-only registers).
>> -     */
>> -    vtd_address_space_unmap(vtd_as, n);
>> -
>>      if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
>>          trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
>>                                    "legacy mode",
>> @@ -3850,8 +3878,11 @@ static void
>vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>>                  .as = vtd_as,
>>                  .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid),
>>              };
>> -
>> -            vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
>> +            if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
>> +                vtd_page_walk_full_map_fast_path(vtd_as->iova_tree, n);
>> +            } else {
>> +                vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
>> +            }
>>          }
>>      } else {
>>          trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 78358ede2764..5dae4502b908 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -1890,7 +1890,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer
>*container,
>>
>>                  iommu_notifier_init(&gdn.n,
>>                                      vfio_iommu_map_dirty_notify,
>> -                                    IOMMU_NOTIFIER_MAP,
>> +                                    IOMMU_NOTIFIER_FULL_MAP,
>>                                      section->offset_within_region,
>>                                      int128_get64(llend),
>>                                      idx);
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index c3661b2276c7..eecc3eec6702 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -142,6 +142,10 @@ struct IOMMUTLBEntry {
>>   *       events (e.g. VFIO). Both notifications must be accurate so that
>>   *       the shadow page table is fully in sync with the guest view.
>>   *
>> + *       Besides MAP, there is a special use case called FULL_MAP which
>> + *       requests notification for all the existent mappings (e.g. VFIO
>> + *       dirty page sync).
>
>Why do we need FULL_MAP?  Can we simply reimpl MAP?

Sorry, I just realized IOMMU_NOTIFIER_FULL_MAP is confusing.
Maybe IOMMU_NOTIFIER_MAP_FAST_PATH could be a bit more accurate.

IIUC, currently replay() is called from two paths, one is VFIO device address
space switch which walks over the IOMMU page table to setup initial
mapping and cache it in IOVA tree. The other is VFIO dirty sync which
walks over the IOMMU page table to notify the mapping, because we
already cache the mapping in IOVA tree and VFIO dirty sync is protected
by BQL, so I think it's fine to pick mapping from IOVA tree directly instead
of walking over IOMMU page table. That's the reason of FULL_MAP
(IOMMU_NOTIFIER_MAP_FAST_PATH better).

About "reimpl MAP", do you mean to walk over IOMMU page table to
notify all existing MAP events without checking with the IOVA tree for
difference? If you prefer, I'll rewrite an implementation this way.

Thanks
Zhenzhong
Peter Xu June 6, 2023, 3:41 p.m. UTC | #3
On Tue, Jun 06, 2023 at 02:35:41AM +0000, Duan, Zhenzhong wrote:
> >-----Original Message-----
> >From: Peter Xu <peterx@redhat.com>
> >Sent: Tuesday, June 6, 2023 2:39 AM
> >To: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> >Cc: qemu-devel@nongnu.org; mst@redhat.com; jasowang@redhat.com;
> >pbonzini@redhat.com; richard.henderson@linaro.org; eduardo@habkost.net;
> >marcel.apfelbaum@gmail.com; alex.williamson@redhat.com;
> >clg@redhat.com; david@redhat.com; philmd@linaro.org;
> >kwankhede@nvidia.com; cjia@nvidia.com; Liu, Yi L <yi.l.liu@intel.com>; Peng,
> >Chao P <chao.p.peng@intel.com>
> >Subject: Re: [PATCH v2 2/4] intel_iommu: Fix a potential issue in VFIO dirty
> >page sync
> >
> >On Thu, Jun 01, 2023 at 02:33:18PM +0800, Zhenzhong Duan wrote:
> >> Peter Xu found a potential issue:
> >>
> >> "The other thing is when I am looking at the new code I found that we
> >> actually extended the replay() to be used also in dirty tracking of vfio,
> >> in vfio_sync_dirty_bitmap().  For that maybe it's already broken if
> >> unmap_all() because afaiu log_sync() can be called in migration thread
> >> anytime during DMA so I think it means the device is prone to DMA with the
> >> IOMMU pgtable quickly erased and rebuilt here, which means the DMA
> >could
> >> fail unexpectedly.  Copy Alex, Kirti and Neo."
> >>
> >> To eliminate this small window with empty mapping, we should remove the
> >> call to unmap_all(). Besides that, introduce a new notifier type called
> >> IOMMU_NOTIFIER_FULL_MAP to get full mappings as intel_iommu only
> >notifies
> >> changed mappings while VFIO dirty page sync needs full mappings. Thanks
> >> to current implementation of iova tree, we could pick mappings from iova
> >> trees directly instead of walking through guest IOMMU page table.
> >>
> >> IOMMU_NOTIFIER_MAP is still used to get changed mappings for
> >optimization
> >> purpose. As long as notification for IOMMU_NOTIFIER_MAP could ensure
> >shadow
> >> page table in sync, then it's OK.
> >>
> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> >> ---
> >>  hw/i386/intel_iommu.c | 49 +++++++++++++++++++++++++++++++++++------
> >--
> >>  hw/vfio/common.c      |  2 +-
> >>  include/exec/memory.h | 13 ++++++++++++
> >>  softmmu/memory.c      |  4 ++++
> >>  4 files changed, 58 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >> index 94d52f4205d2..061fcded0dfb 100644
> >> --- a/hw/i386/intel_iommu.c
> >> +++ b/hw/i386/intel_iommu.c
> >> @@ -3819,6 +3819,41 @@ static int vtd_replay_hook(IOMMUTLBEvent
> >*event, void *private)
> >>      return 0;
> >>  }
> >>
> >> +static gboolean vtd_replay_full_map(DMAMap *map, gpointer *private)
> >> +{
> >> +    IOMMUTLBEvent event;
> >> +
> >> +    event.type = IOMMU_NOTIFIER_MAP;
> >> +    event.entry.iova = map->iova;
> >> +    event.entry.addr_mask = map->size;
> >> +    event.entry.target_as = &address_space_memory;
> >> +    event.entry.perm = map->perm;
> >> +    event.entry.translated_addr = map->translated_addr;
> >> +
> >> +    return vtd_replay_hook(&event, private);
> >> +}
> >> +
> >> +/*
> >> + * This is a fast path to notify the full mappings falling in the scope
> >> + * of IOMMU notifier. The call site should ensure no iova tree update by
> >> + * taking necessary locks(e.x. BQL).
> >
> >We should be accurate on the locking - I think it's the BQL so far.
> 
> Will update comments.
> 
> >
> >> + */
> >> +static int vtd_page_walk_full_map_fast_path(IOVATree *iova_tree,
> >> +                                            IOMMUNotifier *n)
> >> +{
> >> +    DMAMap map;
> >> +
> >> +    map.iova = n->start;
> >> +    map.size = n->end - n->start;
> >> +    if (!iova_tree_find(iova_tree, &map)) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    iova_tree_foreach_range_data(iova_tree, &map, vtd_replay_full_map,
> >> +                                 (gpointer *)n);
> >> +    return 0;
> >> +}
> >> +
> >>  static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr,
> >IOMMUNotifier *n)
> >>  {
> >>      VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace,
> >iommu);
> >> @@ -3826,13 +3861,6 @@ static void
> >vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
> >>      uint8_t bus_n = pci_bus_num(vtd_as->bus);
> >>      VTDContextEntry ce;
> >>
> >> -    /*
> >> -     * The replay can be triggered by either a invalidation or a newly
> >> -     * created entry. No matter what, we release existing mappings
> >> -     * (it means flushing caches for UNMAP-only registers).
> >> -     */
> >> -    vtd_address_space_unmap(vtd_as, n);
> >> -
> >>      if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
> >>          trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
> >>                                    "legacy mode",
> >> @@ -3850,8 +3878,11 @@ static void
> >vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
> >>                  .as = vtd_as,
> >>                  .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid),
> >>              };
> >> -
> >> -            vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
> >> +            if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
> >> +                vtd_page_walk_full_map_fast_path(vtd_as->iova_tree, n);
> >> +            } else {
> >> +                vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
> >> +            }
> >>          }
> >>      } else {
> >>          trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index 78358ede2764..5dae4502b908 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -1890,7 +1890,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer
> >*container,
> >>
> >>                  iommu_notifier_init(&gdn.n,
> >>                                      vfio_iommu_map_dirty_notify,
> >> -                                    IOMMU_NOTIFIER_MAP,
> >> +                                    IOMMU_NOTIFIER_FULL_MAP,
> >>                                      section->offset_within_region,
> >>                                      int128_get64(llend),
> >>                                      idx);
> >> diff --git a/include/exec/memory.h b/include/exec/memory.h
> >> index c3661b2276c7..eecc3eec6702 100644
> >> --- a/include/exec/memory.h
> >> +++ b/include/exec/memory.h
> >> @@ -142,6 +142,10 @@ struct IOMMUTLBEntry {
> >>   *       events (e.g. VFIO). Both notifications must be accurate so that
> >>   *       the shadow page table is fully in sync with the guest view.
> >>   *
> >> + *       Besides MAP, there is a special use case called FULL_MAP which
> >> + *       requests notification for all the existent mappings (e.g. VFIO
> >> + *       dirty page sync).
> >
> >Why do we need FULL_MAP?  Can we simply reimpl MAP?
> 
> Sorry, I just realized IOMMU_NOTIFIER_FULL_MAP is confusing.
> Maybe IOMMU_NOTIFIER_MAP_FAST_PATH could be a bit more accurate.
> 
> IIUC, currently replay() is called from two paths, one is VFIO device address
> space switch which walks over the IOMMU page table to setup initial
> mapping and cache it in IOVA tree. The other is VFIO dirty sync which
> walks over the IOMMU page table to notify the mapping, because we
> already cache the mapping in IOVA tree and VFIO dirty sync is protected
> by BQL, so I think it's fine to pick mapping from IOVA tree directly instead
> of walking over IOMMU page table. That's the reason of FULL_MAP
> (IOMMU_NOTIFIER_MAP_FAST_PATH better).
> 
> About "reimpl MAP", do you mean to walk over IOMMU page table to
> notify all existing MAP events without checking with the IOVA tree for
> difference? If you prefer, I'll rewrite an implementation this way.

We still need to maintain iova tree. IIUC that's the major complexity of
vt-d emulation, because we have that extra cache layer to sync with the
real guest iommu pgtables.

But I think we were just wrong to also notify in the unmap_all() procedure.

IIUC the right thing to do (keeping replay() the interface as-is, per it
used to be defined) is we should replace the unmap_all() to only evacuate
the iova tree (keeping all host mappings untouched, IOW, don't notify
UNMAP), and do a full resync there, which will notify all existing mappings
as MAP.  Then we don't interrupt with any existing mapping if there is
(e.g. for the dirty sync case), meanwhile we keep sync too to latest (for
moving a vfio device into an existing iommu group).

Do you think that'll work for us?
Duan, Zhenzhong June 7, 2023, 3:14 a.m. UTC | #4
>-----Original Message-----
>From: Peter Xu <peterx@redhat.com>
>Sent: Tuesday, June 6, 2023 11:42 PM
>Subject: Re: [PATCH v2 2/4] intel_iommu: Fix a potential issue in VFIO dirty
>page sync
>
...
>> >> a/include/exec/memory.h b/include/exec/memory.h index
>> >> c3661b2276c7..eecc3eec6702 100644
>> >> --- a/include/exec/memory.h
>> >> +++ b/include/exec/memory.h
>> >> @@ -142,6 +142,10 @@ struct IOMMUTLBEntry {
>> >>   *       events (e.g. VFIO). Both notifications must be accurate so that
>> >>   *       the shadow page table is fully in sync with the guest view.
>> >>   *
>> >> + *       Besides MAP, there is a special use case called FULL_MAP which
>> >> + *       requests notification for all the existent mappings (e.g. VFIO
>> >> + *       dirty page sync).
>> >
>> >Why do we need FULL_MAP?  Can we simply reimpl MAP?
>>
>> Sorry, I just realized IOMMU_NOTIFIER_FULL_MAP is confusing.
>> Maybe IOMMU_NOTIFIER_MAP_FAST_PATH could be a bit more accurate.
>>
>> IIUC, currently replay() is called from two paths, one is VFIO device
>> address space switch which walks over the IOMMU page table to setup
>> initial mapping and cache it in IOVA tree. The other is VFIO dirty
>> sync which walks over the IOMMU page table to notify the mapping,
>> because we already cache the mapping in IOVA tree and VFIO dirty sync
>> is protected by BQL, so I think it's fine to pick mapping from IOVA
>> tree directly instead of walking over IOMMU page table. That's the
>> reason of FULL_MAP (IOMMU_NOTIFIER_MAP_FAST_PATH better).
>>
>> About "reimpl MAP", do you mean to walk over IOMMU page table to
>> notify all existing MAP events without checking with the IOVA tree for
>> difference? If you prefer, I'll rewrite an implementation this way.
>
>We still need to maintain iova tree. IIUC that's the major complexity of vt-d
>emulation, because we have that extra cache layer to sync with the real guest
>iommu pgtables.

Can't agree more, looks only intel-iommu and virtio-iommu implemented such
optimization for now.

>
>But I think we were just wrong to also notify in the unmap_all() procedure.
>
>IIUC the right thing to do (keeping replay() the interface as-is, per it used to be
>defined) is we should replace the unmap_all() to only evacuate the iova tree
>(keeping all host mappings untouched, IOW, don't notify UNMAP), and do a
>full resync there, which will notify all existing mappings as MAP.  Then we
>don't interrupt with any existing mapping if there is (e.g. for the dirty sync
>case), meanwhile we keep sync too to latest (for moving a vfio device into an
>existing iommu group).
>
>Do you think that'll work for us?

Yes, I think I get your point.
Below simple change will work in your suggested way, do you agree?

@@ -3825,13 +3833,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
     IntelIOMMUState *s = vtd_as->iommu_state;
     uint8_t bus_n = pci_bus_num(vtd_as->bus);
     VTDContextEntry ce;
+    DMAMap map = { .iova = 0, .size = HWADDR_MAX }

-    /*
-     * The replay can be triggered by either a invalidation or a newly
-     * created entry. No matter what, we release existing mappings
-     * (it means flushing caches for UNMAP-only registers).
-     */
-    vtd_address_space_unmap(vtd_as, n);
+    /* replay is protected by BQL, page walk will re-setup IOVA tree safely */
+    iova_tree_remove(as->iova_tree, map);

     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :

Thanks
Zhenzhong
Peter Xu June 7, 2023, 2:06 p.m. UTC | #5
On Wed, Jun 07, 2023 at 03:14:07AM +0000, Duan, Zhenzhong wrote:
> 
> 
> >-----Original Message-----
> >From: Peter Xu <peterx@redhat.com>
> >Sent: Tuesday, June 6, 2023 11:42 PM
> >Subject: Re: [PATCH v2 2/4] intel_iommu: Fix a potential issue in VFIO dirty
> >page sync
> >
> ...
> >> >> a/include/exec/memory.h b/include/exec/memory.h index
> >> >> c3661b2276c7..eecc3eec6702 100644
> >> >> --- a/include/exec/memory.h
> >> >> +++ b/include/exec/memory.h
> >> >> @@ -142,6 +142,10 @@ struct IOMMUTLBEntry {
> >> >>   *       events (e.g. VFIO). Both notifications must be accurate so that
> >> >>   *       the shadow page table is fully in sync with the guest view.
> >> >>   *
> >> >> + *       Besides MAP, there is a special use case called FULL_MAP which
> >> >> + *       requests notification for all the existent mappings (e.g. VFIO
> >> >> + *       dirty page sync).
> >> >
> >> >Why do we need FULL_MAP?  Can we simply reimpl MAP?
> >>
> >> Sorry, I just realized IOMMU_NOTIFIER_FULL_MAP is confusing.
> >> Maybe IOMMU_NOTIFIER_MAP_FAST_PATH could be a bit more accurate.
> >>
> >> IIUC, currently replay() is called from two paths, one is VFIO device
> >> address space switch which walks over the IOMMU page table to setup
> >> initial mapping and cache it in IOVA tree. The other is VFIO dirty
> >> sync which walks over the IOMMU page table to notify the mapping,
> >> because we already cache the mapping in IOVA tree and VFIO dirty sync
> >> is protected by BQL, so I think it's fine to pick mapping from IOVA
> >> tree directly instead of walking over IOMMU page table. That's the
> >> reason of FULL_MAP (IOMMU_NOTIFIER_MAP_FAST_PATH better).
> >>
> >> About "reimpl MAP", do you mean to walk over IOMMU page table to
> >> notify all existing MAP events without checking with the IOVA tree for
> >> difference? If you prefer, I'll rewrite an implementation this way.
> >
> >We still need to maintain iova tree. IIUC that's the major complexity of vt-d
> >emulation, because we have that extra cache layer to sync with the real guest
> >iommu pgtables.
> 
> Can't agree more, looks only intel-iommu and virtio-iommu implemented such
> optimization for now.
> 
> >
> >But I think we were just wrong to also notify in the unmap_all() procedure.
> >
> >IIUC the right thing to do (keeping replay() the interface as-is, per it used to be
> >defined) is we should replace the unmap_all() to only evacuate the iova tree
> >(keeping all host mappings untouched, IOW, don't notify UNMAP), and do a
> >full resync there, which will notify all existing mappings as MAP.  Then we
> >don't interrupt with any existing mapping if there is (e.g. for the dirty sync
> >case), meanwhile we keep sync too to latest (for moving a vfio device into an
> >existing iommu group).
> >
> >Do you think that'll work for us?
> 
> Yes, I think I get your point.
> Below simple change will work in your suggested way, do you agree?
> 
> @@ -3825,13 +3833,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>      IntelIOMMUState *s = vtd_as->iommu_state;
>      uint8_t bus_n = pci_bus_num(vtd_as->bus);
>      VTDContextEntry ce;
> +    DMAMap map = { .iova = 0, .size = HWADDR_MAX }
> 
> -    /*
> -     * The replay can be triggered by either a invalidation or a newly
> -     * created entry. No matter what, we release existing mappings
> -     * (it means flushing caches for UNMAP-only registers).
> -     */
> -    vtd_address_space_unmap(vtd_as, n);
> +    /* replay is protected by BQL, page walk will re-setup IOVA tree safely */
> +    iova_tree_remove(as->iova_tree, map);
> 
>      if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
>          trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :

Yes, thanks!
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 94d52f4205d2..061fcded0dfb 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3819,6 +3819,41 @@  static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
     return 0;
 }
 
+static gboolean vtd_replay_full_map(DMAMap *map, gpointer *private)
+{
+    IOMMUTLBEvent event;
+
+    event.type = IOMMU_NOTIFIER_MAP;
+    event.entry.iova = map->iova;
+    event.entry.addr_mask = map->size;
+    event.entry.target_as = &address_space_memory;
+    event.entry.perm = map->perm;
+    event.entry.translated_addr = map->translated_addr;
+
+    return vtd_replay_hook(&event, private);
+}
+
+/*
+ * This is a fast path to notify the full mappings falling in the scope
+ * of IOMMU notifier. The call site should ensure no iova tree update by
+ * taking necessary locks(e.x. BQL).
+ */
+static int vtd_page_walk_full_map_fast_path(IOVATree *iova_tree,
+                                            IOMMUNotifier *n)
+{
+    DMAMap map;
+
+    map.iova = n->start;
+    map.size = n->end - n->start;
+    if (!iova_tree_find(iova_tree, &map)) {
+        return 0;
+    }
+
+    iova_tree_foreach_range_data(iova_tree, &map, vtd_replay_full_map,
+                                 (gpointer *)n);
+    return 0;
+}
+
 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
 {
     VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
@@ -3826,13 +3861,6 @@  static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
     uint8_t bus_n = pci_bus_num(vtd_as->bus);
     VTDContextEntry ce;
 
-    /*
-     * The replay can be triggered by either a invalidation or a newly
-     * created entry. No matter what, we release existing mappings
-     * (it means flushing caches for UNMAP-only registers).
-     */
-    vtd_address_space_unmap(vtd_as, n);
-
     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
                                   "legacy mode",
@@ -3850,8 +3878,11 @@  static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .as = vtd_as,
                 .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid),
             };
-
-            vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
+            if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
+                vtd_page_walk_full_map_fast_path(vtd_as->iova_tree, n);
+            } else {
+                vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
+            }
         }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 78358ede2764..5dae4502b908 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1890,7 +1890,7 @@  static int vfio_sync_dirty_bitmap(VFIOContainer *container,
 
                 iommu_notifier_init(&gdn.n,
                                     vfio_iommu_map_dirty_notify,
-                                    IOMMU_NOTIFIER_MAP,
+                                    IOMMU_NOTIFIER_FULL_MAP,
                                     section->offset_within_region,
                                     int128_get64(llend),
                                     idx);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index c3661b2276c7..eecc3eec6702 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -142,6 +142,10 @@  struct IOMMUTLBEntry {
  *       events (e.g. VFIO). Both notifications must be accurate so that
  *       the shadow page table is fully in sync with the guest view.
  *
+ *       Besides MAP, there is a special use case called FULL_MAP which
+ *       requests notification for all the existent mappings (e.g. VFIO
+ *       dirty page sync).
+ *
  *   (2) When the device doesn't need accurate synchronizations of the
  *       vIOMMU page tables, it needs to register only with UNMAP or
  *       DEVIOTLB_UNMAP notifies.
@@ -164,6 +168,8 @@  typedef enum {
     IOMMU_NOTIFIER_MAP = 0x2,
     /* Notify changes on device IOTLB entries */
     IOMMU_NOTIFIER_DEVIOTLB_UNMAP = 0x04,
+    /* Notify every existent entries */
+    IOMMU_NOTIFIER_FULL_MAP = 0x8,
 } IOMMUNotifierFlag;
 
 #define IOMMU_NOTIFIER_IOTLB_EVENTS (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP)
@@ -237,6 +243,13 @@  static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
                                        hwaddr start, hwaddr end,
                                        int iommu_idx)
 {
+    /*
+     * memory_region_notify_iommu_one() needs IOMMU_NOTIFIER_MAP set to
+     * trigger notifier.
+     */
+    if (flags & IOMMU_NOTIFIER_FULL_MAP) {
+        flags |= IOMMU_NOTIFIER_MAP;
+    }
     n->notify = fn;
     n->notifier_flags = flags;
     n->start = start;
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 7d9494ce7028..0a8465007c66 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -1922,6 +1922,10 @@  int memory_region_register_iommu_notifier(MemoryRegion *mr,
     assert(n->iommu_idx >= 0 &&
            n->iommu_idx < memory_region_iommu_num_indexes(iommu_mr));
 
+    if (n->notifier_flags & IOMMU_NOTIFIER_FULL_MAP) {
+        error_setg(errp, "FULL_MAP could only be used in replay");
+    }
+
     QLIST_INSERT_HEAD(&iommu_mr->iommu_notify, n, node);
     ret = memory_region_update_iommu_notify_flags(iommu_mr, errp);
     if (ret) {