diff mbox series

[RFC,v9,15/29] vfio: Set up nested stage mappings

Message ID 20210411120912.15770-16-eric.auger@redhat.com (mailing list archive)
State New, archived
Headers show
Series vSMMUv3/pSMMUv3 2 stage VFIO integration | expand

Commit Message

Eric Auger April 11, 2021, 12:08 p.m. UTC
In nested mode, legacy vfio_iommu_map_notify cannot be used as
there is no "caching" mode and we do not trap on map.

On Intel, vfio_iommu_map_notify was used to DMA map the RAM
through the host single stage.

With nested mode, we need to setup the stage 2 and the stage 1
separately. This patch introduces a prereg_listener to setup
the stage 2 mapping.

The stage 1 mapping, owned by the guest, is passed to the host
when the guest invalidates the stage 1 configuration, through
a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
are cascaded downto the host through another IOMMU MR UNMAP
notifier.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---

v7 -> v8:
- properly handle new IOMMUTLBEntry fields and especially
  propagate DOMAIN and PASID based invalidations

v6 -> v7:
- remove PASID based invalidation

v5 -> v6:
- add error_report_err()
- remove the abort in case of nested stage case

v4 -> v5:
- use VFIO_IOMMU_SET_PASID_TABLE
- use PCIPASIDOps for config notification

v3 -> v4:
- use iommu_inv_pasid_info for ASID invalidation

v2 -> v3:
- use VFIO_IOMMU_ATTACH_PASID_TABLE
- new user API
- handle leaf

v1 -> v2:
- adapt to uapi changes
- pass the asid
- pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
---
 hw/vfio/common.c     | 139 +++++++++++++++++++++++++++++++++++++++++--
 hw/vfio/pci.c        |  21 +++++++
 hw/vfio/trace-events |   2 +
 3 files changed, 157 insertions(+), 5 deletions(-)

Comments

Kunkun Jiang April 13, 2021, 12:10 p.m. UTC | #1
Hi Eric,

On 2021/4/11 20:08, Eric Auger wrote:
> In nested mode, legacy vfio_iommu_map_notify cannot be used as
> there is no "caching" mode and we do not trap on map.
>
> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
> through the host single stage.
>
> With nested mode, we need to setup the stage 2 and the stage 1
> separately. This patch introduces a prereg_listener to setup
> the stage 2 mapping.
>
> The stage 1 mapping, owned by the guest, is passed to the host
> when the guest invalidates the stage 1 configuration, through
> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
> are cascaded downto the host through another IOMMU MR UNMAP
> notifier.
>
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>
> ---
>
> v7 -> v8:
> - properly handle new IOMMUTLBEntry fields and especially
>    propagate DOMAIN and PASID based invalidations
>
> v6 -> v7:
> - remove PASID based invalidation
>
> v5 -> v6:
> - add error_report_err()
> - remove the abort in case of nested stage case
>
> v4 -> v5:
> - use VFIO_IOMMU_SET_PASID_TABLE
> - use PCIPASIDOps for config notification
>
> v3 -> v4:
> - use iommu_inv_pasid_info for ASID invalidation
>
> v2 -> v3:
> - use VFIO_IOMMU_ATTACH_PASID_TABLE
> - new user API
> - handle leaf
>
> v1 -> v2:
> - adapt to uapi changes
> - pass the asid
> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
> ---
>   hw/vfio/common.c     | 139 +++++++++++++++++++++++++++++++++++++++++--
>   hw/vfio/pci.c        |  21 +++++++
>   hw/vfio/trace-events |   2 +
>   3 files changed, 157 insertions(+), 5 deletions(-)
>
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 0cd7ef2139..e369d451e7 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
>       return true;
>   }
>   
> +/* Propagate a guest IOTLB invalidation to the host (nested mode) */
> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> +{
> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
> +    VFIOContainer *container = giommu->container;
> +    int ret;
> +
> +    assert(iotlb->perm == IOMMU_NONE);
> +
> +    ustruct.argsz = sizeof(ustruct);
> +    ustruct.flags = 0;
> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
> +
> +    switch (iotlb->granularity) {
> +    case IOMMU_INV_GRAN_DOMAIN:
> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
> +        break;
> +    case IOMMU_INV_GRAN_PASID:
> +    {
> +        struct iommu_inv_pasid_info *pasid_info;
> +        int archid = -1;
> +
> +        pasid_info = &ustruct.info.granu.pasid_info;
> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
> +            archid = iotlb->arch_id;
> +        }
> +        pasid_info->archid = archid;
> +        trace_vfio_iommu_asid_inv_iotlb(archid);
> +        break;
> +    }
> +    case IOMMU_INV_GRAN_ADDR:
> +    {
> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
> +        struct iommu_inv_addr_info *addr_info;
> +        size_t size = iotlb->addr_mask + 1;
> +        int archid = -1;
> +
> +        addr_info = &ustruct.info.granu.addr_info;
> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
> +        if (iotlb->leaf) {
> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
> +        }
> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
> +            archid = iotlb->arch_id;
> +        }
> +        addr_info->archid = archid;
> +        addr_info->addr = start;
> +        addr_info->granule_size = size;
> +        addr_info->nb_granules = 1;
> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
> +                                        1, iotlb->leaf);
> +        break;
> +    }
Should we pass a size to  host kernel here, even if vSMMU doesn't support
RIL or guest kernel doesn't use RIL?

It will cause TLBI issue in  this scenario: Guest kernel issues a TLBI cmd
without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
the iova and size (4K) to host kernel. Finally, host kernel issues a 
TLBI cmd
with "range" (4K) which can not invalidate the TLB entry of 2M huge page.
(pSMMU supports RIL)

Thanks,
Kunkun Jiang
> +    }
> +
> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
> +    if (ret) {
> +        error_report("%p: failed to invalidate CACHE (%d)", container, ret);
> +    }
> +}
> +
>   static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>   {
>       VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
> @@ -776,6 +843,35 @@ static void vfio_dma_unmap_ram_section(VFIOContainer *container,
>       }
>   }
>   
> +static void vfio_prereg_listener_region_add(MemoryListener *listener,
> +                                            MemoryRegionSection *section)
> +{
> +    VFIOContainer *container =
> +        container_of(listener, VFIOContainer, prereg_listener);
> +    Error *err = NULL;
> +
> +    if (!memory_region_is_ram(section->mr)) {
> +        return;
> +    }
> +
> +    vfio_dma_map_ram_section(container, section, &err);
> +    if (err) {
> +        error_report_err(err);
> +    }
> +}
> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
> +                                     MemoryRegionSection *section)
> +{
> +    VFIOContainer *container =
> +        container_of(listener, VFIOContainer, prereg_listener);
> +
> +    if (!memory_region_is_ram(section->mr)) {
> +        return;
> +    }
> +
> +    vfio_dma_unmap_ram_section(container, section);
> +}
> +
>   static void vfio_listener_region_add(MemoryListener *listener,
>                                        MemoryRegionSection *section)
>   {
> @@ -879,9 +975,10 @@ static void vfio_listener_region_add(MemoryListener *listener,
>       memory_region_ref(section->mr);
>   
>       if (memory_region_is_iommu(section->mr)) {
> +        IOMMUNotify notify;
>           VFIOGuestIOMMU *giommu;
>           IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
> -        int iommu_idx;
> +        int iommu_idx, flags;
>   
>           trace_vfio_listener_region_add_iommu(iova, end);
>           /*
> @@ -900,8 +997,18 @@ static void vfio_listener_region_add(MemoryListener *listener,
>           llend = int128_sub(llend, int128_one());
>           iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>                                                          MEMTXATTRS_UNSPECIFIED);
> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
> +
> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
> +            /* IOTLB unmap notifier to propagate guest IOTLB invalidations */
> +            flags = IOMMU_NOTIFIER_UNMAP;
> +            notify = vfio_iommu_unmap_notify;
> +        } else {
> +            /* MAP/UNMAP IOTLB notifier */
> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
> +            notify = vfio_iommu_map_notify;
> +        }
> +
> +        iommu_notifier_init(&giommu->n, notify, flags,
>                               section->offset_within_region,
>                               int128_get64(llend),
>                               iommu_idx);
> @@ -921,7 +1028,9 @@ static void vfio_listener_region_add(MemoryListener *listener,
>               goto fail;
>           }
>           QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
> +        if (flags & IOMMU_NOTIFIER_MAP) {
> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
> +        }
>   
>           return;
>       }
> @@ -1205,10 +1314,16 @@ static const MemoryListener vfio_memory_listener = {
>       .log_sync = vfio_listener_log_sync,
>   };
>   
> +static MemoryListener vfio_memory_prereg_listener = {
> +    .region_add = vfio_prereg_listener_region_add,
> +    .region_del = vfio_prereg_listener_region_del,
> +};
> +
>   static void vfio_listener_release(VFIOContainer *container)
>   {
>       memory_listener_unregister(&container->listener);
> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>           memory_listener_unregister(&container->prereg_listener);
>       }
>   }
> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
>               vfio_get_iommu_info_migration(container, info);
>           }
>           g_free(info);
> +
> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
> +            container->prereg_listener = vfio_memory_prereg_listener;
> +            memory_listener_register(&container->prereg_listener,
> +                                     &address_space_memory);
> +            if (container->error) {
> +                memory_listener_unregister(&container->prereg_listener);
> +                ret = -1;
> +                error_propagate_prepend(errp, container->error,
> +                                    "RAM memory listener initialization failed "
> +                                    "for container");
> +                goto free_container_exit;
> +            }
> +        }
>           break;
>       }
>       case VFIO_SPAPR_TCE_v2_IOMMU:
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 5c65aa0a98..cad7deec71 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2773,6 +2773,25 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>       vdev->req_enabled = false;
>   }
>   
> +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
> +                                      IOMMUConfig *config)
> +{
> +    PCIDevice *pdev = bus->devices[devfn];
> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> +    VFIOContainer *container = vdev->vbasedev.group->container;
> +    struct vfio_iommu_type1_set_pasid_table info;
> +
> +    info.argsz = sizeof(info);
> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
> +    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
> +
> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
> +}
> +
> +static PCIPASIDOps vfio_pci_pasid_ops = {
> +    .set_pasid_table = vfio_iommu_set_pasid_table,
> +};
> +
>   static void vfio_realize(PCIDevice *pdev, Error **errp)
>   {
>       VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>       vfio_register_req_notifier(vdev);
>       vfio_setup_resetfn_quirk(vdev);
>   
> +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
> +
>       return;
>   
>   out_deregister:
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 936d29d150..43696afc15 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
>   vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
>   vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
>   vfio_dma_unmap_overflow_workaround(void) ""
> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>   
>   # platform.c
>   vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
Eric Auger April 13, 2021, 12:57 p.m. UTC | #2
Hi Kunkun,

On 4/13/21 2:10 PM, Kunkun Jiang wrote:
> Hi Eric,
> 
> On 2021/4/11 20:08, Eric Auger wrote:
>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>> there is no "caching" mode and we do not trap on map.
>>
>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>> through the host single stage.
>>
>> With nested mode, we need to setup the stage 2 and the stage 1
>> separately. This patch introduces a prereg_listener to setup
>> the stage 2 mapping.
>>
>> The stage 1 mapping, owned by the guest, is passed to the host
>> when the guest invalidates the stage 1 configuration, through
>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>> are cascaded downto the host through another IOMMU MR UNMAP
>> notifier.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>
>> ---
>>
>> v7 -> v8:
>> - properly handle new IOMMUTLBEntry fields and especially
>>    propagate DOMAIN and PASID based invalidations
>>
>> v6 -> v7:
>> - remove PASID based invalidation
>>
>> v5 -> v6:
>> - add error_report_err()
>> - remove the abort in case of nested stage case
>>
>> v4 -> v5:
>> - use VFIO_IOMMU_SET_PASID_TABLE
>> - use PCIPASIDOps for config notification
>>
>> v3 -> v4:
>> - use iommu_inv_pasid_info for ASID invalidation
>>
>> v2 -> v3:
>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>> - new user API
>> - handle leaf
>>
>> v1 -> v2:
>> - adapt to uapi changes
>> - pass the asid
>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>> ---
>>   hw/vfio/common.c     | 139 +++++++++++++++++++++++++++++++++++++++++--
>>   hw/vfio/pci.c        |  21 +++++++
>>   hw/vfio/trace-events |   2 +
>>   3 files changed, 157 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 0cd7ef2139..e369d451e7 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>> *iotlb, void **vaddr,
>>       return true;
>>   }
>>   +/* Propagate a guest IOTLB invalidation to the host (nested mode) */
>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>> *iotlb)
>> +{
>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>> +    VFIOContainer *container = giommu->container;
>> +    int ret;
>> +
>> +    assert(iotlb->perm == IOMMU_NONE);
>> +
>> +    ustruct.argsz = sizeof(ustruct);
>> +    ustruct.flags = 0;
>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>> +
>> +    switch (iotlb->granularity) {
>> +    case IOMMU_INV_GRAN_DOMAIN:
>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>> +        break;
>> +    case IOMMU_INV_GRAN_PASID:
>> +    {
>> +        struct iommu_inv_pasid_info *pasid_info;
>> +        int archid = -1;
>> +
>> +        pasid_info = &ustruct.info.granu.pasid_info;
>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>> +            archid = iotlb->arch_id;
>> +        }
>> +        pasid_info->archid = archid;
>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>> +        break;
>> +    }
>> +    case IOMMU_INV_GRAN_ADDR:
>> +    {
>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>> +        struct iommu_inv_addr_info *addr_info;
>> +        size_t size = iotlb->addr_mask + 1;
>> +        int archid = -1;
>> +
>> +        addr_info = &ustruct.info.granu.addr_info;
>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>> +        if (iotlb->leaf) {
>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>> +        }
>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>> +            archid = iotlb->arch_id;
>> +        }
>> +        addr_info->archid = archid;
>> +        addr_info->addr = start;
>> +        addr_info->granule_size = size;
>> +        addr_info->nb_granules = 1;
>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>> +                                        1, iotlb->leaf);
>> +        break;
>> +    }
> Should we pass a size to  host kernel here, even if vSMMU doesn't support
> RIL or guest kernel doesn't use RIL?
> 
> It will cause TLBI issue in  this scenario: Guest kernel issues a TLBI cmd
> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
> the iova and size (4K) to host kernel. Finally, host kernel issues a
> TLBI cmd
> with "range" (4K) which can not invalidate the TLB entry of 2M huge page.
> (pSMMU supports RIL)

In that case the guest will loop over all 4K images belonging to the 2M
huge page and invalidate each of them. This should turn into qemu
notifications for each 4kB page, no? This is totally inefficient, hence
the support of RIL on guest side and QEMU device.

What do I miss?

Thanks

Eric
> 
> Thanks,
> Kunkun Jiang
>> +    }
>> +
>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>> +    if (ret) {
>> +        error_report("%p: failed to invalidate CACHE (%d)",
>> container, ret);
>> +    }
>> +}
>> +
>>   static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>> *iotlb)
>>   {
>>       VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>> @@ -776,6 +843,35 @@ static void
>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>       }
>>   }
>>   +static void vfio_prereg_listener_region_add(MemoryListener *listener,
>> +                                            MemoryRegionSection
>> *section)
>> +{
>> +    VFIOContainer *container =
>> +        container_of(listener, VFIOContainer, prereg_listener);
>> +    Error *err = NULL;
>> +
>> +    if (!memory_region_is_ram(section->mr)) {
>> +        return;
>> +    }
>> +
>> +    vfio_dma_map_ram_section(container, section, &err);
>> +    if (err) {
>> +        error_report_err(err);
>> +    }
>> +}
>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>> +                                     MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container =
>> +        container_of(listener, VFIOContainer, prereg_listener);
>> +
>> +    if (!memory_region_is_ram(section->mr)) {
>> +        return;
>> +    }
>> +
>> +    vfio_dma_unmap_ram_section(container, section);
>> +}
>> +
>>   static void vfio_listener_region_add(MemoryListener *listener,
>>                                        MemoryRegionSection *section)
>>   {
>> @@ -879,9 +975,10 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>       memory_region_ref(section->mr);
>>         if (memory_region_is_iommu(section->mr)) {
>> +        IOMMUNotify notify;
>>           VFIOGuestIOMMU *giommu;
>>           IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
>> -        int iommu_idx;
>> +        int iommu_idx, flags;
>>             trace_vfio_listener_region_add_iommu(iova, end);
>>           /*
>> @@ -900,8 +997,18 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>           llend = int128_sub(llend, int128_one());
>>           iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>                                                         
>> MEMTXATTRS_UNSPECIFIED);
>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>> +
>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>> invalidations */
>> +            flags = IOMMU_NOTIFIER_UNMAP;
>> +            notify = vfio_iommu_unmap_notify;
>> +        } else {
>> +            /* MAP/UNMAP IOTLB notifier */
>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>> +            notify = vfio_iommu_map_notify;
>> +        }
>> +
>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>                               section->offset_within_region,
>>                               int128_get64(llend),
>>                               iommu_idx);
>> @@ -921,7 +1028,9 @@ static void
>> vfio_listener_region_add(MemoryListener *listener,
>>               goto fail;
>>           }
>>           QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>> giommu_next);
>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>> +        }
>>             return;
>>       }
>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>> vfio_memory_listener = {
>>       .log_sync = vfio_listener_log_sync,
>>   };
>>   +static MemoryListener vfio_memory_prereg_listener = {
>> +    .region_add = vfio_prereg_listener_region_add,
>> +    .region_del = vfio_prereg_listener_region_del,
>> +};
>> +
>>   static void vfio_listener_release(VFIOContainer *container)
>>   {
>>       memory_listener_unregister(&container->listener);
>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>           memory_listener_unregister(&container->prereg_listener);
>>       }
>>   }
>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>> *group, AddressSpace *as,
>>               vfio_get_iommu_info_migration(container, info);
>>           }
>>           g_free(info);
>> +
>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>> +            container->prereg_listener = vfio_memory_prereg_listener;
>> +            memory_listener_register(&container->prereg_listener,
>> +                                     &address_space_memory);
>> +            if (container->error) {
>> +                memory_listener_unregister(&container->prereg_listener);
>> +                ret = -1;
>> +                error_propagate_prepend(errp, container->error,
>> +                                    "RAM memory listener
>> initialization failed "
>> +                                    "for container");
>> +                goto free_container_exit;
>> +            }
>> +        }
>>           break;
>>       }
>>       case VFIO_SPAPR_TCE_v2_IOMMU:
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 5c65aa0a98..cad7deec71 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2773,6 +2773,25 @@ static void
>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>       vdev->req_enabled = false;
>>   }
>>   +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>> +                                      IOMMUConfig *config)
>> +{
>> +    PCIDevice *pdev = bus->devices[devfn];
>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>> +    struct vfio_iommu_type1_set_pasid_table info;
>> +
>> +    info.argsz = sizeof(info);
>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>> +    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
>> +
>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>> +}
>> +
>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>> +};
>> +
>>   static void vfio_realize(PCIDevice *pdev, Error **errp)
>>   {
>>       VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>> **errp)
>>       vfio_register_req_notifier(vdev);
>>       vfio_setup_resetfn_quirk(vdev);
>>   +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>> +
>>       return;
>>     out_deregister:
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 936d29d150..43696afc15 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>> int index, int nr_areas) "Devic
>>   vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>   vfio_get_dev_region(const char *name, int index, uint32_t type,
>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>   vfio_dma_unmap_overflow_workaround(void) ""
>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>> leaf=%d"
>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>     # platform.c
>>   vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>> to group #%d"
> 
> 
>
Kunkun Jiang April 14, 2021, 1:45 a.m. UTC | #3
On 2021/4/13 20:57, Auger Eric wrote:
> Hi Kunkun,
>
> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>> Hi Eric,
>>
>> On 2021/4/11 20:08, Eric Auger wrote:
>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>> there is no "caching" mode and we do not trap on map.
>>>
>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>> through the host single stage.
>>>
>>> With nested mode, we need to setup the stage 2 and the stage 1
>>> separately. This patch introduces a prereg_listener to setup
>>> the stage 2 mapping.
>>>
>>> The stage 1 mapping, owned by the guest, is passed to the host
>>> when the guest invalidates the stage 1 configuration, through
>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>> are cascaded downto the host through another IOMMU MR UNMAP
>>> notifier.
>>>
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>
>>> ---
>>>
>>> v7 -> v8:
>>> - properly handle new IOMMUTLBEntry fields and especially
>>>     propagate DOMAIN and PASID based invalidations
>>>
>>> v6 -> v7:
>>> - remove PASID based invalidation
>>>
>>> v5 -> v6:
>>> - add error_report_err()
>>> - remove the abort in case of nested stage case
>>>
>>> v4 -> v5:
>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>> - use PCIPASIDOps for config notification
>>>
>>> v3 -> v4:
>>> - use iommu_inv_pasid_info for ASID invalidation
>>>
>>> v2 -> v3:
>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>> - new user API
>>> - handle leaf
>>>
>>> v1 -> v2:
>>> - adapt to uapi changes
>>> - pass the asid
>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>> ---
>>>    hw/vfio/common.c     | 139 +++++++++++++++++++++++++++++++++++++++++--
>>>    hw/vfio/pci.c        |  21 +++++++
>>>    hw/vfio/trace-events |   2 +
>>>    3 files changed, 157 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 0cd7ef2139..e369d451e7 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>> *iotlb, void **vaddr,
>>>        return true;
>>>    }
>>>    +/* Propagate a guest IOTLB invalidation to the host (nested mode) */
>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>> *iotlb)
>>> +{
>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>> +    VFIOContainer *container = giommu->container;
>>> +    int ret;
>>> +
>>> +    assert(iotlb->perm == IOMMU_NONE);
>>> +
>>> +    ustruct.argsz = sizeof(ustruct);
>>> +    ustruct.flags = 0;
>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>> +
>>> +    switch (iotlb->granularity) {
>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>> +        break;
>>> +    case IOMMU_INV_GRAN_PASID:
>>> +    {
>>> +        struct iommu_inv_pasid_info *pasid_info;
>>> +        int archid = -1;
>>> +
>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>> +            archid = iotlb->arch_id;
>>> +        }
>>> +        pasid_info->archid = archid;
>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>> +        break;
>>> +    }
>>> +    case IOMMU_INV_GRAN_ADDR:
>>> +    {
>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>> +        struct iommu_inv_addr_info *addr_info;
>>> +        size_t size = iotlb->addr_mask + 1;
>>> +        int archid = -1;
>>> +
>>> +        addr_info = &ustruct.info.granu.addr_info;
>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>> +        if (iotlb->leaf) {
>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>> +        }
>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>> +            archid = iotlb->arch_id;
>>> +        }
>>> +        addr_info->archid = archid;
>>> +        addr_info->addr = start;
>>> +        addr_info->granule_size = size;
>>> +        addr_info->nb_granules = 1;
>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>> +                                        1, iotlb->leaf);
>>> +        break;
>>> +    }
>> Should we pass a size to  host kernel here, even if vSMMU doesn't support
>> RIL or guest kernel doesn't use RIL?
>>
>> It will cause TLBI issue in  this scenario: Guest kernel issues a TLBI cmd
>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>> TLBI cmd
>> with "range" (4K) which can not invalidate the TLB entry of 2M huge page.
>> (pSMMU supports RIL)
> In that case the guest will loop over all 4K images belonging to the 2M
> huge page and invalidate each of them. This should turn into qemu
> notifications for each 4kB page, no? This is totally inefficient, hence
The guest will not loop over all 4K images belonging to the 2M huge page.
The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":

iommu_iotlb_gather_add_page
     iommu_iotlb_sync
         domain->ops->iotlb_sync
             arm_smmu_iotlb_sync
                 arm_smmu_tlb_inv_range_domain
                     __arm_smmu_tlb_inv_range

In the above mentioned scenario, guest kernel will issue a TLBI cmd only 
with
"iova" (tg = 0).

Thanks,
Kunkun Jiang
> the support of RIL on guest side and QEMU device.
>
> What do I miss?
>
> Thanks
>
> Eric
>> Thanks,
>> Kunkun Jiang
>>> +    }
>>> +
>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>>> +    if (ret) {
>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>> container, ret);
>>> +    }
>>> +}
>>> +
>>>    static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>> *iotlb)
>>>    {
>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>> @@ -776,6 +843,35 @@ static void
>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>        }
>>>    }
>>>    +static void vfio_prereg_listener_region_add(MemoryListener *listener,
>>> +                                            MemoryRegionSection
>>> *section)
>>> +{
>>> +    VFIOContainer *container =
>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>> +    Error *err = NULL;
>>> +
>>> +    if (!memory_region_is_ram(section->mr)) {
>>> +        return;
>>> +    }
>>> +
>>> +    vfio_dma_map_ram_section(container, section, &err);
>>> +    if (err) {
>>> +        error_report_err(err);
>>> +    }
>>> +}
>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>> +                                     MemoryRegionSection *section)
>>> +{
>>> +    VFIOContainer *container =
>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>> +
>>> +    if (!memory_region_is_ram(section->mr)) {
>>> +        return;
>>> +    }
>>> +
>>> +    vfio_dma_unmap_ram_section(container, section);
>>> +}
>>> +
>>>    static void vfio_listener_region_add(MemoryListener *listener,
>>>                                         MemoryRegionSection *section)
>>>    {
>>> @@ -879,9 +975,10 @@ static void
>>> vfio_listener_region_add(MemoryListener *listener,
>>>        memory_region_ref(section->mr);
>>>          if (memory_region_is_iommu(section->mr)) {
>>> +        IOMMUNotify notify;
>>>            VFIOGuestIOMMU *giommu;
>>>            IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
>>> -        int iommu_idx;
>>> +        int iommu_idx, flags;
>>>              trace_vfio_listener_region_add_iommu(iova, end);
>>>            /*
>>> @@ -900,8 +997,18 @@ static void
>>> vfio_listener_region_add(MemoryListener *listener,
>>>            llend = int128_sub(llend, int128_one());
>>>            iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>                                                          
>>> MEMTXATTRS_UNSPECIFIED);
>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>> +
>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>> invalidations */
>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>> +            notify = vfio_iommu_unmap_notify;
>>> +        } else {
>>> +            /* MAP/UNMAP IOTLB notifier */
>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>> +            notify = vfio_iommu_map_notify;
>>> +        }
>>> +
>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>                                section->offset_within_region,
>>>                                int128_get64(llend),
>>>                                iommu_idx);
>>> @@ -921,7 +1028,9 @@ static void
>>> vfio_listener_region_add(MemoryListener *listener,
>>>                goto fail;
>>>            }
>>>            QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>> giommu_next);
>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>> +        }
>>>              return;
>>>        }
>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>> vfio_memory_listener = {
>>>        .log_sync = vfio_listener_log_sync,
>>>    };
>>>    +static MemoryListener vfio_memory_prereg_listener = {
>>> +    .region_add = vfio_prereg_listener_region_add,
>>> +    .region_del = vfio_prereg_listener_region_del,
>>> +};
>>> +
>>>    static void vfio_listener_release(VFIOContainer *container)
>>>    {
>>>        memory_listener_unregister(&container->listener);
>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>            memory_listener_unregister(&container->prereg_listener);
>>>        }
>>>    }
>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>> *group, AddressSpace *as,
>>>                vfio_get_iommu_info_migration(container, info);
>>>            }
>>>            g_free(info);
>>> +
>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>> +            memory_listener_register(&container->prereg_listener,
>>> +                                     &address_space_memory);
>>> +            if (container->error) {
>>> +                memory_listener_unregister(&container->prereg_listener);
>>> +                ret = -1;
>>> +                error_propagate_prepend(errp, container->error,
>>> +                                    "RAM memory listener
>>> initialization failed "
>>> +                                    "for container");
>>> +                goto free_container_exit;
>>> +            }
>>> +        }
>>>            break;
>>>        }
>>>        case VFIO_SPAPR_TCE_v2_IOMMU:
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index 5c65aa0a98..cad7deec71 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -2773,6 +2773,25 @@ static void
>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>        vdev->req_enabled = false;
>>>    }
>>>    +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>> +                                      IOMMUConfig *config)
>>> +{
>>> +    PCIDevice *pdev = bus->devices[devfn];
>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>> +
>>> +    info.argsz = sizeof(info);
>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>> +    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
>>> +
>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>> +}
>>> +
>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>> +};
>>> +
>>>    static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>    {
>>>        VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>> **errp)
>>>        vfio_register_req_notifier(vdev);
>>>        vfio_setup_resetfn_quirk(vdev);
>>>    +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>> +
>>>        return;
>>>      out_deregister:
>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>> index 936d29d150..43696afc15 100644
>>> --- a/hw/vfio/trace-events
>>> +++ b/hw/vfio/trace-events
>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>> int index, int nr_areas) "Devic
>>>    vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>    vfio_get_dev_region(const char *name, int index, uint32_t type,
>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>    vfio_dma_unmap_overflow_workaround(void) ""
>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>> leaf=%d"
>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>      # platform.c
>>>    vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>> to group #%d"
>>
>>
> .
Eric Auger April 14, 2021, 8:05 a.m. UTC | #4
Hi Kunkun,

On 4/14/21 3:45 AM, Kunkun Jiang wrote:
> On 2021/4/13 20:57, Auger Eric wrote:
>> Hi Kunkun,
>>
>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>> Hi Eric,
>>>
>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>> there is no "caching" mode and we do not trap on map.
>>>>
>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>> through the host single stage.
>>>>
>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>> separately. This patch introduces a prereg_listener to setup
>>>> the stage 2 mapping.
>>>>
>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>> when the guest invalidates the stage 1 configuration, through
>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>> notifier.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> ---
>>>>
>>>> v7 -> v8:
>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>     propagate DOMAIN and PASID based invalidations
>>>>
>>>> v6 -> v7:
>>>> - remove PASID based invalidation
>>>>
>>>> v5 -> v6:
>>>> - add error_report_err()
>>>> - remove the abort in case of nested stage case
>>>>
>>>> v4 -> v5:
>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>> - use PCIPASIDOps for config notification
>>>>
>>>> v3 -> v4:
>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>
>>>> v2 -> v3:
>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>> - new user API
>>>> - handle leaf
>>>>
>>>> v1 -> v2:
>>>> - adapt to uapi changes
>>>> - pass the asid
>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>> ---
>>>>    hw/vfio/common.c     | 139
>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>    hw/vfio/pci.c        |  21 +++++++
>>>>    hw/vfio/trace-events |   2 +
>>>>    3 files changed, 157 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 0cd7ef2139..e369d451e7 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>> *iotlb, void **vaddr,
>>>>        return true;
>>>>    }
>>>>    +/* Propagate a guest IOTLB invalidation to the host (nested
>>>> mode) */
>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>> +{
>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>> +    VFIOContainer *container = giommu->container;
>>>> +    int ret;
>>>> +
>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>> +
>>>> +    ustruct.argsz = sizeof(ustruct);
>>>> +    ustruct.flags = 0;
>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>> +
>>>> +    switch (iotlb->granularity) {
>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>> +        break;
>>>> +    case IOMMU_INV_GRAN_PASID:
>>>> +    {
>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>> +        int archid = -1;
>>>> +
>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        pasid_info->archid = archid;
>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>> +        break;
>>>> +    }
>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>> +    {
>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>> +        struct iommu_inv_addr_info *addr_info;
>>>> +        size_t size = iotlb->addr_mask + 1;
>>>> +        int archid = -1;
>>>> +
>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>> +        if (iotlb->leaf) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>> +        }
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        addr_info->archid = archid;
>>>> +        addr_info->addr = start;
>>>> +        addr_info->granule_size = size;
>>>> +        addr_info->nb_granules = 1;
>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>> +                                        1, iotlb->leaf);
>>>> +        break;
>>>> +    }
>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>> support
>>> RIL or guest kernel doesn't use RIL?
>>>
>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>> TLBI cmd
>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>> TLBI cmd
>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>> page.
>>> (pSMMU supports RIL)
>> In that case the guest will loop over all 4K images belonging to the 2M
>> huge page and invalidate each of them. This should turn into qemu
>> notifications for each 4kB page, no? This is totally inefficient, hence
> The guest will not loop over all 4K images belonging to the 2M huge page.
> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
> 
> iommu_iotlb_gather_add_page
>     iommu_iotlb_sync
>         domain->ops->iotlb_sync
>             arm_smmu_iotlb_sync
>                 arm_smmu_tlb_inv_range_domain
>                     __arm_smmu_tlb_inv_range
> 
> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
> with
> "iova" (tg = 0).

OK I get it now. In that case I think I should do a TG=0 invalidation
on host even if the host does support RIL. Does that sound correct?

The trouble is the uapi struct does not convey such info explicitly.
Maybe I should use nb_granules = 0 to detect that case.

I think for this use case RIL should be supported by the guest. Thoughts?

Thanks

Eric
> 
> Thanks,
> Kunkun Jiang
>> the support of RIL on guest side and QEMU device.
>>
>> What do I miss?
>>
>> Thanks
>>
>> Eric
>>> Thanks,
>>> Kunkun Jiang
>>>> +    }
>>>> +
>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>>>> +    if (ret) {
>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>> container, ret);
>>>> +    }
>>>> +}
>>>> +
>>>>    static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>>    {
>>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> @@ -776,6 +843,35 @@ static void
>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>        }
>>>>    }
>>>>    +static void vfio_prereg_listener_region_add(MemoryListener
>>>> *listener,
>>>> +                                            MemoryRegionSection
>>>> *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +    Error *err = NULL;
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>> +    if (err) {
>>>> +        error_report_err(err);
>>>> +    }
>>>> +}
>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>> +                                     MemoryRegionSection *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>> +}
>>>> +
>>>>    static void vfio_listener_region_add(MemoryListener *listener,
>>>>                                         MemoryRegionSection *section)
>>>>    {
>>>> @@ -879,9 +975,10 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>        memory_region_ref(section->mr);
>>>>          if (memory_region_is_iommu(section->mr)) {
>>>> +        IOMMUNotify notify;
>>>>            VFIOGuestIOMMU *giommu;
>>>>            IOMMUMemoryRegion *iommu_mr =
>>>> IOMMU_MEMORY_REGION(section->mr);
>>>> -        int iommu_idx;
>>>> +        int iommu_idx, flags;
>>>>              trace_vfio_listener_region_add_iommu(iova, end);
>>>>            /*
>>>> @@ -900,8 +997,18 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>            llend = int128_sub(llend, int128_one());
>>>>            iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>                                                         
>>>> MEMTXATTRS_UNSPECIFIED);
>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>> invalidations */
>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>> +            notify = vfio_iommu_unmap_notify;
>>>> +        } else {
>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>> +            notify = vfio_iommu_map_notify;
>>>> +        }
>>>> +
>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>                                section->offset_within_region,
>>>>                                int128_get64(llend),
>>>>                                iommu_idx);
>>>> @@ -921,7 +1028,9 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>                goto fail;
>>>>            }
>>>>            QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>> giommu_next);
>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        }
>>>>              return;
>>>>        }
>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>> vfio_memory_listener = {
>>>>        .log_sync = vfio_listener_log_sync,
>>>>    };
>>>>    +static MemoryListener vfio_memory_prereg_listener = {
>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>> +};
>>>> +
>>>>    static void vfio_listener_release(VFIOContainer *container)
>>>>    {
>>>>        memory_listener_unregister(&container->listener);
>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>            memory_listener_unregister(&container->prereg_listener);
>>>>        }
>>>>    }
>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>> *group, AddressSpace *as,
>>>>                vfio_get_iommu_info_migration(container, info);
>>>>            }
>>>>            g_free(info);
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>> +            memory_listener_register(&container->prereg_listener,
>>>> +                                     &address_space_memory);
>>>> +            if (container->error) {
>>>> +               
>>>> memory_listener_unregister(&container->prereg_listener);
>>>> +                ret = -1;
>>>> +                error_propagate_prepend(errp, container->error,
>>>> +                                    "RAM memory listener
>>>> initialization failed "
>>>> +                                    "for container");
>>>> +                goto free_container_exit;
>>>> +            }
>>>> +        }
>>>>            break;
>>>>        }
>>>>        case VFIO_SPAPR_TCE_v2_IOMMU:
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 5c65aa0a98..cad7deec71 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -2773,6 +2773,25 @@ static void
>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>        vdev->req_enabled = false;
>>>>    }
>>>>    +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>> +                                      IOMMUConfig *config)
>>>> +{
>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>> +
>>>> +    info.argsz = sizeof(info);
>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>> sizeof(config->pasid_cfg));
>>>> +
>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>> +}
>>>> +
>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>> +};
>>>> +
>>>>    static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>    {
>>>>        VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>> **errp)
>>>>        vfio_register_req_notifier(vdev);
>>>>        vfio_setup_resetfn_quirk(vdev);
>>>>    +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>> +
>>>>        return;
>>>>      out_deregister:
>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>> index 936d29d150..43696afc15 100644
>>>> --- a/hw/vfio/trace-events
>>>> +++ b/hw/vfio/trace-events
>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>> int index, int nr_areas) "Devic
>>>>    vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>    vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>    vfio_dma_unmap_overflow_workaround(void) ""
>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>> leaf=%d"
>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>      # platform.c
>>>>    vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>> to group #%d"
>>>
>>>
>> .
> 
>
Kunkun Jiang April 15, 2021, 2:03 a.m. UTC | #5
Hi Eric,

On 2021/4/14 16:05, Auger Eric wrote:
> Hi Kunkun,
>
> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>> On 2021/4/13 20:57, Auger Eric wrote:
>>> Hi Kunkun,
>>>
>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>> Hi Eric,
>>>>
>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>> there is no "caching" mode and we do not trap on map.
>>>>>
>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>> through the host single stage.
>>>>>
>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>> separately. This patch introduces a prereg_listener to setup
>>>>> the stage 2 mapping.
>>>>>
>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>> when the guest invalidates the stage 1 configuration, through
>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>> notifier.
>>>>>
>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>
>>>>> ---
>>>>>
>>>>> v7 -> v8:
>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>      propagate DOMAIN and PASID based invalidations
>>>>>
>>>>> v6 -> v7:
>>>>> - remove PASID based invalidation
>>>>>
>>>>> v5 -> v6:
>>>>> - add error_report_err()
>>>>> - remove the abort in case of nested stage case
>>>>>
>>>>> v4 -> v5:
>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>> - use PCIPASIDOps for config notification
>>>>>
>>>>> v3 -> v4:
>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>
>>>>> v2 -> v3:
>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>> - new user API
>>>>> - handle leaf
>>>>>
>>>>> v1 -> v2:
>>>>> - adapt to uapi changes
>>>>> - pass the asid
>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>> ---
>>>>>     hw/vfio/common.c     | 139
>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>     hw/vfio/pci.c        |  21 +++++++
>>>>>     hw/vfio/trace-events |   2 +
>>>>>     3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>> --- a/hw/vfio/common.c
>>>>> +++ b/hw/vfio/common.c
>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>> *iotlb, void **vaddr,
>>>>>         return true;
>>>>>     }
>>>>>     +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>> mode) */
>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>> +{
>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>> +    VFIOContainer *container = giommu->container;
>>>>> +    int ret;
>>>>> +
>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>> +
>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>> +    ustruct.flags = 0;
>>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>> +
>>>>> +    switch (iotlb->granularity) {
>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>> +        break;
>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>> +    {
>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        pasid_info->archid = archid;
>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>> +        break;
>>>>> +    }
>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>> +    {
>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>> +        if (iotlb->leaf) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>> +        }
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        addr_info->archid = archid;
>>>>> +        addr_info->addr = start;
>>>>> +        addr_info->granule_size = size;
>>>>> +        addr_info->nb_granules = 1;
>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>> +                                        1, iotlb->leaf);
>>>>> +        break;
>>>>> +    }
>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>> support
>>>> RIL or guest kernel doesn't use RIL?
>>>>
>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>> TLBI cmd
>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>> TLBI cmd
>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>> page.
>>>> (pSMMU supports RIL)
>>> In that case the guest will loop over all 4K images belonging to the 2M
>>> huge page and invalidate each of them. This should turn into qemu
>>> notifications for each 4kB page, no? This is totally inefficient, hence
>> The guest will not loop over all 4K images belonging to the 2M huge page.
>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>>
>> iommu_iotlb_gather_add_page
>>      iommu_iotlb_sync
>>          domain->ops->iotlb_sync
>>              arm_smmu_iotlb_sync
>>                  arm_smmu_tlb_inv_range_domain
>>                      __arm_smmu_tlb_inv_range
>>
>> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
>> with
>> "iova" (tg = 0).
> OK I get it now. In that case I think I should do a TG=0 invalidation
> on host even if the host does support RIL. Does that sound correct?
Yeah, that's what I thought.
> The trouble is the uapi struct does not convey such info explicitly.
> Maybe I should use nb_granules = 0 to detect that case.
It is troublesome to correct this issue. Using nb_granules = 0 may be
a good choice.
> I think for this use case RIL should be supported by the guest. Thoughts?
Yes. If guest supports RIL, the scenario mentioned above does not exist.

Thanks,
Kunkun Jiang
>
> Thanks
>
> Eric
>> Thanks,
>> Kunkun Jiang
>>> the support of RIL on guest side and QEMU device.
>>>
>>> What do I miss?
>>>
>>> Thanks
>>>
>>> Eric
>>>> Thanks,
>>>> Kunkun Jiang
>>>>> +    }
>>>>> +
>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>>>>> +    if (ret) {
>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>> container, ret);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>     static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>>     {
>>>>>         VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> @@ -776,6 +843,35 @@ static void
>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>         }
>>>>>     }
>>>>>     +static void vfio_prereg_listener_region_add(MemoryListener
>>>>> *listener,
>>>>> +                                            MemoryRegionSection
>>>>> *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +    Error *err = NULL;
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>> +    if (err) {
>>>>> +        error_report_err(err);
>>>>> +    }
>>>>> +}
>>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>>> +                                     MemoryRegionSection *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>> +}
>>>>> +
>>>>>     static void vfio_listener_region_add(MemoryListener *listener,
>>>>>                                          MemoryRegionSection *section)
>>>>>     {
>>>>> @@ -879,9 +975,10 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>         memory_region_ref(section->mr);
>>>>>           if (memory_region_is_iommu(section->mr)) {
>>>>> +        IOMMUNotify notify;
>>>>>             VFIOGuestIOMMU *giommu;
>>>>>             IOMMUMemoryRegion *iommu_mr =
>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>> -        int iommu_idx;
>>>>> +        int iommu_idx, flags;
>>>>>               trace_vfio_listener_region_add_iommu(iova, end);
>>>>>             /*
>>>>> @@ -900,8 +997,18 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>             llend = int128_sub(llend, int128_one());
>>>>>             iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>                                                          
>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>> invalidations */
>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>> +        } else {
>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>> +            notify = vfio_iommu_map_notify;
>>>>> +        }
>>>>> +
>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>                                 section->offset_within_region,
>>>>>                                 int128_get64(llend),
>>>>>                                 iommu_idx);
>>>>> @@ -921,7 +1028,9 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>                 goto fail;
>>>>>             }
>>>>>             QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>> giommu_next);
>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        }
>>>>>               return;
>>>>>         }
>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>> vfio_memory_listener = {
>>>>>         .log_sync = vfio_listener_log_sync,
>>>>>     };
>>>>>     +static MemoryListener vfio_memory_prereg_listener = {
>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>> +};
>>>>> +
>>>>>     static void vfio_listener_release(VFIOContainer *container)
>>>>>     {
>>>>>         memory_listener_unregister(&container->listener);
>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>             memory_listener_unregister(&container->prereg_listener);
>>>>>         }
>>>>>     }
>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>> *group, AddressSpace *as,
>>>>>                 vfio_get_iommu_info_migration(container, info);
>>>>>             }
>>>>>             g_free(info);
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>> +                                     &address_space_memory);
>>>>> +            if (container->error) {
>>>>> +
>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>> +                ret = -1;
>>>>> +                error_propagate_prepend(errp, container->error,
>>>>> +                                    "RAM memory listener
>>>>> initialization failed "
>>>>> +                                    "for container");
>>>>> +                goto free_container_exit;
>>>>> +            }
>>>>> +        }
>>>>>             break;
>>>>>         }
>>>>>         case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>         vdev->req_enabled = false;
>>>>>     }
>>>>>     +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>>> +                                      IOMMUConfig *config)
>>>>> +{
>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>> +
>>>>> +    info.argsz = sizeof(info);
>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>> sizeof(config->pasid_cfg));
>>>>> +
>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>>> +}
>>>>> +
>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>> +};
>>>>> +
>>>>>     static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>     {
>>>>>         VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>>> **errp)
>>>>>         vfio_register_req_notifier(vdev);
>>>>>         vfio_setup_resetfn_quirk(vdev);
>>>>>     +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>> +
>>>>>         return;
>>>>>       out_deregister:
>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>> index 936d29d150..43696afc15 100644
>>>>> --- a/hw/vfio/trace-events
>>>>> +++ b/hw/vfio/trace-events
>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>>> int index, int nr_areas) "Devic
>>>>>     vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>     vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>     vfio_dma_unmap_overflow_workaround(void) ""
>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>> leaf=%d"
>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>>       # platform.c
>>>>>     vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>>> to group #%d"
>>>>
>>> .
>>
> .
Eric Auger April 26, 2021, 12:30 p.m. UTC | #6
Hi Kunkun,

On 4/14/21 3:45 AM, Kunkun Jiang wrote:
> On 2021/4/13 20:57, Auger Eric wrote:
>> Hi Kunkun,
>>
>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>> Hi Eric,
>>>
>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>> there is no "caching" mode and we do not trap on map.
>>>>
>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>> through the host single stage.
>>>>
>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>> separately. This patch introduces a prereg_listener to setup
>>>> the stage 2 mapping.
>>>>
>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>> when the guest invalidates the stage 1 configuration, through
>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>> notifier.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> ---
>>>>
>>>> v7 -> v8:
>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>     propagate DOMAIN and PASID based invalidations
>>>>
>>>> v6 -> v7:
>>>> - remove PASID based invalidation
>>>>
>>>> v5 -> v6:
>>>> - add error_report_err()
>>>> - remove the abort in case of nested stage case
>>>>
>>>> v4 -> v5:
>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>> - use PCIPASIDOps for config notification
>>>>
>>>> v3 -> v4:
>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>
>>>> v2 -> v3:
>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>> - new user API
>>>> - handle leaf
>>>>
>>>> v1 -> v2:
>>>> - adapt to uapi changes
>>>> - pass the asid
>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>> ---
>>>>    hw/vfio/common.c     | 139
>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>    hw/vfio/pci.c        |  21 +++++++
>>>>    hw/vfio/trace-events |   2 +
>>>>    3 files changed, 157 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 0cd7ef2139..e369d451e7 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>> *iotlb, void **vaddr,
>>>>        return true;
>>>>    }
>>>>    +/* Propagate a guest IOTLB invalidation to the host (nested
>>>> mode) */
>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>> +{
>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>> +    VFIOContainer *container = giommu->container;
>>>> +    int ret;
>>>> +
>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>> +
>>>> +    ustruct.argsz = sizeof(ustruct);
>>>> +    ustruct.flags = 0;
>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>> +
>>>> +    switch (iotlb->granularity) {
>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>> +        break;
>>>> +    case IOMMU_INV_GRAN_PASID:
>>>> +    {
>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>> +        int archid = -1;
>>>> +
>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        pasid_info->archid = archid;
>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>> +        break;
>>>> +    }
>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>> +    {
>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>> +        struct iommu_inv_addr_info *addr_info;
>>>> +        size_t size = iotlb->addr_mask + 1;
>>>> +        int archid = -1;
>>>> +
>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>> +        if (iotlb->leaf) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>> +        }
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        addr_info->archid = archid;
>>>> +        addr_info->addr = start;
>>>> +        addr_info->granule_size = size;
>>>> +        addr_info->nb_granules = 1;
>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>> +                                        1, iotlb->leaf);
>>>> +        break;
>>>> +    }
>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>> support
>>> RIL or guest kernel doesn't use RIL?
>>>
>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>> TLBI cmd
>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>> TLBI cmd
>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>> page.
>>> (pSMMU supports RIL)
>> In that case the guest will loop over all 4K images belonging to the 2M
>> huge page and invalidate each of them. This should turn into qemu
>> notifications for each 4kB page, no? This is totally inefficient, hence
> The guest will not loop over all 4K images belonging to the 2M huge page.
> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
> 
> iommu_iotlb_gather_add_page
>     iommu_iotlb_sync
>         domain->ops->iotlb_sync
>             arm_smmu_iotlb_sync
>                 arm_smmu_tlb_inv_range_domain
>                     __arm_smmu_tlb_inv_range
> 
> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
> with
> "iova" (tg = 0).

I am busy fixing this case. Could you share your guest use case. It is
DPDK? In the positive I would be interested in getting your DPDK
setup/commands.

Thank you in advance

Eric
> 
> Thanks,
> Kunkun Jiang
>> the support of RIL on guest side and QEMU device.
>>
>> What do I miss?
>>
>> Thanks
>>
>> Eric
>>> Thanks,
>>> Kunkun Jiang
>>>> +    }
>>>> +
>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>>>> +    if (ret) {
>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>> container, ret);
>>>> +    }
>>>> +}
>>>> +
>>>>    static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>>    {
>>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> @@ -776,6 +843,35 @@ static void
>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>        }
>>>>    }
>>>>    +static void vfio_prereg_listener_region_add(MemoryListener
>>>> *listener,
>>>> +                                            MemoryRegionSection
>>>> *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +    Error *err = NULL;
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>> +    if (err) {
>>>> +        error_report_err(err);
>>>> +    }
>>>> +}
>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>> +                                     MemoryRegionSection *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>> +}
>>>> +
>>>>    static void vfio_listener_region_add(MemoryListener *listener,
>>>>                                         MemoryRegionSection *section)
>>>>    {
>>>> @@ -879,9 +975,10 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>        memory_region_ref(section->mr);
>>>>          if (memory_region_is_iommu(section->mr)) {
>>>> +        IOMMUNotify notify;
>>>>            VFIOGuestIOMMU *giommu;
>>>>            IOMMUMemoryRegion *iommu_mr =
>>>> IOMMU_MEMORY_REGION(section->mr);
>>>> -        int iommu_idx;
>>>> +        int iommu_idx, flags;
>>>>              trace_vfio_listener_region_add_iommu(iova, end);
>>>>            /*
>>>> @@ -900,8 +997,18 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>            llend = int128_sub(llend, int128_one());
>>>>            iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>                                                         
>>>> MEMTXATTRS_UNSPECIFIED);
>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>> invalidations */
>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>> +            notify = vfio_iommu_unmap_notify;
>>>> +        } else {
>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>> +            notify = vfio_iommu_map_notify;
>>>> +        }
>>>> +
>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>                                section->offset_within_region,
>>>>                                int128_get64(llend),
>>>>                                iommu_idx);
>>>> @@ -921,7 +1028,9 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>                goto fail;
>>>>            }
>>>>            QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>> giommu_next);
>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        }
>>>>              return;
>>>>        }
>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>> vfio_memory_listener = {
>>>>        .log_sync = vfio_listener_log_sync,
>>>>    };
>>>>    +static MemoryListener vfio_memory_prereg_listener = {
>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>> +};
>>>> +
>>>>    static void vfio_listener_release(VFIOContainer *container)
>>>>    {
>>>>        memory_listener_unregister(&container->listener);
>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>            memory_listener_unregister(&container->prereg_listener);
>>>>        }
>>>>    }
>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>> *group, AddressSpace *as,
>>>>                vfio_get_iommu_info_migration(container, info);
>>>>            }
>>>>            g_free(info);
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>> +            memory_listener_register(&container->prereg_listener,
>>>> +                                     &address_space_memory);
>>>> +            if (container->error) {
>>>> +               
>>>> memory_listener_unregister(&container->prereg_listener);
>>>> +                ret = -1;
>>>> +                error_propagate_prepend(errp, container->error,
>>>> +                                    "RAM memory listener
>>>> initialization failed "
>>>> +                                    "for container");
>>>> +                goto free_container_exit;
>>>> +            }
>>>> +        }
>>>>            break;
>>>>        }
>>>>        case VFIO_SPAPR_TCE_v2_IOMMU:
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 5c65aa0a98..cad7deec71 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -2773,6 +2773,25 @@ static void
>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>        vdev->req_enabled = false;
>>>>    }
>>>>    +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>> +                                      IOMMUConfig *config)
>>>> +{
>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>> +
>>>> +    info.argsz = sizeof(info);
>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>> sizeof(config->pasid_cfg));
>>>> +
>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>> +}
>>>> +
>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>> +};
>>>> +
>>>>    static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>    {
>>>>        VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>> **errp)
>>>>        vfio_register_req_notifier(vdev);
>>>>        vfio_setup_resetfn_quirk(vdev);
>>>>    +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>> +
>>>>        return;
>>>>      out_deregister:
>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>> index 936d29d150..43696afc15 100644
>>>> --- a/hw/vfio/trace-events
>>>> +++ b/hw/vfio/trace-events
>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>> int index, int nr_areas) "Devic
>>>>    vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>    vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>    vfio_dma_unmap_overflow_workaround(void) ""
>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>> leaf=%d"
>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>      # platform.c
>>>>    vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>> to group #%d"
>>>
>>>
>> .
> 
>
Eric Auger April 26, 2021, 7:16 p.m. UTC | #7
Hi Kunkun,

On 4/15/21 4:03 AM, Kunkun Jiang wrote:
> Hi Eric,
> 
> On 2021/4/14 16:05, Auger Eric wrote:
>> Hi Kunkun,
>>
>> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>>> On 2021/4/13 20:57, Auger Eric wrote:
>>>> Hi Kunkun,
>>>>
>>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>>> Hi Eric,
>>>>>
>>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>>> there is no "caching" mode and we do not trap on map.
>>>>>>
>>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>>> through the host single stage.
>>>>>>
>>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>>> separately. This patch introduces a prereg_listener to setup
>>>>>> the stage 2 mapping.
>>>>>>
>>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>>> when the guest invalidates the stage 1 configuration, through
>>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>>> notifier.
>>>>>>
>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> v7 -> v8:
>>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>>      propagate DOMAIN and PASID based invalidations
>>>>>>
>>>>>> v6 -> v7:
>>>>>> - remove PASID based invalidation
>>>>>>
>>>>>> v5 -> v6:
>>>>>> - add error_report_err()
>>>>>> - remove the abort in case of nested stage case
>>>>>>
>>>>>> v4 -> v5:
>>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>>> - use PCIPASIDOps for config notification
>>>>>>
>>>>>> v3 -> v4:
>>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>>
>>>>>> v2 -> v3:
>>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>>> - new user API
>>>>>> - handle leaf
>>>>>>
>>>>>> v1 -> v2:
>>>>>> - adapt to uapi changes
>>>>>> - pass the asid
>>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>>> ---
>>>>>>     hw/vfio/common.c     | 139
>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>     hw/vfio/pci.c        |  21 +++++++
>>>>>>     hw/vfio/trace-events |   2 +
>>>>>>     3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>>> --- a/hw/vfio/common.c
>>>>>> +++ b/hw/vfio/common.c
>>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>>> *iotlb, void **vaddr,
>>>>>>         return true;
>>>>>>     }
>>>>>>     +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>>> mode) */
>>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>>> *iotlb)
>>>>>> +{
>>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>>> +    VFIOContainer *container = giommu->container;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>>> +
>>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>>> +    ustruct.flags = 0;
>>>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>>> +
>>>>>> +    switch (iotlb->granularity) {
>>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>>> +        break;
>>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>>> +    {
>>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>>> +        int archid = -1;
>>>>>> +
>>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>> +            archid = iotlb->arch_id;
>>>>>> +        }
>>>>>> +        pasid_info->archid = archid;
>>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>>> +        break;
>>>>>> +    }
>>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>>> +    {
>>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>>> +        int archid = -1;
>>>>>> +
>>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>>> +        if (iotlb->leaf) {
>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>>> +        }
>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>> +            archid = iotlb->arch_id;
>>>>>> +        }
>>>>>> +        addr_info->archid = archid;
>>>>>> +        addr_info->addr = start;
>>>>>> +        addr_info->granule_size = size;
>>>>>> +        addr_info->nb_granules = 1;
>>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>>> +                                        1, iotlb->leaf);
>>>>>> +        break;
>>>>>> +    }
>>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>>> support
>>>>> RIL or guest kernel doesn't use RIL?
>>>>>
>>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>>> TLBI cmd
>>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu
>>>>> passed
>>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>>> TLBI cmd
>>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>>> page.
>>>>> (pSMMU supports RIL)
>>>> In that case the guest will loop over all 4K images belonging to the 2M
>>>> huge page and invalidate each of them. This should turn into qemu
>>>> notifications for each 4kB page, no? This is totally inefficient, hence
>>> The guest will not loop over all 4K images belonging to the 2M huge
>>> page.
>>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page.
>>> The
>>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>>>
>>> iommu_iotlb_gather_add_page
>>>      iommu_iotlb_sync
>>>          domain->ops->iotlb_sync
>>>              arm_smmu_iotlb_sync
>>>                  arm_smmu_tlb_inv_range_domain
>>>                      __arm_smmu_tlb_inv_range
>>>
>>> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
>>> with
>>> "iova" (tg = 0).
>> OK I get it now. In that case I think I should do a TG=0 invalidation
>> on host even if the host does support RIL. Does that sound correct?
> Yeah, that's what I thought.
>> The trouble is the uapi struct does not convey such info explicitly.
>> Maybe I should use nb_granules = 0 to detect that case.
> It is troublesome to correct this issue. Using nb_granules = 0 may be
> a good choice.
>> I think for this use case RIL should be supported by the guest. Thoughts?
> Yes. If guest supports RIL, the scenario mentioned above does not exist.

After further study I really wonder if it is worth supporting the case
where the guest does not use range inval. Passing a nb_granules = 0 (~
size) would be OK to perform the TG=0 range invalidation on the host.
however host arm_smmu_tlb_inv_range_domain() then calls
arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size) which needs a size.
We would need to trap guest CMD_ATC_INV and use different code paths on
host smmu driver to cascade the various cache invalidations. At the
moment, without maintainer giodance, I am a bit reluctant to add this
extra complexity.

So I would be inclined to fail in QEMU if we detect TG=0 is being used
by the guest. Recent guest kernels would be a prerequisite for this use
case. What do you think?

Thanks

Eric
> 
> Thanks,
> Kunkun Jiang
>>
>> Thanks
>>
>> Eric
>>> Thanks,
>>> Kunkun Jiang
>>>> the support of RIL on guest side and QEMU device.
>>>>
>>>> What do I miss?
>>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>> Thanks,
>>>>> Kunkun Jiang
>>>>>> +    }
>>>>>> +
>>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE,
>>>>>> &ustruct);
>>>>>> +    if (ret) {
>>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>>> container, ret);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>>     static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>>> *iotlb)
>>>>>>     {
>>>>>>         VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>>> @@ -776,6 +843,35 @@ static void
>>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>>         }
>>>>>>     }
>>>>>>     +static void vfio_prereg_listener_region_add(MemoryListener
>>>>>> *listener,
>>>>>> +                                            MemoryRegionSection
>>>>>> *section)
>>>>>> +{
>>>>>> +    VFIOContainer *container =
>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>> +    Error *err = NULL;
>>>>>> +
>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>>> +    if (err) {
>>>>>> +        error_report_err(err);
>>>>>> +    }
>>>>>> +}
>>>>>> +static void vfio_prereg_listener_region_del(MemoryListener
>>>>>> *listener,
>>>>>> +                                     MemoryRegionSection *section)
>>>>>> +{
>>>>>> +    VFIOContainer *container =
>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>> +
>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>>> +}
>>>>>> +
>>>>>>     static void vfio_listener_region_add(MemoryListener *listener,
>>>>>>                                          MemoryRegionSection
>>>>>> *section)
>>>>>>     {
>>>>>> @@ -879,9 +975,10 @@ static void
>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>         memory_region_ref(section->mr);
>>>>>>           if (memory_region_is_iommu(section->mr)) {
>>>>>> +        IOMMUNotify notify;
>>>>>>             VFIOGuestIOMMU *giommu;
>>>>>>             IOMMUMemoryRegion *iommu_mr =
>>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>>> -        int iommu_idx;
>>>>>> +        int iommu_idx, flags;
>>>>>>               trace_vfio_listener_region_add_iommu(iova, end);
>>>>>>             /*
>>>>>> @@ -900,8 +997,18 @@ static void
>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>             llend = int128_sub(llend, int128_one());
>>>>>>             iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>>                                                         
>>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>>> +
>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>>> invalidations */
>>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>>> +        } else {
>>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>>> +            notify = vfio_iommu_map_notify;
>>>>>> +        }
>>>>>> +
>>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>>                                 section->offset_within_region,
>>>>>>                                 int128_get64(llend),
>>>>>>                                 iommu_idx);
>>>>>> @@ -921,7 +1028,9 @@ static void
>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>                 goto fail;
>>>>>>             }
>>>>>>             QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>>> giommu_next);
>>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>> +        }
>>>>>>               return;
>>>>>>         }
>>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>>> vfio_memory_listener = {
>>>>>>         .log_sync = vfio_listener_log_sync,
>>>>>>     };
>>>>>>     +static MemoryListener vfio_memory_prereg_listener = {
>>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>>> +};
>>>>>> +
>>>>>>     static void vfio_listener_release(VFIOContainer *container)
>>>>>>     {
>>>>>>         memory_listener_unregister(&container->listener);
>>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>             memory_listener_unregister(&container->prereg_listener);
>>>>>>         }
>>>>>>     }
>>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>>> *group, AddressSpace *as,
>>>>>>                 vfio_get_iommu_info_migration(container, info);
>>>>>>             }
>>>>>>             g_free(info);
>>>>>> +
>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>> +            container->prereg_listener =
>>>>>> vfio_memory_prereg_listener;
>>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>>> +                                     &address_space_memory);
>>>>>> +            if (container->error) {
>>>>>> +
>>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>>> +                ret = -1;
>>>>>> +                error_propagate_prepend(errp, container->error,
>>>>>> +                                    "RAM memory listener
>>>>>> initialization failed "
>>>>>> +                                    "for container");
>>>>>> +                goto free_container_exit;
>>>>>> +            }
>>>>>> +        }
>>>>>>             break;
>>>>>>         }
>>>>>>         case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>>         vdev->req_enabled = false;
>>>>>>     }
>>>>>>     +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t
>>>>>> devfn,
>>>>>> +                                      IOMMUConfig *config)
>>>>>> +{
>>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>>> +
>>>>>> +    info.argsz = sizeof(info);
>>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>>> sizeof(config->pasid_cfg));
>>>>>> +
>>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>>>> +}
>>>>>> +
>>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>>> +};
>>>>>> +
>>>>>>     static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>>     {
>>>>>>         VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>>>> **errp)
>>>>>>         vfio_register_req_notifier(vdev);
>>>>>>         vfio_setup_resetfn_quirk(vdev);
>>>>>>     +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>>> +
>>>>>>         return;
>>>>>>       out_deregister:
>>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>>> index 936d29d150..43696afc15 100644
>>>>>> --- a/hw/vfio/trace-events
>>>>>> +++ b/hw/vfio/trace-events
>>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>>>> int index, int nr_areas) "Devic
>>>>>>     vfio_region_sparse_mmap_entry(int i, unsigned long start,
>>>>>> unsigned
>>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>>     vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>>     vfio_dma_unmap_overflow_workaround(void) ""
>>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>>> leaf=%d"
>>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate
>>>>>> asid=%d"
>>>>>>       # platform.c
>>>>>>     vfio_platform_base_device_init(char *name, int groupid) "%s
>>>>>> belongs
>>>>>> to group #%d"
>>>>>
>>>> .
>>>
>> .
> 
>
Kunkun Jiang April 27, 2021, 8:58 a.m. UTC | #8
Hi Eric,

On 2021/4/26 20:30, Auger Eric wrote:
> Hi Kunkun,
>
> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>> On 2021/4/13 20:57, Auger Eric wrote:
>>> Hi Kunkun,
>>>
>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>> Hi Eric,
>>>>
>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>> there is no "caching" mode and we do not trap on map.
>>>>>
>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>> through the host single stage.
>>>>>
>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>> separately. This patch introduces a prereg_listener to setup
>>>>> the stage 2 mapping.
>>>>>
>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>> when the guest invalidates the stage 1 configuration, through
>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>> notifier.
>>>>>
>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>
>>>>> ---
>>>>>
>>>>> v7 -> v8:
>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>      propagate DOMAIN and PASID based invalidations
>>>>>
>>>>> v6 -> v7:
>>>>> - remove PASID based invalidation
>>>>>
>>>>> v5 -> v6:
>>>>> - add error_report_err()
>>>>> - remove the abort in case of nested stage case
>>>>>
>>>>> v4 -> v5:
>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>> - use PCIPASIDOps for config notification
>>>>>
>>>>> v3 -> v4:
>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>
>>>>> v2 -> v3:
>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>> - new user API
>>>>> - handle leaf
>>>>>
>>>>> v1 -> v2:
>>>>> - adapt to uapi changes
>>>>> - pass the asid
>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>> ---
>>>>>     hw/vfio/common.c     | 139
>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>     hw/vfio/pci.c        |  21 +++++++
>>>>>     hw/vfio/trace-events |   2 +
>>>>>     3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>> --- a/hw/vfio/common.c
>>>>> +++ b/hw/vfio/common.c
>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>> *iotlb, void **vaddr,
>>>>>         return true;
>>>>>     }
>>>>>     +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>> mode) */
>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>> +{
>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>> +    VFIOContainer *container = giommu->container;
>>>>> +    int ret;
>>>>> +
>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>> +
>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>> +    ustruct.flags = 0;
>>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>> +
>>>>> +    switch (iotlb->granularity) {
>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>> +        break;
>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>> +    {
>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        pasid_info->archid = archid;
>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>> +        break;
>>>>> +    }
>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>> +    {
>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>> +        if (iotlb->leaf) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>> +        }
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        addr_info->archid = archid;
>>>>> +        addr_info->addr = start;
>>>>> +        addr_info->granule_size = size;
>>>>> +        addr_info->nb_granules = 1;
>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>> +                                        1, iotlb->leaf);
>>>>> +        break;
>>>>> +    }
>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>> support
>>>> RIL or guest kernel doesn't use RIL?
>>>>
>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>> TLBI cmd
>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>> TLBI cmd
>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>> page.
>>>> (pSMMU supports RIL)
>>> In that case the guest will loop over all 4K images belonging to the 2M
>>> huge page and invalidate each of them. This should turn into qemu
>>> notifications for each 4kB page, no? This is totally inefficient, hence
>> The guest will not loop over all 4K images belonging to the 2M huge page.
>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>>
>> iommu_iotlb_gather_add_page
>>      iommu_iotlb_sync
>>          domain->ops->iotlb_sync
>>              arm_smmu_iotlb_sync
>>                  arm_smmu_tlb_inv_range_domain
>>                      __arm_smmu_tlb_inv_range
>>
>> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
>> with
>> "iova" (tg = 0).
> I am busy fixing this case. Could you share your guest use case. It is
> DPDK? In the positive I would be interested in getting your DPDK
> setup/commands.
I use UADK[1] with the accelerator Hisilicon SEC.
./test_hisi_sec --perf --sync --pktlen 1024 --block 1024 --blknum 1 
--times 1 --multi 1 --ctxnum 1

By the way, it needs to modify some code of UADK in vSVA scenario
to map 2M huge pages. The modified UADK I used was provided by
@Xingang Wang

[1] User Space Accelerator Development Kit
https://github.com/Linaro/uadk

Thanks,
Kunkun Jiang
> Thank you in advance
>
> Eric
>> Thanks,
>> Kunkun Jiang
>>> the support of RIL on guest side and QEMU device.
>>>
>>> What do I miss?
>>>
>>> Thanks
>>>
>>> Eric
>>>> Thanks,
>>>> Kunkun Jiang
>>>>> +    }
>>>>> +
>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
>>>>> +    if (ret) {
>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>> container, ret);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>     static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>>     {
>>>>>         VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> @@ -776,6 +843,35 @@ static void
>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>         }
>>>>>     }
>>>>>     +static void vfio_prereg_listener_region_add(MemoryListener
>>>>> *listener,
>>>>> +                                            MemoryRegionSection
>>>>> *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +    Error *err = NULL;
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>> +    if (err) {
>>>>> +        error_report_err(err);
>>>>> +    }
>>>>> +}
>>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>>> +                                     MemoryRegionSection *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>> +}
>>>>> +
>>>>>     static void vfio_listener_region_add(MemoryListener *listener,
>>>>>                                          MemoryRegionSection *section)
>>>>>     {
>>>>> @@ -879,9 +975,10 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>         memory_region_ref(section->mr);
>>>>>           if (memory_region_is_iommu(section->mr)) {
>>>>> +        IOMMUNotify notify;
>>>>>             VFIOGuestIOMMU *giommu;
>>>>>             IOMMUMemoryRegion *iommu_mr =
>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>> -        int iommu_idx;
>>>>> +        int iommu_idx, flags;
>>>>>               trace_vfio_listener_region_add_iommu(iova, end);
>>>>>             /*
>>>>> @@ -900,8 +997,18 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>             llend = int128_sub(llend, int128_one());
>>>>>             iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>                                                          
>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>> invalidations */
>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>> +        } else {
>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>> +            notify = vfio_iommu_map_notify;
>>>>> +        }
>>>>> +
>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>                                 section->offset_within_region,
>>>>>                                 int128_get64(llend),
>>>>>                                 iommu_idx);
>>>>> @@ -921,7 +1028,9 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>                 goto fail;
>>>>>             }
>>>>>             QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>> giommu_next);
>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        }
>>>>>               return;
>>>>>         }
>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>> vfio_memory_listener = {
>>>>>         .log_sync = vfio_listener_log_sync,
>>>>>     };
>>>>>     +static MemoryListener vfio_memory_prereg_listener = {
>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>> +};
>>>>> +
>>>>>     static void vfio_listener_release(VFIOContainer *container)
>>>>>     {
>>>>>         memory_listener_unregister(&container->listener);
>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>             memory_listener_unregister(&container->prereg_listener);
>>>>>         }
>>>>>     }
>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>> *group, AddressSpace *as,
>>>>>                 vfio_get_iommu_info_migration(container, info);
>>>>>             }
>>>>>             g_free(info);
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>> +                                     &address_space_memory);
>>>>> +            if (container->error) {
>>>>> +
>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>> +                ret = -1;
>>>>> +                error_propagate_prepend(errp, container->error,
>>>>> +                                    "RAM memory listener
>>>>> initialization failed "
>>>>> +                                    "for container");
>>>>> +                goto free_container_exit;
>>>>> +            }
>>>>> +        }
>>>>>             break;
>>>>>         }
>>>>>         case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>         vdev->req_enabled = false;
>>>>>     }
>>>>>     +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>>> +                                      IOMMUConfig *config)
>>>>> +{
>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>> +
>>>>> +    info.argsz = sizeof(info);
>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>> sizeof(config->pasid_cfg));
>>>>> +
>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>>> +}
>>>>> +
>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>> +};
>>>>> +
>>>>>     static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>     {
>>>>>         VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>>> **errp)
>>>>>         vfio_register_req_notifier(vdev);
>>>>>         vfio_setup_resetfn_quirk(vdev);
>>>>>     +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>> +
>>>>>         return;
>>>>>       out_deregister:
>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>> index 936d29d150..43696afc15 100644
>>>>> --- a/hw/vfio/trace-events
>>>>> +++ b/hw/vfio/trace-events
>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>>> int index, int nr_areas) "Devic
>>>>>     vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>     vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>     vfio_dma_unmap_overflow_workaround(void) ""
>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>> leaf=%d"
>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>>       # platform.c
>>>>>     vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>>> to group #%d"
>>>>
>>> .
>>
> .
Kunkun Jiang April 28, 2021, 9:51 a.m. UTC | #9
Hi Eric,

On 2021/4/27 3:16, Auger Eric wrote:
> Hi Kunkun,
>
> On 4/15/21 4:03 AM, Kunkun Jiang wrote:
>> Hi Eric,
>>
>> On 2021/4/14 16:05, Auger Eric wrote:
>>> Hi Kunkun,
>>>
>>> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>>>> On 2021/4/13 20:57, Auger Eric wrote:
>>>>> Hi Kunkun,
>>>>>
>>>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>>>> Hi Eric,
>>>>>>
>>>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>>>> there is no "caching" mode and we do not trap on map.
>>>>>>>
>>>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>>>> through the host single stage.
>>>>>>>
>>>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>>>> separately. This patch introduces a prereg_listener to setup
>>>>>>> the stage 2 mapping.
>>>>>>>
>>>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>>>> when the guest invalidates the stage 1 configuration, through
>>>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>>>> notifier.
>>>>>>>
>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>
>>>>>>> ---
>>>>>>>
>>>>>>> v7 -> v8:
>>>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>>>       propagate DOMAIN and PASID based invalidations
>>>>>>>
>>>>>>> v6 -> v7:
>>>>>>> - remove PASID based invalidation
>>>>>>>
>>>>>>> v5 -> v6:
>>>>>>> - add error_report_err()
>>>>>>> - remove the abort in case of nested stage case
>>>>>>>
>>>>>>> v4 -> v5:
>>>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>>>> - use PCIPASIDOps for config notification
>>>>>>>
>>>>>>> v3 -> v4:
>>>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>>>
>>>>>>> v2 -> v3:
>>>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>>>> - new user API
>>>>>>> - handle leaf
>>>>>>>
>>>>>>> v1 -> v2:
>>>>>>> - adapt to uapi changes
>>>>>>> - pass the asid
>>>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>>>> ---
>>>>>>>      hw/vfio/common.c     | 139
>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>      hw/vfio/pci.c        |  21 +++++++
>>>>>>>      hw/vfio/trace-events |   2 +
>>>>>>>      3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>>>> --- a/hw/vfio/common.c
>>>>>>> +++ b/hw/vfio/common.c
>>>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>>>> *iotlb, void **vaddr,
>>>>>>>          return true;
>>>>>>>      }
>>>>>>>      +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>>>> mode) */
>>>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>>>> *iotlb)
>>>>>>> +{
>>>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>>>> +    VFIOContainer *container = giommu->container;
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>>>> +
>>>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>>>> +    ustruct.flags = 0;
>>>>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>>>> +
>>>>>>> +    switch (iotlb->granularity) {
>>>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>>>> +        break;
>>>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>>>> +    {
>>>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>>>> +        int archid = -1;
>>>>>>> +
>>>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>>> +            archid = iotlb->arch_id;
>>>>>>> +        }
>>>>>>> +        pasid_info->archid = archid;
>>>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>>>> +        break;
>>>>>>> +    }
>>>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>>>> +    {
>>>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>>>> +        int archid = -1;
>>>>>>> +
>>>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>>>> +        if (iotlb->leaf) {
>>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>>>> +        }
>>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>>> +            archid = iotlb->arch_id;
>>>>>>> +        }
>>>>>>> +        addr_info->archid = archid;
>>>>>>> +        addr_info->addr = start;
>>>>>>> +        addr_info->granule_size = size;
>>>>>>> +        addr_info->nb_granules = 1;
>>>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>>>> +                                        1, iotlb->leaf);
>>>>>>> +        break;
>>>>>>> +    }
>>>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>>>> support
>>>>>> RIL or guest kernel doesn't use RIL?
>>>>>>
>>>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>>>> TLBI cmd
>>>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu
>>>>>> passed
>>>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>>>> TLBI cmd
>>>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>>>> page.
>>>>>> (pSMMU supports RIL)
>>>>> In that case the guest will loop over all 4K images belonging to the 2M
>>>>> huge page and invalidate each of them. This should turn into qemu
>>>>> notifications for each 4kB page, no? This is totally inefficient, hence
>>>> The guest will not loop over all 4K images belonging to the 2M huge
>>>> page.
>>>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page.
>>>> The
>>>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>>>>
>>>> iommu_iotlb_gather_add_page
>>>>       iommu_iotlb_sync
>>>>           domain->ops->iotlb_sync
>>>>               arm_smmu_iotlb_sync
>>>>                   arm_smmu_tlb_inv_range_domain
>>>>                       __arm_smmu_tlb_inv_range
>>>>
>>>> In the above mentioned scenario, guest kernel will issue a TLBI cmd only
>>>> with
>>>> "iova" (tg = 0).
>>> OK I get it now. In that case I think I should do a TG=0 invalidation
>>> on host even if the host does support RIL. Does that sound correct?
>> Yeah, that's what I thought.
>>> The trouble is the uapi struct does not convey such info explicitly.
>>> Maybe I should use nb_granules = 0 to detect that case.
>> It is troublesome to correct this issue. Using nb_granules = 0 may be
>> a good choice.
>>> I think for this use case RIL should be supported by the guest. Thoughts?
>> Yes. If guest supports RIL, the scenario mentioned above does not exist.
> After further study I really wonder if it is worth supporting the case
> where the guest does not use range inval. Passing a nb_granules = 0 (~
> size) would be OK to perform the TG=0 range invalidation on the host.
> however host arm_smmu_tlb_inv_range_domain() then calls
> arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size) which needs a size.
> We would need to trap guest CMD_ATC_INV and use different code paths on
> host smmu driver to cascade the various cache invalidations. At the
> moment, without maintainer giodance, I am a bit reluctant to add this
> extra complexity.
>
> So I would be inclined to fail in QEMU if we detect TG=0 is being used
> by the guest. Recent guest kernels would be a prerequisite for this use
> case. What do you think?
Sorry for late reply.

How do I determine whether the guest kernel is a recent guest kernel?
If I use a modified kernel or an old kernel, this method will cause problems
when the vm runs business.

I didn't think of the problem of CMD_ATC_INV. In my opinion, it is the best
choice to add processing of CMD_ATC_INV. And some function of smmu
driver will issues a separate CMD_ATC_INV, for example
arm_smmu_enable_ats(). Is it possible that Qemu needs to support the
processing of CMD_ATC_INV in the future?

I also thought of another not-so-good idea. We can use the granule_size
to give CMD_ATC_INV a corresponding maximum page size. This will
result in poor performance.
For example:
granule_size          maximum_page_size
       4K                               1G

Thanks,
Kunkun Jiang
> Thanks
>
> Eric
>> Thanks,
>> Kunkun Jiang
>>> Thanks
>>>
>>> Eric
>>>> Thanks,
>>>> Kunkun Jiang
>>>>> the support of RIL on guest side and QEMU device.
>>>>>
>>>>> What do I miss?
>>>>>
>>>>> Thanks
>>>>>
>>>>> Eric
>>>>>> Thanks,
>>>>>> Kunkun Jiang
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE,
>>>>>>> &ustruct);
>>>>>>> +    if (ret) {
>>>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>>>> container, ret);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>>      static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>>>> *iotlb)
>>>>>>>      {
>>>>>>>          VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>>>> @@ -776,6 +843,35 @@ static void
>>>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>>>          }
>>>>>>>      }
>>>>>>>      +static void vfio_prereg_listener_region_add(MemoryListener
>>>>>>> *listener,
>>>>>>> +                                            MemoryRegionSection
>>>>>>> *section)
>>>>>>> +{
>>>>>>> +    VFIOContainer *container =
>>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>>> +    Error *err = NULL;
>>>>>>> +
>>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>>>> +    if (err) {
>>>>>>> +        error_report_err(err);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +static void vfio_prereg_listener_region_del(MemoryListener
>>>>>>> *listener,
>>>>>>> +                                     MemoryRegionSection *section)
>>>>>>> +{
>>>>>>> +    VFIOContainer *container =
>>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>>> +
>>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>>>> +}
>>>>>>> +
>>>>>>>      static void vfio_listener_region_add(MemoryListener *listener,
>>>>>>>                                           MemoryRegionSection
>>>>>>> *section)
>>>>>>>      {
>>>>>>> @@ -879,9 +975,10 @@ static void
>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>          memory_region_ref(section->mr);
>>>>>>>            if (memory_region_is_iommu(section->mr)) {
>>>>>>> +        IOMMUNotify notify;
>>>>>>>              VFIOGuestIOMMU *giommu;
>>>>>>>              IOMMUMemoryRegion *iommu_mr =
>>>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>>>> -        int iommu_idx;
>>>>>>> +        int iommu_idx, flags;
>>>>>>>                trace_vfio_listener_region_add_iommu(iova, end);
>>>>>>>              /*
>>>>>>> @@ -900,8 +997,18 @@ static void
>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>              llend = int128_sub(llend, int128_one());
>>>>>>>              iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>>>                                                          
>>>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>>>> +
>>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>>>> invalidations */
>>>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>>>> +        } else {
>>>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>>>> +            notify = vfio_iommu_map_notify;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>>>                                  section->offset_within_region,
>>>>>>>                                  int128_get64(llend),
>>>>>>>                                  iommu_idx);
>>>>>>> @@ -921,7 +1028,9 @@ static void
>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>                  goto fail;
>>>>>>>              }
>>>>>>>              QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>>>> giommu_next);
>>>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>>> +        }
>>>>>>>                return;
>>>>>>>          }
>>>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>>>> vfio_memory_listener = {
>>>>>>>          .log_sync = vfio_listener_log_sync,
>>>>>>>      };
>>>>>>>      +static MemoryListener vfio_memory_prereg_listener = {
>>>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>>>> +};
>>>>>>> +
>>>>>>>      static void vfio_listener_release(VFIOContainer *container)
>>>>>>>      {
>>>>>>>          memory_listener_unregister(&container->listener);
>>>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>>              memory_listener_unregister(&container->prereg_listener);
>>>>>>>          }
>>>>>>>      }
>>>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>>>> *group, AddressSpace *as,
>>>>>>>                  vfio_get_iommu_info_migration(container, info);
>>>>>>>              }
>>>>>>>              g_free(info);
>>>>>>> +
>>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>> +            container->prereg_listener =
>>>>>>> vfio_memory_prereg_listener;
>>>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>>>> +                                     &address_space_memory);
>>>>>>> +            if (container->error) {
>>>>>>> +
>>>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>>>> +                ret = -1;
>>>>>>> +                error_propagate_prepend(errp, container->error,
>>>>>>> +                                    "RAM memory listener
>>>>>>> initialization failed "
>>>>>>> +                                    "for container");
>>>>>>> +                goto free_container_exit;
>>>>>>> +            }
>>>>>>> +        }
>>>>>>>              break;
>>>>>>>          }
>>>>>>>          case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>>>> --- a/hw/vfio/pci.c
>>>>>>> +++ b/hw/vfio/pci.c
>>>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>>>          vdev->req_enabled = false;
>>>>>>>      }
>>>>>>>      +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t
>>>>>>> devfn,
>>>>>>> +                                      IOMMUConfig *config)
>>>>>>> +{
>>>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>>>> +
>>>>>>> +    info.argsz = sizeof(info);
>>>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>>>> sizeof(config->pasid_cfg));
>>>>>>> +
>>>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>>>> +};
>>>>>>> +
>>>>>>>      static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>>>      {
>>>>>>>          VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>>>>> **errp)
>>>>>>>          vfio_register_req_notifier(vdev);
>>>>>>>          vfio_setup_resetfn_quirk(vdev);
>>>>>>>      +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>>>> +
>>>>>>>          return;
>>>>>>>        out_deregister:
>>>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>>>> index 936d29d150..43696afc15 100644
>>>>>>> --- a/hw/vfio/trace-events
>>>>>>> +++ b/hw/vfio/trace-events
>>>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>>>>> int index, int nr_areas) "Devic
>>>>>>>      vfio_region_sparse_mmap_entry(int i, unsigned long start,
>>>>>>> unsigned
>>>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>>>      vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>>>      vfio_dma_unmap_overflow_workaround(void) ""
>>>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>>>> leaf=%d"
>>>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate
>>>>>>> asid=%d"
>>>>>>>        # platform.c
>>>>>>>      vfio_platform_base_device_init(char *name, int groupid) "%s
>>>>>>> belongs
>>>>>>> to group #%d"
>>>>> .
>>> .
>>
> .
Eric Auger April 29, 2021, 1:58 p.m. UTC | #10
Hi Kunkun,

On 4/28/21 11:51 AM, Kunkun Jiang wrote:
> Hi Eric,
> 
> On 2021/4/27 3:16, Auger Eric wrote:
>> Hi Kunkun,
>>
>> On 4/15/21 4:03 AM, Kunkun Jiang wrote:
>>> Hi Eric,
>>>
>>> On 2021/4/14 16:05, Auger Eric wrote:
>>>> Hi Kunkun,
>>>>
>>>> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>>>>> On 2021/4/13 20:57, Auger Eric wrote:
>>>>>> Hi Kunkun,
>>>>>>
>>>>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>>>>> Hi Eric,
>>>>>>>
>>>>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>>>>> there is no "caching" mode and we do not trap on map.
>>>>>>>>
>>>>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>>>>> through the host single stage.
>>>>>>>>
>>>>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>>>>> separately. This patch introduces a prereg_listener to setup
>>>>>>>> the stage 2 mapping.
>>>>>>>>
>>>>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>>>>> when the guest invalidates the stage 1 configuration, through
>>>>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>>>>> notifier.
>>>>>>>>
>>>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>>>
>>>>>>>> ---
>>>>>>>>
>>>>>>>> v7 -> v8:
>>>>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>>>>       propagate DOMAIN and PASID based invalidations
>>>>>>>>
>>>>>>>> v6 -> v7:
>>>>>>>> - remove PASID based invalidation
>>>>>>>>
>>>>>>>> v5 -> v6:
>>>>>>>> - add error_report_err()
>>>>>>>> - remove the abort in case of nested stage case
>>>>>>>>
>>>>>>>> v4 -> v5:
>>>>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>>>>> - use PCIPASIDOps for config notification
>>>>>>>>
>>>>>>>> v3 -> v4:
>>>>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>>>>
>>>>>>>> v2 -> v3:
>>>>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>>>>> - new user API
>>>>>>>> - handle leaf
>>>>>>>>
>>>>>>>> v1 -> v2:
>>>>>>>> - adapt to uapi changes
>>>>>>>> - pass the asid
>>>>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>>>>> ---
>>>>>>>>      hw/vfio/common.c     | 139
>>>>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>      hw/vfio/pci.c        |  21 +++++++
>>>>>>>>      hw/vfio/trace-events |   2 +
>>>>>>>>      3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>>>>> --- a/hw/vfio/common.c
>>>>>>>> +++ b/hw/vfio/common.c
>>>>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>>>>> *iotlb, void **vaddr,
>>>>>>>>          return true;
>>>>>>>>      }
>>>>>>>>      +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>>>>> mode) */
>>>>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n,
>>>>>>>> IOMMUTLBEntry
>>>>>>>> *iotlb)
>>>>>>>> +{
>>>>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>>>>> +    VFIOContainer *container = giommu->container;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>>>>> +
>>>>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>>>>> +    ustruct.flags = 0;
>>>>>>>> +    ustruct.info.argsz = sizeof(struct
>>>>>>>> iommu_cache_invalidate_info);
>>>>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>>>>> +
>>>>>>>> +    switch (iotlb->granularity) {
>>>>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>>>>> +        break;
>>>>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>>>>> +    {
>>>>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>>>>> +        int archid = -1;
>>>>>>>> +
>>>>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>>>> +            archid = iotlb->arch_id;
>>>>>>>> +        }
>>>>>>>> +        pasid_info->archid = archid;
>>>>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>>>>> +        break;
>>>>>>>> +    }
>>>>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>>>>> +    {
>>>>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>>>>> +        int archid = -1;
>>>>>>>> +
>>>>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>>>>> +        if (iotlb->leaf) {
>>>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>>>>> +        }
>>>>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>>>>> +            archid = iotlb->arch_id;
>>>>>>>> +        }
>>>>>>>> +        addr_info->archid = archid;
>>>>>>>> +        addr_info->addr = start;
>>>>>>>> +        addr_info->granule_size = size;
>>>>>>>> +        addr_info->nb_granules = 1;
>>>>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>>>>> +                                        1, iotlb->leaf);
>>>>>>>> +        break;
>>>>>>>> +    }
>>>>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>>>>> support
>>>>>>> RIL or guest kernel doesn't use RIL?
>>>>>>>
>>>>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>>>>> TLBI cmd
>>>>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu
>>>>>>> passed
>>>>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>>>>> TLBI cmd
>>>>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>>>>> page.
>>>>>>> (pSMMU supports RIL)
>>>>>> In that case the guest will loop over all 4K images belonging to
>>>>>> the 2M
>>>>>> huge page and invalidate each of them. This should turn into qemu
>>>>>> notifications for each 4kB page, no? This is totally inefficient,
>>>>>> hence
>>>>> The guest will not loop over all 4K images belonging to the 2M huge
>>>>> page.
>>>>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page.
>>>>> The
>>>>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as
>>>>> "granule":
>>>>>
>>>>> iommu_iotlb_gather_add_page
>>>>>       iommu_iotlb_sync
>>>>>           domain->ops->iotlb_sync
>>>>>               arm_smmu_iotlb_sync
>>>>>                   arm_smmu_tlb_inv_range_domain
>>>>>                       __arm_smmu_tlb_inv_range
>>>>>
>>>>> In the above mentioned scenario, guest kernel will issue a TLBI cmd
>>>>> only
>>>>> with
>>>>> "iova" (tg = 0).
>>>> OK I get it now. In that case I think I should do a TG=0 invalidation
>>>> on host even if the host does support RIL. Does that sound correct?
>>> Yeah, that's what I thought.
>>>> The trouble is the uapi struct does not convey such info explicitly.
>>>> Maybe I should use nb_granules = 0 to detect that case.
>>> It is troublesome to correct this issue. Using nb_granules = 0 may be
>>> a good choice.
>>>> I think for this use case RIL should be supported by the guest.
>>>> Thoughts?
>>> Yes. If guest supports RIL, the scenario mentioned above does not exist.
>> After further study I really wonder if it is worth supporting the case
>> where the guest does not use range inval. Passing a nb_granules = 0 (~
>> size) would be OK to perform the TG=0 range invalidation on the host.
>> however host arm_smmu_tlb_inv_range_domain() then calls
>> arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size) which needs a size.
>> We would need to trap guest CMD_ATC_INV and use different code paths on
>> host smmu driver to cascade the various cache invalidations. At the
>> moment, without maintainer giodance, I am a bit reluctant to add this
>> extra complexity.
>>
>> So I would be inclined to fail in QEMU if we detect TG=0 is being used
>> by the guest. Recent guest kernels would be a prerequisite for this use
>> case. What do you think?
> Sorry for late reply.
> 
> How do I determine whether the guest kernel is a recent guest kernel?
> If I use a modified kernel or an old kernel, this method will cause
> problems
> when the vm runs business.
Yes I aknowledge this detection would happen very late when trapping
guest invals.
> 
> I didn't think of the problem of CMD_ATC_INV. In my opinion, it is the best
> choice to add processing of CMD_ATC_INV. And some function of smmu
> driver will issues a separate CMD_ATC_INV, for example
> arm_smmu_enable_ats(). 

Yes this would mean that in arm_smmu_cache_invalidate() I should not use
arm_smmu_tlb_inv_range_domain() and split the actual inv_range from the
atc_inv_domain. Or even at the moment, since the vIOMMU does not support
ATS I can do without.

Is it possible that Qemu needs to support the
> processing of CMD_ATC_INV in the future?

well that's technically feasible but we would need to make sure the
pIOMMU supports it.

Thanks

Eric
> 
> I also thought of another not-so-good idea. We can use the granule_size
> to give CMD_ATC_INV a corresponding maximum page size. This will
> result in poor performance.
> For example:
> granule_size          maximum_page_size
>       4K                               1G
> 
> Thanks,
> Kunkun Jiang
>> Thanks
>>
>> Eric
>>> Thanks,
>>> Kunkun Jiang
>>>> Thanks
>>>>
>>>> Eric
>>>>> Thanks,
>>>>> Kunkun Jiang
>>>>>> the support of RIL on guest side and QEMU device.
>>>>>>
>>>>>> What do I miss?
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>> Eric
>>>>>>> Thanks,
>>>>>>> Kunkun Jiang
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE,
>>>>>>>> &ustruct);
>>>>>>>> +    if (ret) {
>>>>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>>>>> container, ret);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static void vfio_iommu_map_notify(IOMMUNotifier *n,
>>>>>>>> IOMMUTLBEntry
>>>>>>>> *iotlb)
>>>>>>>>      {
>>>>>>>>          VFIOGuestIOMMU *giommu = container_of(n,
>>>>>>>> VFIOGuestIOMMU, n);
>>>>>>>> @@ -776,6 +843,35 @@ static void
>>>>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>>>>          }
>>>>>>>>      }
>>>>>>>>      +static void vfio_prereg_listener_region_add(MemoryListener
>>>>>>>> *listener,
>>>>>>>> +                                            MemoryRegionSection
>>>>>>>> *section)
>>>>>>>> +{
>>>>>>>> +    VFIOContainer *container =
>>>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>>>> +    Error *err = NULL;
>>>>>>>> +
>>>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>>>>> +    if (err) {
>>>>>>>> +        error_report_err(err);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +static void vfio_prereg_listener_region_del(MemoryListener
>>>>>>>> *listener,
>>>>>>>> +                                     MemoryRegionSection *section)
>>>>>>>> +{
>>>>>>>> +    VFIOContainer *container =
>>>>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>>>>> +
>>>>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static void vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>                                           MemoryRegionSection
>>>>>>>> *section)
>>>>>>>>      {
>>>>>>>> @@ -879,9 +975,10 @@ static void
>>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>          memory_region_ref(section->mr);
>>>>>>>>            if (memory_region_is_iommu(section->mr)) {
>>>>>>>> +        IOMMUNotify notify;
>>>>>>>>              VFIOGuestIOMMU *giommu;
>>>>>>>>              IOMMUMemoryRegion *iommu_mr =
>>>>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>>>>> -        int iommu_idx;
>>>>>>>> +        int iommu_idx, flags;
>>>>>>>>                trace_vfio_listener_region_add_iommu(iova, end);
>>>>>>>>              /*
>>>>>>>> @@ -900,8 +997,18 @@ static void
>>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>              llend = int128_sub(llend, int128_one());
>>>>>>>>              iommu_idx =
>>>>>>>> memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>>>>                                                         
>>>>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>>>>> +
>>>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>>>>> invalidations */
>>>>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>>>>> +        } else {
>>>>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>>>>> +            notify = vfio_iommu_map_notify;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>>>>                                  section->offset_within_region,
>>>>>>>>                                  int128_get64(llend),
>>>>>>>>                                  iommu_idx);
>>>>>>>> @@ -921,7 +1028,9 @@ static void
>>>>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>>>>                  goto fail;
>>>>>>>>              }
>>>>>>>>              QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>>>>> giommu_next);
>>>>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>>>>> +        }
>>>>>>>>                return;
>>>>>>>>          }
>>>>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>>>>> vfio_memory_listener = {
>>>>>>>>          .log_sync = vfio_listener_log_sync,
>>>>>>>>      };
>>>>>>>>      +static MemoryListener vfio_memory_prereg_listener = {
>>>>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>      static void vfio_listener_release(VFIOContainer *container)
>>>>>>>>      {
>>>>>>>>          memory_listener_unregister(&container->listener);
>>>>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>>>             
>>>>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>>>>>          }
>>>>>>>>      }
>>>>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>>>>> *group, AddressSpace *as,
>>>>>>>>                  vfio_get_iommu_info_migration(container, info);
>>>>>>>>              }
>>>>>>>>              g_free(info);
>>>>>>>> +
>>>>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>>>> +            container->prereg_listener =
>>>>>>>> vfio_memory_prereg_listener;
>>>>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>>>>> +                                     &address_space_memory);
>>>>>>>> +            if (container->error) {
>>>>>>>> +
>>>>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>>>>> +                ret = -1;
>>>>>>>> +                error_propagate_prepend(errp, container->error,
>>>>>>>> +                                    "RAM memory listener
>>>>>>>> initialization failed "
>>>>>>>> +                                    "for container");
>>>>>>>> +                goto free_container_exit;
>>>>>>>> +            }
>>>>>>>> +        }
>>>>>>>>              break;
>>>>>>>>          }
>>>>>>>>          case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>>>>> --- a/hw/vfio/pci.c
>>>>>>>> +++ b/hw/vfio/pci.c
>>>>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>>>>          vdev->req_enabled = false;
>>>>>>>>      }
>>>>>>>>      +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t
>>>>>>>> devfn,
>>>>>>>> +                                      IOMMUConfig *config)
>>>>>>>> +{
>>>>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>>>>> +
>>>>>>>> +    info.argsz = sizeof(info);
>>>>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>>>>> sizeof(config->pasid_cfg));
>>>>>>>> +
>>>>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE,
>>>>>>>> &info);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>>      static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>>>>      {
>>>>>>>>          VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev,
>>>>>>>> Error
>>>>>>>> **errp)
>>>>>>>>          vfio_register_req_notifier(vdev);
>>>>>>>>          vfio_setup_resetfn_quirk(vdev);
>>>>>>>>      +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>>>>> +
>>>>>>>>          return;
>>>>>>>>        out_deregister:
>>>>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>>>>> index 936d29d150..43696afc15 100644
>>>>>>>> --- a/hw/vfio/trace-events
>>>>>>>> +++ b/hw/vfio/trace-events
>>>>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char
>>>>>>>> *name,
>>>>>>>> int index, int nr_areas) "Devic
>>>>>>>>      vfio_region_sparse_mmap_entry(int i, unsigned long start,
>>>>>>>> unsigned
>>>>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>>>>      vfio_get_dev_region(const char *name, int index, uint32_t
>>>>>>>> type,
>>>>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>>>>      vfio_dma_unmap_overflow_workaround(void) ""
>>>>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>>>>> leaf=%d"
>>>>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate
>>>>>>>> asid=%d"
>>>>>>>>        # platform.c
>>>>>>>>      vfio_platform_base_device_init(char *name, int groupid) "%s
>>>>>>>> belongs
>>>>>>>> to group #%d"
>>>>>> .
>>>> .
>>>
>> .
> 
> 
>
Eric Auger Oct. 7, 2021, 4:58 p.m. UTC | #11
Hi Kunkun,

On 4/14/21 3:45 AM, Kunkun Jiang wrote:
> On 2021/4/13 20:57, Auger Eric wrote:
>> Hi Kunkun,
>>
>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>> Hi Eric,
>>>
>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>> there is no "caching" mode and we do not trap on map.
>>>>
>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>> through the host single stage.
>>>>
>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>> separately. This patch introduces a prereg_listener to setup
>>>> the stage 2 mapping.
>>>>
>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>> when the guest invalidates the stage 1 configuration, through
>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>> notifier.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> ---
>>>>
>>>> v7 -> v8:
>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>     propagate DOMAIN and PASID based invalidations
>>>>
>>>> v6 -> v7:
>>>> - remove PASID based invalidation
>>>>
>>>> v5 -> v6:
>>>> - add error_report_err()
>>>> - remove the abort in case of nested stage case
>>>>
>>>> v4 -> v5:
>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>> - use PCIPASIDOps for config notification
>>>>
>>>> v3 -> v4:
>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>
>>>> v2 -> v3:
>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>> - new user API
>>>> - handle leaf
>>>>
>>>> v1 -> v2:
>>>> - adapt to uapi changes
>>>> - pass the asid
>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>> ---
>>>>    hw/vfio/common.c     | 139
>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>    hw/vfio/pci.c        |  21 +++++++
>>>>    hw/vfio/trace-events |   2 +
>>>>    3 files changed, 157 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 0cd7ef2139..e369d451e7 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>> *iotlb, void **vaddr,
>>>>        return true;
>>>>    }
>>>>    +/* Propagate a guest IOTLB invalidation to the host (nested
>>>> mode) */
>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>> +{
>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>> +    VFIOContainer *container = giommu->container;
>>>> +    int ret;
>>>> +
>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>> +
>>>> +    ustruct.argsz = sizeof(ustruct);
>>>> +    ustruct.flags = 0;
>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>> +
>>>> +    switch (iotlb->granularity) {
>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>> +        break;
>>>> +    case IOMMU_INV_GRAN_PASID:
>>>> +    {
>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>> +        int archid = -1;
>>>> +
>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        pasid_info->archid = archid;
>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>> +        break;
>>>> +    }
>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>> +    {
>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>> +        struct iommu_inv_addr_info *addr_info;
>>>> +        size_t size = iotlb->addr_mask + 1;
>>>> +        int archid = -1;
>>>> +
>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>> +        if (iotlb->leaf) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>> +        }
>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>> +            archid = iotlb->arch_id;
>>>> +        }
>>>> +        addr_info->archid = archid;
>>>> +        addr_info->addr = start;
>>>> +        addr_info->granule_size = size;
>>>> +        addr_info->nb_granules = 1;
>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>> +                                        1, iotlb->leaf);
>>>> +        break;
>>>> +    }
>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>> support
>>> RIL or guest kernel doesn't use RIL?
>>>
>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>> TLBI cmd
>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>> TLBI cmd
>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>> page.
>>> (pSMMU supports RIL)
>> In that case the guest will loop over all 4K images belonging to the 2M
>> huge page and invalidate each of them. This should turn into qemu
>> notifications for each 4kB page, no? This is totally inefficient, hence
> The guest will not loop over all 4K images belonging to the 2M huge page.
> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>
> iommu_iotlb_gather_add_page
>     iommu_iotlb_sync
>         domain->ops->iotlb_sync
>             arm_smmu_iotlb_sync
>                 arm_smmu_tlb_inv_range_domain
>                     __arm_smmu_tlb_inv_range
>
> In the above mentioned scenario, guest kernel will issue a TLBI cmd
> only with
> "iova" (tg = 0).

I am currently respinning the SMMU part (the rest depends on
/dev/iommu). While thinking more about this issue you reported a long
time ago (sorry), I think a guest not using RIL is problematic wrt vSMMU
integration. Your case would not work with vhost either because when you
notify vhost you need to pass an invalidation range. In this specific
case there is none passed by the guest as the INVAL CMD just contains an
iova and that's it. Hope I do not miss anything. And I cannot guess the
range size. I only have the granule size from the CD and the IOVA. So
how can I properly invalidate in the vhost case beside upgrading the API
and pass the info telling the addr range is not set and invalidate the
whole vhost cache in that case.

So I would be inclined to support a RIL only guest for nested and
globally with vhost. I can't ignore your argument that the detection of
non interoperability comes late, with the first inval with TG=0. But I
think the SMMU spec without RIL just is *not* virtualization compatible
and this was first reported in Aug 2017
(https://lkml.org/lkml/2017/8/11/428), yeah so much time spent on this
without outcome :-(. Given the nested feature timeframe, now with this
/dev/iommu redesign, can't you consider that using a guest kernel >=
5.7, featuring RIL now is acceptable.

Thanks

Eric
>
> Thanks,
> Kunkun Jiang
>> the support of RIL on guest side and QEMU device.
>>
>> What do I miss?
>>
>> Thanks
>>
>> Eric
>>> Thanks,
>>> Kunkun Jiang
>>>> +    }
>>>> +
>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE,
>>>> &ustruct);
>>>> +    if (ret) {
>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>> container, ret);
>>>> +    }
>>>> +}
>>>> +
>>>>    static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>> *iotlb)
>>>>    {
>>>>        VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>> @@ -776,6 +843,35 @@ static void
>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>        }
>>>>    }
>>>>    +static void vfio_prereg_listener_region_add(MemoryListener
>>>> *listener,
>>>> +                                            MemoryRegionSection
>>>> *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +    Error *err = NULL;
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>> +    if (err) {
>>>> +        error_report_err(err);
>>>> +    }
>>>> +}
>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>> +                                     MemoryRegionSection *section)
>>>> +{
>>>> +    VFIOContainer *container =
>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>> +
>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>> +}
>>>> +
>>>>    static void vfio_listener_region_add(MemoryListener *listener,
>>>>                                         MemoryRegionSection *section)
>>>>    {
>>>> @@ -879,9 +975,10 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>        memory_region_ref(section->mr);
>>>>          if (memory_region_is_iommu(section->mr)) {
>>>> +        IOMMUNotify notify;
>>>>            VFIOGuestIOMMU *giommu;
>>>>            IOMMUMemoryRegion *iommu_mr =
>>>> IOMMU_MEMORY_REGION(section->mr);
>>>> -        int iommu_idx;
>>>> +        int iommu_idx, flags;
>>>>              trace_vfio_listener_region_add_iommu(iova, end);
>>>>            /*
>>>> @@ -900,8 +997,18 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>            llend = int128_sub(llend, int128_one());
>>>>            iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>                                                         
>>>> MEMTXATTRS_UNSPECIFIED);
>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>> invalidations */
>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>> +            notify = vfio_iommu_unmap_notify;
>>>> +        } else {
>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>> +            notify = vfio_iommu_map_notify;
>>>> +        }
>>>> +
>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>                                section->offset_within_region,
>>>>                                int128_get64(llend),
>>>>                                iommu_idx);
>>>> @@ -921,7 +1028,9 @@ static void
>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>                goto fail;
>>>>            }
>>>>            QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>> giommu_next);
>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>> +        }
>>>>              return;
>>>>        }
>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>> vfio_memory_listener = {
>>>>        .log_sync = vfio_listener_log_sync,
>>>>    };
>>>>    +static MemoryListener vfio_memory_prereg_listener = {
>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>> +};
>>>> +
>>>>    static void vfio_listener_release(VFIOContainer *container)
>>>>    {
>>>>        memory_listener_unregister(&container->listener);
>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>            memory_listener_unregister(&container->prereg_listener);
>>>>        }
>>>>    }
>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>> *group, AddressSpace *as,
>>>>                vfio_get_iommu_info_migration(container, info);
>>>>            }
>>>>            g_free(info);
>>>> +
>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>> +            memory_listener_register(&container->prereg_listener,
>>>> +                                     &address_space_memory);
>>>> +            if (container->error) {
>>>> +               
>>>> memory_listener_unregister(&container->prereg_listener);
>>>> +                ret = -1;
>>>> +                error_propagate_prepend(errp, container->error,
>>>> +                                    "RAM memory listener
>>>> initialization failed "
>>>> +                                    "for container");
>>>> +                goto free_container_exit;
>>>> +            }
>>>> +        }
>>>>            break;
>>>>        }
>>>>        case VFIO_SPAPR_TCE_v2_IOMMU:
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 5c65aa0a98..cad7deec71 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -2773,6 +2773,25 @@ static void
>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>        vdev->req_enabled = false;
>>>>    }
>>>>    +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>> +                                      IOMMUConfig *config)
>>>> +{
>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>> +
>>>> +    info.argsz = sizeof(info);
>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>> sizeof(config->pasid_cfg));
>>>> +
>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>> +}
>>>> +
>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>> +};
>>>> +
>>>>    static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>    {
>>>>        VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>> **errp)
>>>>        vfio_register_req_notifier(vdev);
>>>>        vfio_setup_resetfn_quirk(vdev);
>>>>    +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>> +
>>>>        return;
>>>>      out_deregister:
>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>> index 936d29d150..43696afc15 100644
>>>> --- a/hw/vfio/trace-events
>>>> +++ b/hw/vfio/trace-events
>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>> int index, int nr_areas) "Devic
>>>>    vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>    vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>    vfio_dma_unmap_overflow_workaround(void) ""
>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>> leaf=%d"
>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>      # platform.c
>>>>    vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>> to group #%d"
>>>
>>>
>> .
>
>
Kunkun Jiang Oct. 8, 2021, 2:13 a.m. UTC | #12
Hi Eric,

On 2021/10/8 0:58, Eric Auger wrote:
> Hi Kunkun,
>
> On 4/14/21 3:45 AM, Kunkun Jiang wrote:
>> On 2021/4/13 20:57, Auger Eric wrote:
>>> Hi Kunkun,
>>>
>>> On 4/13/21 2:10 PM, Kunkun Jiang wrote:
>>>> Hi Eric,
>>>>
>>>> On 2021/4/11 20:08, Eric Auger wrote:
>>>>> In nested mode, legacy vfio_iommu_map_notify cannot be used as
>>>>> there is no "caching" mode and we do not trap on map.
>>>>>
>>>>> On Intel, vfio_iommu_map_notify was used to DMA map the RAM
>>>>> through the host single stage.
>>>>>
>>>>> With nested mode, we need to setup the stage 2 and the stage 1
>>>>> separately. This patch introduces a prereg_listener to setup
>>>>> the stage 2 mapping.
>>>>>
>>>>> The stage 1 mapping, owned by the guest, is passed to the host
>>>>> when the guest invalidates the stage 1 configuration, through
>>>>> a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
>>>>> are cascaded downto the host through another IOMMU MR UNMAP
>>>>> notifier.
>>>>>
>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>
>>>>> ---
>>>>>
>>>>> v7 -> v8:
>>>>> - properly handle new IOMMUTLBEntry fields and especially
>>>>>      propagate DOMAIN and PASID based invalidations
>>>>>
>>>>> v6 -> v7:
>>>>> - remove PASID based invalidation
>>>>>
>>>>> v5 -> v6:
>>>>> - add error_report_err()
>>>>> - remove the abort in case of nested stage case
>>>>>
>>>>> v4 -> v5:
>>>>> - use VFIO_IOMMU_SET_PASID_TABLE
>>>>> - use PCIPASIDOps for config notification
>>>>>
>>>>> v3 -> v4:
>>>>> - use iommu_inv_pasid_info for ASID invalidation
>>>>>
>>>>> v2 -> v3:
>>>>> - use VFIO_IOMMU_ATTACH_PASID_TABLE
>>>>> - new user API
>>>>> - handle leaf
>>>>>
>>>>> v1 -> v2:
>>>>> - adapt to uapi changes
>>>>> - pass the asid
>>>>> - pass IOMMU_NOTIFIER_S1_CFG when initializing the config notifier
>>>>> ---
>>>>>     hw/vfio/common.c     | 139
>>>>> +++++++++++++++++++++++++++++++++++++++++--
>>>>>     hw/vfio/pci.c        |  21 +++++++
>>>>>     hw/vfio/trace-events |   2 +
>>>>>     3 files changed, 157 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>>> index 0cd7ef2139..e369d451e7 100644
>>>>> --- a/hw/vfio/common.c
>>>>> +++ b/hw/vfio/common.c
>>>>> @@ -595,6 +595,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry
>>>>> *iotlb, void **vaddr,
>>>>>         return true;
>>>>>     }
>>>>>     +/* Propagate a guest IOTLB invalidation to the host (nested
>>>>> mode) */
>>>>> +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>> +{
>>>>> +    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> +    struct vfio_iommu_type1_cache_invalidate ustruct = {};
>>>>> +    VFIOContainer *container = giommu->container;
>>>>> +    int ret;
>>>>> +
>>>>> +    assert(iotlb->perm == IOMMU_NONE);
>>>>> +
>>>>> +    ustruct.argsz = sizeof(ustruct);
>>>>> +    ustruct.flags = 0;
>>>>> +    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
>>>>> +    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
>>>>> +    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
>>>>> +
>>>>> +    switch (iotlb->granularity) {
>>>>> +    case IOMMU_INV_GRAN_DOMAIN:
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
>>>>> +        break;
>>>>> +    case IOMMU_INV_GRAN_PASID:
>>>>> +    {
>>>>> +        struct iommu_inv_pasid_info *pasid_info;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        pasid_info = &ustruct.info.granu.pasid_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        pasid_info->archid = archid;
>>>>> +        trace_vfio_iommu_asid_inv_iotlb(archid);
>>>>> +        break;
>>>>> +    }
>>>>> +    case IOMMU_INV_GRAN_ADDR:
>>>>> +    {
>>>>> +        hwaddr start = iotlb->iova + giommu->iommu_offset;
>>>>> +        struct iommu_inv_addr_info *addr_info;
>>>>> +        size_t size = iotlb->addr_mask + 1;
>>>>> +        int archid = -1;
>>>>> +
>>>>> +        addr_info = &ustruct.info.granu.addr_info;
>>>>> +        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
>>>>> +        if (iotlb->leaf) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
>>>>> +        }
>>>>> +        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
>>>>> +            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
>>>>> +            archid = iotlb->arch_id;
>>>>> +        }
>>>>> +        addr_info->archid = archid;
>>>>> +        addr_info->addr = start;
>>>>> +        addr_info->granule_size = size;
>>>>> +        addr_info->nb_granules = 1;
>>>>> +        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
>>>>> +                                        1, iotlb->leaf);
>>>>> +        break;
>>>>> +    }
>>>> Should we pass a size to  host kernel here, even if vSMMU doesn't
>>>> support
>>>> RIL or guest kernel doesn't use RIL?
>>>>
>>>> It will cause TLBI issue in  this scenario: Guest kernel issues a
>>>> TLBI cmd
>>>> without "range" (tg = 0) to invalidate a 2M huge page. Then qemu passed
>>>> the iova and size (4K) to host kernel. Finally, host kernel issues a
>>>> TLBI cmd
>>>> with "range" (4K) which can not invalidate the TLB entry of 2M huge
>>>> page.
>>>> (pSMMU supports RIL)
>>> In that case the guest will loop over all 4K images belonging to the 2M
>>> huge page and invalidate each of them. This should turn into qemu
>>> notifications for each 4kB page, no? This is totally inefficient, hence
>> The guest will not loop over all 4K images belonging to the 2M huge page.
>> The iommu_iotlb_gather->pgsize will be 2M, if a page is 2M huge page. The
>> gather->pgsize will be passed to __arm_smmu_tlb_inv_range as "granule":
>>
>> iommu_iotlb_gather_add_page
>>      iommu_iotlb_sync
>>          domain->ops->iotlb_sync
>>              arm_smmu_iotlb_sync
>>                  arm_smmu_tlb_inv_range_domain
>>                      __arm_smmu_tlb_inv_range
>>
>> In the above mentioned scenario, guest kernel will issue a TLBI cmd
>> only with
>> "iova" (tg = 0).
> I am currently respinning the SMMU part (the rest depends on
> /dev/iommu). While thinking more about this issue you reported a long
> time ago (sorry), I think a guest not using RIL is problematic wrt vSMMU
> integration. Your case would not work with vhost either because when you
> notify vhost you need to pass an invalidation range. In this specific
> case there is none passed by the guest as the INVAL CMD just contains an
> iova and that's it. Hope I do not miss anything. And I cannot guess the
> range size. I only have the granule size from the CD and the IOVA. So
> how can I properly invalidate in the vhost case beside upgrading the API
> and pass the info telling the addr range is not set and invalidate the
> whole vhost cache in that case.
>
> So I would be inclined to support a RIL only guest for nested and
> globally with vhost. I can't ignore your argument that the detection of
> non interoperability comes late, with the first inval with TG=0. But I
> think the SMMU spec without RIL just is *not* virtualization compatible
> and this was first reported in Aug 2017
> (https://lkml.org/lkml/2017/8/11/428), yeah so much time spent on this
> without outcome :-(. Given the nested feature timeframe, now with this
> /dev/iommu redesign, can't you consider that using a guest kernel >=
> 5.7, featuring RIL now is acceptable.
Ok, I see. Thanks for your reply.
Looking forward to your new patch set.

Thanks,
Kunkun Jiang
> Thanks
>
> Eric
>> Thanks,
>> Kunkun Jiang
>>> the support of RIL on guest side and QEMU device.
>>>
>>> What do I miss?
>>>
>>> Thanks
>>>
>>> Eric
>>>> Thanks,
>>>> Kunkun Jiang
>>>>> +    }
>>>>> +
>>>>> +    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE,
>>>>> &ustruct);
>>>>> +    if (ret) {
>>>>> +        error_report("%p: failed to invalidate CACHE (%d)",
>>>>> container, ret);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>     static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry
>>>>> *iotlb)
>>>>>     {
>>>>>         VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>>>>> @@ -776,6 +843,35 @@ static void
>>>>> vfio_dma_unmap_ram_section(VFIOContainer *container,
>>>>>         }
>>>>>     }
>>>>>     +static void vfio_prereg_listener_region_add(MemoryListener
>>>>> *listener,
>>>>> +                                            MemoryRegionSection
>>>>> *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +    Error *err = NULL;
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_map_ram_section(container, section, &err);
>>>>> +    if (err) {
>>>>> +        error_report_err(err);
>>>>> +    }
>>>>> +}
>>>>> +static void vfio_prereg_listener_region_del(MemoryListener *listener,
>>>>> +                                     MemoryRegionSection *section)
>>>>> +{
>>>>> +    VFIOContainer *container =
>>>>> +        container_of(listener, VFIOContainer, prereg_listener);
>>>>> +
>>>>> +    if (!memory_region_is_ram(section->mr)) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    vfio_dma_unmap_ram_section(container, section);
>>>>> +}
>>>>> +
>>>>>     static void vfio_listener_region_add(MemoryListener *listener,
>>>>>                                          MemoryRegionSection *section)
>>>>>     {
>>>>> @@ -879,9 +975,10 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>         memory_region_ref(section->mr);
>>>>>           if (memory_region_is_iommu(section->mr)) {
>>>>> +        IOMMUNotify notify;
>>>>>             VFIOGuestIOMMU *giommu;
>>>>>             IOMMUMemoryRegion *iommu_mr =
>>>>> IOMMU_MEMORY_REGION(section->mr);
>>>>> -        int iommu_idx;
>>>>> +        int iommu_idx, flags;
>>>>>               trace_vfio_listener_region_add_iommu(iova, end);
>>>>>             /*
>>>>> @@ -900,8 +997,18 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>             llend = int128_sub(llend, int128_one());
>>>>>             iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
>>>>>                                                          
>>>>> MEMTXATTRS_UNSPECIFIED);
>>>>> -        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
>>>>> -                            IOMMU_NOTIFIER_IOTLB_EVENTS,
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            /* IOTLB unmap notifier to propagate guest IOTLB
>>>>> invalidations */
>>>>> +            flags = IOMMU_NOTIFIER_UNMAP;
>>>>> +            notify = vfio_iommu_unmap_notify;
>>>>> +        } else {
>>>>> +            /* MAP/UNMAP IOTLB notifier */
>>>>> +            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
>>>>> +            notify = vfio_iommu_map_notify;
>>>>> +        }
>>>>> +
>>>>> +        iommu_notifier_init(&giommu->n, notify, flags,
>>>>>                                 section->offset_within_region,
>>>>>                                 int128_get64(llend),
>>>>>                                 iommu_idx);
>>>>> @@ -921,7 +1028,9 @@ static void
>>>>> vfio_listener_region_add(MemoryListener *listener,
>>>>>                 goto fail;
>>>>>             }
>>>>>             QLIST_INSERT_HEAD(&container->giommu_list, giommu,
>>>>> giommu_next);
>>>>> -        memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        if (flags & IOMMU_NOTIFIER_MAP) {
>>>>> +            memory_region_iommu_replay(giommu->iommu, &giommu->n);
>>>>> +        }
>>>>>               return;
>>>>>         }
>>>>> @@ -1205,10 +1314,16 @@ static const MemoryListener
>>>>> vfio_memory_listener = {
>>>>>         .log_sync = vfio_listener_log_sync,
>>>>>     };
>>>>>     +static MemoryListener vfio_memory_prereg_listener = {
>>>>> +    .region_add = vfio_prereg_listener_region_add,
>>>>> +    .region_del = vfio_prereg_listener_region_del,
>>>>> +};
>>>>> +
>>>>>     static void vfio_listener_release(VFIOContainer *container)
>>>>>     {
>>>>>         memory_listener_unregister(&container->listener);
>>>>> -    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
>>>>> +    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
>>>>> +        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>>             memory_listener_unregister(&container->prereg_listener);
>>>>>         }
>>>>>     }
>>>>> @@ -1858,6 +1973,20 @@ static int vfio_connect_container(VFIOGroup
>>>>> *group, AddressSpace *as,
>>>>>                 vfio_get_iommu_info_migration(container, info);
>>>>>             }
>>>>>             g_free(info);
>>>>> +
>>>>> +        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
>>>>> +            container->prereg_listener = vfio_memory_prereg_listener;
>>>>> +            memory_listener_register(&container->prereg_listener,
>>>>> +                                     &address_space_memory);
>>>>> +            if (container->error) {
>>>>> +
>>>>> memory_listener_unregister(&container->prereg_listener);
>>>>> +                ret = -1;
>>>>> +                error_propagate_prepend(errp, container->error,
>>>>> +                                    "RAM memory listener
>>>>> initialization failed "
>>>>> +                                    "for container");
>>>>> +                goto free_container_exit;
>>>>> +            }
>>>>> +        }
>>>>>             break;
>>>>>         }
>>>>>         case VFIO_SPAPR_TCE_v2_IOMMU:
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index 5c65aa0a98..cad7deec71 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -2773,6 +2773,25 @@ static void
>>>>> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>>         vdev->req_enabled = false;
>>>>>     }
>>>>>     +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
>>>>> +                                      IOMMUConfig *config)
>>>>> +{
>>>>> +    PCIDevice *pdev = bus->devices[devfn];
>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>> +    VFIOContainer *container = vdev->vbasedev.group->container;
>>>>> +    struct vfio_iommu_type1_set_pasid_table info;
>>>>> +
>>>>> +    info.argsz = sizeof(info);
>>>>> +    info.flags = VFIO_PASID_TABLE_FLAG_SET;
>>>>> +    memcpy(&info.config, &config->pasid_cfg,
>>>>> sizeof(config->pasid_cfg));
>>>>> +
>>>>> +    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
>>>>> +}
>>>>> +
>>>>> +static PCIPASIDOps vfio_pci_pasid_ops = {
>>>>> +    .set_pasid_table = vfio_iommu_set_pasid_table,
>>>>> +};
>>>>> +
>>>>>     static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>>     {
>>>>>         VFIOPCIDevice *vdev = VFIO_PCI(pdev);
>>>>> @@ -3084,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>>>>> **errp)
>>>>>         vfio_register_req_notifier(vdev);
>>>>>         vfio_setup_resetfn_quirk(vdev);
>>>>>     +    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
>>>>> +
>>>>>         return;
>>>>>       out_deregister:
>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>> index 936d29d150..43696afc15 100644
>>>>> --- a/hw/vfio/trace-events
>>>>> +++ b/hw/vfio/trace-events
>>>>> @@ -120,6 +120,8 @@ vfio_region_sparse_mmap_header(const char *name,
>>>>> int index, int nr_areas) "Devic
>>>>>     vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned
>>>>> long end) "sparse entry %d [0x%lx - 0x%lx]"
>>>>>     vfio_get_dev_region(const char *name, int index, uint32_t type,
>>>>> uint32_t subtype) "%s index %d, %08x/%0x8"
>>>>>     vfio_dma_unmap_overflow_workaround(void) ""
>>>>> +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size,
>>>>> uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d,
>>>>> addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64"
>>>>> leaf=%d"
>>>>> +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
>>>>>       # platform.c
>>>>>     vfio_platform_base_device_init(char *name, int groupid) "%s belongs
>>>>> to group #%d"
>>>>
>>> .
>>
> .
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0cd7ef2139..e369d451e7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -595,6 +595,73 @@  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
     return true;
 }
 
+/* Propagate a guest IOTLB invalidation to the host (nested mode) */
+static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+    struct vfio_iommu_type1_cache_invalidate ustruct = {};
+    VFIOContainer *container = giommu->container;
+    int ret;
+
+    assert(iotlb->perm == IOMMU_NONE);
+
+    ustruct.argsz = sizeof(ustruct);
+    ustruct.flags = 0;
+    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
+    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
+
+    switch (iotlb->granularity) {
+    case IOMMU_INV_GRAN_DOMAIN:
+        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
+        break;
+    case IOMMU_INV_GRAN_PASID:
+    {
+        struct iommu_inv_pasid_info *pasid_info;
+        int archid = -1;
+
+        pasid_info = &ustruct.info.granu.pasid_info;
+        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
+        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
+            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
+            archid = iotlb->arch_id;
+        }
+        pasid_info->archid = archid;
+        trace_vfio_iommu_asid_inv_iotlb(archid);
+        break;
+    }
+    case IOMMU_INV_GRAN_ADDR:
+    {
+        hwaddr start = iotlb->iova + giommu->iommu_offset;
+        struct iommu_inv_addr_info *addr_info;
+        size_t size = iotlb->addr_mask + 1;
+        int archid = -1;
+
+        addr_info = &ustruct.info.granu.addr_info;
+        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
+        if (iotlb->leaf) {
+            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
+        }
+        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
+            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
+            archid = iotlb->arch_id;
+        }
+        addr_info->archid = archid;
+        addr_info->addr = start;
+        addr_info->granule_size = size;
+        addr_info->nb_granules = 1;
+        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
+                                        1, iotlb->leaf);
+        break;
+    }
+    }
+
+    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
+    if (ret) {
+        error_report("%p: failed to invalidate CACHE (%d)", container, ret);
+    }
+}
+
 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
@@ -776,6 +843,35 @@  static void vfio_dma_unmap_ram_section(VFIOContainer *container,
     }
 }
 
+static void vfio_prereg_listener_region_add(MemoryListener *listener,
+                                            MemoryRegionSection *section)
+{
+    VFIOContainer *container =
+        container_of(listener, VFIOContainer, prereg_listener);
+    Error *err = NULL;
+
+    if (!memory_region_is_ram(section->mr)) {
+        return;
+    }
+
+    vfio_dma_map_ram_section(container, section, &err);
+    if (err) {
+        error_report_err(err);
+    }
+}
+static void vfio_prereg_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container =
+        container_of(listener, VFIOContainer, prereg_listener);
+
+    if (!memory_region_is_ram(section->mr)) {
+        return;
+    }
+
+    vfio_dma_unmap_ram_section(container, section);
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -879,9 +975,10 @@  static void vfio_listener_region_add(MemoryListener *listener,
     memory_region_ref(section->mr);
 
     if (memory_region_is_iommu(section->mr)) {
+        IOMMUNotify notify;
         VFIOGuestIOMMU *giommu;
         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
-        int iommu_idx;
+        int iommu_idx, flags;
 
         trace_vfio_listener_region_add_iommu(iova, end);
         /*
@@ -900,8 +997,18 @@  static void vfio_listener_region_add(MemoryListener *listener,
         llend = int128_sub(llend, int128_one());
         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
                                                        MEMTXATTRS_UNSPECIFIED);
-        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
-                            IOMMU_NOTIFIER_IOTLB_EVENTS,
+
+        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+            /* IOTLB unmap notifier to propagate guest IOTLB invalidations */
+            flags = IOMMU_NOTIFIER_UNMAP;
+            notify = vfio_iommu_unmap_notify;
+        } else {
+            /* MAP/UNMAP IOTLB notifier */
+            flags = IOMMU_NOTIFIER_IOTLB_EVENTS;
+            notify = vfio_iommu_map_notify;
+        }
+
+        iommu_notifier_init(&giommu->n, notify, flags,
                             section->offset_within_region,
                             int128_get64(llend),
                             iommu_idx);
@@ -921,7 +1028,9 @@  static void vfio_listener_region_add(MemoryListener *listener,
             goto fail;
         }
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
-        memory_region_iommu_replay(giommu->iommu, &giommu->n);
+        if (flags & IOMMU_NOTIFIER_MAP) {
+            memory_region_iommu_replay(giommu->iommu, &giommu->n);
+        }
 
         return;
     }
@@ -1205,10 +1314,16 @@  static const MemoryListener vfio_memory_listener = {
     .log_sync = vfio_listener_log_sync,
 };
 
+static MemoryListener vfio_memory_prereg_listener = {
+    .region_add = vfio_prereg_listener_region_add,
+    .region_del = vfio_prereg_listener_region_del,
+};
+
 static void vfio_listener_release(VFIOContainer *container)
 {
     memory_listener_unregister(&container->listener);
-    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
+        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
         memory_listener_unregister(&container->prereg_listener);
     }
 }
@@ -1858,6 +1973,20 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
             vfio_get_iommu_info_migration(container, info);
         }
         g_free(info);
+
+        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+            container->prereg_listener = vfio_memory_prereg_listener;
+            memory_listener_register(&container->prereg_listener,
+                                     &address_space_memory);
+            if (container->error) {
+                memory_listener_unregister(&container->prereg_listener);
+                ret = -1;
+                error_propagate_prepend(errp, container->error,
+                                    "RAM memory listener initialization failed "
+                                    "for container");
+                goto free_container_exit;
+            }
+        }
         break;
     }
     case VFIO_SPAPR_TCE_v2_IOMMU:
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5c65aa0a98..cad7deec71 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2773,6 +2773,25 @@  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
+                                      IOMMUConfig *config)
+{
+    PCIDevice *pdev = bus->devices[devfn];
+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
+    VFIOContainer *container = vdev->vbasedev.group->container;
+    struct vfio_iommu_type1_set_pasid_table info;
+
+    info.argsz = sizeof(info);
+    info.flags = VFIO_PASID_TABLE_FLAG_SET;
+    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
+
+    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
+}
+
+static PCIPASIDOps vfio_pci_pasid_ops = {
+    .set_pasid_table = vfio_iommu_set_pasid_table,
+};
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
@@ -3084,6 +3103,8 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
+    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
+
     return;
 
 out_deregister:
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 936d29d150..43696afc15 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -120,6 +120,8 @@  vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
 vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
 vfio_dma_unmap_overflow_workaround(void) ""
+vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
+vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
 
 # platform.c
 vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"