diff mbox series

[v4,03/14] x86/iommu: convert VT-d code to use new page table allocator

Message ID 20200804134209.8717-4-paul@xen.org
State Superseded
Headers show
Series IOMMU cleanup | expand

Commit Message

Paul Durrant Aug. 4, 2020, 1:41 p.m. UTC
From: Paul Durrant <pdurrant@amazon.com>

This patch converts the VT-d code to use the new IOMMU page table allocator
function. This allows all the free-ing code to be removed (since it is now
handled by the general x86 code) which reduces TLB and cache thrashing as well
as shortening the code.

The scope of the mapping_lock in intel_iommu_quarantine_init() has also been
increased slightly; it should have always covered accesses to
'arch.vtd.pgd_maddr'.

NOTE: The common IOMMU needs a slight modification to avoid scheduling the
      cleanup tasklet if the free_page_table() method is not present (since
      the tasklet will unconditionally call it).

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
Cc: Kevin Tian <kevin.tian@intel.com>

v2:
 - New in v2 (split from "add common page-table allocator")
---
 xen/drivers/passthrough/iommu.c     |   6 +-
 xen/drivers/passthrough/vtd/iommu.c | 101 ++++++++++------------------
 2 files changed, 39 insertions(+), 68 deletions(-)

Comments

Tian, Kevin Aug. 14, 2020, 6:41 a.m. UTC | #1
> From: Paul Durrant <paul@xen.org>
> Sent: Tuesday, August 4, 2020 9:42 PM
> 
> From: Paul Durrant <pdurrant@amazon.com>
> 
> This patch converts the VT-d code to use the new IOMMU page table
> allocator
> function. This allows all the free-ing code to be removed (since it is now
> handled by the general x86 code) which reduces TLB and cache thrashing as
> well
> as shortening the code.
> 
> The scope of the mapping_lock in intel_iommu_quarantine_init() has also
> been
> increased slightly; it should have always covered accesses to
> 'arch.vtd.pgd_maddr'.
> 
> NOTE: The common IOMMU needs a slight modification to avoid scheduling
> the
>       cleanup tasklet if the free_page_table() method is not present (since
>       the tasklet will unconditionally call it).
> 
> Signed-off-by: Paul Durrant <pdurrant@amazon.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> ---
> Cc: Kevin Tian <kevin.tian@intel.com>
> 
> v2:
>  - New in v2 (split from "add common page-table allocator")
> ---
>  xen/drivers/passthrough/iommu.c     |   6 +-
>  xen/drivers/passthrough/vtd/iommu.c | 101 ++++++++++------------------
>  2 files changed, 39 insertions(+), 68 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/iommu.c
> b/xen/drivers/passthrough/iommu.c
> index 1d644844ab..2b1db8022c 100644
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -225,8 +225,10 @@ static void iommu_teardown(struct domain *d)
>  {
>      struct domain_iommu *hd = dom_iommu(d);
> 
> -    hd->platform_ops->teardown(d);
> -    tasklet_schedule(&iommu_pt_cleanup_tasklet);
> +    iommu_vcall(hd->platform_ops, teardown, d);
> +
> +    if ( hd->platform_ops->free_page_table )
> +        tasklet_schedule(&iommu_pt_cleanup_tasklet);
>  }
> 
>  void iommu_domain_destroy(struct domain *d)
> diff --git a/xen/drivers/passthrough/vtd/iommu.c
> b/xen/drivers/passthrough/vtd/iommu.c
> index 94e0455a4d..607e8b5e65 100644
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -265,10 +265,15 @@ static u64 addr_to_dma_page_maddr(struct
> domain *domain, u64 addr, int alloc)
> 
>      addr &= (((u64)1) << addr_width) - 1;
>      ASSERT(spin_is_locked(&hd->arch.mapping_lock));
> -    if ( !hd->arch.vtd.pgd_maddr &&
> -         (!alloc ||
> -          ((hd->arch.vtd.pgd_maddr = alloc_pgtable_maddr(1, hd->node)) ==
> 0)) )
> -        goto out;
> +    if ( !hd->arch.vtd.pgd_maddr )
> +    {
> +        struct page_info *pg;
> +
> +        if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
> +            goto out;
> +
> +        hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
> +    }
> 
>      parent = (struct dma_pte *)map_vtd_domain_page(hd-
> >arch.vtd.pgd_maddr);
>      while ( level > 1 )
> @@ -279,13 +284,16 @@ static u64 addr_to_dma_page_maddr(struct
> domain *domain, u64 addr, int alloc)
>          pte_maddr = dma_pte_addr(*pte);
>          if ( !pte_maddr )
>          {
> +            struct page_info *pg;
> +
>              if ( !alloc )
>                  break;
> 
> -            pte_maddr = alloc_pgtable_maddr(1, hd->node);
> -            if ( !pte_maddr )
> +            pg = iommu_alloc_pgtable(domain);
> +            if ( !pg )
>                  break;
> 
> +            pte_maddr = page_to_maddr(pg);
>              dma_set_pte_addr(*pte, pte_maddr);
> 
>              /*
> @@ -675,45 +683,6 @@ static void dma_pte_clear_one(struct domain
> *domain, uint64_t addr,
>      unmap_vtd_domain_page(page);
>  }
> 
> -static void iommu_free_pagetable(u64 pt_maddr, int level)
> -{
> -    struct page_info *pg = maddr_to_page(pt_maddr);
> -
> -    if ( pt_maddr == 0 )
> -        return;
> -
> -    PFN_ORDER(pg) = level;
> -    spin_lock(&iommu_pt_cleanup_lock);
> -    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> -    spin_unlock(&iommu_pt_cleanup_lock);
> -}
> -
> -static void iommu_free_page_table(struct page_info *pg)
> -{
> -    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> -    u64 pt_maddr = page_to_maddr(pg);
> -    struct dma_pte *pt_vaddr, *pte;
> -
> -    PFN_ORDER(pg) = 0;
> -    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
> -
> -    for ( i = 0; i < PTE_NUM; i++ )
> -    {
> -        pte = &pt_vaddr[i];
> -        if ( !dma_pte_present(*pte) )
> -            continue;
> -
> -        if ( next_level >= 1 )
> -            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
> -
> -        dma_clear_pte(*pte);
> -        iommu_sync_cache(pte, sizeof(struct dma_pte));

I didn't see sync_cache in the new iommu_free_pgtables. Is it intended
(i.e. original flush is meaningless) or overlooked?

Thanks
Kevin

> -    }
> -
> -    unmap_vtd_domain_page(pt_vaddr);
> -    free_pgtable_maddr(pt_maddr);
> -}
> -
>  static int iommu_set_root_entry(struct vtd_iommu *iommu)
>  {
>      u32 sts;
> @@ -1748,16 +1717,7 @@ static void iommu_domain_teardown(struct
> domain *d)
>          xfree(mrmrr);
>      }
> 
> -    ASSERT(is_iommu_enabled(d));
> -
> -    if ( iommu_use_hap_pt(d) )
> -        return;
> -
> -    spin_lock(&hd->arch.mapping_lock);
> -    iommu_free_pagetable(hd->arch.vtd.pgd_maddr,
> -                         agaw_to_level(hd->arch.vtd.agaw));
>      hd->arch.vtd.pgd_maddr = 0;
> -    spin_unlock(&hd->arch.mapping_lock);
>  }
> 
>  static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
> @@ -2669,23 +2629,28 @@ static void vtd_dump_p2m_table(struct domain
> *d)
>  static int __init intel_iommu_quarantine_init(struct domain *d)
>  {
>      struct domain_iommu *hd = dom_iommu(d);
> +    struct page_info *pg;
>      struct dma_pte *parent;
>      unsigned int agaw =
> width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
>      unsigned int level = agaw_to_level(agaw);
> -    int rc;
> +    int rc = 0;
> +
> +    spin_lock(&hd->arch.mapping_lock);
> 
>      if ( hd->arch.vtd.pgd_maddr )
>      {
>          ASSERT_UNREACHABLE();
> -        return 0;
> +        goto out;
>      }
> 
> -    spin_lock(&hd->arch.mapping_lock);
> +    pg = iommu_alloc_pgtable(d);
> 
> -    hd->arch.vtd.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
> -    if ( !hd->arch.vtd.pgd_maddr )
> +    rc = -ENOMEM;
> +    if ( !pg )
>          goto out;
> 
> +    hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
> +
>      parent = map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
>      while ( level )
>      {
> @@ -2697,10 +2662,12 @@ static int __init
> intel_iommu_quarantine_init(struct domain *d)
>           * page table pages, and the resulting allocations are always
>           * zeroed.
>           */
> -        maddr = alloc_pgtable_maddr(1, hd->node);
> -        if ( !maddr )
> -            break;
> +        pg = iommu_alloc_pgtable(d);
> +
> +        if ( !pg )
> +            goto out;
> 
> +        maddr = page_to_maddr(pg);
>          for ( offset = 0; offset < PTE_NUM; offset++ )
>          {
>              struct dma_pte *pte = &parent[offset];
> @@ -2716,13 +2683,16 @@ static int __init
> intel_iommu_quarantine_init(struct domain *d)
>      }
>      unmap_vtd_domain_page(parent);
> 
> +    rc = 0;
> +
>   out:
>      spin_unlock(&hd->arch.mapping_lock);
> 
> -    rc = iommu_flush_iotlb_all(d);
> +    if ( !rc )
> +        rc = iommu_flush_iotlb_all(d);
> 
> -    /* Pages leaked in failure case */
> -    return level ? -ENOMEM : rc;
> +    /* Pages may be leaked in failure case */
> +    return rc;
>  }
> 
>  static struct iommu_ops __initdata vtd_ops = {
> @@ -2737,7 +2707,6 @@ static struct iommu_ops __initdata vtd_ops = {
>      .map_page = intel_iommu_map_page,
>      .unmap_page = intel_iommu_unmap_page,
>      .lookup_page = intel_iommu_lookup_page,
> -    .free_page_table = iommu_free_page_table,
>      .reassign_device = reassign_device_ownership,
>      .get_device_group_id = intel_iommu_group_id,
>      .enable_x2apic = intel_iommu_enable_eim,
> --
> 2.20.1
Durrant, Paul Aug. 14, 2020, 7:16 a.m. UTC | #2
> -----Original Message-----
[snip]
> > -static void iommu_free_page_table(struct page_info *pg)
> > -{
> > -    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> > -    u64 pt_maddr = page_to_maddr(pg);
> > -    struct dma_pte *pt_vaddr, *pte;
> > -
> > -    PFN_ORDER(pg) = 0;
> > -    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
> > -
> > -    for ( i = 0; i < PTE_NUM; i++ )
> > -    {
> > -        pte = &pt_vaddr[i];
> > -        if ( !dma_pte_present(*pte) )
> > -            continue;
> > -
> > -        if ( next_level >= 1 )
> > -            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
> > -
> > -        dma_clear_pte(*pte);
> > -        iommu_sync_cache(pte, sizeof(struct dma_pte));
> 
> I didn't see sync_cache in the new iommu_free_pgtables. Is it intended
> (i.e. original flush is meaningless) or overlooked?
> 

The original v1 combined patch had the comment:

NOTE: There is no need to clear and sync PTEs during teardown since the per-
      device root entries will have already been cleared (when devices were
      de-assigned) so the page tables can no longer be accessed by the IOMMU.

I should have included that note in this one. I'll fix in v5.

  Paul

> Thanks
> Kevin
diff mbox series

Patch

diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index 1d644844ab..2b1db8022c 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -225,8 +225,10 @@  static void iommu_teardown(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    hd->platform_ops->teardown(d);
-    tasklet_schedule(&iommu_pt_cleanup_tasklet);
+    iommu_vcall(hd->platform_ops, teardown, d);
+
+    if ( hd->platform_ops->free_page_table )
+        tasklet_schedule(&iommu_pt_cleanup_tasklet);
 }
 
 void iommu_domain_destroy(struct domain *d)
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 94e0455a4d..607e8b5e65 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -265,10 +265,15 @@  static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
 
     addr &= (((u64)1) << addr_width) - 1;
     ASSERT(spin_is_locked(&hd->arch.mapping_lock));
-    if ( !hd->arch.vtd.pgd_maddr &&
-         (!alloc ||
-          ((hd->arch.vtd.pgd_maddr = alloc_pgtable_maddr(1, hd->node)) == 0)) )
-        goto out;
+    if ( !hd->arch.vtd.pgd_maddr )
+    {
+        struct page_info *pg;
+
+        if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
+            goto out;
+
+        hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
+    }
 
     parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
     while ( level > 1 )
@@ -279,13 +284,16 @@  static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
         pte_maddr = dma_pte_addr(*pte);
         if ( !pte_maddr )
         {
+            struct page_info *pg;
+
             if ( !alloc )
                 break;
 
-            pte_maddr = alloc_pgtable_maddr(1, hd->node);
-            if ( !pte_maddr )
+            pg = iommu_alloc_pgtable(domain);
+            if ( !pg )
                 break;
 
+            pte_maddr = page_to_maddr(pg);
             dma_set_pte_addr(*pte, pte_maddr);
 
             /*
@@ -675,45 +683,6 @@  static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
     unmap_vtd_domain_page(page);
 }
 
-static void iommu_free_pagetable(u64 pt_maddr, int level)
-{
-    struct page_info *pg = maddr_to_page(pt_maddr);
-
-    if ( pt_maddr == 0 )
-        return;
-
-    PFN_ORDER(pg) = level;
-    spin_lock(&iommu_pt_cleanup_lock);
-    page_list_add_tail(pg, &iommu_pt_cleanup_list);
-    spin_unlock(&iommu_pt_cleanup_lock);
-}
-
-static void iommu_free_page_table(struct page_info *pg)
-{
-    unsigned int i, next_level = PFN_ORDER(pg) - 1;
-    u64 pt_maddr = page_to_maddr(pg);
-    struct dma_pte *pt_vaddr, *pte;
-
-    PFN_ORDER(pg) = 0;
-    pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
-
-    for ( i = 0; i < PTE_NUM; i++ )
-    {
-        pte = &pt_vaddr[i];
-        if ( !dma_pte_present(*pte) )
-            continue;
-
-        if ( next_level >= 1 )
-            iommu_free_pagetable(dma_pte_addr(*pte), next_level);
-
-        dma_clear_pte(*pte);
-        iommu_sync_cache(pte, sizeof(struct dma_pte));
-    }
-
-    unmap_vtd_domain_page(pt_vaddr);
-    free_pgtable_maddr(pt_maddr);
-}
-
 static int iommu_set_root_entry(struct vtd_iommu *iommu)
 {
     u32 sts;
@@ -1748,16 +1717,7 @@  static void iommu_domain_teardown(struct domain *d)
         xfree(mrmrr);
     }
 
-    ASSERT(is_iommu_enabled(d));
-
-    if ( iommu_use_hap_pt(d) )
-        return;
-
-    spin_lock(&hd->arch.mapping_lock);
-    iommu_free_pagetable(hd->arch.vtd.pgd_maddr,
-                         agaw_to_level(hd->arch.vtd.agaw));
     hd->arch.vtd.pgd_maddr = 0;
-    spin_unlock(&hd->arch.mapping_lock);
 }
 
 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
@@ -2669,23 +2629,28 @@  static void vtd_dump_p2m_table(struct domain *d)
 static int __init intel_iommu_quarantine_init(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
+    struct page_info *pg;
     struct dma_pte *parent;
     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
     unsigned int level = agaw_to_level(agaw);
-    int rc;
+    int rc = 0;
+
+    spin_lock(&hd->arch.mapping_lock);
 
     if ( hd->arch.vtd.pgd_maddr )
     {
         ASSERT_UNREACHABLE();
-        return 0;
+        goto out;
     }
 
-    spin_lock(&hd->arch.mapping_lock);
+    pg = iommu_alloc_pgtable(d);
 
-    hd->arch.vtd.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
-    if ( !hd->arch.vtd.pgd_maddr )
+    rc = -ENOMEM;
+    if ( !pg )
         goto out;
 
+    hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
+
     parent = map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
     while ( level )
     {
@@ -2697,10 +2662,12 @@  static int __init intel_iommu_quarantine_init(struct domain *d)
          * page table pages, and the resulting allocations are always
          * zeroed.
          */
-        maddr = alloc_pgtable_maddr(1, hd->node);
-        if ( !maddr )
-            break;
+        pg = iommu_alloc_pgtable(d);
+
+        if ( !pg )
+            goto out;
 
+        maddr = page_to_maddr(pg);
         for ( offset = 0; offset < PTE_NUM; offset++ )
         {
             struct dma_pte *pte = &parent[offset];
@@ -2716,13 +2683,16 @@  static int __init intel_iommu_quarantine_init(struct domain *d)
     }
     unmap_vtd_domain_page(parent);
 
+    rc = 0;
+
  out:
     spin_unlock(&hd->arch.mapping_lock);
 
-    rc = iommu_flush_iotlb_all(d);
+    if ( !rc )
+        rc = iommu_flush_iotlb_all(d);
 
-    /* Pages leaked in failure case */
-    return level ? -ENOMEM : rc;
+    /* Pages may be leaked in failure case */
+    return rc;
 }
 
 static struct iommu_ops __initdata vtd_ops = {
@@ -2737,7 +2707,6 @@  static struct iommu_ops __initdata vtd_ops = {
     .map_page = intel_iommu_map_page,
     .unmap_page = intel_iommu_unmap_page,
     .lookup_page = intel_iommu_lookup_page,
-    .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
     .enable_x2apic = intel_iommu_enable_eim,