diff mbox series

[v3,22/23] VT-d: replace all-contiguous page tables by superpage mappings

Message ID 58e8871c-0d5e-ec32-74ac-9137e8f2ce41@suse.com (mailing list archive)
State New, archived
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich Jan. 10, 2022, 4:38 p.m. UTC
When a page table ends up with all contiguous entries (including all
identical attributes), it can be replaced by a superpage entry at the
next higher level. The page table itself can then be scheduled for
freeing.

The adjustment to LEVEL_MASK is merely to avoid leaving a latent trap
for whenever we (and obviously hardware) start supporting 512G mappings.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Unlike the freeing of all-empty page tables, this causes quite a bit of
back and forth for PV domains, due to their mapping/unmapping of pages
when they get converted to/from being page tables. It may therefore be
worth considering to delay re-coalescing a little, to avoid doing so
when the superpage would otherwise get split again pretty soon. But I
think this would better be the subject of a separate change anyway.

Of course this could also be helped by more "aware" kernel side
behavior: They could avoid immediately mapping freed page tables
writable again, in anticipation of re-using that same page for another
page table elsewhere.
---
v3: New.

Comments

Tian, Kevin Feb. 18, 2022, 5:22 a.m. UTC | #1
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, January 11, 2022 12:38 AM
> 
> When a page table ends up with all contiguous entries (including all
> identical attributes), it can be replaced by a superpage entry at the
> next higher level. The page table itself can then be scheduled for
> freeing.
> 
> The adjustment to LEVEL_MASK is merely to avoid leaving a latent trap
> for whenever we (and obviously hardware) start supporting 512G mappings.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Unlike the freeing of all-empty page tables, this causes quite a bit of
> back and forth for PV domains, due to their mapping/unmapping of pages
> when they get converted to/from being page tables. It may therefore be
> worth considering to delay re-coalescing a little, to avoid doing so
> when the superpage would otherwise get split again pretty soon. But I
> think this would better be the subject of a separate change anyway.
> 

Agree. thus:

	Reviewed-by: Kevin Tian <kevin.tian@intel.com>

> Of course this could also be helped by more "aware" kernel side
> behavior: They could avoid immediately mapping freed page tables
> writable again, in anticipation of re-using that same page for another
> page table elsewhere.
> ---
> v3: New.
> 
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -2071,14 +2071,35 @@ static int __must_check intel_iommu_map_
>       * While the (ab)use of PTE_kind_table here allows to save some work in
>       * the function, the main motivation for it is that it avoids a so far
>       * unexplained hang during boot (while preparing Dom0) on a Westmere
> -     * based laptop.
> +     * based laptop.  This also has the intended effect of terminating the
> +     * loop when super pages aren't supported anymore at the next level.
>       */
> -    pt_update_contig_markers(&page->val,
> -                             address_level_offset(dfn_to_daddr(dfn), level),
> -                             level,
> -                             (hd->platform_ops->page_sizes &
> -                              (1UL << level_to_offset_bits(level + 1))
> -                              ? PTE_kind_leaf : PTE_kind_table));
> +    while ( pt_update_contig_markers(&page->val,
> +                                     address_level_offset(dfn_to_daddr(dfn), level),
> +                                     level,
> +                                     (hd->platform_ops->page_sizes &
> +                                      (1UL << level_to_offset_bits(level + 1))
> +                                       ? PTE_kind_leaf : PTE_kind_table)) )
> +    {
> +        struct page_info *pg = maddr_to_page(pg_maddr);
> +
> +        unmap_vtd_domain_page(page);
> +
> +        new.val &= ~(LEVEL_MASK << level_to_offset_bits(level));
> +        dma_set_pte_superpage(new);
> +
> +        pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), ++level,
> +                                          flush_flags, false);
> +        BUG_ON(pg_maddr < PAGE_SIZE);
> +
> +        page = map_vtd_domain_page(pg_maddr);
> +        pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
> +        *pte = new;
> +        iommu_sync_cache(pte, sizeof(*pte));
> +
> +        *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
> +        iommu_queue_free_pgtable(d, pg);
> +    }
> 
>      spin_unlock(&hd->arch.mapping_lock);
>      unmap_vtd_domain_page(page);
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -229,7 +229,7 @@ struct context_entry {
> 
>  /* page table handling */
>  #define LEVEL_STRIDE       (9)
> -#define LEVEL_MASK         ((1 << LEVEL_STRIDE) - 1)
> +#define LEVEL_MASK         (PTE_NUM - 1UL)
>  #define PTE_NUM            (1 << LEVEL_STRIDE)
>  #define level_to_agaw(val) ((val) - 2)
>  #define agaw_to_level(val) ((val) + 2)
diff mbox series

Patch

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2071,14 +2071,35 @@  static int __must_check intel_iommu_map_
      * While the (ab)use of PTE_kind_table here allows to save some work in
      * the function, the main motivation for it is that it avoids a so far
      * unexplained hang during boot (while preparing Dom0) on a Westmere
-     * based laptop.
+     * based laptop.  This also has the intended effect of terminating the
+     * loop when super pages aren't supported anymore at the next level.
      */
-    pt_update_contig_markers(&page->val,
-                             address_level_offset(dfn_to_daddr(dfn), level),
-                             level,
-                             (hd->platform_ops->page_sizes &
-                              (1UL << level_to_offset_bits(level + 1))
-                              ? PTE_kind_leaf : PTE_kind_table));
+    while ( pt_update_contig_markers(&page->val,
+                                     address_level_offset(dfn_to_daddr(dfn), level),
+                                     level,
+                                     (hd->platform_ops->page_sizes &
+                                      (1UL << level_to_offset_bits(level + 1))
+                                       ? PTE_kind_leaf : PTE_kind_table)) )
+    {
+        struct page_info *pg = maddr_to_page(pg_maddr);
+
+        unmap_vtd_domain_page(page);
+
+        new.val &= ~(LEVEL_MASK << level_to_offset_bits(level));
+        dma_set_pte_superpage(new);
+
+        pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), ++level,
+                                          flush_flags, false);
+        BUG_ON(pg_maddr < PAGE_SIZE);
+
+        page = map_vtd_domain_page(pg_maddr);
+        pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
+        *pte = new;
+        iommu_sync_cache(pte, sizeof(*pte));
+
+        *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
+        iommu_queue_free_pgtable(d, pg);
+    }
 
     spin_unlock(&hd->arch.mapping_lock);
     unmap_vtd_domain_page(page);
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -229,7 +229,7 @@  struct context_entry {
 
 /* page table handling */
 #define LEVEL_STRIDE       (9)
-#define LEVEL_MASK         ((1 << LEVEL_STRIDE) - 1)
+#define LEVEL_MASK         (PTE_NUM - 1UL)
 #define PTE_NUM            (1 << LEVEL_STRIDE)
 #define level_to_agaw(val) ((val) - 2)
 #define agaw_to_level(val) ((val) + 2)