diff mbox series

[v3,02/23] VT-d: have callers specify the target level for page table walks

Message ID 5bfc3618-12ff-f673-e880-6ea13bcf8fa3@suse.com (mailing list archive)
State New, archived
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich Jan. 10, 2022, 4:22 p.m. UTC
In order to be able to insert/remove super-pages we need to allow
callers of the walking function to specify at which point to stop the
walk.

For intel_iommu_lookup_page() integrate the last level access into
the main walking function.

dma_pte_clear_one() gets only partly adjusted for now: Error handling
and order parameter get put in place, but the order parameter remains
ignored (just like intel_iommu_map_page()'s order part of the flags).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I was actually wondering whether it wouldn't make sense to integrate
dma_pte_clear_one() into its only caller intel_iommu_unmap_page(), for
better symmetry with intel_iommu_map_page().
---
v2: Fix build.

Comments

Tian, Kevin Jan. 30, 2022, 3:17 a.m. UTC | #1
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, January 11, 2022 12:23 AM
> 
> In order to be able to insert/remove super-pages we need to allow
> callers of the walking function to specify at which point to stop the
> walk.
> 
> For intel_iommu_lookup_page() integrate the last level access into
> the main walking function.
> 
> dma_pte_clear_one() gets only partly adjusted for now: Error handling
> and order parameter get put in place, but the order parameter remains
> ignored (just like intel_iommu_map_page()'s order part of the flags).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I was actually wondering whether it wouldn't make sense to integrate
> dma_pte_clear_one() into its only caller intel_iommu_unmap_page(), for
> better symmetry with intel_iommu_map_page().

I think it's the right thing to do. It was there due to multiple callers
when firstly introduced. But now given only one caller mering it
with the caller to be symmetry makes sense.

with or without that change (given it's simple):

	Reviewed-by: Kevin Tian <kevin.tian@intel.com>

> ---
> v2: Fix build.
> 
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -347,63 +347,116 @@ static u64 bus_to_context_maddr(struct v
>      return maddr;
>  }
> 
> -static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int
> alloc)
> +/*
> + * This function walks (and if requested allocates) page tables to the
> + * designated target level. It returns
> + * - 0 when a non-present entry was encountered and no allocation was
> + *   requested,
> + * - a small positive value (the level, i.e. below PAGE_SIZE) upon allocation
> + *   failure,
> + * - for target > 0 the physical address of the page table holding the leaf
> + *   PTE for the requested address,
> + * - for target == 0 the full PTE.
> + */
> +static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t
> addr,
> +                                       unsigned int target,
> +                                       unsigned int *flush_flags, bool alloc)
>  {
>      struct domain_iommu *hd = dom_iommu(domain);
>      int addr_width = agaw_to_width(hd->arch.vtd.agaw);
>      struct dma_pte *parent, *pte = NULL;
> -    int level = agaw_to_level(hd->arch.vtd.agaw);
> -    int offset;
> +    unsigned int level = agaw_to_level(hd->arch.vtd.agaw), offset;
>      u64 pte_maddr = 0;
> 
>      addr &= (((u64)1) << addr_width) - 1;
>      ASSERT(spin_is_locked(&hd->arch.mapping_lock));
> +    ASSERT(target || !alloc);
> +
>      if ( !hd->arch.vtd.pgd_maddr )
>      {
>          struct page_info *pg;
> 
> -        if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
> +        if ( !alloc )
> +            goto out;
> +
> +        pte_maddr = level;
> +        if ( !(pg = iommu_alloc_pgtable(domain)) )
>              goto out;
> 
>          hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
>      }
> 
> -    parent = (struct dma_pte *)map_vtd_domain_page(hd-
> >arch.vtd.pgd_maddr);
> -    while ( level > 1 )
> +    pte_maddr = hd->arch.vtd.pgd_maddr;
> +    parent = map_vtd_domain_page(pte_maddr);
> +    while ( level > target )
>      {
>          offset = address_level_offset(addr, level);
>          pte = &parent[offset];
> 
>          pte_maddr = dma_pte_addr(*pte);
> -        if ( !pte_maddr )
> +        if ( !dma_pte_present(*pte) || (level > 1 &&
> dma_pte_superpage(*pte)) )
>          {
>              struct page_info *pg;
> +            /*
> +             * Higher level tables always set r/w, last level page table
> +             * controls read/write.
> +             */
> +            struct dma_pte new_pte = { DMA_PTE_PROT };
> 
>              if ( !alloc )
> -                break;
> +            {
> +                pte_maddr = 0;
> +                if ( !dma_pte_present(*pte) )
> +                    break;
> +
> +                /*
> +                 * When the leaf entry was requested, pass back the full PTE,
> +                 * with the address adjusted to account for the residual of
> +                 * the walk.
> +                 */
> +                pte_maddr = pte->val +
> +                    (addr & ((1UL << level_to_offset_bits(level)) - 1) &
> +                     PAGE_MASK);
> +                if ( !target )
> +                    break;
> +            }
> 
> +            pte_maddr = level - 1;
>              pg = iommu_alloc_pgtable(domain);
>              if ( !pg )
>                  break;
> 
>              pte_maddr = page_to_maddr(pg);
> -            dma_set_pte_addr(*pte, pte_maddr);
> +            dma_set_pte_addr(new_pte, pte_maddr);
> 
> -            /*
> -             * high level table always sets r/w, last level
> -             * page table control read/write
> -             */
> -            dma_set_pte_readable(*pte);
> -            dma_set_pte_writable(*pte);
> +            if ( dma_pte_present(*pte) )
> +            {
> +                struct dma_pte *split = map_vtd_domain_page(pte_maddr);
> +                unsigned long inc = 1UL << level_to_offset_bits(level - 1);
> +
> +                split[0].val = pte->val;
> +                if ( inc == PAGE_SIZE )
> +                    split[0].val &= ~DMA_PTE_SP;
> +
> +                for ( offset = 1; offset < PTE_NUM; ++offset )
> +                    split[offset].val = split[offset - 1].val + inc;
> +
> +                iommu_sync_cache(split, PAGE_SIZE);
> +                unmap_vtd_domain_page(split);
> +
> +                if ( flush_flags )
> +                    *flush_flags |= IOMMU_FLUSHF_modified;
> +            }
> +
> +            write_atomic(&pte->val, new_pte.val);
>              iommu_sync_cache(pte, sizeof(struct dma_pte));
>          }
> 
> -        if ( level == 2 )
> +        if ( --level == target )
>              break;
> 
>          unmap_vtd_domain_page(parent);
>          parent = map_vtd_domain_page(pte_maddr);
> -        level--;
>      }
> 
>      unmap_vtd_domain_page(parent);
> @@ -430,7 +483,7 @@ static uint64_t domain_pgd_maddr(struct
>          if ( !hd->arch.vtd.pgd_maddr )
>          {
>              /* Ensure we have pagetables allocated down to leaf PTE. */
> -            addr_to_dma_page_maddr(d, 0, 1);
> +            addr_to_dma_page_maddr(d, 0, 1, NULL, true);
> 
>              if ( !hd->arch.vtd.pgd_maddr )
>                  return 0;
> @@ -770,8 +823,9 @@ static int __must_check iommu_flush_iotl
>  }
> 
>  /* clear one page's page table */
> -static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
> -                              unsigned int *flush_flags)
> +static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
> +                             unsigned int order,
> +                             unsigned int *flush_flags)
>  {
>      struct domain_iommu *hd = dom_iommu(domain);
>      struct dma_pte *page = NULL, *pte = NULL;
> @@ -779,11 +833,11 @@ static void dma_pte_clear_one(struct dom
> 
>      spin_lock(&hd->arch.mapping_lock);
>      /* get last level pte */
> -    pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
> -    if ( pg_maddr == 0 )
> +    pg_maddr = addr_to_dma_page_maddr(domain, addr, 1, flush_flags,
> false);
> +    if ( pg_maddr < PAGE_SIZE )
>      {
>          spin_unlock(&hd->arch.mapping_lock);
> -        return;
> +        return pg_maddr ? -ENOMEM : 0;
>      }
> 
>      page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
> @@ -793,7 +847,7 @@ static void dma_pte_clear_one(struct dom
>      {
>          spin_unlock(&hd->arch.mapping_lock);
>          unmap_vtd_domain_page(page);
> -        return;
> +        return 0;
>      }
> 
>      dma_clear_pte(*pte);
> @@ -803,6 +857,8 @@ static void dma_pte_clear_one(struct dom
>      iommu_sync_cache(pte, sizeof(struct dma_pte));
> 
>      unmap_vtd_domain_page(page);
> +
> +    return 0;
>  }
> 
>  static int iommu_set_root_entry(struct vtd_iommu *iommu)
> @@ -1914,8 +1970,9 @@ static int __must_check intel_iommu_map_
>          return 0;
>      }
> 
> -    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1);
> -    if ( !pg_maddr )
> +    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1,
> flush_flags,
> +                                      true);
> +    if ( pg_maddr < PAGE_SIZE )
>      {
>          spin_unlock(&hd->arch.mapping_lock);
>          return -ENOMEM;
> @@ -1965,17 +2022,14 @@ static int __must_check intel_iommu_unma
>      if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
>          return 0;
> 
> -    dma_pte_clear_one(d, dfn_to_daddr(dfn), flush_flags);
> -
> -    return 0;
> +    return dma_pte_clear_one(d, dfn_to_daddr(dfn), 0, flush_flags);
>  }
> 
>  static int intel_iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t
> *mfn,
>                                     unsigned int *flags)
>  {
>      struct domain_iommu *hd = dom_iommu(d);
> -    struct dma_pte *page, val;
> -    u64 pg_maddr;
> +    uint64_t val;
> 
>      /*
>       * If VT-d shares EPT page table or if the domain is the hardware
> @@ -1987,25 +2041,16 @@ static int intel_iommu_lookup_page(struc
> 
>      spin_lock(&hd->arch.mapping_lock);
> 
> -    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0);
> -    if ( !pg_maddr )
> -    {
> -        spin_unlock(&hd->arch.mapping_lock);
> -        return -ENOENT;
> -    }
> -
> -    page = map_vtd_domain_page(pg_maddr);
> -    val = page[dfn_x(dfn) & LEVEL_MASK];
> +    val = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0, NULL, false);
> 
> -    unmap_vtd_domain_page(page);
>      spin_unlock(&hd->arch.mapping_lock);
> 
> -    if ( !dma_pte_present(val) )
> +    if ( val < PAGE_SIZE )
>          return -ENOENT;
> 
> -    *mfn = maddr_to_mfn(dma_pte_addr(val));
> -    *flags = dma_pte_read(val) ? IOMMUF_readable : 0;
> -    *flags |= dma_pte_write(val) ? IOMMUF_writable : 0;
> +    *mfn = maddr_to_mfn(val);
> +    *flags = val & DMA_PTE_READ ? IOMMUF_readable : 0;
> +    *flags |= val & DMA_PTE_WRITE ? IOMMUF_writable : 0;
> 
>      return 0;
>  }
Jan Beulich Jan. 31, 2022, 10:04 a.m. UTC | #2
On 30.01.2022 04:17, Tian, Kevin wrote:
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Tuesday, January 11, 2022 12:23 AM
>>
>> In order to be able to insert/remove super-pages we need to allow
>> callers of the walking function to specify at which point to stop the
>> walk.
>>
>> For intel_iommu_lookup_page() integrate the last level access into
>> the main walking function.
>>
>> dma_pte_clear_one() gets only partly adjusted for now: Error handling
>> and order parameter get put in place, but the order parameter remains
>> ignored (just like intel_iommu_map_page()'s order part of the flags).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> I was actually wondering whether it wouldn't make sense to integrate
>> dma_pte_clear_one() into its only caller intel_iommu_unmap_page(), for
>> better symmetry with intel_iommu_map_page().
> 
> I think it's the right thing to do. It was there due to multiple callers
> when firstly introduced. But now given only one caller mering it
> with the caller to be symmetry makes sense.

I guess I'll make this a separate change towards the end of this series
now, to save me from some rebasing of other patches.

> with or without that change (given it's simple):
> 
> 	Reviewed-by: Kevin Tian <kevin.tian@intel.com>

Thanks.

Jan
diff mbox series

Patch

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -347,63 +347,116 @@  static u64 bus_to_context_maddr(struct v
     return maddr;
 }
 
-static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
+/*
+ * This function walks (and if requested allocates) page tables to the
+ * designated target level. It returns
+ * - 0 when a non-present entry was encountered and no allocation was
+ *   requested,
+ * - a small positive value (the level, i.e. below PAGE_SIZE) upon allocation
+ *   failure,
+ * - for target > 0 the physical address of the page table holding the leaf
+ *   PTE for the requested address,
+ * - for target == 0 the full PTE.
+ */
+static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+                                       unsigned int target,
+                                       unsigned int *flush_flags, bool alloc)
 {
     struct domain_iommu *hd = dom_iommu(domain);
     int addr_width = agaw_to_width(hd->arch.vtd.agaw);
     struct dma_pte *parent, *pte = NULL;
-    int level = agaw_to_level(hd->arch.vtd.agaw);
-    int offset;
+    unsigned int level = agaw_to_level(hd->arch.vtd.agaw), offset;
     u64 pte_maddr = 0;
 
     addr &= (((u64)1) << addr_width) - 1;
     ASSERT(spin_is_locked(&hd->arch.mapping_lock));
+    ASSERT(target || !alloc);
+
     if ( !hd->arch.vtd.pgd_maddr )
     {
         struct page_info *pg;
 
-        if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
+        if ( !alloc )
+            goto out;
+
+        pte_maddr = level;
+        if ( !(pg = iommu_alloc_pgtable(domain)) )
             goto out;
 
         hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
     }
 
-    parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
-    while ( level > 1 )
+    pte_maddr = hd->arch.vtd.pgd_maddr;
+    parent = map_vtd_domain_page(pte_maddr);
+    while ( level > target )
     {
         offset = address_level_offset(addr, level);
         pte = &parent[offset];
 
         pte_maddr = dma_pte_addr(*pte);
-        if ( !pte_maddr )
+        if ( !dma_pte_present(*pte) || (level > 1 && dma_pte_superpage(*pte)) )
         {
             struct page_info *pg;
+            /*
+             * Higher level tables always set r/w, last level page table
+             * controls read/write.
+             */
+            struct dma_pte new_pte = { DMA_PTE_PROT };
 
             if ( !alloc )
-                break;
+            {
+                pte_maddr = 0;
+                if ( !dma_pte_present(*pte) )
+                    break;
+
+                /*
+                 * When the leaf entry was requested, pass back the full PTE,
+                 * with the address adjusted to account for the residual of
+                 * the walk.
+                 */
+                pte_maddr = pte->val +
+                    (addr & ((1UL << level_to_offset_bits(level)) - 1) &
+                     PAGE_MASK);
+                if ( !target )
+                    break;
+            }
 
+            pte_maddr = level - 1;
             pg = iommu_alloc_pgtable(domain);
             if ( !pg )
                 break;
 
             pte_maddr = page_to_maddr(pg);
-            dma_set_pte_addr(*pte, pte_maddr);
+            dma_set_pte_addr(new_pte, pte_maddr);
 
-            /*
-             * high level table always sets r/w, last level
-             * page table control read/write
-             */
-            dma_set_pte_readable(*pte);
-            dma_set_pte_writable(*pte);
+            if ( dma_pte_present(*pte) )
+            {
+                struct dma_pte *split = map_vtd_domain_page(pte_maddr);
+                unsigned long inc = 1UL << level_to_offset_bits(level - 1);
+
+                split[0].val = pte->val;
+                if ( inc == PAGE_SIZE )
+                    split[0].val &= ~DMA_PTE_SP;
+
+                for ( offset = 1; offset < PTE_NUM; ++offset )
+                    split[offset].val = split[offset - 1].val + inc;
+
+                iommu_sync_cache(split, PAGE_SIZE);
+                unmap_vtd_domain_page(split);
+
+                if ( flush_flags )
+                    *flush_flags |= IOMMU_FLUSHF_modified;
+            }
+
+            write_atomic(&pte->val, new_pte.val);
             iommu_sync_cache(pte, sizeof(struct dma_pte));
         }
 
-        if ( level == 2 )
+        if ( --level == target )
             break;
 
         unmap_vtd_domain_page(parent);
         parent = map_vtd_domain_page(pte_maddr);
-        level--;
     }
 
     unmap_vtd_domain_page(parent);
@@ -430,7 +483,7 @@  static uint64_t domain_pgd_maddr(struct
         if ( !hd->arch.vtd.pgd_maddr )
         {
             /* Ensure we have pagetables allocated down to leaf PTE. */
-            addr_to_dma_page_maddr(d, 0, 1);
+            addr_to_dma_page_maddr(d, 0, 1, NULL, true);
 
             if ( !hd->arch.vtd.pgd_maddr )
                 return 0;
@@ -770,8 +823,9 @@  static int __must_check iommu_flush_iotl
 }
 
 /* clear one page's page table */
-static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
-                              unsigned int *flush_flags)
+static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
+                             unsigned int order,
+                             unsigned int *flush_flags)
 {
     struct domain_iommu *hd = dom_iommu(domain);
     struct dma_pte *page = NULL, *pte = NULL;
@@ -779,11 +833,11 @@  static void dma_pte_clear_one(struct dom
 
     spin_lock(&hd->arch.mapping_lock);
     /* get last level pte */
-    pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
-    if ( pg_maddr == 0 )
+    pg_maddr = addr_to_dma_page_maddr(domain, addr, 1, flush_flags, false);
+    if ( pg_maddr < PAGE_SIZE )
     {
         spin_unlock(&hd->arch.mapping_lock);
-        return;
+        return pg_maddr ? -ENOMEM : 0;
     }
 
     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
@@ -793,7 +847,7 @@  static void dma_pte_clear_one(struct dom
     {
         spin_unlock(&hd->arch.mapping_lock);
         unmap_vtd_domain_page(page);
-        return;
+        return 0;
     }
 
     dma_clear_pte(*pte);
@@ -803,6 +857,8 @@  static void dma_pte_clear_one(struct dom
     iommu_sync_cache(pte, sizeof(struct dma_pte));
 
     unmap_vtd_domain_page(page);
+
+    return 0;
 }
 
 static int iommu_set_root_entry(struct vtd_iommu *iommu)
@@ -1914,8 +1970,9 @@  static int __must_check intel_iommu_map_
         return 0;
     }
 
-    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1);
-    if ( !pg_maddr )
+    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1, flush_flags,
+                                      true);
+    if ( pg_maddr < PAGE_SIZE )
     {
         spin_unlock(&hd->arch.mapping_lock);
         return -ENOMEM;
@@ -1965,17 +2022,14 @@  static int __must_check intel_iommu_unma
     if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
         return 0;
 
-    dma_pte_clear_one(d, dfn_to_daddr(dfn), flush_flags);
-
-    return 0;
+    return dma_pte_clear_one(d, dfn_to_daddr(dfn), 0, flush_flags);
 }
 
 static int intel_iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
                                    unsigned int *flags)
 {
     struct domain_iommu *hd = dom_iommu(d);
-    struct dma_pte *page, val;
-    u64 pg_maddr;
+    uint64_t val;
 
     /*
      * If VT-d shares EPT page table or if the domain is the hardware
@@ -1987,25 +2041,16 @@  static int intel_iommu_lookup_page(struc
 
     spin_lock(&hd->arch.mapping_lock);
 
-    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0);
-    if ( !pg_maddr )
-    {
-        spin_unlock(&hd->arch.mapping_lock);
-        return -ENOENT;
-    }
-
-    page = map_vtd_domain_page(pg_maddr);
-    val = page[dfn_x(dfn) & LEVEL_MASK];
+    val = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0, NULL, false);
 
-    unmap_vtd_domain_page(page);
     spin_unlock(&hd->arch.mapping_lock);
 
-    if ( !dma_pte_present(val) )
+    if ( val < PAGE_SIZE )
         return -ENOENT;
 
-    *mfn = maddr_to_mfn(dma_pte_addr(val));
-    *flags = dma_pte_read(val) ? IOMMUF_readable : 0;
-    *flags |= dma_pte_write(val) ? IOMMUF_writable : 0;
+    *mfn = maddr_to_mfn(val);
+    *flags = val & DMA_PTE_READ ? IOMMUF_readable : 0;
+    *flags |= val & DMA_PTE_WRITE ? IOMMUF_writable : 0;
 
     return 0;
 }