diff mbox series

[v4,15/21] AMD/IOMMU: free all-empty page tables

Message ID 16f5b398-56f6-a70c-9ce0-0ad72eab5058@suse.com (mailing list archive)
State Superseded
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich April 25, 2022, 8:42 a.m. UTC
When a page table ends up with no present entries left, it can be
replaced by a non-present entry at the next higher level. The page table
itself can then be scheduled for freeing.

Note that while its output isn't used there yet,
pt_update_contig_markers() right away needs to be called in all places
where entries get updated, not just the one where entries get cleared.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Re-base over changes earlier in the series.
v3: Re-base over changes earlier in the series.
v2: New.

Comments

Roger Pau Monné May 10, 2022, 1:30 p.m. UTC | #1
On Mon, Apr 25, 2022 at 10:42:19AM +0200, Jan Beulich wrote:
> When a page table ends up with no present entries left, it can be
> replaced by a non-present entry at the next higher level. The page table
> itself can then be scheduled for freeing.
> 
> Note that while its output isn't used there yet,
> pt_update_contig_markers() right away needs to be called in all places
> where entries get updated, not just the one where entries get cleared.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Some comments below.

> ---
> v4: Re-base over changes earlier in the series.
> v3: Re-base over changes earlier in the series.
> v2: New.
> 
> --- a/xen/drivers/passthrough/amd/iommu_map.c
> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> @@ -21,6 +21,9 @@
>  
>  #include "iommu.h"
>  
> +#define CONTIG_MASK IOMMU_PTE_CONTIG_MASK
> +#include <asm/pt-contig-markers.h>
> +
>  /* Given pfn and page table level, return pde index */
>  static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level)
>  {
> @@ -33,16 +36,20 @@ static unsigned int pfn_to_pde_idx(unsig
>  
>  static union amd_iommu_pte clear_iommu_pte_present(unsigned long l1_mfn,
>                                                     unsigned long dfn,
> -                                                   unsigned int level)
> +                                                   unsigned int level,
> +                                                   bool *free)
>  {
>      union amd_iommu_pte *table, *pte, old;
> +    unsigned int idx = pfn_to_pde_idx(dfn, level);
>  
>      table = map_domain_page(_mfn(l1_mfn));
> -    pte = &table[pfn_to_pde_idx(dfn, level)];
> +    pte = &table[idx];
>      old = *pte;
>  
>      write_atomic(&pte->raw, 0);
>  
> +    *free = pt_update_contig_markers(&table->raw, idx, level, PTE_kind_null);
> +
>      unmap_domain_page(table);
>  
>      return old;
> @@ -85,7 +92,11 @@ static union amd_iommu_pte set_iommu_pte
>      if ( !old.pr || old.next_level ||
>           old.mfn != next_mfn ||
>           old.iw != iw || old.ir != ir )
> +    {
>          set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> +        pt_update_contig_markers(&table->raw, pfn_to_pde_idx(dfn, level),
> +                                 level, PTE_kind_leaf);

It would be better to call pt_update_contig_markers inside of
set_iommu_pde_present, but that would imply changing the parameters
passed to the function.  It's cumbersome (and error prone) to have to
pair calls to set_iommu_pde_present() with pt_update_contig_markers().

> +    }
>      else
>          old.pr = false; /* signal "no change" to the caller */
>  
> @@ -322,6 +333,9 @@ static int iommu_pde_from_dfn(struct dom
>              smp_wmb();
>              set_iommu_pde_present(pde, next_table_mfn, next_level, true,
>                                    true);
> +            pt_update_contig_markers(&next_table_vaddr->raw,
> +                                     pfn_to_pde_idx(dfn, level),
> +                                     level, PTE_kind_table);
>  
>              *flush_flags |= IOMMU_FLUSHF_modified;
>          }
> @@ -347,6 +361,9 @@ static int iommu_pde_from_dfn(struct dom
>                  next_table_mfn = mfn_x(page_to_mfn(table));
>                  set_iommu_pde_present(pde, next_table_mfn, next_level, true,
>                                        true);
> +                pt_update_contig_markers(&next_table_vaddr->raw,
> +                                         pfn_to_pde_idx(dfn, level),
> +                                         level, PTE_kind_table);
>              }
>              else /* should never reach here */
>              {
> @@ -474,8 +491,24 @@ int cf_check amd_iommu_unmap_page(
>  
>      if ( pt_mfn )
>      {
> +        bool free;
> +
>          /* Mark PTE as 'page not present'. */
> -        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level);
> +        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
> +
> +        while ( unlikely(free) && ++level < hd->arch.amd.paging_mode )
> +        {
> +            struct page_info *pg = mfn_to_page(_mfn(pt_mfn));
> +
> +            if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn,
> +                                    flush_flags, false) )
> +                BUG();
> +            BUG_ON(!pt_mfn);
> +
> +            clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);

Not sure it's worth initializing free to false (at definition and
before each call to clear_iommu_pte_present), just in case we manage
to return early from clear_iommu_pte_present without having updated
'free'.

Thanks, Roger.
Jan Beulich May 18, 2022, 10:18 a.m. UTC | #2
On 10.05.2022 15:30, Roger Pau Monné wrote:
> On Mon, Apr 25, 2022 at 10:42:19AM +0200, Jan Beulich wrote:
>> When a page table ends up with no present entries left, it can be
>> replaced by a non-present entry at the next higher level. The page table
>> itself can then be scheduled for freeing.
>>
>> Note that while its output isn't used there yet,
>> pt_update_contig_markers() right away needs to be called in all places
>> where entries get updated, not just the one where entries get cleared.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

>> @@ -85,7 +92,11 @@ static union amd_iommu_pte set_iommu_pte
>>      if ( !old.pr || old.next_level ||
>>           old.mfn != next_mfn ||
>>           old.iw != iw || old.ir != ir )
>> +    {
>>          set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>> +        pt_update_contig_markers(&table->raw, pfn_to_pde_idx(dfn, level),
>> +                                 level, PTE_kind_leaf);
> 
> It would be better to call pt_update_contig_markers inside of
> set_iommu_pde_present, but that would imply changing the parameters
> passed to the function.  It's cumbersome (and error prone) to have to
> pair calls to set_iommu_pde_present() with pt_update_contig_markers().

Right, but then already the sheer number of parameters would become
excessive (imo).

>> @@ -474,8 +491,24 @@ int cf_check amd_iommu_unmap_page(
>>  
>>      if ( pt_mfn )
>>      {
>> +        bool free;
>> +
>>          /* Mark PTE as 'page not present'. */
>> -        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level);
>> +        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
>> +
>> +        while ( unlikely(free) && ++level < hd->arch.amd.paging_mode )
>> +        {
>> +            struct page_info *pg = mfn_to_page(_mfn(pt_mfn));
>> +
>> +            if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn,
>> +                                    flush_flags, false) )
>> +                BUG();
>> +            BUG_ON(!pt_mfn);
>> +
>> +            clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
> 
> Not sure it's worth initializing free to false (at definition and
> before each call to clear_iommu_pte_present), just in case we manage
> to return early from clear_iommu_pte_present without having updated
> 'free'.

There's no such path now, so I'd view it as dead code to do so. If
necessary a patch introducing such an early exit would need to deal
with this. But even then I'd rather see this being dealt with right
in clear_iommu_pte_present().

Jan
diff mbox series

Patch

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -21,6 +21,9 @@ 
 
 #include "iommu.h"
 
+#define CONTIG_MASK IOMMU_PTE_CONTIG_MASK
+#include <asm/pt-contig-markers.h>
+
 /* Given pfn and page table level, return pde index */
 static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level)
 {
@@ -33,16 +36,20 @@  static unsigned int pfn_to_pde_idx(unsig
 
 static union amd_iommu_pte clear_iommu_pte_present(unsigned long l1_mfn,
                                                    unsigned long dfn,
-                                                   unsigned int level)
+                                                   unsigned int level,
+                                                   bool *free)
 {
     union amd_iommu_pte *table, *pte, old;
+    unsigned int idx = pfn_to_pde_idx(dfn, level);
 
     table = map_domain_page(_mfn(l1_mfn));
-    pte = &table[pfn_to_pde_idx(dfn, level)];
+    pte = &table[idx];
     old = *pte;
 
     write_atomic(&pte->raw, 0);
 
+    *free = pt_update_contig_markers(&table->raw, idx, level, PTE_kind_null);
+
     unmap_domain_page(table);
 
     return old;
@@ -85,7 +92,11 @@  static union amd_iommu_pte set_iommu_pte
     if ( !old.pr || old.next_level ||
          old.mfn != next_mfn ||
          old.iw != iw || old.ir != ir )
+    {
         set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
+        pt_update_contig_markers(&table->raw, pfn_to_pde_idx(dfn, level),
+                                 level, PTE_kind_leaf);
+    }
     else
         old.pr = false; /* signal "no change" to the caller */
 
@@ -322,6 +333,9 @@  static int iommu_pde_from_dfn(struct dom
             smp_wmb();
             set_iommu_pde_present(pde, next_table_mfn, next_level, true,
                                   true);
+            pt_update_contig_markers(&next_table_vaddr->raw,
+                                     pfn_to_pde_idx(dfn, level),
+                                     level, PTE_kind_table);
 
             *flush_flags |= IOMMU_FLUSHF_modified;
         }
@@ -347,6 +361,9 @@  static int iommu_pde_from_dfn(struct dom
                 next_table_mfn = mfn_x(page_to_mfn(table));
                 set_iommu_pde_present(pde, next_table_mfn, next_level, true,
                                       true);
+                pt_update_contig_markers(&next_table_vaddr->raw,
+                                         pfn_to_pde_idx(dfn, level),
+                                         level, PTE_kind_table);
             }
             else /* should never reach here */
             {
@@ -474,8 +491,24 @@  int cf_check amd_iommu_unmap_page(
 
     if ( pt_mfn )
     {
+        bool free;
+
         /* Mark PTE as 'page not present'. */
-        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level);
+        old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
+
+        while ( unlikely(free) && ++level < hd->arch.amd.paging_mode )
+        {
+            struct page_info *pg = mfn_to_page(_mfn(pt_mfn));
+
+            if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn,
+                                    flush_flags, false) )
+                BUG();
+            BUG_ON(!pt_mfn);
+
+            clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
+            *flush_flags |= IOMMU_FLUSHF_all;
+            iommu_queue_free_pgtable(hd, pg);
+        }
     }
 
     spin_unlock(&hd->arch.mapping_lock);