diff mbox series

iommu/amd: atomically update IRTE if supported

Message ID 20250121095704.18769-1-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series iommu/amd: atomically update IRTE if supported | expand

Commit Message

Roger Pau Monné Jan. 21, 2025, 9:57 a.m. UTC
If using a 32bit Interrupt Remapping Entry or a 128bit one and the CPU
supports 128bit cmpxchg don't disable the entry by setting RemapEn = 0
ahead of updating it.  As a consequence of not toggling RemapEn ahead of
the update the Interrupt Remapping Table needs to be flushed after the
entry update.

This avoids a window where the IRTE has RemapEn = 0, which can lead to
IO_PAGE_FAULT if the underlying interrupt source is not masked.

There's no guidance in AMD-Vi specification about how IRTE update should be
performed as opposed to DTE updating which has specific guidance.  However
DTE updating claims that reads will always be at least 128bits in size, and
hence for the purposes here assume that reads and caching of the IRTE
entries in either 32 or 128 bit format will be done atomically from
the IOMMU.

Note that as part of introducing a new raw128 field in the IRTE struct, the
current raw field is renamed to raw64 to explicitly contain the size in the
field name.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/drivers/passthrough/amd/iommu_intr.c | 68 ++++++++++++++++++------
 1 file changed, 53 insertions(+), 15 deletions(-)

Comments

Andrew Cooper Jan. 21, 2025, 10:13 a.m. UTC | #1
On 21/01/2025 9:57 am, Roger Pau Monne wrote:
> diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c
> index 7fc796dec25b..efa9ddc62458 100644
> --- a/xen/drivers/passthrough/amd/iommu_intr.c
> +++ b/xen/drivers/passthrough/amd/iommu_intr.c
> @@ -223,14 +224,36 @@ static void update_intremap_entry(const struct amd_iommu *iommu,
>              },
>          };
>  
> -        ASSERT(!entry.ptr128->full.remap_en);
> -        entry.ptr128->raw[1] = irte.raw[1];
> -        /*
> -         * High half needs to be set before low one (containing RemapEn).  See
> -         * comment in free_intremap_entry() regarding the choice of barrier.
> -         */
> -        smp_wmb();
> -        ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
> +        if ( cpu_has_cx16 )

cx16 is always available.  Teddy even submitted patches, but it looks
like they succumbed.

We need to take Teddy's patches, then ~half this one.

As proved by this patch alone, it's already hard enough getting this
right, without introducing dead logic paths into the mix.

~Andrew
Andrew Cooper Jan. 21, 2025, 10:32 a.m. UTC | #2
On 21/01/2025 9:57 am, Roger Pau Monne wrote:
> If using a 32bit Interrupt Remapping Entry or a 128bit one and the CPU
> supports 128bit cmpxchg don't disable the entry by setting RemapEn = 0
> ahead of updating it.  As a consequence of not toggling RemapEn ahead of
> the update the Interrupt Remapping Table needs to be flushed after the
> entry update.
>
> This avoids a window where the IRTE has RemapEn = 0, which can lead to
> IO_PAGE_FAULT if the underlying interrupt source is not masked.

It's probably worth saying that this race condition was identified in
the field, rather than being a theoretical issue.

~Andrew
diff mbox series

Patch

diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c
index 7fc796dec25b..efa9ddc62458 100644
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -39,7 +39,8 @@  union irte32 {
 };
 
 union irte128 {
-    uint64_t raw[2];
+    uint64_t raw64[2];
+    __uint128_t raw128;
     struct {
         bool remap_en:1;
         bool sup_io_pf:1;
@@ -187,7 +188,7 @@  static void free_intremap_entry(const struct amd_iommu *iommu,
 
     if ( iommu->ctrl.ga_en )
     {
-        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        ACCESS_ONCE(entry.ptr128->raw64[0]) = 0;
         /*
          * Low half (containing RemapEn) needs to be cleared first.  Note that
          * strictly speaking smp_wmb() isn't enough, as conceptually it expands
@@ -197,7 +198,7 @@  static void free_intremap_entry(const struct amd_iommu *iommu,
          * variant will do.
          */
         smp_wmb();
-        entry.ptr128->raw[1] = 0;
+        entry.ptr128->raw64[1] = 0;
     }
     else
         ACCESS_ONCE(entry.ptr32->raw) = 0;
@@ -223,14 +224,36 @@  static void update_intremap_entry(const struct amd_iommu *iommu,
             },
         };
 
-        ASSERT(!entry.ptr128->full.remap_en);
-        entry.ptr128->raw[1] = irte.raw[1];
-        /*
-         * High half needs to be set before low one (containing RemapEn).  See
-         * comment in free_intremap_entry() regarding the choice of barrier.
-         */
-        smp_wmb();
-        ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
+        if ( cpu_has_cx16 )
+        {
+            __uint128_t old = entry.ptr128->raw128;
+            __uint128_t res = cmpxchg16b(&entry.ptr128->raw128, &old,
+                                         &irte.raw128);
+
+            /*
+             * Hardware does not update the IRTE behind our backs, so the
+             * return value should match "old".
+             */
+            if ( res != old )
+            {
+                printk(XENLOG_ERR
+                       "unexpected IRTE %016lx_%016lx (expected %016lx_%016lx)\n",
+                       (uint64_t)(res >> 64), (uint64_t)res,
+                       (uint64_t)(old >> 64), (uint64_t)old);
+                BUG();
+            }
+        }
+        else
+        {
+            ASSERT(!entry.ptr128->full.remap_en);
+            entry.ptr128->raw64[1] = irte.raw64[1];
+            /*
+             * High half needs to be set before low one (containing RemapEn).  See
+             * comment in free_intremap_entry() regarding the choice of barrier.
+             */
+            smp_wmb();
+            ACCESS_ONCE(entry.ptr128->raw64[0]) = irte.raw64[0];
+        }
     }
     else
     {
@@ -300,7 +323,8 @@  static int update_intremap_entry_from_ioapic(
     entry = get_intremap_entry(iommu, req_id, offset);
 
     /* The RemapEn fields match for all formats. */
-    while ( iommu->enabled && entry.ptr32->flds.remap_en )
+    while ( iommu->enabled && entry.ptr32->flds.remap_en &&
+            iommu->ctrl.ga_en && !cpu_has_cx16 )
     {
         entry.ptr32->flds.remap_en = false;
         spin_unlock(lock);
@@ -314,6 +338,9 @@  static int update_intremap_entry_from_ioapic(
 
     spin_unlock_irqrestore(lock, flags);
 
+    if ( iommu->enabled && !fresh && (!iommu->ctrl.ga_en || cpu_has_cx16) )
+        amd_iommu_flush_intremap(iommu, req_id);
+
     set_rte_index(rte, offset);
 
     return 0;
@@ -425,6 +452,7 @@  static int update_intremap_entry_from_msi_msg(
     uint8_t delivery_mode, vector, dest_mode;
     spinlock_t *lock;
     unsigned int dest, offset, i;
+    bool fresh = false;
 
     req_id = get_dma_requestor_id(iommu->seg, bdf);
     alias_id = get_intremap_requestor_id(iommu->seg, bdf);
@@ -468,12 +496,14 @@  static int update_intremap_entry_from_msi_msg(
             return -ENOSPC;
         }
         *remap_index = offset;
+        fresh = true;
     }
 
     entry = get_intremap_entry(iommu, req_id, offset);
 
     /* The RemapEn fields match for all formats. */
-    while ( iommu->enabled && entry.ptr32->flds.remap_en )
+    while ( iommu->enabled && entry.ptr32->flds.remap_en &&
+            iommu->ctrl.ga_en && !cpu_has_cx16 )
     {
         entry.ptr32->flds.remap_en = false;
         spin_unlock(lock);
@@ -488,6 +518,13 @@  static int update_intremap_entry_from_msi_msg(
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
+    if ( iommu->enabled && !fresh && (!iommu->ctrl.ga_en || cpu_has_cx16) )
+    {
+        amd_iommu_flush_intremap(iommu, req_id);
+        if ( alias_id != req_id )
+            amd_iommu_flush_intremap(iommu, alias_id);
+    }
+
     *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
 
     /*
@@ -722,7 +759,7 @@  static void dump_intremap_table(const struct amd_iommu *iommu,
     for ( count = 0; count < nr; count++ )
     {
         if ( iommu->ctrl.ga_en
-             ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
+             ? !tbl.ptr128[count].raw64[0] && !tbl.ptr128[count].raw64[1]
              : !tbl.ptr32[count].raw )
                 continue;
 
@@ -735,7 +772,8 @@  static void dump_intremap_table(const struct amd_iommu *iommu,
 
         if ( iommu->ctrl.ga_en )
             printk("    IRTE[%03x] %016lx_%016lx\n",
-                   count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
+                   count, tbl.ptr128[count].raw64[1],
+                   tbl.ptr128[count].raw64[0]);
         else
             printk("    IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
     }