diff mbox series

[v4,05/12] AMD/IOMMU: introduce 128-bit IRTE non-guest-APIC IRTE format

Message ID 193f3db9-5731-6841-4723-fa547f89db07@suse.com (mailing list archive)
State New, archived
Headers show
Series [v4,01/12] AMD/IOMMU: use bit field for extended feature register | expand

Commit Message

Jan Beulich July 25, 2019, 1:31 p.m. UTC
This is in preparation of actually enabling x2APIC mode, which requires
this wider IRTE format to be used.

A specific remark regarding the first hunk changing
amd_iommu_ioapic_update_ire(): This bypass was introduced for XSA-36,
i.e. by 94d4a1119d ("AMD,IOMMU: Clean up old entries in remapping
tables when creating new one"). Other code introduced by that change has
meanwhile disappeared or further changed, and I wonder if - rather than
adding an x2apic_enabled check to the conditional - the bypass couldn't
be deleted altogether. For now the goal is to affect the non-x2APIC
paths as little as possible.

Take the liberty and use the new "fresh" flag to suppress an unneeded
flush in update_intremap_entry_from_ioapic().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Re-base. Do away with standalone struct irte_full. Use smp_wmb().
v3: Avoid unrelated type changes in update_intremap_entry_from_ioapic().
     Drop irte_mode enum and variable. Convert INTREMAP_TABLE_ORDER into
     a static helper. Comment barrier() uses. Switch boolean bitfields to
     bool.
v2: Add cast in get_full_dest(). Re-base over changes earlier in the
     series. Don't use cmpxchg16b. Use barrier() instead of wmb().
---
Note that AMD's doc says Lowest Priority ("Arbitrated" by their naming)
mode is unavailable in x2APIC mode, but they've confirmed this to be a
mistake on their part.

Comments

Andrew Cooper July 30, 2019, 10:13 a.m. UTC | #1
On 25/07/2019 14:31, Jan Beulich wrote:
> This is in preparation of actually enabling x2APIC mode, which requires
> this wider IRTE format to be used.
>
> A specific remark regarding the first hunk changing
> amd_iommu_ioapic_update_ire(): This bypass was introduced for XSA-36,
> i.e. by 94d4a1119d ("AMD,IOMMU: Clean up old entries in remapping
> tables when creating new one"). Other code introduced by that change has
> meanwhile disappeared or further changed, and I wonder if - rather than
> adding an x2apic_enabled check to the conditional - the bypass couldn't
> be deleted altogether. For now the goal is to affect the non-x2APIC
> paths as little as possible.
>
> Take the liberty and use the new "fresh" flag to suppress an unneeded
> flush in update_intremap_entry_from_ioapic().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Woods, Brian July 30, 2019, 4:36 p.m. UTC | #2
On Thu, Jul 25, 2019 at 01:31:02PM +0000, Jan Beulich wrote:
> This is in preparation of actually enabling x2APIC mode, which requires
> this wider IRTE format to be used.
> 
> A specific remark regarding the first hunk changing
> amd_iommu_ioapic_update_ire(): This bypass was introduced for XSA-36,
> i.e. by 94d4a1119d ("AMD,IOMMU: Clean up old entries in remapping
> tables when creating new one"). Other code introduced by that change has
> meanwhile disappeared or further changed, and I wonder if - rather than
> adding an x2apic_enabled check to the conditional - the bypass couldn't
> be deleted altogether. For now the goal is to affect the non-x2APIC
> paths as little as possible.
> 
> Take the liberty and use the new "fresh" flag to suppress an unneeded
> flush in update_intremap_entry_from_ioapic().
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Brian Woods <brian.woods@amd.com>

> ---
> v4: Re-base. Do away with standalone struct irte_full. Use smp_wmb().
> v3: Avoid unrelated type changes in update_intremap_entry_from_ioapic().
>      Drop irte_mode enum and variable. Convert INTREMAP_TABLE_ORDER into
>      a static helper. Comment barrier() uses. Switch boolean bitfields to
>      bool.
> v2: Add cast in get_full_dest(). Re-base over changes earlier in the
>      series. Don't use cmpxchg16b. Use barrier() instead of wmb().
> ---
> Note that AMD's doc says Lowest Priority ("Arbitrated" by their naming)
> mode is unavailable in x2APIC mode, but they've confirmed this to be a
> mistake on their part.
> 
> --- a/xen/drivers/passthrough/amd/iommu_intr.c
> +++ b/xen/drivers/passthrough/amd/iommu_intr.c
> @@ -39,12 +39,36 @@ union irte32 {
>       } flds;
>   };
>   
> +union irte128 {
> +    uint64_t raw[2];
> +    struct {
> +        bool remap_en:1;
> +        bool sup_io_pf:1;
> +        unsigned int int_type:3;
> +        bool rq_eoi:1;
> +        bool dm:1;
> +        bool guest_mode:1; /* MBZ */
> +        unsigned int dest_lo:24;
> +        unsigned int :32;
> +        unsigned int vector:8;
> +        unsigned int :24;
> +        unsigned int :24;
> +        unsigned int dest_hi:8;
> +    } full;
> +};
> +
>   union irte_ptr {
>       void *ptr;
>       union irte32 *ptr32;
> +    union irte128 *ptr128;
>   };
>   
> -#define INTREMAP_TABLE_ORDER    1
> +union irte_cptr {
> +    const void *ptr;
> +    const union irte32 *ptr32;
> +    const union irte128 *ptr128;
> +} __transparent__;
> +
>   #define INTREMAP_LENGTH 0xB
>   #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
>   
> @@ -57,6 +81,13 @@ unsigned int nr_ioapic_sbdf;
>   
>   static void dump_intremap_tables(unsigned char key);
>   
> +static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
> +{
> +    return iommu->ctrl.ga_en
> +           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
> +           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
> +}
> +
>   unsigned int ioapic_id_to_index(unsigned int apic_id)
>   {
>       unsigned int idx;
> @@ -131,7 +162,10 @@ static union irte_ptr get_intremap_entry
>   
>       ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
>   
> -    table.ptr32 += index;
> +    if ( iommu->ctrl.ga_en )
> +        table.ptr128 += index;
> +    else
> +        table.ptr32 += index;
>   
>       return table;
>   }
> @@ -141,7 +175,22 @@ static void free_intremap_entry(const st
>   {
>       union irte_ptr entry = get_intremap_entry(iommu, bdf, index);
>   
> -    ACCESS_ONCE(entry.ptr32->raw) = 0;
> +    if ( iommu->ctrl.ga_en )
> +    {
> +        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
> +        /*
> +         * Low half (containing RemapEn) needs to be cleared first.  Note that
> +         * strictly speaking smp_wmb() isn't enough, as conceptually it expands
> +         * to just barrier() when !CONFIG_SMP.  But wmb() would be more than we
> +         * need, since the IOMMU is a cache-coherent entity on the bus.  And
> +         * given that we don't allow CONFIG_SMP to be turned off, the SMP
> +         * variant will do.
> +         */
> +        smp_wmb();
> +        entry.ptr128->raw[1] = 0;
> +    }
> +    else
> +        ACCESS_ONCE(entry.ptr32->raw) = 0;
>   
>       __clear_bit(index, get_ivrs_mappings(iommu->seg)[bdf].intremap_inuse);
>   }
> @@ -151,17 +200,44 @@ static void update_intremap_entry(const
>                                     unsigned int vector, unsigned int int_type,
>                                     unsigned int dest_mode, unsigned int dest)
>   {
> -    union irte32 irte = {
> -        .flds = {
> -            .remap_en = true,
> -            .int_type = int_type,
> -            .dm = dest_mode,
> -            .dest = dest,
> -            .vector = vector,
> -        },
> -    };
> +    if ( iommu->ctrl.ga_en )
> +    {
> +        union irte128 irte = {
> +            .full = {
> +                .remap_en = true,
> +                .int_type = int_type,
> +                .dm = dest_mode,
> +                .dest_lo = dest,
> +                .dest_hi = dest >> 24,
> +                .vector = vector,
> +            },
> +        };
> +
> +        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
> +        /*
> +         * Low half, in particular RemapEn, needs to be cleared first.  See
> +         * comment in free_intremap_entry() regarding the choice of barrier.
> +         */
> +        smp_wmb();
> +        entry.ptr128->raw[1] = irte.raw[1];
> +        /* High half needs to be set before low one (containing RemapEn). */
> +        smp_wmb();
> +        ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
> +    }
> +    else
> +    {
> +        union irte32 irte = {
> +            .flds = {
> +                .remap_en = true,
> +                .int_type = int_type,
> +                .dm = dest_mode,
> +                .dest = dest,
> +                .vector = vector,
> +            },
> +        };
>   
> -    ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
> +        ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
> +    }
>   }
>   
>   static inline int get_rte_index(const struct IO_APIC_route_entry *rte)
> @@ -175,6 +251,11 @@ static inline void set_rte_index(struct
>       rte->delivery_mode = offset >> 8;
>   }
>   
> +static inline unsigned int get_full_dest(const union irte128 *entry)
> +{
> +    return entry->full.dest_lo | ((unsigned int)entry->full.dest_hi << 24);
> +}
> +
>   static int update_intremap_entry_from_ioapic(
>       int bdf,
>       struct amd_iommu *iommu,
> @@ -184,10 +265,11 @@ static int update_intremap_entry_from_io
>   {
>       unsigned long flags;
>       union irte_ptr entry;
> -    u8 delivery_mode, dest, vector, dest_mode;
> +    uint8_t delivery_mode, vector, dest_mode;
>       int req_id;
>       spinlock_t *lock;
> -    unsigned int offset;
> +    unsigned int dest, offset;
> +    bool fresh = false;
>   
>       req_id = get_intremap_requestor_id(iommu->seg, bdf);
>       lock = get_intremap_lock(iommu->seg, req_id);
> @@ -195,7 +277,7 @@ static int update_intremap_entry_from_io
>       delivery_mode = rte->delivery_mode;
>       vector = rte->vector;
>       dest_mode = rte->dest_mode;
> -    dest = rte->dest.logical.logical_dest;
> +    dest = x2apic_enabled ? rte->dest.dest32 : rte->dest.logical.logical_dest;
>   
>       spin_lock_irqsave(lock, flags);
>   
> @@ -210,25 +292,40 @@ static int update_intremap_entry_from_io
>               return -ENOSPC;
>           }
>           *index = offset;
> -        lo_update = 1;
> +        fresh = true;
>       }
>   
>       entry = get_intremap_entry(iommu, req_id, offset);
> -    if ( !lo_update )
> +    if ( fresh )
> +        /* nothing */;
> +    else if ( !lo_update )
>       {
>           /*
>            * Low half of incoming RTE is already in remapped format,
>            * so need to recover vector and delivery mode from IRTE.
>            */
>           ASSERT(get_rte_index(rte) == offset);
> -        vector = entry.ptr32->flds.vector;
> +        if ( iommu->ctrl.ga_en )
> +            vector = entry.ptr128->full.vector;
> +        else
> +            vector = entry.ptr32->flds.vector;
> +        /* The IntType fields match for both formats. */
>           delivery_mode = entry.ptr32->flds.int_type;
>       }
> +    else if ( x2apic_enabled )
> +    {
> +        /*
> +         * High half of incoming RTE was read from the I/O APIC and hence may
> +         * not hold the full destination, so need to recover full destination
> +         * from IRTE.
> +         */
> +        dest = get_full_dest(entry.ptr128);
> +    }
>       update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
>   
>       spin_unlock_irqrestore(lock, flags);
>   
> -    if ( iommu->enabled )
> +    if ( iommu->enabled && !fresh )
>       {
>           spin_lock_irqsave(&iommu->lock, flags);
>           amd_iommu_flush_intremap(iommu, req_id);
> @@ -286,6 +383,18 @@ int __init amd_iommu_setup_ioapic_remapp
>               dest_mode = rte.dest_mode;
>               dest = rte.dest.logical.logical_dest;
>   
> +            if ( iommu->ctrl.xt_en )
> +            {
> +                /*
> +                 * In x2APIC mode we have no way of discovering the high 24
> +                 * bits of the destination of an already enabled interrupt.
> +                 * We come here earlier than for xAPIC mode, so no interrupts
> +                 * should have been set up before.
> +                 */
> +                AMD_IOMMU_DEBUG("Unmasked IO-APIC#%u entry %u in x2APIC mode\n",
> +                                IO_APIC_ID(apic), pin);
> +            }
> +
>               spin_lock_irqsave(lock, flags);
>               offset = alloc_intremap_entry(seg, req_id, 1);
>               BUG_ON(offset >= INTREMAP_ENTRIES);
> @@ -320,7 +429,8 @@ void amd_iommu_ioapic_update_ire(
>       struct IO_APIC_route_entry new_rte = { 0 };
>       unsigned int rte_lo = (reg & 1) ? reg - 1 : reg;
>       unsigned int pin = (reg - 0x10) / 2;
> -    int saved_mask, seg, bdf, rc;
> +    int seg, bdf, rc;
> +    bool saved_mask, fresh = false;
>       struct amd_iommu *iommu;
>       unsigned int idx;
>   
> @@ -362,12 +472,22 @@ void amd_iommu_ioapic_update_ire(
>           *(((u32 *)&new_rte) + 1) = value;
>       }
>   
> -    if ( new_rte.mask &&
> -         ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
> +    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
>       {
>           ASSERT(saved_mask);
> -        __io_apic_write(apic, reg, value);
> -        return;
> +
> +        /*
> +         * There's nowhere except the IRTE to store a full 32-bit destination,
> +         * so we may not bypass entry allocation and updating of the low RTE
> +         * half in the (usual) case of the high RTE half getting written first.
> +         */
> +        if ( new_rte.mask && !x2apic_enabled )
> +        {
> +            __io_apic_write(apic, reg, value);
> +            return;
> +        }
> +
> +        fresh = true;
>       }
>   
>       /* mask the interrupt while we change the intremap table */
> @@ -396,8 +516,12 @@ void amd_iommu_ioapic_update_ire(
>       if ( reg == rte_lo )
>           return;
>   
> -    /* unmask the interrupt after we have updated the intremap table */
> -    if ( !saved_mask )
> +    /*
> +     * Unmask the interrupt after we have updated the intremap table. Also
> +     * write the low half if a fresh entry was allocated for a high half
> +     * update in x2APIC mode.
> +     */
> +    if ( !saved_mask || (x2apic_enabled && fresh) )
>       {
>           old_rte.mask = saved_mask;
>           __io_apic_write(apic, rte_lo, *((u32 *)&old_rte));
> @@ -411,31 +535,40 @@ unsigned int amd_iommu_read_ioapic_from_
>       unsigned int offset;
>       unsigned int val = __io_apic_read(apic, reg);
>       unsigned int pin = (reg - 0x10) / 2;
> +    uint16_t seg, bdf, req_id;
> +    const struct amd_iommu *iommu;
> +    union irte_ptr entry;
>   
>       idx = ioapic_id_to_index(IO_APIC_ID(apic));
>       if ( idx == MAX_IO_APICS )
>           return val;
>   
>       offset = ioapic_sbdf[idx].pin_2_idx[pin];
> +    if ( offset >= INTREMAP_ENTRIES )
> +        return val;
>   
> -    if ( !(reg & 1) && offset < INTREMAP_ENTRIES )
> -    {
> -        u16 bdf = ioapic_sbdf[idx].bdf;
> -        u16 seg = ioapic_sbdf[idx].seg;
> -        u16 req_id = get_intremap_requestor_id(seg, bdf);
> -        const struct amd_iommu *iommu = find_iommu_for_device(seg, bdf);
> -        union irte_ptr entry;
> +    seg = ioapic_sbdf[idx].seg;
> +    bdf = ioapic_sbdf[idx].bdf;
> +    iommu = find_iommu_for_device(seg, bdf);
> +    if ( !iommu )
> +        return val;
> +    req_id = get_intremap_requestor_id(seg, bdf);
> +    entry = get_intremap_entry(iommu, req_id, offset);
>   
> -        if ( !iommu )
> -            return val;
> +    if ( !(reg & 1) )
> +    {
>           ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
> -        entry = get_intremap_entry(iommu, req_id, offset);
>           val &= ~(INTREMAP_ENTRIES - 1);
> +        /* The IntType fields match for both formats. */
>           val |= MASK_INSR(entry.ptr32->flds.int_type,
>                            IO_APIC_REDIR_DELIV_MODE_MASK);
> -        val |= MASK_INSR(entry.ptr32->flds.vector,
> +        val |= MASK_INSR(iommu->ctrl.ga_en
> +                         ? entry.ptr128->full.vector
> +                         : entry.ptr32->flds.vector,
>                            IO_APIC_REDIR_VECTOR_MASK);
>       }
> +    else if ( x2apic_enabled )
> +        val = get_full_dest(entry.ptr128);
>   
>       return val;
>   }
> @@ -447,9 +580,9 @@ static int update_intremap_entry_from_ms
>       unsigned long flags;
>       union irte_ptr entry;
>       u16 req_id, alias_id;
> -    u8 delivery_mode, dest, vector, dest_mode;
> +    uint8_t delivery_mode, vector, dest_mode;
>       spinlock_t *lock;
> -    unsigned int offset, i;
> +    unsigned int dest, offset, i;
>   
>       req_id = get_dma_requestor_id(iommu->seg, bdf);
>       alias_id = get_intremap_requestor_id(iommu->seg, bdf);
> @@ -470,7 +603,12 @@ static int update_intremap_entry_from_ms
>       dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
>       delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
>       vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK;
> -    dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff;
> +
> +    if ( x2apic_enabled )
> +        dest = msg->dest32;
> +    else
> +        dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
> +
>       offset = *remap_index;
>       if ( offset >= INTREMAP_ENTRIES )
>       {
> @@ -616,10 +754,21 @@ void amd_iommu_read_msi_from_ire(
>       }
>   
>       msg->data &= ~(INTREMAP_ENTRIES - 1);
> +    /* The IntType fields match for both formats. */
>       msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
>                              MSI_DATA_DELIVERY_MODE_MASK);
> -    msg->data |= MASK_INSR(entry.ptr32->flds.vector,
> -                           MSI_DATA_VECTOR_MASK);
> +    if ( iommu->ctrl.ga_en )
> +    {
> +        msg->data |= MASK_INSR(entry.ptr128->full.vector,
> +                               MSI_DATA_VECTOR_MASK);
> +        msg->dest32 = get_full_dest(entry.ptr128);
> +    }
> +    else
> +    {
> +        msg->data |= MASK_INSR(entry.ptr32->flds.vector,
> +                               MSI_DATA_VECTOR_MASK);
> +        msg->dest32 = entry.ptr32->flds.dest;
> +    }
>   }
>   
>   int __init amd_iommu_free_intremap_table(
> @@ -631,7 +780,7 @@ int __init amd_iommu_free_intremap_table
>   
>       if ( tb )
>       {
> -        __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER);
> +        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
>           ivrs_mapping->intremap_table = NULL;
>       }
>   
> @@ -641,10 +790,10 @@ int __init amd_iommu_free_intremap_table
>   void *__init amd_iommu_alloc_intremap_table(
>       const struct amd_iommu *iommu, unsigned long **inuse_map)
>   {
> -    void *tb;
> -    tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
> +    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
> +
>       BUG_ON(tb == NULL);
> -    memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER));
> +    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
>       *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
>       BUG_ON(*inuse_map == NULL);
>       return tb;
> @@ -685,18 +834,29 @@ int __init amd_setup_hpet_msi(struct msi
>       return rc;
>   }
>   
> -static void dump_intremap_table(const u32 *table)
> +static void dump_intremap_table(const struct amd_iommu *iommu,
> +                                union irte_cptr tbl)
>   {
> -    u32 count;
> +    unsigned int count;
>   
> -    if ( !table )
> +    if ( !tbl.ptr )
>           return;
>   
>       for ( count = 0; count < INTREMAP_ENTRIES; count++ )
>       {
> -        if ( !table[count] )
> -            continue;
> -        printk("    IRTE[%03x] %08x\n", count, table[count]);
> +        if ( iommu->ctrl.ga_en )
> +        {
> +            if ( !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1] )
> +                continue;
> +            printk("    IRTE[%03x] %016lx_%016lx\n",
> +                   count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
> +        }
> +        else
> +        {
> +            if ( !tbl.ptr32[count].raw )
> +                continue;
> +            printk("    IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
> +        }
>       }
>   }
>   
> @@ -714,7 +874,7 @@ static int dump_intremap_mapping(const s
>              PCI_FUNC(ivrs_mapping->dte_requestor_id));
>   
>       spin_lock_irqsave(&(ivrs_mapping->intremap_lock), flags);
> -    dump_intremap_table(ivrs_mapping->intremap_table);
> +    dump_intremap_table(iommu, ivrs_mapping->intremap_table);
>       spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags);
>   
>       process_pending_softirqs();
> @@ -733,6 +893,8 @@ static void dump_intremap_tables(unsigne
>       printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n");
>   
>       spin_lock_irqsave(&shared_intremap_lock, flags);
> -    dump_intremap_table(shared_intremap_table);
> +    dump_intremap_table(list_first_entry(&amd_iommu_head, struct amd_iommu,
> +                                         list),
> +                        shared_intremap_table);
>       spin_unlock_irqrestore(&shared_intremap_lock, flags);
>   }
>
diff mbox series

Patch

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -39,12 +39,36 @@  union irte32 {
      } flds;
  };
  
+union irte128 {
+    uint64_t raw[2];
+    struct {
+        bool remap_en:1;
+        bool sup_io_pf:1;
+        unsigned int int_type:3;
+        bool rq_eoi:1;
+        bool dm:1;
+        bool guest_mode:1; /* MBZ */
+        unsigned int dest_lo:24;
+        unsigned int :32;
+        unsigned int vector:8;
+        unsigned int :24;
+        unsigned int :24;
+        unsigned int dest_hi:8;
+    } full;
+};
+
  union irte_ptr {
      void *ptr;
      union irte32 *ptr32;
+    union irte128 *ptr128;
  };
  
-#define INTREMAP_TABLE_ORDER    1
+union irte_cptr {
+    const void *ptr;
+    const union irte32 *ptr32;
+    const union irte128 *ptr128;
+} __transparent__;
+
  #define INTREMAP_LENGTH 0xB
  #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
  
@@ -57,6 +81,13 @@  unsigned int nr_ioapic_sbdf;
  
  static void dump_intremap_tables(unsigned char key);
  
+static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
+{
+    return iommu->ctrl.ga_en
+           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+}
+
  unsigned int ioapic_id_to_index(unsigned int apic_id)
  {
      unsigned int idx;
@@ -131,7 +162,10 @@  static union irte_ptr get_intremap_entry
  
      ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
  
-    table.ptr32 += index;
+    if ( iommu->ctrl.ga_en )
+        table.ptr128 += index;
+    else
+        table.ptr32 += index;
  
      return table;
  }
@@ -141,7 +175,22 @@  static void free_intremap_entry(const st
  {
      union irte_ptr entry = get_intremap_entry(iommu, bdf, index);
  
-    ACCESS_ONCE(entry.ptr32->raw) = 0;
+    if ( iommu->ctrl.ga_en )
+    {
+        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        /*
+         * Low half (containing RemapEn) needs to be cleared first.  Note that
+         * strictly speaking smp_wmb() isn't enough, as conceptually it expands
+         * to just barrier() when !CONFIG_SMP.  But wmb() would be more than we
+         * need, since the IOMMU is a cache-coherent entity on the bus.  And
+         * given that we don't allow CONFIG_SMP to be turned off, the SMP
+         * variant will do.
+         */
+        smp_wmb();
+        entry.ptr128->raw[1] = 0;
+    }
+    else
+        ACCESS_ONCE(entry.ptr32->raw) = 0;
  
      __clear_bit(index, get_ivrs_mappings(iommu->seg)[bdf].intremap_inuse);
  }
@@ -151,17 +200,44 @@  static void update_intremap_entry(const
                                    unsigned int vector, unsigned int int_type,
                                    unsigned int dest_mode, unsigned int dest)
  {
-    union irte32 irte = {
-        .flds = {
-            .remap_en = true,
-            .int_type = int_type,
-            .dm = dest_mode,
-            .dest = dest,
-            .vector = vector,
-        },
-    };
+    if ( iommu->ctrl.ga_en )
+    {
+        union irte128 irte = {
+            .full = {
+                .remap_en = true,
+                .int_type = int_type,
+                .dm = dest_mode,
+                .dest_lo = dest,
+                .dest_hi = dest >> 24,
+                .vector = vector,
+            },
+        };
+
+        ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+        /*
+         * Low half, in particular RemapEn, needs to be cleared first.  See
+         * comment in free_intremap_entry() regarding the choice of barrier.
+         */
+        smp_wmb();
+        entry.ptr128->raw[1] = irte.raw[1];
+        /* High half needs to be set before low one (containing RemapEn). */
+        smp_wmb();
+        ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
+    }
+    else
+    {
+        union irte32 irte = {
+            .flds = {
+                .remap_en = true,
+                .int_type = int_type,
+                .dm = dest_mode,
+                .dest = dest,
+                .vector = vector,
+            },
+        };
  
-    ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
+        ACCESS_ONCE(entry.ptr32->raw) = irte.raw;
+    }
  }
  
  static inline int get_rte_index(const struct IO_APIC_route_entry *rte)
@@ -175,6 +251,11 @@  static inline void set_rte_index(struct
      rte->delivery_mode = offset >> 8;
  }
  
+static inline unsigned int get_full_dest(const union irte128 *entry)
+{
+    return entry->full.dest_lo | ((unsigned int)entry->full.dest_hi << 24);
+}
+
  static int update_intremap_entry_from_ioapic(
      int bdf,
      struct amd_iommu *iommu,
@@ -184,10 +265,11 @@  static int update_intremap_entry_from_io
  {
      unsigned long flags;
      union irte_ptr entry;
-    u8 delivery_mode, dest, vector, dest_mode;
+    uint8_t delivery_mode, vector, dest_mode;
      int req_id;
      spinlock_t *lock;
-    unsigned int offset;
+    unsigned int dest, offset;
+    bool fresh = false;
  
      req_id = get_intremap_requestor_id(iommu->seg, bdf);
      lock = get_intremap_lock(iommu->seg, req_id);
@@ -195,7 +277,7 @@  static int update_intremap_entry_from_io
      delivery_mode = rte->delivery_mode;
      vector = rte->vector;
      dest_mode = rte->dest_mode;
-    dest = rte->dest.logical.logical_dest;
+    dest = x2apic_enabled ? rte->dest.dest32 : rte->dest.logical.logical_dest;
  
      spin_lock_irqsave(lock, flags);
  
@@ -210,25 +292,40 @@  static int update_intremap_entry_from_io
              return -ENOSPC;
          }
          *index = offset;
-        lo_update = 1;
+        fresh = true;
      }
  
      entry = get_intremap_entry(iommu, req_id, offset);
-    if ( !lo_update )
+    if ( fresh )
+        /* nothing */;
+    else if ( !lo_update )
      {
          /*
           * Low half of incoming RTE is already in remapped format,
           * so need to recover vector and delivery mode from IRTE.
           */
          ASSERT(get_rte_index(rte) == offset);
-        vector = entry.ptr32->flds.vector;
+        if ( iommu->ctrl.ga_en )
+            vector = entry.ptr128->full.vector;
+        else
+            vector = entry.ptr32->flds.vector;
+        /* The IntType fields match for both formats. */
          delivery_mode = entry.ptr32->flds.int_type;
      }
+    else if ( x2apic_enabled )
+    {
+        /*
+         * High half of incoming RTE was read from the I/O APIC and hence may
+         * not hold the full destination, so need to recover full destination
+         * from IRTE.
+         */
+        dest = get_full_dest(entry.ptr128);
+    }
      update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
  
      spin_unlock_irqrestore(lock, flags);
  
-    if ( iommu->enabled )
+    if ( iommu->enabled && !fresh )
      {
          spin_lock_irqsave(&iommu->lock, flags);
          amd_iommu_flush_intremap(iommu, req_id);
@@ -286,6 +383,18 @@  int __init amd_iommu_setup_ioapic_remapp
              dest_mode = rte.dest_mode;
              dest = rte.dest.logical.logical_dest;
  
+            if ( iommu->ctrl.xt_en )
+            {
+                /*
+                 * In x2APIC mode we have no way of discovering the high 24
+                 * bits of the destination of an already enabled interrupt.
+                 * We come here earlier than for xAPIC mode, so no interrupts
+                 * should have been set up before.
+                 */
+                AMD_IOMMU_DEBUG("Unmasked IO-APIC#%u entry %u in x2APIC mode\n",
+                                IO_APIC_ID(apic), pin);
+            }
+
              spin_lock_irqsave(lock, flags);
              offset = alloc_intremap_entry(seg, req_id, 1);
              BUG_ON(offset >= INTREMAP_ENTRIES);
@@ -320,7 +429,8 @@  void amd_iommu_ioapic_update_ire(
      struct IO_APIC_route_entry new_rte = { 0 };
      unsigned int rte_lo = (reg & 1) ? reg - 1 : reg;
      unsigned int pin = (reg - 0x10) / 2;
-    int saved_mask, seg, bdf, rc;
+    int seg, bdf, rc;
+    bool saved_mask, fresh = false;
      struct amd_iommu *iommu;
      unsigned int idx;
  
@@ -362,12 +472,22 @@  void amd_iommu_ioapic_update_ire(
          *(((u32 *)&new_rte) + 1) = value;
      }
  
-    if ( new_rte.mask &&
-         ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
      {
          ASSERT(saved_mask);
-        __io_apic_write(apic, reg, value);
-        return;
+
+        /*
+         * There's nowhere except the IRTE to store a full 32-bit destination,
+         * so we may not bypass entry allocation and updating of the low RTE
+         * half in the (usual) case of the high RTE half getting written first.
+         */
+        if ( new_rte.mask && !x2apic_enabled )
+        {
+            __io_apic_write(apic, reg, value);
+            return;
+        }
+
+        fresh = true;
      }
  
      /* mask the interrupt while we change the intremap table */
@@ -396,8 +516,12 @@  void amd_iommu_ioapic_update_ire(
      if ( reg == rte_lo )
          return;
  
-    /* unmask the interrupt after we have updated the intremap table */
-    if ( !saved_mask )
+    /*
+     * Unmask the interrupt after we have updated the intremap table. Also
+     * write the low half if a fresh entry was allocated for a high half
+     * update in x2APIC mode.
+     */
+    if ( !saved_mask || (x2apic_enabled && fresh) )
      {
          old_rte.mask = saved_mask;
          __io_apic_write(apic, rte_lo, *((u32 *)&old_rte));
@@ -411,31 +535,40 @@  unsigned int amd_iommu_read_ioapic_from_
      unsigned int offset;
      unsigned int val = __io_apic_read(apic, reg);
      unsigned int pin = (reg - 0x10) / 2;
+    uint16_t seg, bdf, req_id;
+    const struct amd_iommu *iommu;
+    union irte_ptr entry;
  
      idx = ioapic_id_to_index(IO_APIC_ID(apic));
      if ( idx == MAX_IO_APICS )
          return val;
  
      offset = ioapic_sbdf[idx].pin_2_idx[pin];
+    if ( offset >= INTREMAP_ENTRIES )
+        return val;
  
-    if ( !(reg & 1) && offset < INTREMAP_ENTRIES )
-    {
-        u16 bdf = ioapic_sbdf[idx].bdf;
-        u16 seg = ioapic_sbdf[idx].seg;
-        u16 req_id = get_intremap_requestor_id(seg, bdf);
-        const struct amd_iommu *iommu = find_iommu_for_device(seg, bdf);
-        union irte_ptr entry;
+    seg = ioapic_sbdf[idx].seg;
+    bdf = ioapic_sbdf[idx].bdf;
+    iommu = find_iommu_for_device(seg, bdf);
+    if ( !iommu )
+        return val;
+    req_id = get_intremap_requestor_id(seg, bdf);
+    entry = get_intremap_entry(iommu, req_id, offset);
  
-        if ( !iommu )
-            return val;
+    if ( !(reg & 1) )
+    {
          ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        entry = get_intremap_entry(iommu, req_id, offset);
          val &= ~(INTREMAP_ENTRIES - 1);
+        /* The IntType fields match for both formats. */
          val |= MASK_INSR(entry.ptr32->flds.int_type,
                           IO_APIC_REDIR_DELIV_MODE_MASK);
-        val |= MASK_INSR(entry.ptr32->flds.vector,
+        val |= MASK_INSR(iommu->ctrl.ga_en
+                         ? entry.ptr128->full.vector
+                         : entry.ptr32->flds.vector,
                           IO_APIC_REDIR_VECTOR_MASK);
      }
+    else if ( x2apic_enabled )
+        val = get_full_dest(entry.ptr128);
  
      return val;
  }
@@ -447,9 +580,9 @@  static int update_intremap_entry_from_ms
      unsigned long flags;
      union irte_ptr entry;
      u16 req_id, alias_id;
-    u8 delivery_mode, dest, vector, dest_mode;
+    uint8_t delivery_mode, vector, dest_mode;
      spinlock_t *lock;
-    unsigned int offset, i;
+    unsigned int dest, offset, i;
  
      req_id = get_dma_requestor_id(iommu->seg, bdf);
      alias_id = get_intremap_requestor_id(iommu->seg, bdf);
@@ -470,7 +603,12 @@  static int update_intremap_entry_from_ms
      dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
      delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
      vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK;
-    dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff;
+
+    if ( x2apic_enabled )
+        dest = msg->dest32;
+    else
+        dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
+
      offset = *remap_index;
      if ( offset >= INTREMAP_ENTRIES )
      {
@@ -616,10 +754,21 @@  void amd_iommu_read_msi_from_ire(
      }
  
      msg->data &= ~(INTREMAP_ENTRIES - 1);
+    /* The IntType fields match for both formats. */
      msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                             MSI_DATA_DELIVERY_MODE_MASK);
-    msg->data |= MASK_INSR(entry.ptr32->flds.vector,
-                           MSI_DATA_VECTOR_MASK);
+    if ( iommu->ctrl.ga_en )
+    {
+        msg->data |= MASK_INSR(entry.ptr128->full.vector,
+                               MSI_DATA_VECTOR_MASK);
+        msg->dest32 = get_full_dest(entry.ptr128);
+    }
+    else
+    {
+        msg->data |= MASK_INSR(entry.ptr32->flds.vector,
+                               MSI_DATA_VECTOR_MASK);
+        msg->dest32 = entry.ptr32->flds.dest;
+    }
  }
  
  int __init amd_iommu_free_intremap_table(
@@ -631,7 +780,7 @@  int __init amd_iommu_free_intremap_table
  
      if ( tb )
      {
-        __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER);
+        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
          ivrs_mapping->intremap_table = NULL;
      }
  
@@ -641,10 +790,10 @@  int __init amd_iommu_free_intremap_table
  void *__init amd_iommu_alloc_intremap_table(
      const struct amd_iommu *iommu, unsigned long **inuse_map)
  {
-    void *tb;
-    tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
+    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+
      BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER));
+    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
      *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
      BUG_ON(*inuse_map == NULL);
      return tb;
@@ -685,18 +834,29 @@  int __init amd_setup_hpet_msi(struct msi
      return rc;
  }
  
-static void dump_intremap_table(const u32 *table)
+static void dump_intremap_table(const struct amd_iommu *iommu,
+                                union irte_cptr tbl)
  {
-    u32 count;
+    unsigned int count;
  
-    if ( !table )
+    if ( !tbl.ptr )
          return;
  
      for ( count = 0; count < INTREMAP_ENTRIES; count++ )
      {
-        if ( !table[count] )
-            continue;
-        printk("    IRTE[%03x] %08x\n", count, table[count]);
+        if ( iommu->ctrl.ga_en )
+        {
+            if ( !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1] )
+                continue;
+            printk("    IRTE[%03x] %016lx_%016lx\n",
+                   count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
+        }
+        else
+        {
+            if ( !tbl.ptr32[count].raw )
+                continue;
+            printk("    IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
+        }
      }
  }
  
@@ -714,7 +874,7 @@  static int dump_intremap_mapping(const s
             PCI_FUNC(ivrs_mapping->dte_requestor_id));
  
      spin_lock_irqsave(&(ivrs_mapping->intremap_lock), flags);
-    dump_intremap_table(ivrs_mapping->intremap_table);
+    dump_intremap_table(iommu, ivrs_mapping->intremap_table);
      spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags);
  
      process_pending_softirqs();
@@ -733,6 +893,8 @@  static void dump_intremap_tables(unsigne
      printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n");
  
      spin_lock_irqsave(&shared_intremap_lock, flags);
-    dump_intremap_table(shared_intremap_table);
+    dump_intremap_table(list_first_entry(&amd_iommu_head, struct amd_iommu,
+                                         list),
+                        shared_intremap_table);
      spin_unlock_irqrestore(&shared_intremap_lock, flags);
  }