diff mbox

[v5,00/10] AMD IOMMU: further improvements

Message ID 5a4d4a61-a543-c657-8458-cbc6b5a8a633@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Aug. 6, 2019, 1:05 p.m. UTC
Only the first patch here is left from v4, everything else is new,
yet still related. The main goal is to reduce the huge memory
overhead that we've noticed. On the way there a number of other
things were once again noticed. Unfortunately before I was able to
also test the last two patches there, my Rome box broke again.
Hence these two patches have been tested on a (less affected)
Fam15 system only.

01: miscellaneous DTE handling adjustments
02: drop stray "else"
03: don't free shared IRT multiple times
04: introduce a "valid" flag for IVRS mappings
05: let callers of amd_iommu_alloc_intremap_table() handle errors
06: don't blindly allocate interrupt remapping tables
07: make phantom functions share interrupt remapping tables
08: x86/PCI: read MSI-X table entry count early
09: replace INTREMAP_ENTRIES
10: restrict interrupt remapping table sizes

Full set of patches once again attached here due to still unresolved
email issues over here.

Jan
AMD/IOMMU: miscellaneous DTE handling adjustments

First and foremost switch boolean fields to bool. Adjust a few related
function parameters as well. Then
- in amd_iommu_set_intremap_table() don't use literal numbers,
- in iommu_dte_add_device_entry() use a compound literal instead of many
  assignments,
- in amd_iommu_setup_domain_device()
  - eliminate a pointless local variable,
  - use || instead of && when deciding whether to clear an entry,
  - clear the I field without any checking of ATS / IOTLB state,
- leave reserved fields unnamed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v5: IOMMU_INTREMAP_LENGTH -> IOMMU_INTREMAP_ORDER. Adjust comment.
v4: New.
AMD/IOMMU: drop stray "else"

The blank line between it and the prior if() clearly indicates that this
was meant to be a standalone if().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -166,8 +166,8 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    else if ( (init_done ? amd_iommu_init_interrupt()
-                         : amd_iommu_init(false)) != 0 )
+    if ( (init_done ? amd_iommu_init_interrupt()
+                    : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
         return -ENODEV;
AMD/IOMMU: don't free shared IRT multiple times

Calling amd_iommu_free_intremap_table() for every IVRS entry is correct
only in per-device-IRT mode. Use a NULL 2nd argument to indicate that
the shared table should be freed, and call the function exactly once in
shared mode.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1106,6 +1106,15 @@ static void __init amd_iommu_init_cleanu
 {
     struct amd_iommu *iommu, *next;
 
+    /* free interrupt remapping table */
+    if ( amd_iommu_perdev_intremap )
+        iterate_ivrs_entries(amd_iommu_free_intremap_table);
+    else if ( shared_intremap_table )
+        amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
+                                                       struct amd_iommu,
+                                                       list),
+                                      NULL);
+
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
     {
@@ -1128,9 +1137,6 @@ static void __init amd_iommu_init_cleanu
         xfree(iommu);
     }
 
-    /* free interrupt remapping table */
-    iterate_ivrs_entries(amd_iommu_free_intremap_table);
-
     /* free device table */
     deallocate_device_table(&device_table);
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -792,14 +792,23 @@ void amd_iommu_read_msi_from_ire(
 int __init amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
-    void *tb = ivrs_mapping->intremap_table;
+    void **tblp;
 
-    XFREE(ivrs_mapping->intremap_inuse);
+    if ( ivrs_mapping )
+    {
+        XFREE(ivrs_mapping->intremap_inuse);
+        tblp = &ivrs_mapping->intremap_table;
+    }
+    else
+    {
+        XFREE(shared_intremap_inuse);
+        tblp = &shared_intremap_table;
+    }
 
-    if ( tb )
+    if ( *tblp )
     {
-        __free_amd_iommu_tables(tb, intremap_table_order(iommu));
-        ivrs_mapping->intremap_table = NULL;
+        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        *tblp = NULL;
     }
 
     return 0;
AMD/IOMMU: introduce a "valid" flag for IVRS mappings

For us to no longer blindly allocate interrupt remapping tables for
everything the ACPI tables name, we can't use struct ivrs_mappings'
intremap_table field anymore to also have the meaning of "this entry
is valid". Add a separate boolean field instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -88,6 +88,8 @@ static void __init add_ivrs_mapping_entr
          }
     }
 
+    ivrs_mappings[alias_id].valid = true;
+
     /* Assign IOMMU hardware. */
     ivrs_mappings[bdf].iommu = iommu;
 }
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1244,7 +1244,6 @@ static int __init amd_iommu_setup_device
     u16 seg, struct ivrs_mappings *ivrs_mappings)
 {
     unsigned int bdf;
-    void *intr_tb, *dte;
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
@@ -1264,16 +1263,17 @@ static int __init amd_iommu_setup_device
     /* Add device table entries */
     for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ )
     {
-        intr_tb = ivrs_mappings[bdf].intremap_table;
-
-        if ( intr_tb )
+        if ( ivrs_mappings[bdf].valid )
         {
+            void *dte;
+
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
             amd_iommu_set_intremap_table(
-                dte, (u64)virt_to_maddr(intr_tb), iommu_intremap);
+                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -69,8 +69,8 @@ struct amd_iommu *find_iommu_for_device(
  * table and I/O page table respectively. Such devices will have
  * both alias entry and select entry in IVRS structure.
  *
- * Return original device id, if device has valid interrupt remapping
- * table setup for both select entry and alias entry.
+ * Return original device id if both the specific entry and the alias entry
+ * have been marked valid.
  */
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf)
 {
@@ -79,8 +79,7 @@ int get_dma_requestor_id(uint16_t seg, u
 
     BUG_ON ( bdf >= ivrs_bdf_entries );
     req_id = ivrs_mappings[bdf].dte_requestor_id;
-    if ( (ivrs_mappings[bdf].intremap_table != NULL) &&
-         (ivrs_mappings[req_id].intremap_table != NULL) )
+    if ( ivrs_mappings[bdf].valid && ivrs_mappings[req_id].valid )
         req_id = bdf;
 
     return req_id;
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -111,6 +111,7 @@ struct ivrs_mappings {
     u8 unity_map_enable;
     u8 write_permission;
     u8 read_permission;
+    bool valid;
     unsigned long addr_range_start;
     unsigned long addr_range_length;
     struct amd_iommu *iommu;
AMD/IOMMU: let callers of amd_iommu_alloc_intremap_table() handle errors

Additional users of the function will want to handle errors more
gracefully. Remove the BUG_ON()s and make the current caller panic()
instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -86,6 +86,10 @@ static void __init add_ivrs_mapping_entr
              ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
              ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
          }
+
+         if ( !ivrs_mappings[alias_id].intremap_table )
+             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
+                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
     }
 
     ivrs_mappings[alias_id].valid = true;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -817,12 +817,22 @@ int __init amd_iommu_free_intremap_table
 void *__init amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
-    void *tb = __alloc_amd_iommu_tables(intremap_table_order(iommu));
+    unsigned int order = intremap_table_order(iommu);
+    void *tb = __alloc_amd_iommu_tables(order);
+
+    if ( tb )
+    {
+        *inuse_map = xzalloc_array(unsigned long,
+                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        if ( *inuse_map )
+            memset(tb, 0, PAGE_SIZE << order);
+        else
+        {
+            __free_amd_iommu_tables(tb, order);
+            tb = NULL;
+        }
+    }
 
-    BUG_ON(tb == NULL);
-    memset(tb, 0, PAGE_SIZE << intremap_table_order(iommu));
-    *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES));
-    BUG_ON(*inuse_map == NULL);
     return tb;
 }
AMD/IOMMU: don't blindly allocate interrupt remapping tables

ACPI tables are free to list far more device coordinates than there are
actual devices. By delaying the table allocations for most cases, and
doing them only when an actual device is known to be present at a given
position, overall memory used for the tables goes down from over 500k
pages to just over 1k ones.

Tables continue to get allocated right away for special entries
(IO-APIC, HPET) as well as for alias IDs. While in the former case
that's simply because there may not be any device at a given position,
in the latter case this is to avoid having to introduce ref-counting of
table usage.

The change involves invoking
iterate_ivrs_mappings(amd_iommu_setup_device_table) a second time,
because the function now wants to be able to find PCI devices, which
isn't possible yet when IOMMU setup happens very early during x2APIC
mode setup. In this context amd_iommu_init_interrupt() gets renamed as
well.

The logic adjusting a DTE's interrupt remapping attributes also gets
changed, such that the lack of an IRT would result in target aborted
rather than not remapped interrupts (should any occur).

Note that for now phantom functions get separate IRTs allocated, as was
the case before.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.
---
TBD: This retains prior (but suspicious) behavior of not calling
     amd_iommu_set_intremap_table() for "invalid" IVRS mapping entries.
     Since DTE.IV=0 means un-remapped interrupts, I wonder if this needs
     changing.

Backporting note: This depends on b5fbe81196 "iommu / x86: move call to
scan_pci_devices() out of vendor code"!

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -53,7 +53,8 @@ union acpi_ivhd_device {
 };
 
 static void __init add_ivrs_mapping_entry(
-    u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu)
+    uint16_t bdf, uint16_t alias_id, uint8_t flags, bool alloc_irt,
+    struct amd_iommu *iommu)
 {
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
 
@@ -69,27 +70,32 @@ static void __init add_ivrs_mapping_entr
     if ( iommu->bdf == bdf )
         return;
 
-    if ( !ivrs_mappings[alias_id].intremap_table )
+    /* Allocate interrupt remapping table if needed. */
+    if ( iommu_intremap && !ivrs_mappings[alias_id].intremap_table )
     {
-         /* allocate per-device interrupt remapping table */
-         if ( amd_iommu_perdev_intremap )
-             ivrs_mappings[alias_id].intremap_table =
-                 amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &ivrs_mappings[alias_id].intremap_inuse);
-         else
-         {
-             if ( shared_intremap_table == NULL  )
-                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                     iommu,
-                     &shared_intremap_inuse);
-             ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
-             ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
-         }
-
-         if ( !ivrs_mappings[alias_id].intremap_table )
-             panic("No memory for %04x:%02x:%02x.%u's IRT\n", iommu->seg,
-                   PCI_BUS(alias_id), PCI_SLOT(alias_id), PCI_FUNC(alias_id));
+        if ( !amd_iommu_perdev_intremap )
+        {
+            if ( !shared_intremap_table )
+                shared_intremap_table = amd_iommu_alloc_intremap_table(
+                    iommu, &shared_intremap_inuse);
+
+            if ( !shared_intremap_table )
+                panic("No memory for shared IRT\n");
+
+            ivrs_mappings[alias_id].intremap_table = shared_intremap_table;
+            ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse;
+        }
+        else if ( alloc_irt )
+        {
+            ivrs_mappings[alias_id].intremap_table =
+                amd_iommu_alloc_intremap_table(
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+
+            if ( !ivrs_mappings[alias_id].intremap_table )
+                panic("No memory for %04x:%02x:%02x.%u's IRT\n",
+                      iommu->seg, PCI_BUS(alias_id), PCI_SLOT(alias_id),
+                      PCI_FUNC(alias_id));
+        }
     }
 
     ivrs_mappings[alias_id].valid = true;
@@ -433,7 +439,8 @@ static u16 __init parse_ivhd_device_sele
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, false,
+                           iommu);
 
     return sizeof(*select);
 }
@@ -479,7 +486,7 @@ static u16 __init parse_ivhd_device_rang
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->start.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -513,7 +520,8 @@ static u16 __init parse_ivhd_device_alia
 
     AMD_IOMMU_DEBUG(" Dev_Id Alias: %#x\n", alias_id);
 
-    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, true,
+                           iommu);
 
     return dev_length;
 }
@@ -568,7 +576,7 @@ static u16 __init parse_ivhd_device_alia
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting,
-                               iommu);
+                               true, iommu);
 
     return dev_length;
 }
@@ -593,7 +601,7 @@ static u16 __init parse_ivhd_device_exte
         return 0;
     }
 
-    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, false, iommu);
 
     return dev_length;
 }
@@ -640,7 +648,7 @@ static u16 __init parse_ivhd_device_exte
 
     for ( bdf = first_bdf; bdf <= last_bdf; bdf++ )
         add_ivrs_mapping_entry(bdf, bdf, range->extended.header.data_setting,
-                               iommu);
+                               false, iommu);
 
     return dev_length;
 }
@@ -733,7 +741,8 @@ static u16 __init parse_ivhd_device_spec
     AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n",
                     seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf),
                     special->variety, special->handle);
-    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu);
+    add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, true,
+                           iommu);
 
     switch ( special->variety )
     {
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -30,6 +30,7 @@
 #include <xen/delay.h>
 
 static int __initdata nr_amd_iommus;
+static bool __initdata pci_init;
 
 static void do_amd_iommu_irq(unsigned long data);
 static DECLARE_SOFTIRQ_TASKLET(amd_iommu_irq_tasklet, do_amd_iommu_irq, 0);
@@ -1247,17 +1248,20 @@ static int __init amd_iommu_setup_device
 
     BUG_ON( (ivrs_bdf_entries == 0) );
 
-    /* allocate 'device table' on a 4K boundary */
-    device_table.alloc_size = PAGE_SIZE <<
-                              get_order_from_bytes(
-                              PAGE_ALIGN(ivrs_bdf_entries *
-                              IOMMU_DEV_TABLE_ENTRY_SIZE));
-    device_table.entries = device_table.alloc_size /
-                           IOMMU_DEV_TABLE_ENTRY_SIZE;
-
-    device_table.buffer = allocate_buffer(device_table.alloc_size,
-                                          "Device Table");
-    if  ( device_table.buffer == NULL )
+    if ( !device_table.buffer )
+    {
+        /* allocate 'device table' on a 4K boundary */
+        device_table.alloc_size = PAGE_SIZE <<
+                                  get_order_from_bytes(
+                                  PAGE_ALIGN(ivrs_bdf_entries *
+                                  IOMMU_DEV_TABLE_ENTRY_SIZE));
+        device_table.entries = device_table.alloc_size /
+                               IOMMU_DEV_TABLE_ENTRY_SIZE;
+
+        device_table.buffer = allocate_buffer(device_table.alloc_size,
+                                              "Device Table");
+    }
+    if ( !device_table.buffer )
         return -ENOMEM;
 
     /* Add device table entries */
@@ -1266,13 +1270,46 @@ static int __init amd_iommu_setup_device
         if ( ivrs_mappings[bdf].valid )
         {
             void *dte;
+            const struct pci_dev *pdev = NULL;
 
             /* add device table entry */
             dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE);
             iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]);
 
+            if ( iommu_intremap &&
+                 ivrs_mappings[bdf].dte_requestor_id == bdf &&
+                 !ivrs_mappings[bdf].intremap_table )
+            {
+                if ( !pci_init )
+                    continue;
+                pcidevs_lock();
+                pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+                pcidevs_unlock();
+            }
+
+            if ( pdev )
+            {
+                unsigned int req_id = bdf;
+
+                do {
+                    ivrs_mappings[req_id].intremap_table =
+                        amd_iommu_alloc_intremap_table(
+                            ivrs_mappings[bdf].iommu,
+                            &ivrs_mappings[req_id].intremap_inuse);
+                    if ( !ivrs_mappings[req_id].intremap_table )
+                        return -ENOMEM;
+
+                    if ( !pdev->phantom_stride )
+                        break;
+                    req_id += pdev->phantom_stride;
+                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+            }
+
             amd_iommu_set_intremap_table(
-                dte, virt_to_maddr(ivrs_mappings[bdf].intremap_table),
+                dte,
+                ivrs_mappings[bdf].intremap_table
+                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+                : 0,
                 iommu_intremap);
         }
     }
@@ -1405,7 +1442,8 @@ int __init amd_iommu_init(bool xt)
     if ( rc )
         goto error_out;
 
-    /* allocate and initialize a global device table shared by all iommus */
+    /* Allocate and initialize device table(s). */
+    pci_init = !xt;
     rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
     if ( rc )
         goto error_out;
@@ -1424,7 +1462,7 @@ int __init amd_iommu_init(bool xt)
         /*
          * Setting up of the IOMMU interrupts cannot occur yet at the (very
          * early) time we get here when enabling x2APIC mode. Suppress it
-         * here, and do it explicitly in amd_iommu_init_interrupt().
+         * here, and do it explicitly in amd_iommu_init_late().
          */
         rc = amd_iommu_init_one(iommu, !xt);
         if ( rc )
@@ -1438,11 +1476,16 @@ error_out:
     return rc;
 }
 
-int __init amd_iommu_init_interrupt(void)
+int __init amd_iommu_init_late(void)
 {
     struct amd_iommu *iommu;
     int rc = 0;
 
+    /* Further initialize the device table(s). */
+    pci_init = true;
+    if ( iommu_intremap )
+        rc = iterate_ivrs_mappings(amd_iommu_setup_device_table);
+
     for_each_amd_iommu ( iommu )
     {
         struct irq_desc *desc;
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -789,7 +789,7 @@ void amd_iommu_read_msi_from_ire(
     }
 }
 
-int __init amd_iommu_free_intremap_table(
+int amd_iommu_free_intremap_table(
     const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
 {
     void **tblp;
@@ -814,7 +814,7 @@ int __init amd_iommu_free_intremap_table
     return 0;
 }
 
-void *__init amd_iommu_alloc_intremap_table(
+void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *iommu, unsigned long **inuse_map)
 {
     unsigned int order = intremap_table_order(iommu);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -116,8 +116,9 @@ void __init amd_iommu_set_intremap_table
     struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
 {
     dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
-    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
+    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
+                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
     dte->ig = false; /* unmapped interrupts result in i/o page faults */
     dte->iv = valid;
 }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -165,7 +165,7 @@ static int __init iov_detect(void)
     if ( !iommu_enable && !iommu_intremap )
         return 0;
 
-    if ( (init_done ? amd_iommu_init_interrupt()
+    if ( (init_done ? amd_iommu_init_late()
                     : amd_iommu_init(false)) != 0 )
     {
         printk("AMD-Vi: Error initialization\n");
@@ -429,6 +429,7 @@ static int amd_iommu_add_device(u8 devfn
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
 
     if ( !pdev->domain )
         return -EINVAL;
@@ -458,6 +459,32 @@ static int amd_iommu_add_device(u8 devfn
         return -ENODEV;
     }
 
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( !ivrs_mappings ||
+         !ivrs_mappings[ivrs_mappings[bdf].dte_requestor_id].valid )
+        return -EPERM;
+
+    if ( iommu_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         !ivrs_mappings[bdf].intremap_table )
+    {
+        ivrs_mappings[bdf].intremap_table =
+            amd_iommu_alloc_intremap_table(
+                iommu, &ivrs_mappings[bdf].intremap_inuse);
+        if ( !ivrs_mappings[bdf].intremap_table )
+            return -ENOMEM;
+
+        amd_iommu_set_intremap_table(
+            iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+            ivrs_mappings[bdf].intremap_table
+            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
+            : 0,
+            iommu_intremap);
+
+        amd_iommu_flush_device(iommu, bdf);
+    }
+
     amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
     return 0;
 }
@@ -466,6 +493,8 @@ static int amd_iommu_remove_device(u8 de
 {
     struct amd_iommu *iommu;
     u16 bdf;
+    struct ivrs_mappings *ivrs_mappings;
+
     if ( !pdev->domain )
         return -EINVAL;
 
@@ -481,6 +510,14 @@ static int amd_iommu_remove_device(u8 de
     }
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+
+    ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    bdf = PCI_BDF2(pdev->bus, devfn);
+    if ( amd_iommu_perdev_intremap &&
+         ivrs_mappings[bdf].dte_requestor_id == bdf &&
+         ivrs_mappings[bdf].intremap_table )
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+
     return 0;
 }
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -50,7 +50,7 @@ void get_iommu_features(struct amd_iommu
 /* amd-iommu-init functions */
 int amd_iommu_prepare(bool xt);
 int amd_iommu_init(bool xt);
-int amd_iommu_init_interrupt(void);
+int amd_iommu_init_late(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 int iov_adjust_irq_affinities(void);
AMD/IOMMU: make phantom functions share interrupt remapping tables

Rather than duplicating entries in amd_iommu_msi_msg_update_ire(), share
the tables. This mainly requires some care while freeing them, to avoid
freeing memory blocks twice.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1114,7 +1114,7 @@ static void __init amd_iommu_init_cleanu
         amd_iommu_free_intremap_table(list_first_entry(&amd_iommu_head,
                                                        struct amd_iommu,
                                                        list),
-                                      NULL);
+                                      NULL, 0);
 
     /* free amd iommu list */
     list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list )
@@ -1179,7 +1179,7 @@ int iterate_ivrs_mappings(int (*handler)
 }
 
 int iterate_ivrs_entries(int (*handler)(const struct amd_iommu *,
-                                        struct ivrs_mappings *))
+                                        struct ivrs_mappings *, uint16_t bdf))
 {
     u16 seg = 0;
     int rc = 0;
@@ -1196,7 +1196,7 @@ int iterate_ivrs_entries(int (*handler)(
             const struct amd_iommu *iommu = map[bdf].iommu;
 
             if ( iommu && map[bdf].dte_requestor_id == bdf )
-                rc = handler(iommu, &map[bdf]);
+                rc = handler(iommu, &map[bdf], bdf);
         }
     } while ( !rc && ++seg );
 
@@ -1289,20 +1289,29 @@ static int __init amd_iommu_setup_device
 
             if ( pdev )
             {
-                unsigned int req_id = bdf;
-
-                do {
-                    ivrs_mappings[req_id].intremap_table =
-                        amd_iommu_alloc_intremap_table(
-                            ivrs_mappings[bdf].iommu,
-                            &ivrs_mappings[req_id].intremap_inuse);
-                    if ( !ivrs_mappings[req_id].intremap_table )
-                        return -ENOMEM;
-
-                    if ( !pdev->phantom_stride )
-                        break;
-                    req_id += pdev->phantom_stride;
-                } while ( PCI_SLOT(req_id) == pdev->sbdf.dev );
+                ivrs_mappings[bdf].intremap_table =
+                    amd_iommu_alloc_intremap_table(
+                        ivrs_mappings[bdf].iommu,
+                        &ivrs_mappings[bdf].intremap_inuse);
+                if ( !ivrs_mappings[bdf].intremap_table )
+                    return -ENOMEM;
+
+                if ( pdev->phantom_stride )
+                {
+                    unsigned int req_id = bdf;
+
+                    for ( ; ; )
+                    {
+                        req_id += pdev->phantom_stride;
+                        if ( PCI_SLOT(req_id) != pdev->sbdf.dev )
+                            break;
+
+                        ivrs_mappings[req_id].intremap_table =
+                            ivrs_mappings[bdf].intremap_table;
+                        ivrs_mappings[req_id].intremap_inuse =
+                            ivrs_mappings[bdf].intremap_inuse;
+                    }
+                }
             }
 
             amd_iommu_set_intremap_table(
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -711,33 +711,20 @@ int amd_iommu_msi_msg_update_ire(
 
     if ( msi_desc->remap_index >= 0 && !msg )
     {
-        do {
-            update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                               &msi_desc->remap_index,
-                                               NULL, NULL);
-            if ( !pdev || !pdev->phantom_stride )
-                break;
-            bdf += pdev->phantom_stride;
-        } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
+        update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                           &msi_desc->remap_index,
+                                           NULL, NULL);
 
         for ( i = 0; i < nr; ++i )
             msi_desc[i].remap_index = -1;
-        if ( pdev )
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
     }
 
     if ( !msg )
         return 0;
 
-    do {
-        rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
-                                                &msi_desc->remap_index,
-                                                msg, &data);
-        if ( rc || !pdev || !pdev->phantom_stride )
-            break;
-        bdf += pdev->phantom_stride;
-    } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-
+    rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr,
+                                            &msi_desc->remap_index,
+                                            msg, &data);
     if ( !rc )
     {
         for ( i = 1; i < nr; ++i )
@@ -790,12 +777,27 @@ void amd_iommu_read_msi_from_ire(
 }
 
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping)
+    const struct amd_iommu *iommu, struct ivrs_mappings *ivrs_mapping,
+    uint16_t bdf)
 {
     void **tblp;
 
     if ( ivrs_mapping )
     {
+        unsigned int i;
+
+        /*
+         * PCI device phantom functions use the same tables as their "base"
+         * function: Look ahead to zap the pointers.
+         */
+        for ( i = 1; PCI_FUNC(bdf + i) && bdf + i < ivrs_bdf_entries; ++i )
+            if ( ivrs_mapping[i].intremap_table ==
+                 ivrs_mapping->intremap_table )
+            {
+                ivrs_mapping[i].intremap_table = NULL;
+                ivrs_mapping[i].intremap_inuse = NULL;
+            }
+
         XFREE(ivrs_mapping->intremap_inuse);
         tblp = &ivrs_mapping->intremap_table;
     }
@@ -934,7 +936,8 @@ static void dump_intremap_table(const st
 }
 
 static int dump_intremap_mapping(const struct amd_iommu *iommu,
-                                 struct ivrs_mappings *ivrs_mapping)
+                                 struct ivrs_mappings *ivrs_mapping,
+                                 uint16_t unused)
 {
     unsigned long flags;
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -516,7 +516,7 @@ static int amd_iommu_remove_device(u8 de
     if ( amd_iommu_perdev_intremap &&
          ivrs_mappings[bdf].dte_requestor_id == bdf &&
          ivrs_mappings[bdf].intremap_table )
-        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf]);
+        amd_iommu_free_intremap_table(iommu, &ivrs_mappings[bdf], bdf);
 
     return 0;
 }
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -131,7 +131,7 @@ extern u8 ivhd_type;
 struct ivrs_mappings *get_ivrs_mappings(u16 seg);
 int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *));
 int iterate_ivrs_entries(int (*)(const struct amd_iommu *,
-                                 struct ivrs_mappings *));
+                                 struct ivrs_mappings *, uint16_t));
 
 /* iommu tables in guest space */
 struct mmio_reg {
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -101,7 +101,7 @@ int amd_iommu_setup_ioapic_remapping(voi
 void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
-    const struct amd_iommu *, struct ivrs_mappings *);
+    const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
x86/PCI: read MSI-X table entry count early

Rather than doing this every time we set up interrupts for a device
anew (and then in two distinct places) fill this invariant field
right after allocating struct arch_msix.

While at it also obtain the MSI-X capability structure position just
once, in msix_capability_init(), rather than in each caller.

Furthermore take the opportunity and eliminate the multi_msix_capable()
alias of msix_table_size().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -821,10 +821,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
-                                unsigned int pos,
                                 struct msi_info *msi,
-                                struct msi_desc **desc,
-                                unsigned int nr_entries)
+                                struct msi_desc **desc)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
@@ -838,6 +836,11 @@ static int msix_capability_init(struct p
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
     bool maskall = msix->host_maskall;
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
 
     ASSERT(pcidevs_locked());
 
@@ -912,10 +915,9 @@ static int msix_capability_init(struct p
         u64 pba_paddr;
         u32 pba_offset;
 
-        msix->nr_entries = nr_entries;
         msix->table.first = PFN_DOWN(table_paddr);
         msix->table.last = PFN_DOWN(table_paddr +
-                                    nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+                                    msix->nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first,
                                         msix->table.last));
 
@@ -927,7 +929,7 @@ static int msix_capability_init(struct p
 
         msix->pba.first = PFN_DOWN(pba_paddr);
         msix->pba.last = PFN_DOWN(pba_paddr +
-                                  BITS_TO_LONGS(nr_entries) - 1);
+                                  BITS_TO_LONGS(msix->nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                         msix->pba.last));
     }
@@ -999,7 +1001,6 @@ static int msix_capability_init(struct p
             /* XXX How to deal with existing mappings? */
         }
     }
-    WARN_ON(msix->nr_entries != nr_entries);
     WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
     ++msix->used_entries;
 
@@ -1093,22 +1094,17 @@ static void __pci_disable_msi(struct msi
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int pos, nr_entries;
     struct pci_dev *pdev;
-    u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
     u8 func = PCI_FUNC(msi->devfn);
     struct msi_desc *old_desc;
 
     ASSERT(pcidevs_locked());
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
-    if ( !pdev || !pos )
+    if ( !pdev || !pdev->msix )
         return -ENODEV;
 
-    control = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
-    nr_entries = multi_msix_capable(control);
-    if ( msi->entry_nr >= nr_entries )
+    if ( msi->entry_nr >= pdev->msix->nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
@@ -1127,7 +1123,7 @@ static int __pci_enable_msix(struct msi_
         __pci_disable_msi(old_desc);
     }
 
-    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
+    return msix_capability_init(pdev, msi, desc);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1187,16 +1183,10 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
 {
     int rc;
     struct pci_dev *pdev;
-    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
-    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
-                                           PCI_CAP_ID_MSIX);
 
     if ( !use_msi )
         return 0;
 
-    if ( !pos )
-        return -ENODEV;
-
     pcidevs_lock();
     pdev = pci_get_pdev(seg, bus, devfn);
     if ( !pdev )
@@ -1209,13 +1199,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
         rc = 0;
     }
     else
-    {
-        uint16_t control = pci_conf_read16(PCI_SBDF3(seg, bus, devfn),
-                                           msix_control_reg(pos));
-
-        rc = msix_capability_init(pdev, pos, NULL, NULL,
-                                  multi_msix_capable(control));
-    }
+        rc = msix_capability_init(pdev, NULL, NULL);
     pcidevs_unlock();
 
     return rc;
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -324,6 +324,7 @@ static void apply_quirks(struct pci_dev
 static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
 {
     struct pci_dev *pdev;
+    unsigned int pos;
 
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
         if ( pdev->bus == bus && pdev->devfn == devfn )
@@ -339,10 +340,12 @@ static struct pci_dev *alloc_pdev(struct
     pdev->domain = NULL;
     INIT_LIST_HEAD(&pdev->msi_list);
 
-    if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                             PCI_CAP_ID_MSIX) )
+    pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                              PCI_CAP_ID_MSIX);
+    if ( pos )
     {
         struct arch_msix *msix = xzalloc(struct arch_msix);
+        uint16_t ctrl;
 
         if ( !msix )
         {
@@ -350,6 +353,10 @@ static struct pci_dev *alloc_pdev(struct
             return NULL;
         }
         spin_lock_init(&msix->table_lock);
+
+        ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
+        msix->nr_entries = msix_table_size(ctrl);
+
         pdev->msix = msix;
     }
 
@@ -358,7 +365,6 @@ static struct pci_dev *alloc_pdev(struct
     /* update bus2bridge */
     switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
     {
-        int pos;
         u16 cap;
         u8 sec_bus, sub_bus;
 
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -171,7 +171,6 @@ int msi_free_irq(struct msi_desc *entry)
 #define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
 #define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
 #define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define multi_msix_capable		msix_table_size
 #define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
 #define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
AMD/IOMMU: replace INTREMAP_ENTRIES

Prepare for the number of entries to not be the maximum possible, by
separating checks against maximum size from ones against actual size.
For caller side simplicity have alloc_intremap_entry() return the
maximum possible value upon allocation failure, rather than the first
just out-of-bounds one.

Have the involved functions already take all the subsequently needed
arguments here already, to reduce code churn in the patch actually
making the allocation size dynamic.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,7 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -83,8 +83,20 @@ static void dump_intremap_tables(unsigne
 static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
 {
     return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_ENTRIES * sizeof(union irte32));
+           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
+           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
+}
+
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return IOMMU_INTREMAP_ORDER;
+}
+
+static unsigned int intremap_table_entries(
+    const void *irt, const struct amd_iommu *iommu)
+{
+    return 1u << amd_iommu_intremap_table_order(irt, iommu);
 }
 
 unsigned int ioapic_id_to_index(unsigned int apic_id)
@@ -122,20 +134,24 @@ static int get_intremap_requestor_id(int
     return get_ivrs_mappings(seg)[bdf].dte_requestor_id;
 }
 
-static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr)
+static unsigned int alloc_intremap_entry(const struct amd_iommu *iommu,
+                                         unsigned int bdf, unsigned int nr)
 {
-    unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse;
-    unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg);
+    unsigned long *inuse = ivrs_mappings[bdf].intremap_inuse;
+    unsigned int nr_ents =
+        intremap_table_entries(ivrs_mappings[bdf].intremap_table, iommu);
+    unsigned int slot = find_first_zero_bit(inuse, nr_ents);
 
     for ( ; ; )
     {
         unsigned int end;
 
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1);
-        if ( end > INTREMAP_ENTRIES )
-            end = INTREMAP_ENTRIES;
+        end = find_next_bit(inuse, nr_ents, slot + 1);
+        if ( end > nr_ents )
+            end = nr_ents;
         slot = (slot + nr - 1) & ~(nr - 1);
         if ( slot + nr <= end )
         {
@@ -144,12 +160,12 @@ static unsigned int alloc_intremap_entry
             break;
         }
         slot = (end + nr) & ~(nr - 1);
-        if ( slot >= INTREMAP_ENTRIES )
+        if ( slot >= nr_ents )
             break;
-        slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot);
+        slot = find_next_zero_bit(inuse, nr_ents, slot);
     }
 
-    return slot;
+    return slot < nr_ents ? slot : INTREMAP_MAX_ENTRIES;
 }
 
 static union irte_ptr get_intremap_entry(const struct amd_iommu *iommu,
@@ -159,7 +175,7 @@ static union irte_ptr get_intremap_entry
         .ptr = get_ivrs_mappings(iommu->seg)[bdf].intremap_table
     };
 
-    ASSERT(table.ptr && (index < INTREMAP_ENTRIES));
+    ASSERT(table.ptr && (index < intremap_table_entries(table.ptr, iommu)));
 
     if ( iommu->ctrl.ga_en )
         table.ptr128 += index;
@@ -279,10 +295,10 @@ static int update_intremap_entry_from_io
     spin_lock_irqsave(lock, flags);
 
     offset = *index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
-        offset = alloc_intremap_entry(iommu->seg, req_id, 1);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, req_id, 1);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             rte->mask = 1;
@@ -400,8 +416,8 @@ int __init amd_iommu_setup_ioapic_remapp
             }
 
             spin_lock_irqsave(lock, flags);
-            offset = alloc_intremap_entry(seg, req_id, 1);
-            BUG_ON(offset >= INTREMAP_ENTRIES);
+            offset = alloc_intremap_entry(iommu, req_id, 1);
+            BUG_ON(offset >= INTREMAP_MAX_ENTRIES);
             entry = get_intremap_entry(iommu, req_id, offset);
             update_intremap_entry(iommu, entry, vector,
                                   delivery_mode, dest_mode, dest);
@@ -476,7 +492,7 @@ void amd_iommu_ioapic_update_ire(
         *(((u32 *)&new_rte) + 1) = value;
     }
 
-    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_ENTRIES )
+    if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(saved_mask);
 
@@ -548,7 +564,7 @@ unsigned int amd_iommu_read_ioapic_from_
         return val;
 
     offset = ioapic_sbdf[idx].pin_2_idx[pin];
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
         return val;
 
     seg = ioapic_sbdf[idx].seg;
@@ -561,8 +577,8 @@ unsigned int amd_iommu_read_ioapic_from_
 
     if ( !(reg & 1) )
     {
-        ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
-        val &= ~(INTREMAP_ENTRIES - 1);
+        ASSERT(offset == (val & (INTREMAP_MAX_ENTRIES - 1)));
+        val &= ~(INTREMAP_MAX_ENTRIES - 1);
         /* The IntType fields match for both formats. */
         val |= MASK_INSR(entry.ptr32->flds.int_type,
                          IO_APIC_REDIR_DELIV_MODE_MASK);
@@ -622,11 +638,11 @@ static int update_intremap_entry_from_ms
         dest = MASK_EXTR(msg->address_lo, MSI_ADDR_DEST_ID_MASK);
 
     offset = *remap_index;
-    if ( offset >= INTREMAP_ENTRIES )
+    if ( offset >= INTREMAP_MAX_ENTRIES )
     {
         ASSERT(nr);
-        offset = alloc_intremap_entry(iommu->seg, bdf, nr);
-        if ( offset >= INTREMAP_ENTRIES )
+        offset = alloc_intremap_entry(iommu, bdf, nr);
+        if ( offset >= INTREMAP_MAX_ENTRIES )
         {
             spin_unlock_irqrestore(lock, flags);
             return -ENOSPC;
@@ -654,7 +670,7 @@ static int update_intremap_entry_from_ms
     update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
     spin_unlock_irqrestore(lock, flags);
 
-    *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset;
+    *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
 
     /*
      * In some special cases, a pci-e device(e.g SATA controller in IDE mode)
@@ -738,7 +754,7 @@ int amd_iommu_msi_msg_update_ire(
 void amd_iommu_read_msi_from_ire(
     struct msi_desc *msi_desc, struct msi_msg *msg)
 {
-    unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1);
+    unsigned int offset = msg->data & (INTREMAP_MAX_ENTRIES - 1);
     const struct pci_dev *pdev = msi_desc->dev;
     u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf;
     u16 seg = pdev ? pdev->seg : hpet_sbdf.seg;
@@ -758,7 +774,7 @@ void amd_iommu_read_msi_from_ire(
         offset |= nr;
     }
 
-    msg->data &= ~(INTREMAP_ENTRIES - 1);
+    msg->data &= ~(INTREMAP_MAX_ENTRIES - 1);
     /* The IntType fields match for both formats. */
     msg->data |= MASK_INSR(entry.ptr32->flds.int_type,
                            MSI_DATA_DELIVERY_MODE_MASK);
@@ -824,8 +840,9 @@ void *amd_iommu_alloc_intremap_table(
 
     if ( tb )
     {
-        *inuse_map = xzalloc_array(unsigned long,
-                                   BITS_TO_LONGS(INTREMAP_ENTRIES));
+        unsigned int nr = intremap_table_entries(tb, iommu);
+
+        *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
         else
@@ -869,6 +886,7 @@ bool __init iov_supports_xt(void)
 
 int __init amd_setup_hpet_msi(struct msi_desc *msi_desc)
 {
+    const struct amd_iommu *iommu;
     spinlock_t *lock;
     unsigned long flags;
     int rc = 0;
@@ -886,12 +904,15 @@ int __init amd_setup_hpet_msi(struct msi
         return -ENODEV;
     }
 
+    iommu = find_iommu_for_device(hpet_sbdf.seg, hpet_sbdf.bdf);
+    if ( !iommu )
+        return -ENXIO;
+
     lock = get_intremap_lock(hpet_sbdf.seg, hpet_sbdf.bdf);
     spin_lock_irqsave(lock, flags);
 
-    msi_desc->remap_index = alloc_intremap_entry(hpet_sbdf.seg,
-                                                 hpet_sbdf.bdf, 1);
-    if ( msi_desc->remap_index >= INTREMAP_ENTRIES )
+    msi_desc->remap_index = alloc_intremap_entry(iommu, hpet_sbdf.bdf, 1);
+    if ( msi_desc->remap_index >= INTREMAP_MAX_ENTRIES )
     {
         msi_desc->remap_index = -1;
         rc = -ENXIO;
@@ -906,12 +927,12 @@ static void dump_intremap_table(const st
                                 union irte_cptr tbl,
                                 const struct ivrs_mappings *ivrs_mapping)
 {
-    unsigned int count;
+    unsigned int count, nr = intremap_table_entries(tbl.ptr, iommu);
 
     if ( !tbl.ptr )
         return;
 
-    for ( count = 0; count < INTREMAP_ENTRIES; count++ )
+    for ( count = 0; count < nr; count++ )
     {
         if ( iommu->ctrl.ga_en
              ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -102,6 +102,8 @@ void *amd_iommu_alloc_intremap_table(
     const struct amd_iommu *, unsigned long **);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
+unsigned int amd_iommu_intremap_table_order(
+    const void *irt, const struct amd_iommu *iommu);
 void amd_iommu_ioapic_update_ire(
     unsigned int apic, unsigned int reg, unsigned int value);
 unsigned int amd_iommu_read_ioapic_from_ire(
AMD/IOMMU: restrict interrupt remapping table sizes

There's no point setting up tables with more space than a PCI device can
use. For both MSI and MSI-X we can determine how many interrupts could
be set up at most. Tables allocated during ACPI table parsing, however,
will (for now at least) continue to be set up to have maximum size.

Note that until we would want to use sub-page allocations here there's
no point checking whether MSI is supported by a device - 1 or up to 32
(or actually 128, due to the change effectively using a reserved
encoding) IRTEs always mean an order-0 allocation anyway.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -77,7 +77,7 @@ static void __init add_ivrs_mapping_entr
         {
             if ( !shared_intremap_table )
                 shared_intremap_table = amd_iommu_alloc_intremap_table(
-                    iommu, &shared_intremap_inuse);
+                    iommu, &shared_intremap_inuse, 0);
 
             if ( !shared_intremap_table )
                 panic("No memory for shared IRT\n");
@@ -89,7 +89,7 @@ static void __init add_ivrs_mapping_entr
         {
             ivrs_mappings[alias_id].intremap_table =
                 amd_iommu_alloc_intremap_table(
-                    iommu, &ivrs_mappings[alias_id].intremap_inuse);
+                    iommu, &ivrs_mappings[alias_id].intremap_inuse, 0);
 
             if ( !ivrs_mappings[alias_id].intremap_table )
                 panic("No memory for %04x:%02x:%02x.%u's IRT\n",
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1292,7 +1292,9 @@ static int __init amd_iommu_setup_device
                 ivrs_mappings[bdf].intremap_table =
                     amd_iommu_alloc_intremap_table(
                         ivrs_mappings[bdf].iommu,
-                        &ivrs_mappings[bdf].intremap_inuse);
+                        &ivrs_mappings[bdf].intremap_inuse,
+                        pdev->msix ? pdev->msix->nr_entries
+                                   : multi_msi_capable(~0u));
                 if ( !ivrs_mappings[bdf].intremap_table )
                     return -ENOMEM;
 
@@ -1315,11 +1317,8 @@ static int __init amd_iommu_setup_device
             }
 
             amd_iommu_set_intremap_table(
-                dte,
-                ivrs_mappings[bdf].intremap_table
-                ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-                : 0,
-                iommu_intremap);
+                dte, ivrs_mappings[bdf].intremap_table,
+                ivrs_mappings[bdf].iommu, iommu_intremap);
         }
     }
 
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,7 +69,8 @@ union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_MAX_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
+#define INTREMAP_MAX_ORDER   0xB
+#define INTREMAP_MAX_ENTRIES (1 << INTREMAP_MAX_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
@@ -80,17 +81,13 @@ unsigned int nr_ioapic_sbdf;
 
 static void dump_intremap_tables(unsigned char key);
 
-static unsigned int __init intremap_table_order(const struct amd_iommu *iommu)
-{
-    return iommu->ctrl.ga_en
-           ? get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte128))
-           : get_order_from_bytes(INTREMAP_MAX_ENTRIES * sizeof(union irte32));
-}
+#define intremap_page_order(irt) PFN_ORDER(virt_to_page(irt))
 
 unsigned int amd_iommu_intremap_table_order(
     const void *irt, const struct amd_iommu *iommu)
 {
-    return IOMMU_INTREMAP_ORDER;
+    return intremap_page_order(irt) + PAGE_SHIFT -
+           (iommu->ctrl.ga_en ? 4 : 2);
 }
 
 static unsigned int intremap_table_entries(
@@ -825,7 +822,10 @@ int amd_iommu_free_intremap_table(
 
     if ( *tblp )
     {
-        __free_amd_iommu_tables(*tblp, intremap_table_order(iommu));
+        unsigned int order = intremap_page_order(*tblp);
+
+        intremap_page_order(*tblp) = 0;
+        __free_amd_iommu_tables(*tblp, order);
         *tblp = NULL;
     }
 
@@ -833,15 +833,23 @@ int amd_iommu_free_intremap_table(
 }
 
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *iommu, unsigned long **inuse_map)
+    const struct amd_iommu *iommu, unsigned long **inuse_map, unsigned int nr)
 {
-    unsigned int order = intremap_table_order(iommu);
-    void *tb = __alloc_amd_iommu_tables(order);
+    unsigned int order;
+    void *tb;
 
+    if ( !nr )
+        nr = INTREMAP_MAX_ENTRIES;
+
+    order = iommu->ctrl.ga_en
+            ? get_order_from_bytes(nr * sizeof(union irte128))
+            : get_order_from_bytes(nr * sizeof(union irte32));
+
+    tb = __alloc_amd_iommu_tables(order);
     if ( tb )
     {
-        unsigned int nr = intremap_table_entries(tb, iommu);
-
+        intremap_page_order(tb) = order;
+        nr = intremap_table_entries(tb, iommu);
         *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr));
         if ( *inuse_map )
             memset(tb, 0, PAGE_SIZE << order);
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -113,12 +113,22 @@ void amd_iommu_set_root_page_table(struc
 }
 
 void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
+    struct amd_iommu_dte *dte, const void *ptr,
+    const struct amd_iommu *iommu, bool valid)
 {
-    dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = intremap_ptr ? IOMMU_INTREMAP_ORDER : 0;
-    dte->int_ctl = intremap_ptr ? IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED
-                                : IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    if ( ptr )
+    {
+        dte->it_root = virt_to_maddr(ptr) >> 6;
+        dte->int_tab_len = amd_iommu_intremap_table_order(ptr, iommu);
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    }
+    else
+    {
+        dte->it_root = 0;
+        dte->int_tab_len = 0;
+        dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+    }
+
     dte->ig = false; /* unmapped interrupts result in i/o page faults */
     dte->iv = valid;
 }
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -471,16 +471,15 @@ static int amd_iommu_add_device(u8 devfn
     {
         ivrs_mappings[bdf].intremap_table =
             amd_iommu_alloc_intremap_table(
-                iommu, &ivrs_mappings[bdf].intremap_inuse);
+                iommu, &ivrs_mappings[bdf].intremap_inuse,
+                pdev->msix ? pdev->msix->nr_entries
+                           : multi_msi_capable(~0u));
         if ( !ivrs_mappings[bdf].intremap_table )
             return -ENOMEM;
 
         amd_iommu_set_intremap_table(
             iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
-            ivrs_mappings[bdf].intremap_table
-            ? virt_to_maddr(ivrs_mappings[bdf].intremap_table)
-            : 0,
-            iommu_intremap);
+            ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
 
         amd_iommu_flush_device(iommu, bdf);
     }
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,9 +107,6 @@
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
-/* For now, we always allocate the maximum: 2048 entries. */
-#define IOMMU_INTREMAP_ORDER			0xB
-
 struct amd_iommu_dte {
     /* 0 - 63 */
     bool v:1;
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -72,7 +72,8 @@ int __must_check amd_iommu_flush_iotlb_a
 /* device table functions */
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
 void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
-                                  uint64_t intremap_ptr,
+                                  const void *ptr,
+                                  const struct amd_iommu *iommu,
                                   bool valid);
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
 				   uint64_t root_ptr, uint16_t domain_id,
@@ -99,7 +100,7 @@ struct amd_iommu *find_iommu_for_device(
 bool iov_supports_xt(void);
 int amd_iommu_setup_ioapic_remapping(void);
 void *amd_iommu_alloc_intremap_table(
-    const struct amd_iommu *, unsigned long **);
+    const struct amd_iommu *, unsigned long **, unsigned int nr);
 int amd_iommu_free_intremap_table(
     const struct amd_iommu *, struct ivrs_mappings *, uint16_t);
 unsigned int amd_iommu_intremap_table_order(

Comments

Woods, Brian Aug. 15, 2019, 10:30 p.m. UTC | #1
On Tue, Aug 06, 2019 at 03:05:36PM +0200, Jan Beulich wrote:
> Only the first patch here is left from v4, everything else is new,
> yet still related. The main goal is to reduce the huge memory
> overhead that we've noticed. On the way there a number of other
> things were once again noticed. Unfortunately before I was able to
> also test the last two patches there, my Rome box broke again.
> Hence these two patches have been tested on a (less affected)
> Fam15 system only.
> 
> 01: miscellaneous DTE handling adjustments
> 02: drop stray "else"
> 03: don't free shared IRT multiple times
> 04: introduce a "valid" flag for IVRS mappings
> 05: let callers of amd_iommu_alloc_intremap_table() handle errors
> 06: don't blindly allocate interrupt remapping tables
> 07: make phantom functions share interrupt remapping tables
> 08: x86/PCI: read MSI-X table entry count early
> 09: replace INTREMAP_ENTRIES
> 10: restrict interrupt remapping table sizes
> 
> Full set of patches once again attached here due to still unresolved
> email issues over here.
> 
> Jan
> 

I don't think I have enough time here left to review these, but I've
tested them via PCI device passthrough on an AMD Rome system.
diff mbox

Patch

--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -69,8 +69,7 @@  union irte_cptr {
     const union irte128 *ptr128;
 } __transparent__;
 
-#define INTREMAP_LENGTH 0xB
-#define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH)
+#define INTREMAP_ENTRIES (1 << IOMMU_INTREMAP_ORDER)
 
 struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS];
 struct hpet_sbdf hpet_sbdf;
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -101,51 +101,52 @@  static unsigned int set_iommu_pte_presen
 
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
                                    uint64_t root_ptr, uint16_t domain_id,
-                                   uint8_t paging_mode, uint8_t valid)
+                                   uint8_t paging_mode, bool valid)
 {
     dte->domain_id = domain_id;
     dte->pt_root = paddr_to_pfn(root_ptr);
-    dte->iw = 1;
-    dte->ir = 1;
+    dte->iw = true;
+    dte->ir = true;
     dte->paging_mode = paging_mode;
-    dte->tv = 1;
+    dte->tv = true;
     dte->v = valid;
 }
 
 void __init amd_iommu_set_intremap_table(
-    struct amd_iommu_dte *dte, uint64_t intremap_ptr, uint8_t int_valid)
+    struct amd_iommu_dte *dte, uint64_t intremap_ptr, bool valid)
 {
     dte->it_root = intremap_ptr >> 6;
-    dte->int_tab_len = 0xb; /* 2048 entries */
-    dte->int_ctl = 2; /* fixed and arbitrated interrupts remapped */
-    dte->ig = 0; /* unmapped interrupt results io page faults */
-    dte->iv = int_valid;
+    dte->int_tab_len = IOMMU_INTREMAP_ORDER;
+    dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+    dte->ig = false; /* unmapped interrupts result in i/o page faults */
+    dte->iv = valid;
 }
 
 void __init iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                       struct ivrs_mappings *ivrs_dev)
+                                       const struct ivrs_mappings *ivrs_dev)
 {
     uint8_t flags = ivrs_dev->device_flags;
 
-    memset(dte, 0, sizeof(*dte));
-
-    dte->init_pass = MASK_EXTR(flags, ACPI_IVHD_INIT_PASS);
-    dte->ext_int_pass = MASK_EXTR(flags, ACPI_IVHD_EINT_PASS);
-    dte->nmi_pass = MASK_EXTR(flags, ACPI_IVHD_NMI_PASS);
-    dte->lint0_pass = MASK_EXTR(flags, ACPI_IVHD_LINT0_PASS);
-    dte->lint1_pass = MASK_EXTR(flags, ACPI_IVHD_LINT1_PASS);
-    dte->sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT);
-    dte->ex = ivrs_dev->dte_allow_exclusion;
+    *dte = (struct amd_iommu_dte){
+        .init_pass = flags & ACPI_IVHD_INIT_PASS,
+        .ext_int_pass = flags & ACPI_IVHD_EINT_PASS,
+        .nmi_pass = flags & ACPI_IVHD_NMI_PASS,
+        .lint0_pass = flags & ACPI_IVHD_LINT0_PASS,
+        .lint1_pass = flags & ACPI_IVHD_LINT1_PASS,
+        .ioctl = IOMMU_DEV_TABLE_IO_CONTROL_ABORTED,
+        .sys_mgt = MASK_EXTR(flags, ACPI_IVHD_SYSTEM_MGMT),
+        .ex = ivrs_dev->dte_allow_exclusion,
+    };
 }
 
 void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx)
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx)
 {
 #define GCR3_MASK(hi, lo) (((1ul << ((hi) + 1)) - 1) & ~((1ul << (lo)) - 1))
 #define GCR3_SHIFT(lo) ((lo) - PAGE_SHIFT)
 
     /* I bit must be set when gcr3 is enabled */
-    dte->i = 1;
+    dte->i = true;
 
     dte->gcr3_trp_14_12 = (gcr3_mfn & GCR3_MASK(14, 12)) >> GCR3_SHIFT(12);
     dte->gcr3_trp_30_15 = (gcr3_mfn & GCR3_MASK(30, 15)) >> GCR3_SHIFT(15);
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -93,7 +93,6 @@  static void amd_iommu_setup_domain_devic
     struct amd_iommu_dte *table, *dte;
     unsigned long flags;
     int req_id, valid = 1;
-    int dte_i = 0;
     u8 bus = pdev->bus;
     const struct domain_iommu *hd = dom_iommu(domain);
 
@@ -103,9 +102,6 @@  static void amd_iommu_setup_domain_devic
     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
         valid = 0;
 
-    if ( ats_enabled )
-        dte_i = 1;
-
     /* get device-table entry */
     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
     table = iommu->dev_table.buffer;
@@ -122,7 +118,7 @@  static void amd_iommu_setup_domain_devic
 
         if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
              iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = dte_i;
+            dte->i = ats_enabled;
 
         amd_iommu_flush_device(iommu, req_id);
 
@@ -288,14 +284,11 @@  void amd_iommu_disable_domain_device(str
     dte = &table[req_id];
 
     spin_lock_irqsave(&iommu->lock, flags);
-    if ( dte->tv && dte->v )
+    if ( dte->tv || dte->v )
     {
-        dte->tv = 0;
-        dte->v = 0;
-
-        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
-             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
-            dte->i = 0;
+        dte->tv = false;
+        dte->v = false;
+        dte->i = false;
 
         amd_iommu_flush_device(iommu, req_id);
 
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -107,57 +107,60 @@ 
 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED	0x1
 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED	0x2
 
+/* For now, we always allocate the maximum: 2048 entries. */
+#define IOMMU_INTREMAP_ORDER			0xB
+
 struct amd_iommu_dte {
     /* 0 - 63 */
-    uint64_t v:1;
-    uint64_t tv:1;
-    uint64_t reserved0:5;
-    uint64_t had:2;
-    uint64_t paging_mode:3;
+    bool v:1;
+    bool tv:1;
+    unsigned int :5;
+    unsigned int had:2;
+    unsigned int paging_mode:3;
     uint64_t pt_root:40;
-    uint64_t ppr:1;
-    uint64_t gprp:1;
-    uint64_t giov:1;
-    uint64_t gv:1;
-    uint64_t glx:2;
-    uint64_t gcr3_trp_14_12:3;
-    uint64_t ir:1;
-    uint64_t iw:1;
-    uint64_t reserved1:1;
+    bool ppr:1;
+    bool gprp:1;
+    bool giov:1;
+    bool gv:1;
+    unsigned int glx:2;
+    unsigned int gcr3_trp_14_12:3;
+    bool ir:1;
+    bool iw:1;
+    unsigned int :1;
 
     /* 64 - 127 */
-    uint64_t domain_id:16;
-    uint64_t gcr3_trp_30_15:16;
-    uint64_t i:1;
-    uint64_t se:1;
-    uint64_t sa:1;
-    uint64_t ioctl:2;
-    uint64_t cache:1;
-    uint64_t sd:1;
-    uint64_t ex:1;
-    uint64_t sys_mgt:2;
-    uint64_t reserved2:1;
-    uint64_t gcr3_trp_51_31:21;
+    unsigned int domain_id:16;
+    unsigned int gcr3_trp_30_15:16;
+    bool i:1;
+    bool se:1;
+    bool sa:1;
+    unsigned int ioctl:2;
+    bool cache:1;
+    bool sd:1;
+    bool ex:1;
+    unsigned int sys_mgt:2;
+    unsigned int :1;
+    unsigned int gcr3_trp_51_31:21;
 
     /* 128 - 191 */
-    uint64_t iv:1;
-    uint64_t int_tab_len:4;
-    uint64_t ig:1;
+    bool iv:1;
+    unsigned int int_tab_len:4;
+    bool ig:1;
     uint64_t it_root:46;
-    uint64_t reserved3:4;
-    uint64_t init_pass:1;
-    uint64_t ext_int_pass:1;
-    uint64_t nmi_pass:1;
-    uint64_t reserved4:1;
-    uint64_t int_ctl:2;
-    uint64_t lint0_pass:1;
-    uint64_t lint1_pass:1;
+    unsigned int :4;
+    bool init_pass:1;
+    bool ext_int_pass:1;
+    bool nmi_pass:1;
+    unsigned int :1;
+    unsigned int int_ctl:2;
+    bool lint0_pass:1;
+    bool lint1_pass:1;
 
     /* 192 - 255 */
-    uint64_t reserved5:54;
-    uint64_t attr_v:1;
-    uint64_t mode0_fc:1;
-    uint64_t snoop_attr:8;
+    uint64_t :54;
+    bool attr_v:1;
+    bool mode0_fc:1;
+    unsigned int snoop_attr:8;
 };
 
 /* Command Buffer */
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -73,14 +73,14 @@  int __must_check amd_iommu_flush_iotlb_a
 int get_dma_requestor_id(uint16_t seg, uint16_t bdf);
 void amd_iommu_set_intremap_table(struct amd_iommu_dte *dte,
                                   uint64_t intremap_ptr,
-                                  uint8_t int_valid);
+                                  bool valid);
 void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
 				   uint64_t root_ptr, uint16_t domain_id,
-				   uint8_t paging_mode, uint8_t valid);
+				   uint8_t paging_mode, bool valid);
 void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
-                                struct ivrs_mappings *ivrs_dev);
+                                const struct ivrs_mappings *ivrs_dev);
 void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
-                             uint64_t gcr3_mfn, uint8_t gv, uint8_t glx);
+                             uint64_t gcr3_mfn, bool gv, uint8_t glx);
 
 /* send cmd to iommu */
 void amd_iommu_flush_all_pages(struct domain *d);