diff mbox series

[3/3] x86/iommu: use a rangeset for hwdom setup

Message ID 20231117094749.81091-4-roger.pau@citrix.com (mailing list archive)
State Superseded
Headers show
Series x86/iommu: improve setup time of hwdom IOMMU | expand

Commit Message

Roger Pau Monne Nov. 17, 2023, 9:47 a.m. UTC
The current loop that iterates from 0 to the maximum RAM address in order to
setup the IOMMU mappings is highly inefficient, and it will get worse as the
amount of RAM increases.  It's also not accounting for any reserved regions
past the last RAM address.

Instead of iterating over memory addresses, iterate over the memory map regions
and use a rangeset in order to keep track of which ranges need to be identity
mapped in the hardware domain physical address space.

On an AMD EPYC 7452 with 512GiB of RAM, the time to execute
arch_iommu_hwdom_init() in nanoseconds is:

x old
+ new
    N           Min           Max        Median           Avg        Stddev
x   5 2.2364154e+10  2.338244e+10 2.2474685e+10 2.2622409e+10 4.2949869e+08
+   5       1025012       1033036       1026188     1028276.2     3623.1194
Difference at 95.0% confidence
	-2.26214e+10 +/- 4.42931e+08
	-99.9955% +/- 9.05152e-05%
	(Student's t, pooled s = 3.03701e+08)

Execution time of arch_iommu_hwdom_init() goes down from ~22s to ~0.001s.

Note there's a change for HVM domains (ie: PVH dom0) that get switched to
create the p2m mappings using map_mmio_regions() instead of
p2m_add_identity_entry(), so that ranges can be mapped with a single function
call if possible.  Note that the interface of map_mmio_regions() doesn't
allow creating read-only mappings, but so far there are no such mappings
created for PVH dom0 in arch_iommu_hwdom_init().

No change intended in the resulting mappings that a hardware domain gets.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/hvm/io.c               |  15 +-
 xen/arch/x86/include/asm/hvm/io.h   |   4 +-
 xen/drivers/passthrough/x86/iommu.c | 355 +++++++++++++++++-----------
 3 files changed, 231 insertions(+), 143 deletions(-)

Comments

Roger Pau Monne Nov. 17, 2023, 12:04 p.m. UTC | #1
On Fri, Nov 17, 2023 at 10:47:49AM +0100, Roger Pau Monne wrote:
> The current loop that iterates from 0 to the maximum RAM address in order to
> setup the IOMMU mappings is highly inefficient, and it will get worse as the
> amount of RAM increases.  It's also not accounting for any reserved regions
> past the last RAM address.
> 
> Instead of iterating over memory addresses, iterate over the memory map regions
> and use a rangeset in order to keep track of which ranges need to be identity
> mapped in the hardware domain physical address space.
> 
> On an AMD EPYC 7452 with 512GiB of RAM, the time to execute
> arch_iommu_hwdom_init() in nanoseconds is:
> 
> x old
> + new
>     N           Min           Max        Median           Avg        Stddev
> x   5 2.2364154e+10  2.338244e+10 2.2474685e+10 2.2622409e+10 4.2949869e+08
> +   5       1025012       1033036       1026188     1028276.2     3623.1194
> Difference at 95.0% confidence
> 	-2.26214e+10 +/- 4.42931e+08
> 	-99.9955% +/- 9.05152e-05%
> 	(Student's t, pooled s = 3.03701e+08)
> 
> Execution time of arch_iommu_hwdom_init() goes down from ~22s to ~0.001s.
> 
> Note there's a change for HVM domains (ie: PVH dom0) that get switched to
> create the p2m mappings using map_mmio_regions() instead of
> p2m_add_identity_entry(), so that ranges can be mapped with a single function
> call if possible.  Note that the interface of map_mmio_regions() doesn't
> allow creating read-only mappings, but so far there are no such mappings
> created for PVH dom0 in arch_iommu_hwdom_init().
> 
> No change intended in the resulting mappings that a hardware domain gets.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
>  xen/arch/x86/hvm/io.c               |  15 +-
>  xen/arch/x86/include/asm/hvm/io.h   |   4 +-
>  xen/drivers/passthrough/x86/iommu.c | 355 +++++++++++++++++-----------
>  3 files changed, 231 insertions(+), 143 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
> index d75af83ad01f..7c4b7317b13a 100644
> --- a/xen/arch/x86/hvm/io.c
> +++ b/xen/arch/x86/hvm/io.c
> @@ -364,9 +364,20 @@ static const struct hvm_mmcfg *vpci_mmcfg_find(const struct domain *d,
>      return NULL;
>  }
>  
> -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr)
> +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r)
>  {
> -    return vpci_mmcfg_find(d, addr);
> +    const struct hvm_mmcfg *mmcfg;
> +
> +    list_for_each_entry ( mmcfg, &d->arch.hvm.mmcfg_regions, next )
> +    {
> +        int rc = rangeset_remove_range(r, PFN_DOWN(mmcfg->addr),
> +                                       PFN_DOWN(mmcfg->addr + mmcfg->size - 1));
> +
> +        if ( rc )
> +            return rc;
> +    }
> +
> +    return 0;
>  }
>  
>  static unsigned int vpci_mmcfg_decode_addr(const struct hvm_mmcfg *mmcfg,
> diff --git a/xen/arch/x86/include/asm/hvm/io.h b/xen/arch/x86/include/asm/hvm/io.h
> index e5225e75ef26..c9d058fd5695 100644
> --- a/xen/arch/x86/include/asm/hvm/io.h
> +++ b/xen/arch/x86/include/asm/hvm/io.h
> @@ -153,8 +153,8 @@ int register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
>  /* Destroy tracked MMCFG areas. */
>  void destroy_vpci_mmcfg(struct domain *d);
>  
> -/* Check if an address is between a MMCFG region for a domain. */
> -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr);
> +/* Remove MMCFG regions from a given rangeset. */
> +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r);
>  
>  #endif /* __ASM_X86_HVM_IO_H__ */
>  
> diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
> index d70cee9fea77..be2c909f61d8 100644
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -301,129 +301,133 @@ void iommu_identity_map_teardown(struct domain *d)
>      }
>  }
>  
> -static int __hwdom_init xen_in_range(unsigned long mfn)
> +static int __hwdom_init remove_xen_ranges(struct rangeset *r)
>  {
>      paddr_t start, end;
> -    int i;
> -
> -    enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
> -    static struct {
> -        paddr_t s, e;
> -    } xen_regions[nr_regions] __hwdom_initdata;
> +    int rc;
>  
> -    /* initialize first time */
> -    if ( !xen_regions[0].s )
> -    {
> -        /* S3 resume code (and other real mode trampoline code) */
> -        xen_regions[region_s3].s = bootsym_phys(trampoline_start);
> -        xen_regions[region_s3].e = bootsym_phys(trampoline_end);
> +    /* S3 resume code (and other real mode trampoline code) */
> +    rc = rangeset_remove_range(r, PFN_DOWN(bootsym_phys(trampoline_start)),
> +                               PFN_DOWN(bootsym_phys(trampoline_end)));
> +    if ( rc )
> +        return rc;
>  
> -        /*
> -         * This needs to remain in sync with the uses of the same symbols in
> -         * - __start_xen()
> -         * - is_xen_fixed_mfn()
> -         * - tboot_shutdown()
> -         */
> +    /*
> +     * This needs to remain in sync with the uses of the same symbols in
> +     * - __start_xen()
> +     * - is_xen_fixed_mfn()
> +     * - tboot_shutdown()
> +     */
> +    /* hypervisor .text + .rodata */
> +    rc = rangeset_remove_range(r, PFN_DOWN(__pa(&_stext)),
> +                               PFN_DOWN(__pa(&__2M_rodata_end)));
> +    if ( rc )
> +        return rc;
>  
> -        /* hypervisor .text + .rodata */
> -        xen_regions[region_ro].s = __pa(&_stext);
> -        xen_regions[region_ro].e = __pa(&__2M_rodata_end);
> -        /* hypervisor .data + .bss */
> -        xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
> -        xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
> -        if ( efi_boot_mem_unused(&start, &end) )
> -        {
> -            ASSERT(__pa(start) >= xen_regions[region_rw].s);
> -            ASSERT(__pa(end) <= xen_regions[region_rw].e);
> -            xen_regions[region_rw].e = __pa(start);
> -            xen_regions[region_bss].s = __pa(end);
> -            xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
> -        }
> +    /* hypervisor .data + .bss */
> +    if ( efi_boot_mem_unused(&start, &end) )
> +    {
> +        ASSERT(__pa(start) >= __pa(&__2M_rwdata_start));
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
> +                                   PFN_DOWN(__pa(start)));
> +        if ( rc )
> +            return rc;
> +        ASSERT(__pa(end) <= __pa(&__2M_rwdata_end));
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(end)),
> +                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
> +        if ( rc )
> +            return rc;
> +    }
> +    else
> +    {
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
> +                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
> +        if ( rc )
> +            return rc;
>      }
> -
> -    start = (paddr_t)mfn << PAGE_SHIFT;
> -    end = start + PAGE_SIZE;
> -    for ( i = 0; i < nr_regions; i++ )
> -        if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
> -            return 1;
>  
>      return 0;
>  }
>  
> -static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
> -                                                 unsigned long pfn,
> -                                                 unsigned long max_pfn)
> +static int __hwdom_init map_subtract(unsigned long s, unsigned long e,

Bah, this (and others below) are missing cf_check attribute.

Will fix in v2.

Roger.
Andrew Cooper Nov. 17, 2023, 2:34 p.m. UTC | #2
On 17/11/2023 9:47 am, Roger Pau Monne wrote:
> The current loop that iterates from 0 to the maximum RAM address in order to
> setup the IOMMU mappings is highly inefficient, and it will get worse as the
> amount of RAM increases.  It's also not accounting for any reserved regions
> past the last RAM address.
>
> Instead of iterating over memory addresses, iterate over the memory map regions
> and use a rangeset in order to keep track of which ranges need to be identity
> mapped in the hardware domain physical address space.
>
> On an AMD EPYC 7452 with 512GiB of RAM, the time to execute
> arch_iommu_hwdom_init() in nanoseconds is:
>
> x old
> + new
>     N           Min           Max        Median           Avg        Stddev
> x   5 2.2364154e+10  2.338244e+10 2.2474685e+10 2.2622409e+10 4.2949869e+08
> +   5       1025012       1033036       1026188     1028276.2     3623.1194
> Difference at 95.0% confidence
> 	-2.26214e+10 +/- 4.42931e+08
> 	-99.9955% +/- 9.05152e-05%
> 	(Student's t, pooled s = 3.03701e+08)
>
> Execution time of arch_iommu_hwdom_init() goes down from ~22s to ~0.001s.
>
> Note there's a change for HVM domains (ie: PVH dom0) that get switched to
> create the p2m mappings using map_mmio_regions() instead of
> p2m_add_identity_entry(), so that ranges can be mapped with a single function
> call if possible.  Note that the interface of map_mmio_regions() doesn't
> allow creating read-only mappings, but so far there are no such mappings
> created for PVH dom0 in arch_iommu_hwdom_init().
>
> No change intended in the resulting mappings that a hardware domain gets.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Very nice numbers.  And yes - straight line performance like this (good
or bad) is all about the innermost loop.

Sadly, the patch diff is horrible to read.  Patch 2 remaining in common
code will improve this a little, but probably not very much.

If there are no better ideas, it's probably best to split into 3
patches, being:

1) Introduce new rangeset forms of existing operations
2) Rewrite arch_iommu_hwdom_init() to use rangesets
3) Delete old mfn forms

That at least means that the new and the old forms aren't expressed as a
delta against each-other.

~Andrew
Jan Beulich Nov. 20, 2023, 10:30 a.m. UTC | #3
On 17.11.2023 10:47, Roger Pau Monne wrote:
> --- a/xen/arch/x86/hvm/io.c
> +++ b/xen/arch/x86/hvm/io.c
> @@ -364,9 +364,20 @@ static const struct hvm_mmcfg *vpci_mmcfg_find(const struct domain *d,
>      return NULL;
>  }
>  
> -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr)
> +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r)

While there, also add __hwdom_init?

> @@ -447,62 +451,135 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
>      if ( iommu_hwdom_passthrough )
>          return;
>  
> -    max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
> -    top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> +    map = rangeset_new(NULL, NULL, 0);
> +    if ( !map )
> +        panic("IOMMU initialization failed unable to allocate rangeset\n");

This reads a little odd, and could probably do with shortening anyway
(e.g. "IOMMU init: unable to allocate rangeset\n").

> +    if ( iommu_hwdom_inclusive )
> +    {
> +        /* Add the whole range below 4GB, UNUSABLE regions will be removed. */
> +        rc = rangeset_add_range(map, 0, max_pfn);
> +        if ( rc )
> +            panic("IOMMU inclusive mappings can't be added: %d\n",
> +                  rc);
> +    }
>  
> -    for ( i = 0, start = 0, count = 0; i < top; )
> +    for ( i = 0; i < e820.nr_map; i++ )
>      {
> -        unsigned long pfn = pdx_to_pfn(i);
> -        unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> +        struct e820entry entry = e820.map[i];
>  
> -        if ( !perms )
> -            /* nothing */;
> -        else if ( paging_mode_translate(d) )
> +        switch ( entry.type )
>          {
> -            int rc;
> +        case E820_UNUSABLE:
> +            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
> +                continue;

It's > here, but ...

> -            rc = p2m_add_identity_entry(d, pfn,
> -                                        perms & IOMMUF_writable ? p2m_access_rw
> -                                                                : p2m_access_r,
> -                                        0);
> +            rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
> +                                       PFN_DOWN(entry.addr + entry.size - 1));
>              if ( rc )
> -                printk(XENLOG_WARNING
> -                       "%pd: identity mapping of %lx failed: %d\n",
> -                       d, pfn, rc);
> +                panic("IOMMU failed to remove unusable memory: %d\n",
> +                      rc);
> +            continue;
> +
> +        case E820_RESERVED:
> +            if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
> +                continue;
> +            break;
> +
> +        case E820_RAM:
> +            if ( iommu_hwdom_strict )
> +                continue;
> +            break;
> +
> +        default:
> +            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) >= max_pfn )
> +                continue;

... >= here?

> +            entry.size = pfn_to_paddr(max_pfn) - 1 - entry.addr;

Why the subtraction of 1 when you're calculating size? Don't you actually
need to add 1 to max_pfn before converting to paddr?

While overall things look plausible elsewhere, I'm hoping that - as asked
for by Andrew - it'll be possible to split this some, to make it a little
more reasonable to actually look at the changes done.

Jan
Roger Pau Monne Nov. 20, 2023, 10:43 a.m. UTC | #4
On Mon, Nov 20, 2023 at 11:30:16AM +0100, Jan Beulich wrote:
> On 17.11.2023 10:47, Roger Pau Monne wrote:
> > --- a/xen/arch/x86/hvm/io.c
> > +++ b/xen/arch/x86/hvm/io.c
> > @@ -364,9 +364,20 @@ static const struct hvm_mmcfg *vpci_mmcfg_find(const struct domain *d,
> >      return NULL;
> >  }
> >  
> > -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr)
> > +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r)
> 
> While there, also add __hwdom_init?
> 
> > @@ -447,62 +451,135 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
> >      if ( iommu_hwdom_passthrough )
> >          return;
> >  
> > -    max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
> > -    top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
> > +    map = rangeset_new(NULL, NULL, 0);
> > +    if ( !map )
> > +        panic("IOMMU initialization failed unable to allocate rangeset\n");
> 
> This reads a little odd, and could probably do with shortening anyway
> (e.g. "IOMMU init: unable to allocate rangeset\n").
> 
> > +    if ( iommu_hwdom_inclusive )
> > +    {
> > +        /* Add the whole range below 4GB, UNUSABLE regions will be removed. */
> > +        rc = rangeset_add_range(map, 0, max_pfn);
> > +        if ( rc )
> > +            panic("IOMMU inclusive mappings can't be added: %d\n",
> > +                  rc);
> > +    }
> >  
> > -    for ( i = 0, start = 0, count = 0; i < top; )
> > +    for ( i = 0; i < e820.nr_map; i++ )
> >      {
> > -        unsigned long pfn = pdx_to_pfn(i);
> > -        unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
> > +        struct e820entry entry = e820.map[i];
> >  
> > -        if ( !perms )
> > -            /* nothing */;
> > -        else if ( paging_mode_translate(d) )
> > +        switch ( entry.type )
> >          {
> > -            int rc;
> > +        case E820_UNUSABLE:
> > +            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
> > +                continue;
> 
> It's > here, but ...
> 
> > -            rc = p2m_add_identity_entry(d, pfn,
> > -                                        perms & IOMMUF_writable ? p2m_access_rw
> > -                                                                : p2m_access_r,
> > -                                        0);
> > +            rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
> > +                                       PFN_DOWN(entry.addr + entry.size - 1));
> >              if ( rc )
> > -                printk(XENLOG_WARNING
> > -                       "%pd: identity mapping of %lx failed: %d\n",
> > -                       d, pfn, rc);
> > +                panic("IOMMU failed to remove unusable memory: %d\n",
> > +                      rc);
> > +            continue;
> > +
> > +        case E820_RESERVED:
> > +            if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
> > +                continue;
> > +            break;
> > +
> > +        case E820_RAM:
> > +            if ( iommu_hwdom_strict )
> > +                continue;
> > +            break;
> > +
> > +        default:
> > +            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) >= max_pfn )
> > +                continue;
> 
> ... >= here?

Oh, this was a leftover from a previous slightly different logic,
there's no need for such default case anymore, as the whole range
below 4GB is already added to the rangeset in the inclusive case, and
holes are poked for unusable regions.

> > +            entry.size = pfn_to_paddr(max_pfn) - 1 - entry.addr;
> 
> Why the subtraction of 1 when you're calculating size? Don't you actually
> need to add 1 to max_pfn before converting to paddr?

Previous code didn't had the - 1 in max_pfn definition, but again the
default case is no longer needed.

> 
> While overall things look plausible elsewhere, I'm hoping that - as asked
> for by Andrew - it'll be possible to split this some, to make it a little
> more reasonable to actually look at the changes done.

Will try to split it somehow.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index d75af83ad01f..7c4b7317b13a 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -364,9 +364,20 @@  static const struct hvm_mmcfg *vpci_mmcfg_find(const struct domain *d,
     return NULL;
 }
 
-bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr)
+int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r)
 {
-    return vpci_mmcfg_find(d, addr);
+    const struct hvm_mmcfg *mmcfg;
+
+    list_for_each_entry ( mmcfg, &d->arch.hvm.mmcfg_regions, next )
+    {
+        int rc = rangeset_remove_range(r, PFN_DOWN(mmcfg->addr),
+                                       PFN_DOWN(mmcfg->addr + mmcfg->size - 1));
+
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
 }
 
 static unsigned int vpci_mmcfg_decode_addr(const struct hvm_mmcfg *mmcfg,
diff --git a/xen/arch/x86/include/asm/hvm/io.h b/xen/arch/x86/include/asm/hvm/io.h
index e5225e75ef26..c9d058fd5695 100644
--- a/xen/arch/x86/include/asm/hvm/io.h
+++ b/xen/arch/x86/include/asm/hvm/io.h
@@ -153,8 +153,8 @@  int register_vpci_mmcfg_handler(struct domain *d, paddr_t addr,
 /* Destroy tracked MMCFG areas. */
 void destroy_vpci_mmcfg(struct domain *d);
 
-/* Check if an address is between a MMCFG region for a domain. */
-bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr);
+/* Remove MMCFG regions from a given rangeset. */
+int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r);
 
 #endif /* __ASM_X86_HVM_IO_H__ */
 
diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
index d70cee9fea77..be2c909f61d8 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -301,129 +301,133 @@  void iommu_identity_map_teardown(struct domain *d)
     }
 }
 
-static int __hwdom_init xen_in_range(unsigned long mfn)
+static int __hwdom_init remove_xen_ranges(struct rangeset *r)
 {
     paddr_t start, end;
-    int i;
-
-    enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
-    static struct {
-        paddr_t s, e;
-    } xen_regions[nr_regions] __hwdom_initdata;
+    int rc;
 
-    /* initialize first time */
-    if ( !xen_regions[0].s )
-    {
-        /* S3 resume code (and other real mode trampoline code) */
-        xen_regions[region_s3].s = bootsym_phys(trampoline_start);
-        xen_regions[region_s3].e = bootsym_phys(trampoline_end);
+    /* S3 resume code (and other real mode trampoline code) */
+    rc = rangeset_remove_range(r, PFN_DOWN(bootsym_phys(trampoline_start)),
+                               PFN_DOWN(bootsym_phys(trampoline_end)));
+    if ( rc )
+        return rc;
 
-        /*
-         * This needs to remain in sync with the uses of the same symbols in
-         * - __start_xen()
-         * - is_xen_fixed_mfn()
-         * - tboot_shutdown()
-         */
+    /*
+     * This needs to remain in sync with the uses of the same symbols in
+     * - __start_xen()
+     * - is_xen_fixed_mfn()
+     * - tboot_shutdown()
+     */
+    /* hypervisor .text + .rodata */
+    rc = rangeset_remove_range(r, PFN_DOWN(__pa(&_stext)),
+                               PFN_DOWN(__pa(&__2M_rodata_end)));
+    if ( rc )
+        return rc;
 
-        /* hypervisor .text + .rodata */
-        xen_regions[region_ro].s = __pa(&_stext);
-        xen_regions[region_ro].e = __pa(&__2M_rodata_end);
-        /* hypervisor .data + .bss */
-        xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
-        xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
-        if ( efi_boot_mem_unused(&start, &end) )
-        {
-            ASSERT(__pa(start) >= xen_regions[region_rw].s);
-            ASSERT(__pa(end) <= xen_regions[region_rw].e);
-            xen_regions[region_rw].e = __pa(start);
-            xen_regions[region_bss].s = __pa(end);
-            xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
-        }
+    /* hypervisor .data + .bss */
+    if ( efi_boot_mem_unused(&start, &end) )
+    {
+        ASSERT(__pa(start) >= __pa(&__2M_rwdata_start));
+        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
+                                   PFN_DOWN(__pa(start)));
+        if ( rc )
+            return rc;
+        ASSERT(__pa(end) <= __pa(&__2M_rwdata_end));
+        rc = rangeset_remove_range(r, PFN_DOWN(__pa(end)),
+                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
+        if ( rc )
+            return rc;
+    }
+    else
+    {
+        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
+                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
+        if ( rc )
+            return rc;
     }
-
-    start = (paddr_t)mfn << PAGE_SHIFT;
-    end = start + PAGE_SIZE;
-    for ( i = 0; i < nr_regions; i++ )
-        if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
-            return 1;
 
     return 0;
 }
 
-static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
-                                                 unsigned long pfn,
-                                                 unsigned long max_pfn)
+static int __hwdom_init map_subtract(unsigned long s, unsigned long e,
+                                     void *data)
 {
-    mfn_t mfn = _mfn(pfn);
-    unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable;
+    struct rangeset *map = data;
 
-    /*
-     * Set up 1:1 mapping for dom0. Default to include only conventional RAM
-     * areas and let RMRRs include needed reserved regions. When set, the
-     * inclusive mapping additionally maps in every pfn up to 4GB except those
-     * that fall in unusable ranges for PV Dom0.
-     */
-    if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) )
-        return 0;
+    return rangeset_remove_range(map, s, e);
+}
 
-    switch ( type = page_get_ram_type(mfn) )
-    {
-    case RAM_TYPE_UNUSABLE:
-        return 0;
+struct map_data {
+    struct domain *d;
+    unsigned int flush_flags;
+    bool ro;
+};
 
-    case RAM_TYPE_CONVENTIONAL:
-        if ( iommu_hwdom_strict )
-            return 0;
-        break;
+static int __hwdom_init identity_map(unsigned long s, unsigned long e,
+                                     void *data)
+{
+    struct map_data *info = data;
+    struct domain *d = info->d;
+    long rc;
 
-    default:
-        if ( type & RAM_TYPE_RESERVED )
-        {
-            if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
-                perms = 0;
-        }
-        else if ( is_hvm_domain(d) )
-            return 0;
-        else if ( !iommu_hwdom_inclusive || pfn > max_pfn )
-            perms = 0;
-    }
+    if ( iommu_verbose )
+        printk(XENLOG_INFO " [%010lx, %010lx] R%c\n",
+               s, e, info->ro ? 'O' : 'W');
 
-    /* Check that it doesn't overlap with the Interrupt Address Range. */
-    if ( pfn >= 0xfee00 && pfn <= 0xfeeff )
-        return 0;
-    /* ... or the IO-APIC */
-    if ( has_vioapic(d) )
+    if ( paging_mode_translate(d) )
     {
-        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
-            if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) )
-                return 0;
+        if ( info->ro )
+        {
+            ASSERT_UNREACHABLE();
+            return 0;
+        }
+        while ( (rc = map_mmio_regions(d, _gfn(s), e - s + 1, _mfn(s))) > 0 )
+        {
+            s += rc;
+            process_pending_softirqs();
+        }
     }
-    else if ( is_pv_domain(d) )
+    else
     {
-        /*
-         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
-         * ones there (also for e.g. HPET in certain cases), so it should also
-         * have such established for IOMMUs.
-         */
-        if ( iomem_access_permitted(d, pfn, pfn) &&
-             rangeset_contains_singleton(mmio_ro_ranges, pfn) )
-            perms = IOMMUF_readable;
+        const unsigned int perms = IOMMUF_readable | IOMMUF_preempt |
+                                   (info->ro ? 0 : IOMMUF_writable);
+
+        if ( info->ro && !iomem_access_permitted(d, s, e) )
+        {
+            /*
+             * Should be more fine grained in order to not map the forbidden
+             * frame instead of rejecting the region as a whole, but it's only
+             * for read-only MMIO regions, which are very limited.
+             */
+            printk(XENLOG_DEBUG
+                   "IOMMU read-only mapping of region [%lx, %lx] forbidden\n",
+                   s, e);
+            return 0;
+        }
+        while ( (rc = iommu_map(d, _dfn(s), _mfn(s), e - s + 1,
+                                perms, &info->flush_flags)) > 0 )
+        {
+            s += rc;
+            process_pending_softirqs();
+        }
     }
-    /*
-     * ... or the PCIe MCFG regions.
-     * TODO: runtime added MMCFG regions are not checked to make sure they
-     * don't overlap with already mapped regions, thus preventing trapping.
-     */
-    if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) )
-        return 0;
+    ASSERT(rc <= 0);
+    if ( rc )
+        printk(XENLOG_WARNING
+               "IOMMU identity mapping of [%lx, %lx] failed: %ld\n",
+               s, e, rc);
 
-    return perms;
+    /* Ignore errors and attempt to map the remaining regions. */
+    return 0;
 }
 
 void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
 {
-    unsigned long i, top, max_pfn, start, count;
-    unsigned int flush_flags = 0, start_perms = 0;
+    const unsigned long max_pfn = PFN_DOWN(GB(4)) - 1;
+    unsigned int i;
+    struct rangeset *map;
+    struct map_data map_data = { .d = d };
+    int rc;
 
     BUG_ON(!is_hardware_domain(d));
 
@@ -447,62 +451,135 @@  void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
     if ( iommu_hwdom_passthrough )
         return;
 
-    max_pfn = (GB(4) >> PAGE_SHIFT) - 1;
-    top = max(max_pdx, pfn_to_pdx(max_pfn) + 1);
+    map = rangeset_new(NULL, NULL, 0);
+    if ( !map )
+        panic("IOMMU initialization failed unable to allocate rangeset\n");
+
+    if ( iommu_hwdom_inclusive )
+    {
+        /* Add the whole range below 4GB, UNUSABLE regions will be removed. */
+        rc = rangeset_add_range(map, 0, max_pfn);
+        if ( rc )
+            panic("IOMMU inclusive mappings can't be added: %d\n",
+                  rc);
+    }
 
-    for ( i = 0, start = 0, count = 0; i < top; )
+    for ( i = 0; i < e820.nr_map; i++ )
     {
-        unsigned long pfn = pdx_to_pfn(i);
-        unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn);
+        struct e820entry entry = e820.map[i];
 
-        if ( !perms )
-            /* nothing */;
-        else if ( paging_mode_translate(d) )
+        switch ( entry.type )
         {
-            int rc;
+        case E820_UNUSABLE:
+            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) > max_pfn )
+                continue;
 
-            rc = p2m_add_identity_entry(d, pfn,
-                                        perms & IOMMUF_writable ? p2m_access_rw
-                                                                : p2m_access_r,
-                                        0);
+            rc = rangeset_remove_range(map, PFN_DOWN(entry.addr),
+                                       PFN_DOWN(entry.addr + entry.size - 1));
             if ( rc )
-                printk(XENLOG_WARNING
-                       "%pd: identity mapping of %lx failed: %d\n",
-                       d, pfn, rc);
+                panic("IOMMU failed to remove unusable memory: %d\n",
+                      rc);
+            continue;
+
+        case E820_RESERVED:
+            if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved )
+                continue;
+            break;
+
+        case E820_RAM:
+            if ( iommu_hwdom_strict )
+                continue;
+            break;
+
+        default:
+            if ( !iommu_hwdom_inclusive || PFN_DOWN(entry.addr) >= max_pfn )
+                continue;
+            entry.size = pfn_to_paddr(max_pfn) - 1 - entry.addr;
+            break;
         }
-        else if ( pfn != start + count || perms != start_perms )
-        {
-            long rc;
 
-        commit:
-            while ( (rc = iommu_map(d, _dfn(start), _mfn(start), count,
-                                    start_perms | IOMMUF_preempt,
-                                    &flush_flags)) > 0 )
-            {
-                start += rc;
-                count -= rc;
-                process_pending_softirqs();
-            }
+        if ( iommu_hwdom_inclusive &&
+             PFN_DOWN(entry.addr + entry.size - 1) <= max_pfn )
+            /*
+             * Any range below 4GB is already in the rangeset if using inclusive
+             * mode.
+             */
+            continue;
+
+        rc = rangeset_add_range(map, PFN_DOWN(entry.addr),
+                                PFN_DOWN(entry.addr + entry.size - 1));
+        if ( rc )
+            panic("IOMMU failed to add identity range: %d\n", rc);
+    }
+
+    /* Remove any areas in-use by Xen. */
+    rc = remove_xen_ranges(map);
+    if ( rc )
+        panic("IOMMU failed to remove Xen ranges: %d\n", rc);
+
+    /* Remove any overlap with the Interrupt Address Range. */
+    rc = rangeset_remove_range(map, 0xfee00, 0xfeeff);
+    if ( rc )
+        panic("IOMMU failed to remove Interrupt Address Range: %d\n",
+              rc);
+
+    /* If emulating IO-APIC(s) make sure the base address is unmapped. */
+    if ( has_vioapic(d) )
+    {
+        for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ )
+        {
+            rc = rangeset_remove_singleton(map,
+                PFN_DOWN(domain_vioapic(d, i)->base_address));
             if ( rc )
-                printk(XENLOG_WARNING
-                       "%pd: IOMMU identity mapping of [%lx,%lx) failed: %ld\n",
-                       d, start, start + count, rc);
-            start = pfn;
-            count = 1;
-            start_perms = perms;
+                panic("IOMMU failed to remove IO-APIC: %d\n",
+                      rc);
         }
-        else
-            ++count;
+    }
 
-        if ( !(++i & 0xfffff) )
-            process_pending_softirqs();
+    if ( is_pv_domain(d) )
+    {
+        /*
+         * Be consistent with CPU mappings: Dom0 is permitted to establish r/o
+         * ones there (also for e.g. HPET in certain cases), so it should also
+         * have such established for IOMMUs.
+         */
+        rc = rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL, map_subtract, map);
+        if ( rc )
+            panic("IOMMU failed to remove read-only regions: %d\n",
+                  rc);
+    }
+
+    if ( has_vpci(d) )
+    {
+        /*
+         * TODO: runtime added MMCFG regions are not checked to make sure they
+         * don't overlap with already mapped regions, thus preventing trapping.
+         */
+        rc = vpci_subtract_mmcfg(d, map);
+        if ( rc )
+            panic("IOMMU unable to remove MMCFG areas: %d\n", rc);
+    }
 
-        if ( i == top && count )
-            goto commit;
+    if ( iommu_verbose )
+        printk(XENLOG_INFO "d%u: identity mappings for IOMMU:\n",
+               d->domain_id);
+
+    rc = rangeset_report_ranges(map, 0, ~0UL, identity_map, &map_data);
+    if ( rc )
+        panic("IOMMU unable to create mappings: %d\n", rc);
+    if ( is_pv_domain(d) )
+    {
+        map_data.ro = true;
+        rc = rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL, identity_map,
+                                    &map_data);
+        if ( rc )
+            panic("IOMMU unable to create read-only mappings: %d\n", rc);
     }
 
+    rangeset_destroy(map);
+
     /* Use if to avoid compiler warning */
-    if ( iommu_iotlb_flush_all(d, flush_flags) )
+    if ( iommu_iotlb_flush_all(d, map_data.flush_flags) )
         return;
 }