diff mbox series

[v8,07/13] vpci/header: handle p2m range sets per BAR

Message ID 20230720003205.1828537-8-volodymyr_babchuk@epam.com (mailing list archive)
State Superseded
Headers show
Series PCI devices passthrough on Arm, part 3 | expand

Commit Message

Volodymyr Babchuk July 20, 2023, 12:32 a.m. UTC
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

Instead of handling a single range set, that contains all the memory
regions of all the BARs and ROM, have them per BAR.
As the range sets are now created when a PCI device is added and destroyed
when it is removed so make them named and accounted.

Note that rangesets were chosen here despite there being only up to
3 separate ranges in each set (typically just 1). But rangeset per BAR
was chosen for the ease of implementation and existing code re-usability.

This is in preparation of making non-identity mappings in p2m for the MMIOs.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

---
Since v6:
- update according to the new locking scheme
- remove odd fail label in modify_bars
Since v5:
- fix comments
- move rangeset allocation to init_bars and only allocate
  for MAPPABLE BARs
- check for overlap with the already setup BAR ranges
Since v4:
- use named range sets for BARs (Jan)
- changes required by the new locking scheme
- updated commit message (Jan)
Since v3:
- re-work vpci_cancel_pending accordingly to the per-BAR handling
- s/num_mem_ranges/map_pending and s/uint8_t/bool
- ASSERT(bar->mem) in modify_bars
- create and destroy the rangesets on add/remove
---
 xen/drivers/vpci/header.c | 235 ++++++++++++++++++++++++++++----------
 xen/drivers/vpci/vpci.c   |   6 +
 xen/include/xen/vpci.h    |   3 +-
 3 files changed, 181 insertions(+), 63 deletions(-)

Comments

Roger Pau Monné July 21, 2023, 11:49 a.m. UTC | #1
On Thu, Jul 20, 2023 at 12:32:32AM +0000, Volodymyr Babchuk wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> Instead of handling a single range set, that contains all the memory
> regions of all the BARs and ROM, have them per BAR.
> As the range sets are now created when a PCI device is added and destroyed
> when it is removed so make them named and accounted.
> 
> Note that rangesets were chosen here despite there being only up to
> 3 separate ranges in each set (typically just 1). But rangeset per BAR
> was chosen for the ease of implementation and existing code re-usability.
> 
> This is in preparation of making non-identity mappings in p2m for the MMIOs.
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> ---
> Since v6:
> - update according to the new locking scheme
> - remove odd fail label in modify_bars
> Since v5:
> - fix comments
> - move rangeset allocation to init_bars and only allocate
>   for MAPPABLE BARs
> - check for overlap with the already setup BAR ranges
> Since v4:
> - use named range sets for BARs (Jan)
> - changes required by the new locking scheme
> - updated commit message (Jan)
> Since v3:
> - re-work vpci_cancel_pending accordingly to the per-BAR handling
> - s/num_mem_ranges/map_pending and s/uint8_t/bool
> - ASSERT(bar->mem) in modify_bars
> - create and destroy the rangesets on add/remove
> ---
>  xen/drivers/vpci/header.c | 235 ++++++++++++++++++++++++++++----------
>  xen/drivers/vpci/vpci.c   |   6 +
>  xen/include/xen/vpci.h    |   3 +-
>  3 files changed, 181 insertions(+), 63 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index 5dc9b5338b..eb07fa0bb2 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -141,63 +141,106 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    if ( v->vpci.mem )
> +    struct pci_dev *pdev = v->vpci.pdev;
> +    if ( !pdev )
> +        return false;

I think this check is kind of inverted, you should check for
vpci.map_pending first, and then check that the rest of the fields are
also set (or complain otherwise as something went clearly wrong)?

> +
> +    if ( v->vpci.map_pending )
>      {
>          struct map_data data = {
>              .d = v->domain,
>              .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>          };
> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> -
> -        if ( rc == -ERESTART )
> -            return true;
> +        struct vpci_header *header = &pdev->vpci->header;

You need to hold the per-domain rwlock in order to access
pdev->vpci.

> +        unsigned int i;
>  
>          write_lock(&v->domain->pci_lock);

Holding the lock in write mode for the duration of the mapping is
quite aggressive, as the mapping operation could be a long running
one.

Is this only locked in exclusive mode in order to have the right
locking for the vpci_remove_device() call below?

If so we might consider using a different error handling in order to
avoid taking the lock in exclusive mode.

> -        spin_lock(&v->vpci.pdev->vpci->lock);
> -        /* Disable memory decoding unconditionally on failure. */
> -        modify_decoding(v->vpci.pdev,
> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> -                        !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci->lock);
> -
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
> -        if ( rc )
> -            /*
> -             * FIXME: in case of failure remove the device from the domain.
> -             * Note that there might still be leftover mappings. While this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> -             */
> -            vpci_remove_device(v->vpci.pdev);
> +
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +        {
> +            struct vpci_bar *bar = &header->bars[i];
> +            int rc;
> +
> +            if ( rangeset_is_empty(bar->mem) )
> +                continue;
> +
> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> +
> +            if ( rc == -ERESTART )
> +            {
> +                write_unlock(&v->domain->pci_lock);
> +                return true;
> +            }
> +
> +            spin_lock(&pdev->vpci->lock);
> +            /* Disable memory decoding unconditionally on failure. */
> +            modify_decoding(pdev, rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY :
> +                                       v->vpci.cmd, !rc && v->vpci.rom_only);

This need to also be moved out of the loop, or else you would be
toggling the memory decoding bit every time a BAR is mapped or
unmapped.  This must be done once all BARs are {un,}mapped (so outside
of the for loop).

You will likely need to keep a call here that disables memory decoding
only if the mapping has failed.

> +            spin_unlock(&pdev->vpci->lock);
> +
> +            if ( rc )
> +            {
> +                /*
> +                 * FIXME: in case of failure remove the device from the domain.
> +                 * Note that there might still be leftover mappings. While this
> +                 * is safe for Dom0, for DomUs the domain needs to be killed in
> +                 * order to avoid leaking stale p2m mappings on failure.
> +                 */

You are already handling the domU case, so the comment needs to be
adjusted, as it's no longer a FIXME.  We might consider to just remove
the comment at once.

> +                v->vpci.map_pending = false;
> +
> +                if ( is_hardware_domain(v->domain) )
> +                {
> +                    vpci_remove_device(pdev);
> +                    write_unlock(&v->domain->pci_lock);
> +                }
> +                else
> +                {
> +                    write_unlock(&v->domain->pci_lock);
> +                    domain_crash(v->domain);
> +                }
> +                return false;
> +            }
> +        }
>          write_unlock(&v->domain->pci_lock);
> +
> +        v->vpci.map_pending = false;
>      }
>  
> +
>      return false;
>  }
>  
>  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            struct rangeset *mem, uint16_t cmd)
> +                            uint16_t cmd)
>  {
>      struct map_data data = { .d = d, .map = true };
> -    int rc;
> +    struct vpci_header *header = &pdev->vpci->header;
> +    int rc = 0;
> +    unsigned int i;
>  
>      ASSERT(rw_is_locked(&d->pci_lock));
>  
> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
> -        /*
> -         * It's safe to drop and reacquire the lock in this context
> -         * without risking pdev disappearing because devices cannot be
> -         * removed until the initial domain has been started.
> -         */
> -        read_unlock(&d->pci_lock);
> -        process_pending_softirqs();
> -        read_lock(&d->pci_lock);
> -    }
> +        struct vpci_bar *bar = &header->bars[i];
>  
> -    rangeset_destroy(mem);
> +        if ( rangeset_is_empty(bar->mem) )
> +            continue;
> +
> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
> +                                              &data)) == -ERESTART )
> +        {
> +            /*
> +             * It's safe to drop and reacquire the lock in this context
> +             * without risking pdev disappearing because devices cannot be
> +             * removed until the initial domain has been started.
> +             */
> +            write_unlock(&d->pci_lock);
> +            process_pending_softirqs();
> +            write_lock(&d->pci_lock);
> +        }
> +    }
>      if ( !rc )
>          modify_decoding(pdev, cmd, false);
>  
> @@ -205,10 +248,12 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>  }
>  
>  static void defer_map(struct domain *d, struct pci_dev *pdev,
> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
> +                      uint16_t cmd, bool rom_only)
>  {
>      struct vcpu *curr = current;
>  
> +    ASSERT(!!rw_is_write_locked(&pdev->domain->pci_lock));

No need for the double !!.

> +
>      /*
>       * FIXME: when deferring the {un}map the state of the device should not
>       * be trusted. For example the enable bit is toggled after the device
> @@ -216,7 +261,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>       * started for the same device if the domain is not well-behaved.
>       */
>      curr->vpci.pdev = pdev;
> -    curr->vpci.mem = mem;
> +    curr->vpci.map_pending = true;
>      curr->vpci.cmd = cmd;
>      curr->vpci.rom_only = rom_only;
>      /*
> @@ -230,33 +275,34 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>  {
>      struct vpci_header *header = &pdev->vpci->header;
> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>      struct pci_dev *tmp, *dev = NULL;
>      const struct domain *d;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> -    unsigned int i;
> +    unsigned int i, j;
>      int rc;
> +    bool map_pending;
>  
>      ASSERT(rw_is_locked(&pdev->domain->pci_lock));
>  
> -    if ( !mem )
> -        return -ENOMEM;
> -
>      /*
> -     * Create a rangeset that represents the current device BARs memory region
> -     * and compare it against all the currently active BAR memory regions. If
> -     * an overlap is found, subtract it from the region to be mapped/unmapped.
> +     * Create a rangeset per BAR that represents the current device memory
> +     * region and compare it against all the currently active BAR memory
> +     * regions. If an overlap is found, subtract it from the region to be
> +     * mapped/unmapped.
>       *
> -     * First fill the rangeset with all the BARs of this device or with the ROM
> +     * First fill the rangesets with the BARs of this device or with the ROM

I think you need to drop the 's' from BARs also.

>       * BAR only, depending on whether the guest is toggling the memory decode
>       * bit of the command register, or the enable bit of the ROM BAR register.
>       */
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
> -        const struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar *bar = &header->bars[i];
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
> +        if ( !bar->mem )
> +            continue;
> +
>          if ( !MAPPABLE_BAR(bar) ||
>               (rom_only ? bar->type != VPCI_BAR_ROM
>                         : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) ||
> @@ -272,14 +318,31 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              continue;
>          }
>  
> -        rc = rangeset_add_range(mem, start, end);
> +        rc = rangeset_add_range(bar->mem, start, end);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start, end, rc);
> -            rangeset_destroy(mem);
>              return rc;
>          }
> +
> +        /* Check for overlap with the already setup BAR ranges. */
> +        for ( j = 0; j < i; j++ )
> +        {
> +            struct vpci_bar *bar = &header->bars[j];

This is kind of confusing, as you are defining an inner 'bar' variable
that shadows the outside one.  Might be better to name it as prev_bar
or some such, to avoid the shadowing.

> +
> +            if ( rangeset_is_empty(bar->mem) )
> +                continue;
> +
> +            rc = rangeset_remove_range(bar->mem, start, end);
> +            if ( rc )
> +            {
> +                printk(XENLOG_G_WARNING
> +                       "Failed to remove overlapping range [%lx, %lx]: %d\n",
> +                       start, end, rc);

Might as well print the SBDF of the device while at it (same below).

You could also consider using gprintk instead of plain printk, and
avoid the _G_ tag in the log level.

> +                return rc;
> +            }
> +        }
>      }
>  
>      /* Remove any MSIX regions if present. */
> @@ -289,14 +352,21 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>          unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>                                       vmsix_table_size(pdev->vpci, i) - 1);
>  
> -        rc = rangeset_remove_range(mem, start, end);
> -        if ( rc )
> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            printk(XENLOG_G_WARNING
> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
> -                   start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            const struct vpci_bar *bar = &header->bars[j];
> +
> +            if ( rangeset_is_empty(bar->mem) )
> +                continue;
> +
> +            rc = rangeset_remove_range(bar->mem, start, end);
> +            if ( rc )
> +            {
> +                printk(XENLOG_G_WARNING
> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
> +                       start, end, rc);
> +                return rc;
> +            }
>          }
>      }
>  
> @@ -341,7 +411,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
>                  if ( !bar->enabled ||
> -                     !rangeset_overlaps_range(mem, start, end) ||
> +                     !rangeset_overlaps_range(bar->mem, start, end) ||
>                       /*
>                        * If only the ROM enable bit is toggled check against
>                        * other BARs in the same device for overlaps, but not
> @@ -350,12 +420,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                       (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>                      continue;
>  
> -                rc = rangeset_remove_range(mem, start, end);
> +                rc = rangeset_remove_range(bar->mem, start, end);

Urg, isn't 'bar' here pointing to the remote device BAR, not the BARs
that we want to map?

You need an inner loop that iterates over header->bars, much like you
do to handle the MSI-X table overlaps.

>                  if ( rc )
>                  {
>                      printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                             start, end, rc);
> -                    rangeset_destroy(mem);
>                      return rc;
>                  }
>              }
> @@ -380,10 +449,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, mem, cmd);
> +        return apply_map(pdev->domain, pdev, cmd);
>      }
>  
> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
> +    /* Find out how many memory ranges has left after MSI and overlaps. */
                                          ^ are (I think).
> +    map_pending = false;
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +        if ( !rangeset_is_empty(header->bars[i].mem) )
> +        {
> +            map_pending = true;
> +            break;
> +        }
> +
> +    /* If there's no mapping work write the command register now. */
> +    if ( !map_pending )
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> +    else
> +        defer_map(dev->domain, dev, cmd, rom_only);

This is kind of not strictly required, and different from the current
approach where defer_map() gets called regardless of whether the
rangeset is all empty.

Could be moved to a separate commit.

>  
>      return 0;
>  }
> @@ -574,6 +656,19 @@ static void cf_check rom_write(
>          rom->addr = val & PCI_ROM_ADDRESS_MASK;
>  }
>  
> +static int bar_add_rangeset(struct pci_dev *pdev, struct vpci_bar *bar, int i)

pci_dev should be const, and i unsigned.

> +{
> +    char str[32];
> +
> +    snprintf(str, sizeof(str), "%pp:BAR%d", &pdev->sbdf, i);
> +
> +    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> +    if ( !bar->mem )
> +        return -ENOMEM;
> +
> +    return 0;
> +}
> +
>  static int cf_check init_bars(struct pci_dev *pdev)
>  {
>      uint16_t cmd;
> @@ -657,6 +752,13 @@ static int cf_check init_bars(struct pci_dev *pdev)
>          else
>              bars[i].type = VPCI_BAR_MEM32;
>  
> +        rc = bar_add_rangeset(pdev, &bars[i], i);
> +        if ( rc )
> +        {
> +            bars[i].type = VPCI_BAR_EMPTY;
> +            return rc;
> +        }
> +
>          rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
>                                (i == num_bars - 1) ? PCI_BAR_LAST : 0);
>          if ( rc < 0 )
> @@ -707,6 +809,15 @@ static int cf_check init_bars(struct pci_dev *pdev)
>                                     rom_reg, 4, rom);
>              if ( rc )
>                  rom->type = VPCI_BAR_EMPTY;
> +            else
> +            {
> +                rc = bar_add_rangeset(pdev, rom, i);
> +                if ( rc )
> +                {
> +                    rom->type = VPCI_BAR_EMPTY;
> +                    return rc;
> +                }

For both of the above: I don't think you need to set the BAR to EMPTY
if you are already returning an error, as the whole vCPI handling will
fail initialization.  Setting to empty only makes sense if we can try
to continue with normal operations.

> +            }
>          }
>      }
>      else
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index a97710a806..ca3505ecb7 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -38,6 +38,8 @@ extern vpci_register_init_t *const __end_vpci_array[];
>  
>  void vpci_remove_device(struct pci_dev *pdev)
>  {
> +    unsigned int i;
> +
>      ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>  
>      if ( !has_vpci(pdev->domain) || !pdev->vpci )
> @@ -63,6 +65,10 @@ void vpci_remove_device(struct pci_dev *pdev)
>              if ( pdev->vpci->msix->table[i] )
>                  iounmap(pdev->vpci->msix->table[i]);
>      }
> +
> +    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> +        rangeset_destroy(pdev->vpci->header.bars[i].mem);
> +
>      xfree(pdev->vpci->msix);
>      xfree(pdev->vpci->msi);
>      xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 486a655e8d..b78dd6512b 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -72,6 +72,7 @@ struct vpci {
>              /* Guest view of the BAR: address and lower bits. */
>              uint64_t guest_reg;
>              uint64_t size;
> +            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -156,9 +157,9 @@ struct vpci {
>  
>  struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> -    struct rangeset *mem;
>      struct pci_dev *pdev;
>      uint16_t cmd;
> +    bool map_pending : 1;

I do wonder whether we really need the map_pending boolean field,
couldn't pdev != NULL be used as a way to signal a pending mapping
operation?

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 5dc9b5338b..eb07fa0bb2 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -141,63 +141,106 @@  static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
 
 bool vpci_process_pending(struct vcpu *v)
 {
-    if ( v->vpci.mem )
+    struct pci_dev *pdev = v->vpci.pdev;
+
+    if ( !pdev )
+        return false;
+
+    if ( v->vpci.map_pending )
     {
         struct map_data data = {
             .d = v->domain,
             .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
         };
-        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
-
-        if ( rc == -ERESTART )
-            return true;
+        struct vpci_header *header = &pdev->vpci->header;
+        unsigned int i;
 
         write_lock(&v->domain->pci_lock);
-        spin_lock(&v->vpci.pdev->vpci->lock);
-        /* Disable memory decoding unconditionally on failure. */
-        modify_decoding(v->vpci.pdev,
-                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
-                        !rc && v->vpci.rom_only);
-        spin_unlock(&v->vpci.pdev->vpci->lock);
-
-        rangeset_destroy(v->vpci.mem);
-        v->vpci.mem = NULL;
-        if ( rc )
-            /*
-             * FIXME: in case of failure remove the device from the domain.
-             * Note that there might still be leftover mappings. While this is
-             * safe for Dom0, for DomUs the domain will likely need to be
-             * killed in order to avoid leaking stale p2m mappings on
-             * failure.
-             */
-            vpci_remove_device(v->vpci.pdev);
+
+        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+        {
+            struct vpci_bar *bar = &header->bars[i];
+            int rc;
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
+
+            if ( rc == -ERESTART )
+            {
+                write_unlock(&v->domain->pci_lock);
+                return true;
+            }
+
+            spin_lock(&pdev->vpci->lock);
+            /* Disable memory decoding unconditionally on failure. */
+            modify_decoding(pdev, rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY :
+                                       v->vpci.cmd, !rc && v->vpci.rom_only);
+            spin_unlock(&pdev->vpci->lock);
+
+            if ( rc )
+            {
+                /*
+                 * FIXME: in case of failure remove the device from the domain.
+                 * Note that there might still be leftover mappings. While this
+                 * is safe for Dom0, for DomUs the domain needs to be killed in
+                 * order to avoid leaking stale p2m mappings on failure.
+                 */
+                v->vpci.map_pending = false;
+
+                if ( is_hardware_domain(v->domain) )
+                {
+                    vpci_remove_device(pdev);
+                    write_unlock(&v->domain->pci_lock);
+                }
+                else
+                {
+                    write_unlock(&v->domain->pci_lock);
+                    domain_crash(v->domain);
+                }
+                return false;
+            }
+        }
         write_unlock(&v->domain->pci_lock);
+
+        v->vpci.map_pending = false;
     }
 
+
     return false;
 }
 
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
-                            struct rangeset *mem, uint16_t cmd)
+                            uint16_t cmd)
 {
     struct map_data data = { .d = d, .map = true };
-    int rc;
+    struct vpci_header *header = &pdev->vpci->header;
+    int rc = 0;
+    unsigned int i;
 
     ASSERT(rw_is_locked(&d->pci_lock));
 
-    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
-        /*
-         * It's safe to drop and reacquire the lock in this context
-         * without risking pdev disappearing because devices cannot be
-         * removed until the initial domain has been started.
-         */
-        read_unlock(&d->pci_lock);
-        process_pending_softirqs();
-        read_lock(&d->pci_lock);
-    }
+        struct vpci_bar *bar = &header->bars[i];
 
-    rangeset_destroy(mem);
+        if ( rangeset_is_empty(bar->mem) )
+            continue;
+
+        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
+                                              &data)) == -ERESTART )
+        {
+            /*
+             * It's safe to drop and reacquire the lock in this context
+             * without risking pdev disappearing because devices cannot be
+             * removed until the initial domain has been started.
+             */
+            write_unlock(&d->pci_lock);
+            process_pending_softirqs();
+            write_lock(&d->pci_lock);
+        }
+    }
     if ( !rc )
         modify_decoding(pdev, cmd, false);
 
@@ -205,10 +248,12 @@  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
 }
 
 static void defer_map(struct domain *d, struct pci_dev *pdev,
-                      struct rangeset *mem, uint16_t cmd, bool rom_only)
+                      uint16_t cmd, bool rom_only)
 {
     struct vcpu *curr = current;
 
+    ASSERT(!!rw_is_write_locked(&pdev->domain->pci_lock));
+
     /*
      * FIXME: when deferring the {un}map the state of the device should not
      * be trusted. For example the enable bit is toggled after the device
@@ -216,7 +261,7 @@  static void defer_map(struct domain *d, struct pci_dev *pdev,
      * started for the same device if the domain is not well-behaved.
      */
     curr->vpci.pdev = pdev;
-    curr->vpci.mem = mem;
+    curr->vpci.map_pending = true;
     curr->vpci.cmd = cmd;
     curr->vpci.rom_only = rom_only;
     /*
@@ -230,33 +275,34 @@  static void defer_map(struct domain *d, struct pci_dev *pdev,
 static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 {
     struct vpci_header *header = &pdev->vpci->header;
-    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
     struct pci_dev *tmp, *dev = NULL;
     const struct domain *d;
     const struct vpci_msix *msix = pdev->vpci->msix;
-    unsigned int i;
+    unsigned int i, j;
     int rc;
+    bool map_pending;
 
     ASSERT(rw_is_locked(&pdev->domain->pci_lock));
 
-    if ( !mem )
-        return -ENOMEM;
-
     /*
-     * Create a rangeset that represents the current device BARs memory region
-     * and compare it against all the currently active BAR memory regions. If
-     * an overlap is found, subtract it from the region to be mapped/unmapped.
+     * Create a rangeset per BAR that represents the current device memory
+     * region and compare it against all the currently active BAR memory
+     * regions. If an overlap is found, subtract it from the region to be
+     * mapped/unmapped.
      *
-     * First fill the rangeset with all the BARs of this device or with the ROM
+     * First fill the rangesets with the BARs of this device or with the ROM
      * BAR only, depending on whether the guest is toggling the memory decode
      * bit of the command register, or the enable bit of the ROM BAR register.
      */
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
-        const struct vpci_bar *bar = &header->bars[i];
+        struct vpci_bar *bar = &header->bars[i];
         unsigned long start = PFN_DOWN(bar->addr);
         unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
+        if ( !bar->mem )
+            continue;
+
         if ( !MAPPABLE_BAR(bar) ||
              (rom_only ? bar->type != VPCI_BAR_ROM
                        : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) ||
@@ -272,14 +318,31 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
             continue;
         }
 
-        rc = rangeset_add_range(mem, start, end);
+        rc = rangeset_add_range(bar->mem, start, end);
         if ( rc )
         {
             printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
                    start, end, rc);
-            rangeset_destroy(mem);
             return rc;
         }
+
+        /* Check for overlap with the already setup BAR ranges. */
+        for ( j = 0; j < i; j++ )
+        {
+            struct vpci_bar *bar = &header->bars[j];
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_remove_range(bar->mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove overlapping range [%lx, %lx]: %d\n",
+                       start, end, rc);
+                return rc;
+            }
+        }
     }
 
     /* Remove any MSIX regions if present. */
@@ -289,14 +352,21 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
         unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
                                      vmsix_table_size(pdev->vpci, i) - 1);
 
-        rc = rangeset_remove_range(mem, start, end);
-        if ( rc )
+        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
         {
-            printk(XENLOG_G_WARNING
-                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
-                   start, end, rc);
-            rangeset_destroy(mem);
-            return rc;
+            const struct vpci_bar *bar = &header->bars[j];
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_remove_range(bar->mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
+                       start, end, rc);
+                return rc;
+            }
         }
     }
 
@@ -341,7 +411,7 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
                 if ( !bar->enabled ||
-                     !rangeset_overlaps_range(mem, start, end) ||
+                     !rangeset_overlaps_range(bar->mem, start, end) ||
                      /*
                       * If only the ROM enable bit is toggled check against
                       * other BARs in the same device for overlaps, but not
@@ -350,12 +420,11 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                      (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
                     continue;
 
-                rc = rangeset_remove_range(mem, start, end);
+                rc = rangeset_remove_range(bar->mem, start, end);
                 if ( rc )
                 {
                     printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                            start, end, rc);
-                    rangeset_destroy(mem);
                     return rc;
                 }
             }
@@ -380,10 +449,23 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
          * will always be to establish mappings and process all the BARs.
          */
         ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
-        return apply_map(pdev->domain, pdev, mem, cmd);
+        return apply_map(pdev->domain, pdev, cmd);
     }
 
-    defer_map(dev->domain, dev, mem, cmd, rom_only);
+    /* Find out how many memory ranges has left after MSI and overlaps. */
+    map_pending = false;
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+        if ( !rangeset_is_empty(header->bars[i].mem) )
+        {
+            map_pending = true;
+            break;
+        }
+
+    /* If there's no mapping work write the command register now. */
+    if ( !map_pending )
+        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
+    else
+        defer_map(dev->domain, dev, cmd, rom_only);
 
     return 0;
 }
@@ -574,6 +656,19 @@  static void cf_check rom_write(
         rom->addr = val & PCI_ROM_ADDRESS_MASK;
 }
 
+static int bar_add_rangeset(struct pci_dev *pdev, struct vpci_bar *bar, int i)
+{
+    char str[32];
+
+    snprintf(str, sizeof(str), "%pp:BAR%d", &pdev->sbdf, i);
+
+    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
+    if ( !bar->mem )
+        return -ENOMEM;
+
+    return 0;
+}
+
 static int cf_check init_bars(struct pci_dev *pdev)
 {
     uint16_t cmd;
@@ -657,6 +752,13 @@  static int cf_check init_bars(struct pci_dev *pdev)
         else
             bars[i].type = VPCI_BAR_MEM32;
 
+        rc = bar_add_rangeset(pdev, &bars[i], i);
+        if ( rc )
+        {
+            bars[i].type = VPCI_BAR_EMPTY;
+            return rc;
+        }
+
         rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
                               (i == num_bars - 1) ? PCI_BAR_LAST : 0);
         if ( rc < 0 )
@@ -707,6 +809,15 @@  static int cf_check init_bars(struct pci_dev *pdev)
                                    rom_reg, 4, rom);
             if ( rc )
                 rom->type = VPCI_BAR_EMPTY;
+            else
+            {
+                rc = bar_add_rangeset(pdev, rom, i);
+                if ( rc )
+                {
+                    rom->type = VPCI_BAR_EMPTY;
+                    return rc;
+                }
+            }
         }
     }
     else
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index a97710a806..ca3505ecb7 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -38,6 +38,8 @@  extern vpci_register_init_t *const __end_vpci_array[];
 
 void vpci_remove_device(struct pci_dev *pdev)
 {
+    unsigned int i;
+
     ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
 
     if ( !has_vpci(pdev->domain) || !pdev->vpci )
@@ -63,6 +65,10 @@  void vpci_remove_device(struct pci_dev *pdev)
             if ( pdev->vpci->msix->table[i] )
                 iounmap(pdev->vpci->msix->table[i]);
     }
+
+    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
+        rangeset_destroy(pdev->vpci->header.bars[i].mem);
+
     xfree(pdev->vpci->msix);
     xfree(pdev->vpci->msi);
     xfree(pdev->vpci);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 486a655e8d..b78dd6512b 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -72,6 +72,7 @@  struct vpci {
             /* Guest view of the BAR: address and lower bits. */
             uint64_t guest_reg;
             uint64_t size;
+            struct rangeset *mem;
             enum {
                 VPCI_BAR_EMPTY,
                 VPCI_BAR_IO,
@@ -156,9 +157,9 @@  struct vpci {
 
 struct vpci_vcpu {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
-    struct rangeset *mem;
     struct pci_dev *pdev;
     uint16_t cmd;
+    bool map_pending : 1;
     bool rom_only : 1;
 };