diff mbox series

[v3,06/11] vpci/header: Handle p2m range sets per BAR

Message ID 20210930075223.860329-7-andr2000@gmail.com (mailing list archive)
State Superseded
Headers show
Series PCI devices passthrough on Arm, part 3 | expand

Commit Message

Oleksandr Andrushchenko Sept. 30, 2021, 7:52 a.m. UTC
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

Instead of handling a single range set, that contains all the memory
regions of all the BARs and ROM, have them per BAR.

This is in preparation of making non-identity mappings in p2m for the
MMIOs/ROM.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
 xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
 xen/include/xen/vpci.h    |   3 +-
 2 files changed, 122 insertions(+), 53 deletions(-)

Comments

Oleksandr Andrushchenko Oct. 25, 2021, 11:51 a.m. UTC | #1
Hi, Roger!
Could you please take a look at the below?
Jan was questioning the per BAR range set approach, so it
is crucial for the maintainer (you) to answer here.

Thank you in advance,
Oleksandr

On 30.09.21 10:52, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>
> Instead of handling a single range set, that contains all the memory
> regions of all the BARs and ROM, have them per BAR.
>
> This is in preparation of making non-identity mappings in p2m for the
> MMIOs/ROM.
>
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> ---
>   xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
>   xen/include/xen/vpci.h    |   3 +-
>   2 files changed, 122 insertions(+), 53 deletions(-)
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index ec4d215f36ff..9c603d26d302 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>   
>   bool vpci_process_pending(struct vcpu *v)
>   {
> -    if ( v->vpci.mem )
> +    if ( v->vpci.num_mem_ranges )
>       {
>           struct map_data data = {
>               .d = v->domain,
>               .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>           };
> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> +        struct pci_dev *pdev = v->vpci.pdev;
> +        struct vpci_header *header = &pdev->vpci->header;
> +        unsigned int i;
>   
> -        if ( rc == -ERESTART )
> -            return true;
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +        {
> +            struct vpci_bar *bar = &header->bars[i];
> +            int rc;
>   
> -        spin_lock(&v->vpci.pdev->vpci->lock);
> -        /* Disable memory decoding unconditionally on failure. */
> -        modify_decoding(v->vpci.pdev,
> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> -                        !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci->lock);
> +            if ( !bar->mem )
> +                continue;
>   
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
> -        if ( rc )
> -            /*
> -             * FIXME: in case of failure remove the device from the domain.
> -             * Note that there might still be leftover mappings. While this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> -             */
> -            vpci_remove_device(v->vpci.pdev);
> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> +
> +            if ( rc == -ERESTART )
> +                return true;
> +
> +            spin_lock(&pdev->vpci->lock);
> +            /* Disable memory decoding unconditionally on failure. */
> +            modify_decoding(pdev,
> +                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> +                            !rc && v->vpci.rom_only);
> +            spin_unlock(&pdev->vpci->lock);
> +
> +            rangeset_destroy(bar->mem);
> +            bar->mem = NULL;
> +            v->vpci.num_mem_ranges--;
> +            if ( rc )
> +                /*
> +                 * FIXME: in case of failure remove the device from the domain.
> +                 * Note that there might still be leftover mappings. While this is
> +                 * safe for Dom0, for DomUs the domain will likely need to be
> +                 * killed in order to avoid leaking stale p2m mappings on
> +                 * failure.
> +                 */
> +                vpci_remove_device(pdev);
> +        }
>       }
>   
>       return false;
>   }
>   
>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            struct rangeset *mem, uint16_t cmd)
> +                            uint16_t cmd)
>   {
>       struct map_data data = { .d = d, .map = true };
> -    int rc;
> +    struct vpci_header *header = &pdev->vpci->header;
> +    int rc = 0;
> +    unsigned int i;
> +
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
>   
> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
> -        process_pending_softirqs();
> -    rangeset_destroy(mem);
> +        if ( !bar->mem )
> +            continue;
> +
> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
> +                                              &data)) == -ERESTART )
> +            process_pending_softirqs();
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
>       if ( !rc )
>           modify_decoding(pdev, cmd, false);
>   
> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>   }
>   
>   static void defer_map(struct domain *d, struct pci_dev *pdev,
> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
> +                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
>   {
>       struct vcpu *curr = current;
>   
> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>        * started for the same device if the domain is not well-behaved.
>        */
>       curr->vpci.pdev = pdev;
> -    curr->vpci.mem = mem;
>       curr->vpci.cmd = cmd;
>       curr->vpci.rom_only = rom_only;
> +    curr->vpci.num_mem_ranges = num_mem_ranges;
>       /*
>        * Raise a scheduler softirq in order to prevent the guest from resuming
>        * execution with pending mapping operations, to trigger the invocation
> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>   static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>   {
>       struct vpci_header *header = &pdev->vpci->header;
> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>       struct pci_dev *tmp, *dev = NULL;
>       const struct vpci_msix *msix = pdev->vpci->msix;
> -    unsigned int i;
> +    unsigned int i, j;
>       int rc;
> -
> -    if ( !mem )
> -        return -ENOMEM;
> +    uint8_t num_mem_ranges;
>   
>       /*
> -     * Create a rangeset that represents the current device BARs memory region
> +     * Create a rangeset per BAR that represents the current device memory region
>        * and compare it against all the currently active BAR memory regions. If
>        * an overlap is found, subtract it from the region to be mapped/unmapped.
>        *
> -     * First fill the rangeset with all the BARs of this device or with the ROM
> +     * First fill the rangesets with all the BARs of this device or with the ROM
>        * BAR only, depending on whether the guest is toggling the memory decode
>        * bit of the command register, or the enable bit of the ROM BAR register.
>        */
>       for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>       {
> -        const struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar *bar = &header->bars[i];
>           unsigned long start = PFN_DOWN(bar->addr);
>           unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>   
> +        bar->mem = NULL;
> +
>           if ( !MAPPABLE_BAR(bar) ||
>                (rom_only ? bar->type != VPCI_BAR_ROM
>                          : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
>               continue;
>   
> -        rc = rangeset_add_range(mem, start, end);
> +        bar->mem = rangeset_new(NULL, NULL, 0);
> +        if ( !bar->mem )
> +        {
> +            rc = -ENOMEM;
> +            goto fail;
> +        }
> +
> +        rc = rangeset_add_range(bar->mem, start, end);
>           if ( rc )
>           {
>               printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                      start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            goto fail;
>           }
>       }
>   
> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>                                        vmsix_table_size(pdev->vpci, i) - 1);
>   
> -        rc = rangeset_remove_range(mem, start, end);
> -        if ( rc )
> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>           {
> -            printk(XENLOG_G_WARNING
> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
> -                   start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            const struct vpci_bar *bar = &header->bars[j];
> +
> +            if ( !bar->mem )
> +                continue;
> +
> +            rc = rangeset_remove_range(bar->mem, start, end);
> +            if ( rc )
> +            {
> +                printk(XENLOG_G_WARNING
> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
> +                       start, end, rc);
> +                goto fail;
> +            }
>           }
>       }
>   
> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>               unsigned long start = PFN_DOWN(bar->addr);
>               unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>   
> -            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
> +            if ( !bar->enabled ||
> +                 !rangeset_overlaps_range(bar->mem, start, end) ||
>                    /*
>                     * If only the ROM enable bit is toggled check against other
>                     * BARs in the same device for overlaps, but not against the
> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                    (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>                   continue;
>   
> -            rc = rangeset_remove_range(mem, start, end);
> +            rc = rangeset_remove_range(bar->mem, start, end);
>               if ( rc )
>               {
>                   printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                          start, end, rc);
> -                rangeset_destroy(mem);
> -                return rc;
> +                goto fail;
>               }
>           }
>       }
> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>            * will always be to establish mappings and process all the BARs.
>            */
>           ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, mem, cmd);
> +        return apply_map(pdev->domain, pdev, cmd);
>       }
>   
> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
> +    /* Find out how many memory ranges has left after MSI and overlaps. */
> +    num_mem_ranges = 0;
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
> +
> +        if ( !rangeset_is_empty(bar->mem) )
> +            num_mem_ranges++;
> +    }
> +
> +    /*
> +     * There are cases when PCI device, root port for example, has neither
> +     * memory space nor IO. In this case PCI command register write is
> +     * missed resulting in the underlying PCI device not functional, so:
> +     *   - if there are no regions write the command register now
> +     *   - if there are regions then defer work and write later on
> +     */
> +    if ( !num_mem_ranges )
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> +    else
> +        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>   
>       return 0;
> +
> +fail:
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
> +
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
> +    return rc;
>   }
>   
>   static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index a0320b22cb36..352e02d0106d 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -80,6 +80,7 @@ struct vpci {
>               /* Guest view of the BAR. */
>               uint64_t guest_addr;
>               uint64_t size;
> +            struct rangeset *mem;
>               enum {
>                   VPCI_BAR_EMPTY,
>                   VPCI_BAR_IO,
> @@ -154,9 +155,9 @@ struct vpci {
>   
>   struct vpci_vcpu {
>       /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> -    struct rangeset *mem;
>       struct pci_dev *pdev;
>       uint16_t cmd;
> +    uint8_t num_mem_ranges;
>       bool rom_only : 1;
>   };
>
Roger Pau Monné Oct. 26, 2021, 9:08 a.m. UTC | #2
On Thu, Sep 30, 2021 at 10:52:18AM +0300, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> Instead of handling a single range set, that contains all the memory
> regions of all the BARs and ROM, have them per BAR.
> 
> This is in preparation of making non-identity mappings in p2m for the
> MMIOs/ROM.
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> ---
>  xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
>  xen/include/xen/vpci.h    |   3 +-
>  2 files changed, 122 insertions(+), 53 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index ec4d215f36ff..9c603d26d302 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    if ( v->vpci.mem )
> +    if ( v->vpci.num_mem_ranges )
>      {
>          struct map_data data = {
>              .d = v->domain,
>              .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>          };
> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> +        struct pci_dev *pdev = v->vpci.pdev;
> +        struct vpci_header *header = &pdev->vpci->header;
> +        unsigned int i;
>  
> -        if ( rc == -ERESTART )
> -            return true;
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +        {
> +            struct vpci_bar *bar = &header->bars[i];
> +            int rc;
>  
> -        spin_lock(&v->vpci.pdev->vpci->lock);
> -        /* Disable memory decoding unconditionally on failure. */
> -        modify_decoding(v->vpci.pdev,
> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> -                        !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci->lock);
> +            if ( !bar->mem )
> +                continue;
>  
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
> -        if ( rc )
> -            /*
> -             * FIXME: in case of failure remove the device from the domain.
> -             * Note that there might still be leftover mappings. While this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> -             */
> -            vpci_remove_device(v->vpci.pdev);
> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> +
> +            if ( rc == -ERESTART )
> +                return true;
> +
> +            spin_lock(&pdev->vpci->lock);
> +            /* Disable memory decoding unconditionally on failure. */
> +            modify_decoding(pdev,
> +                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> +                            !rc && v->vpci.rom_only);
> +            spin_unlock(&pdev->vpci->lock);
> +
> +            rangeset_destroy(bar->mem);

Now that the rangesets are per-BAR we might have to consider
allocating them at initialization time and not destroying them when
empty. We could replace the NULL checks with rangeset_is_empty
instead. Not that you have to do this on this patch, but I think it's
worth mentioning.

> +            bar->mem = NULL;
> +            v->vpci.num_mem_ranges--;
> +            if ( rc )
> +                /*
> +                 * FIXME: in case of failure remove the device from the domain.
> +                 * Note that there might still be leftover mappings. While this is
> +                 * safe for Dom0, for DomUs the domain will likely need to be
> +                 * killed in order to avoid leaking stale p2m mappings on
> +                 * failure.
> +                 */
> +                vpci_remove_device(pdev);
> +        }
>      }
>  
>      return false;
>  }
>  
>  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            struct rangeset *mem, uint16_t cmd)
> +                            uint16_t cmd)
>  {
>      struct map_data data = { .d = d, .map = true };
> -    int rc;
> +    struct vpci_header *header = &pdev->vpci->header;
> +    int rc = 0;
> +    unsigned int i;
> +
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
>  
> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
> -        process_pending_softirqs();
> -    rangeset_destroy(mem);
> +        if ( !bar->mem )
> +            continue;
> +
> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
> +                                              &data)) == -ERESTART )
> +            process_pending_softirqs();
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
>      if ( !rc )
>          modify_decoding(pdev, cmd, false);
>  
> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>  }
>  
>  static void defer_map(struct domain *d, struct pci_dev *pdev,
> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
> +                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)

Like mentioned below, I don't think you need to pass the number of
BARs that need mapping changes. Iff that's strictly needed, it should
be an unsigned int.

>  {
>      struct vcpu *curr = current;
>  
> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>       * started for the same device if the domain is not well-behaved.
>       */
>      curr->vpci.pdev = pdev;
> -    curr->vpci.mem = mem;
>      curr->vpci.cmd = cmd;
>      curr->vpci.rom_only = rom_only;
> +    curr->vpci.num_mem_ranges = num_mem_ranges;
>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>  {
>      struct vpci_header *header = &pdev->vpci->header;
> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>      struct pci_dev *tmp, *dev = NULL;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> -    unsigned int i;
> +    unsigned int i, j;
>      int rc;
> -
> -    if ( !mem )
> -        return -ENOMEM;
> +    uint8_t num_mem_ranges;
>  
>      /*
> -     * Create a rangeset that represents the current device BARs memory region
> +     * Create a rangeset per BAR that represents the current device memory region
>       * and compare it against all the currently active BAR memory regions. If
>       * an overlap is found, subtract it from the region to be mapped/unmapped.
>       *
> -     * First fill the rangeset with all the BARs of this device or with the ROM
> +     * First fill the rangesets with all the BARs of this device or with the ROM
>       * BAR only, depending on whether the guest is toggling the memory decode
>       * bit of the command register, or the enable bit of the ROM BAR register.
>       */
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
> -        const struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar *bar = &header->bars[i];
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
> +        bar->mem = NULL;

Why do you need to set mem to NULL here? I think we should instead
assert that bar->mem == NULL here.

> +
>          if ( !MAPPABLE_BAR(bar) ||
>               (rom_only ? bar->type != VPCI_BAR_ROM
>                         : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
>              continue;
>  
> -        rc = rangeset_add_range(mem, start, end);
> +        bar->mem = rangeset_new(NULL, NULL, 0);
> +        if ( !bar->mem )
> +        {
> +            rc = -ENOMEM;
> +            goto fail;
> +        }
> +
> +        rc = rangeset_add_range(bar->mem, start, end);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            goto fail;
>          }
>      }
>  
> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>          unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>                                       vmsix_table_size(pdev->vpci, i) - 1);
>  
> -        rc = rangeset_remove_range(mem, start, end);
> -        if ( rc )
> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            printk(XENLOG_G_WARNING
> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
> -                   start, end, rc);
> -            rangeset_destroy(mem);
> -            return rc;
> +            const struct vpci_bar *bar = &header->bars[j];
> +
> +            if ( !bar->mem )
> +                continue;
> +
> +            rc = rangeset_remove_range(bar->mem, start, end);
> +            if ( rc )
> +            {
> +                printk(XENLOG_G_WARNING
> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
> +                       start, end, rc);
> +                goto fail;
> +            }
>          }
>      }
>  
> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              unsigned long start = PFN_DOWN(bar->addr);
>              unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>  
> -            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
> +            if ( !bar->enabled ||
> +                 !rangeset_overlaps_range(bar->mem, start, end) ||
>                   /*
>                    * If only the ROM enable bit is toggled check against other
>                    * BARs in the same device for overlaps, but not against the
> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                   (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>                  continue;
>  
> -            rc = rangeset_remove_range(mem, start, end);
> +            rc = rangeset_remove_range(bar->mem, start, end);
>              if ( rc )
>              {
>                  printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                         start, end, rc);
> -                rangeset_destroy(mem);
> -                return rc;
> +                goto fail;
>              }
>          }
>      }
> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, mem, cmd);
> +        return apply_map(pdev->domain, pdev, cmd);
>      }
>  
> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
> +    /* Find out how many memory ranges has left after MSI and overlaps. */
> +    num_mem_ranges = 0;
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];

There's no need to declare this local variable AFAICT, just use
header->bars[i].mem. In any case this is likely to go away if you
follow my recommendation below to just call defer_map unconditionally
like it's currently done.

> +
> +        if ( !rangeset_is_empty(bar->mem) )
> +            num_mem_ranges++;
> +    }
> +
> +    /*
> +     * There are cases when PCI device, root port for example, has neither
> +     * memory space nor IO. In this case PCI command register write is
> +     * missed resulting in the underlying PCI device not functional, so:
> +     *   - if there are no regions write the command register now
> +     *   - if there are regions then defer work and write later on
> +     */
> +    if ( !num_mem_ranges )
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);

I think this is wrong, as not calling defer_map will prevent the
rangesets (bar[i]->mem) from being destroyed, so we are effectively
leaking memory.

You need to take a path similar to the failure one in case there are
no mappings pending, or even better just call defer_map anyway and let
it do it's thing, it should be capable of handling empty rangesets
just fine. That's how it's currently done.

> +    else
> +        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>  
>      return 0;
> +
> +fail:

We usually ask labels to be indented with one space.

> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        struct vpci_bar *bar = &header->bars[i];
> +
> +        rangeset_destroy(bar->mem);
> +        bar->mem = NULL;
> +    }
> +    return rc;
>  }
>  
>  static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index a0320b22cb36..352e02d0106d 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -80,6 +80,7 @@ struct vpci {
>              /* Guest view of the BAR. */
>              uint64_t guest_addr;
>              uint64_t size;
> +            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -154,9 +155,9 @@ struct vpci {
>  
>  struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> -    struct rangeset *mem;
>      struct pci_dev *pdev;
>      uint16_t cmd;
> +    uint8_t num_mem_ranges;

AFAICT This could be a simple bool:

bool map_pending : 1;

As there's no strict need to know how many BARs have pending mappings.

Thanks, Roger.
Roger Pau Monné Oct. 26, 2021, 9:40 a.m. UTC | #3
On Mon, Oct 25, 2021 at 11:51:57AM +0000, Oleksandr Andrushchenko wrote:
> Hi, Roger!
> Could you please take a look at the below?
> Jan was questioning the per BAR range set approach, so it
> is crucial for the maintainer (you) to answer here.

I'm open to suggestions to using something different than a rangeset
per BAR, but lacking any concrete proposal I think using rangesets is
fine.

One possible way might be to extend rangesets so that private data
could be stored for each rangeset range, but that would then make
merging operations impossible, likewise splitting ranges would be
troublesome.

We could then store the physical BAR address in that private data and
use the rangeset addresses as guest physical address space. It's
unclear however that this approach would be any better than just using
a rangeset per BAR.

Thanks, Roger.
Oleksandr Andrushchenko Nov. 2, 2021, 10:34 a.m. UTC | #4
Hi, Roger!

On 26.10.21 12:08, Roger Pau Monné wrote:
> On Thu, Sep 30, 2021 at 10:52:18AM +0300, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>
>> Instead of handling a single range set, that contains all the memory
>> regions of all the BARs and ROM, have them per BAR.
>>
>> This is in preparation of making non-identity mappings in p2m for the
>> MMIOs/ROM.
>>
>> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>> ---
>>   xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
>>   xen/include/xen/vpci.h    |   3 +-
>>   2 files changed, 122 insertions(+), 53 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index ec4d215f36ff..9c603d26d302 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>   
>>   bool vpci_process_pending(struct vcpu *v)
>>   {
>> -    if ( v->vpci.mem )
>> +    if ( v->vpci.num_mem_ranges )
>>       {
>>           struct map_data data = {
>>               .d = v->domain,
>>               .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>           };
>> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>> +        struct pci_dev *pdev = v->vpci.pdev;
>> +        struct vpci_header *header = &pdev->vpci->header;
>> +        unsigned int i;
>>   
>> -        if ( rc == -ERESTART )
>> -            return true;
>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +        {
>> +            struct vpci_bar *bar = &header->bars[i];
>> +            int rc;
>>   
>> -        spin_lock(&v->vpci.pdev->vpci->lock);
>> -        /* Disable memory decoding unconditionally on failure. */
>> -        modify_decoding(v->vpci.pdev,
>> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
>> -                        !rc && v->vpci.rom_only);
>> -        spin_unlock(&v->vpci.pdev->vpci->lock);
>> +            if ( !bar->mem )
>> +                continue;
>>   
>> -        rangeset_destroy(v->vpci.mem);
>> -        v->vpci.mem = NULL;
>> -        if ( rc )
>> -            /*
>> -             * FIXME: in case of failure remove the device from the domain.
>> -             * Note that there might still be leftover mappings. While this is
>> -             * safe for Dom0, for DomUs the domain will likely need to be
>> -             * killed in order to avoid leaking stale p2m mappings on
>> -             * failure.
>> -             */
>> -            vpci_remove_device(v->vpci.pdev);
>> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>> +
>> +            if ( rc == -ERESTART )
>> +                return true;
>> +
>> +            spin_lock(&pdev->vpci->lock);
>> +            /* Disable memory decoding unconditionally on failure. */
>> +            modify_decoding(pdev,
>> +                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
>> +                            !rc && v->vpci.rom_only);
>> +            spin_unlock(&pdev->vpci->lock);
>> +
>> +            rangeset_destroy(bar->mem);
> Now that the rangesets are per-BAR we might have to consider
> allocating them at initialization time and not destroying them when
> empty. We could replace the NULL checks with rangeset_is_empty
> instead. Not that you have to do this on this patch, but I think it's
> worth mentioning.
Yes, this is a good idea. I will re-work the patch to create/destroy
the rangesets once in add/remove
>
>> +            bar->mem = NULL;
>> +            v->vpci.num_mem_ranges--;
>> +            if ( rc )
>> +                /*
>> +                 * FIXME: in case of failure remove the device from the domain.
>> +                 * Note that there might still be leftover mappings. While this is
>> +                 * safe for Dom0, for DomUs the domain will likely need to be
>> +                 * killed in order to avoid leaking stale p2m mappings on
>> +                 * failure.
>> +                 */
>> +                vpci_remove_device(pdev);
>> +        }
>>       }
>>   
>>       return false;
>>   }
>>   
>>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>> -                            struct rangeset *mem, uint16_t cmd)
>> +                            uint16_t cmd)
>>   {
>>       struct map_data data = { .d = d, .map = true };
>> -    int rc;
>> +    struct vpci_header *header = &pdev->vpci->header;
>> +    int rc = 0;
>> +    unsigned int i;
>> +
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
>>   
>> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
>> -        process_pending_softirqs();
>> -    rangeset_destroy(mem);
>> +        if ( !bar->mem )
>> +            continue;
>> +
>> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
>> +                                              &data)) == -ERESTART )
>> +            process_pending_softirqs();
>> +        rangeset_destroy(bar->mem);
>> +        bar->mem = NULL;
>> +    }
>>       if ( !rc )
>>           modify_decoding(pdev, cmd, false);
>>   
>> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>   }
>>   
>>   static void defer_map(struct domain *d, struct pci_dev *pdev,
>> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
>> +                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
> Like mentioned below, I don't think you need to pass the number of
> BARs that need mapping changes. Iff that's strictly needed, it should
> be an unsigned int.
bool map_pending :1 works great
>
>>   {
>>       struct vcpu *curr = current;
>>   
>> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>>        * started for the same device if the domain is not well-behaved.
>>        */
>>       curr->vpci.pdev = pdev;
>> -    curr->vpci.mem = mem;
>>       curr->vpci.cmd = cmd;
>>       curr->vpci.rom_only = rom_only;
>> +    curr->vpci.num_mem_ranges = num_mem_ranges;
>>       /*
>>        * Raise a scheduler softirq in order to prevent the guest from resuming
>>        * execution with pending mapping operations, to trigger the invocation
>> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>>   static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>   {
>>       struct vpci_header *header = &pdev->vpci->header;
>> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>>       struct pci_dev *tmp, *dev = NULL;
>>       const struct vpci_msix *msix = pdev->vpci->msix;
>> -    unsigned int i;
>> +    unsigned int i, j;
>>       int rc;
>> -
>> -    if ( !mem )
>> -        return -ENOMEM;
>> +    uint8_t num_mem_ranges;
>>   
>>       /*
>> -     * Create a rangeset that represents the current device BARs memory region
>> +     * Create a rangeset per BAR that represents the current device memory region
>>        * and compare it against all the currently active BAR memory regions. If
>>        * an overlap is found, subtract it from the region to be mapped/unmapped.
>>        *
>> -     * First fill the rangeset with all the BARs of this device or with the ROM
>> +     * First fill the rangesets with all the BARs of this device or with the ROM
>>        * BAR only, depending on whether the guest is toggling the memory decode
>>        * bit of the command register, or the enable bit of the ROM BAR register.
>>        */
>>       for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>       {
>> -        const struct vpci_bar *bar = &header->bars[i];
>> +        struct vpci_bar *bar = &header->bars[i];
>>           unsigned long start = PFN_DOWN(bar->addr);
>>           unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>   
>> +        bar->mem = NULL;
> Why do you need to set mem to NULL here? I think we should instead
> assert that bar->mem == NULL here.
I will put an ASSERT here
>
>> +
>>           if ( !MAPPABLE_BAR(bar) ||
>>                (rom_only ? bar->type != VPCI_BAR_ROM
>>                          : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
>>               continue;
>>   
>> -        rc = rangeset_add_range(mem, start, end);
>> +        bar->mem = rangeset_new(NULL, NULL, 0);
>> +        if ( !bar->mem )
>> +        {
>> +            rc = -ENOMEM;
>> +            goto fail;
>> +        }
>> +
>> +        rc = rangeset_add_range(bar->mem, start, end);
>>           if ( rc )
>>           {
>>               printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>>                      start, end, rc);
>> -            rangeset_destroy(mem);
>> -            return rc;
>> +            goto fail;
>>           }
>>       }
>>   
>> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>           unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>>                                        vmsix_table_size(pdev->vpci, i) - 1);
>>   
>> -        rc = rangeset_remove_range(mem, start, end);
>> -        if ( rc )
>> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>>           {
>> -            printk(XENLOG_G_WARNING
>> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
>> -                   start, end, rc);
>> -            rangeset_destroy(mem);
>> -            return rc;
>> +            const struct vpci_bar *bar = &header->bars[j];
>> +
>> +            if ( !bar->mem )
>> +                continue;
>> +
>> +            rc = rangeset_remove_range(bar->mem, start, end);
>> +            if ( rc )
>> +            {
>> +                printk(XENLOG_G_WARNING
>> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
>> +                       start, end, rc);
>> +                goto fail;
>> +            }
>>           }
>>       }
>>   
>> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>               unsigned long start = PFN_DOWN(bar->addr);
>>               unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>   
>> -            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
>> +            if ( !bar->enabled ||
>> +                 !rangeset_overlaps_range(bar->mem, start, end) ||
>>                    /*
>>                     * If only the ROM enable bit is toggled check against other
>>                     * BARs in the same device for overlaps, but not against the
>> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>                    (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>>                   continue;
>>   
>> -            rc = rangeset_remove_range(mem, start, end);
>> +            rc = rangeset_remove_range(bar->mem, start, end);
>>               if ( rc )
>>               {
>>                   printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>>                          start, end, rc);
>> -                rangeset_destroy(mem);
>> -                return rc;
>> +                goto fail;
>>               }
>>           }
>>       }
>> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>            * will always be to establish mappings and process all the BARs.
>>            */
>>           ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
>> -        return apply_map(pdev->domain, pdev, mem, cmd);
>> +        return apply_map(pdev->domain, pdev, cmd);
>>       }
>>   
>> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
>> +    /* Find out how many memory ranges has left after MSI and overlaps. */
>> +    num_mem_ranges = 0;
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
> There's no need to declare this local variable AFAICT, just use
> header->bars[i].mem.
Ok
>   In any case this is likely to go away if you
> follow my recommendation below to just call defer_map unconditionally
> like it's currently done.
Please see below
>> +
>> +        if ( !rangeset_is_empty(bar->mem) )
>> +            num_mem_ranges++;
>> +    }
>> +
>> +    /*
>> +     * There are cases when PCI device, root port for example, has neither
>> +     * memory space nor IO. In this case PCI command register write is
>> +     * missed resulting in the underlying PCI device not functional, so:
>> +     *   - if there are no regions write the command register now
>> +     *   - if there are regions then defer work and write later on
>> +     */
>> +    if ( !num_mem_ranges )
>> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> I think this is wrong, as not calling defer_map will prevent the
> rangesets (bar[i]->mem) from being destroyed, so we are effectively
> leaking memory.
Not really. As in case of num_mem_ranges == 0 there are no rangesets
to free as none was allocated
>
> You need to take a path similar to the failure one in case there are
> no mappings pending, or even better just call defer_map anyway and let
> it do it's thing, it should be capable of handling empty rangesets
> just fine. That's how it's currently done.
So, I think this is still valid to break early and do not go with defer_map
>
>> +    else
>> +        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>>   
>>       return 0;
>> +
>> +fail:
> We usually ask labels to be indented with one space.
Sure. I am confused a bit: there is no word for that in the coding
style and the sources use labels with and without the space.
>
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
>> +
>> +        rangeset_destroy(bar->mem);
>> +        bar->mem = NULL;
>> +    }
>> +    return rc;
>>   }
>>   
>>   static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
>> index a0320b22cb36..352e02d0106d 100644
>> --- a/xen/include/xen/vpci.h
>> +++ b/xen/include/xen/vpci.h
>> @@ -80,6 +80,7 @@ struct vpci {
>>               /* Guest view of the BAR. */
>>               uint64_t guest_addr;
>>               uint64_t size;
>> +            struct rangeset *mem;
>>               enum {
>>                   VPCI_BAR_EMPTY,
>>                   VPCI_BAR_IO,
>> @@ -154,9 +155,9 @@ struct vpci {
>>   
>>   struct vpci_vcpu {
>>       /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
>> -    struct rangeset *mem;
>>       struct pci_dev *pdev;
>>       uint16_t cmd;
>> +    uint8_t num_mem_ranges;
> AFAICT This could be a simple bool:
>
> bool map_pending : 1;
>
> As there's no strict need to know how many BARs have pending mappings.
This is true
>
> Thanks, Roger.
Thank you,
Oleksandr
Jan Beulich Nov. 2, 2021, 11:13 a.m. UTC | #5
On 26.10.2021 11:40, Roger Pau Monné wrote:
> On Mon, Oct 25, 2021 at 11:51:57AM +0000, Oleksandr Andrushchenko wrote:
>> Hi, Roger!
>> Could you please take a look at the below?
>> Jan was questioning the per BAR range set approach, so it
>> is crucial for the maintainer (you) to answer here.
> 
> I'm open to suggestions to using something different than a rangeset
> per BAR, but lacking any concrete proposal I think using rangesets is
> fine.

The main reason for my objection is that for the average BAR the
rangeset will hold exactly one range. That's not an efficient way
to express a single range.

> One possible way might be to extend rangesets so that private data
> could be stored for each rangeset range, but that would then make
> merging operations impossible, likewise splitting ranges would be
> troublesome.

Indeed, so I don't view this as an option.

Jan
diff mbox series

Patch

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index ec4d215f36ff..9c603d26d302 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -131,49 +131,75 @@  static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
 
 bool vpci_process_pending(struct vcpu *v)
 {
-    if ( v->vpci.mem )
+    if ( v->vpci.num_mem_ranges )
     {
         struct map_data data = {
             .d = v->domain,
             .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
         };
-        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
+        struct pci_dev *pdev = v->vpci.pdev;
+        struct vpci_header *header = &pdev->vpci->header;
+        unsigned int i;
 
-        if ( rc == -ERESTART )
-            return true;
+        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+        {
+            struct vpci_bar *bar = &header->bars[i];
+            int rc;
 
-        spin_lock(&v->vpci.pdev->vpci->lock);
-        /* Disable memory decoding unconditionally on failure. */
-        modify_decoding(v->vpci.pdev,
-                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
-                        !rc && v->vpci.rom_only);
-        spin_unlock(&v->vpci.pdev->vpci->lock);
+            if ( !bar->mem )
+                continue;
 
-        rangeset_destroy(v->vpci.mem);
-        v->vpci.mem = NULL;
-        if ( rc )
-            /*
-             * FIXME: in case of failure remove the device from the domain.
-             * Note that there might still be leftover mappings. While this is
-             * safe for Dom0, for DomUs the domain will likely need to be
-             * killed in order to avoid leaking stale p2m mappings on
-             * failure.
-             */
-            vpci_remove_device(v->vpci.pdev);
+            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
+
+            if ( rc == -ERESTART )
+                return true;
+
+            spin_lock(&pdev->vpci->lock);
+            /* Disable memory decoding unconditionally on failure. */
+            modify_decoding(pdev,
+                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
+                            !rc && v->vpci.rom_only);
+            spin_unlock(&pdev->vpci->lock);
+
+            rangeset_destroy(bar->mem);
+            bar->mem = NULL;
+            v->vpci.num_mem_ranges--;
+            if ( rc )
+                /*
+                 * FIXME: in case of failure remove the device from the domain.
+                 * Note that there might still be leftover mappings. While this is
+                 * safe for Dom0, for DomUs the domain will likely need to be
+                 * killed in order to avoid leaking stale p2m mappings on
+                 * failure.
+                 */
+                vpci_remove_device(pdev);
+        }
     }
 
     return false;
 }
 
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
-                            struct rangeset *mem, uint16_t cmd)
+                            uint16_t cmd)
 {
     struct map_data data = { .d = d, .map = true };
-    int rc;
+    struct vpci_header *header = &pdev->vpci->header;
+    int rc = 0;
+    unsigned int i;
+
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    {
+        struct vpci_bar *bar = &header->bars[i];
 
-    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
-        process_pending_softirqs();
-    rangeset_destroy(mem);
+        if ( !bar->mem )
+            continue;
+
+        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
+                                              &data)) == -ERESTART )
+            process_pending_softirqs();
+        rangeset_destroy(bar->mem);
+        bar->mem = NULL;
+    }
     if ( !rc )
         modify_decoding(pdev, cmd, false);
 
@@ -181,7 +207,7 @@  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
 }
 
 static void defer_map(struct domain *d, struct pci_dev *pdev,
-                      struct rangeset *mem, uint16_t cmd, bool rom_only)
+                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
 {
     struct vcpu *curr = current;
 
@@ -192,9 +218,9 @@  static void defer_map(struct domain *d, struct pci_dev *pdev,
      * started for the same device if the domain is not well-behaved.
      */
     curr->vpci.pdev = pdev;
-    curr->vpci.mem = mem;
     curr->vpci.cmd = cmd;
     curr->vpci.rom_only = rom_only;
+    curr->vpci.num_mem_ranges = num_mem_ranges;
     /*
      * Raise a scheduler softirq in order to prevent the guest from resuming
      * execution with pending mapping operations, to trigger the invocation
@@ -206,42 +232,47 @@  static void defer_map(struct domain *d, struct pci_dev *pdev,
 static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 {
     struct vpci_header *header = &pdev->vpci->header;
-    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
     struct pci_dev *tmp, *dev = NULL;
     const struct vpci_msix *msix = pdev->vpci->msix;
-    unsigned int i;
+    unsigned int i, j;
     int rc;
-
-    if ( !mem )
-        return -ENOMEM;
+    uint8_t num_mem_ranges;
 
     /*
-     * Create a rangeset that represents the current device BARs memory region
+     * Create a rangeset per BAR that represents the current device memory region
      * and compare it against all the currently active BAR memory regions. If
      * an overlap is found, subtract it from the region to be mapped/unmapped.
      *
-     * First fill the rangeset with all the BARs of this device or with the ROM
+     * First fill the rangesets with all the BARs of this device or with the ROM
      * BAR only, depending on whether the guest is toggling the memory decode
      * bit of the command register, or the enable bit of the ROM BAR register.
      */
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
-        const struct vpci_bar *bar = &header->bars[i];
+        struct vpci_bar *bar = &header->bars[i];
         unsigned long start = PFN_DOWN(bar->addr);
         unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
+        bar->mem = NULL;
+
         if ( !MAPPABLE_BAR(bar) ||
              (rom_only ? bar->type != VPCI_BAR_ROM
                        : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
             continue;
 
-        rc = rangeset_add_range(mem, start, end);
+        bar->mem = rangeset_new(NULL, NULL, 0);
+        if ( !bar->mem )
+        {
+            rc = -ENOMEM;
+            goto fail;
+        }
+
+        rc = rangeset_add_range(bar->mem, start, end);
         if ( rc )
         {
             printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
                    start, end, rc);
-            rangeset_destroy(mem);
-            return rc;
+            goto fail;
         }
     }
 
@@ -252,14 +283,21 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
         unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
                                      vmsix_table_size(pdev->vpci, i) - 1);
 
-        rc = rangeset_remove_range(mem, start, end);
-        if ( rc )
+        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
         {
-            printk(XENLOG_G_WARNING
-                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
-                   start, end, rc);
-            rangeset_destroy(mem);
-            return rc;
+            const struct vpci_bar *bar = &header->bars[j];
+
+            if ( !bar->mem )
+                continue;
+
+            rc = rangeset_remove_range(bar->mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
+                       start, end, rc);
+                goto fail;
+            }
         }
     }
 
@@ -291,7 +329,8 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
             unsigned long start = PFN_DOWN(bar->addr);
             unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
-            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
+            if ( !bar->enabled ||
+                 !rangeset_overlaps_range(bar->mem, start, end) ||
                  /*
                   * If only the ROM enable bit is toggled check against other
                   * BARs in the same device for overlaps, but not against the
@@ -300,13 +339,12 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                  (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
                 continue;
 
-            rc = rangeset_remove_range(mem, start, end);
+            rc = rangeset_remove_range(bar->mem, start, end);
             if ( rc )
             {
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
-                rangeset_destroy(mem);
-                return rc;
+                goto fail;
             }
         }
     }
@@ -324,12 +362,42 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
          * will always be to establish mappings and process all the BARs.
          */
         ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
-        return apply_map(pdev->domain, pdev, mem, cmd);
+        return apply_map(pdev->domain, pdev, cmd);
     }
 
-    defer_map(dev->domain, dev, mem, cmd, rom_only);
+    /* Find out how many memory ranges has left after MSI and overlaps. */
+    num_mem_ranges = 0;
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    {
+        struct vpci_bar *bar = &header->bars[i];
+
+        if ( !rangeset_is_empty(bar->mem) )
+            num_mem_ranges++;
+    }
+
+    /*
+     * There are cases when PCI device, root port for example, has neither
+     * memory space nor IO. In this case PCI command register write is
+     * missed resulting in the underlying PCI device not functional, so:
+     *   - if there are no regions write the command register now
+     *   - if there are regions then defer work and write later on
+     */
+    if ( !num_mem_ranges )
+        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
+    else
+        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
 
     return 0;
+
+fail:
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    {
+        struct vpci_bar *bar = &header->bars[i];
+
+        rangeset_destroy(bar->mem);
+        bar->mem = NULL;
+    }
+    return rc;
 }
 
 static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index a0320b22cb36..352e02d0106d 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -80,6 +80,7 @@  struct vpci {
             /* Guest view of the BAR. */
             uint64_t guest_addr;
             uint64_t size;
+            struct rangeset *mem;
             enum {
                 VPCI_BAR_EMPTY,
                 VPCI_BAR_IO,
@@ -154,9 +155,9 @@  struct vpci {
 
 struct vpci_vcpu {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
-    struct rangeset *mem;
     struct pci_dev *pdev;
     uint16_t cmd;
+    uint8_t num_mem_ranges;
     bool rom_only : 1;
 };