diff mbox series

[06/10] vpci: Make every domain handle its own BARs

Message ID 20201109125031.26409-7-andr2000@gmail.com (mailing list archive)
State New, archived
Headers show
Series ARM PCI passthrough configuration and vPCI | expand

Commit Message

Oleksandr Andrushchenko Nov. 9, 2020, 12:50 p.m. UTC
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

At the moment there is an identity mapping between how a guest sees its
BARs and how they are programmed into guest domain's p2m. This is not
going to work as guest domains have their own view on the BARs.
Extend existing vPCI BAR handling to allow every domain to have its own
view of the BARs: only hardware domain sees physical memory addresses in
this case and for the rest those are emulated, including logic required
for the guests to detect memory sizes and properties.

While emulating BAR access for the guests create a link between the
virtual BAR address and physical one: use full memory address while
creating range sets used to map/unmap corresponding address spaces and
exploit the fact that PCI BAR value doesn't use 8 lower bits of the
memory address. Use those bits to pass physical BAR's index, so we can
build/remove proper p2m mappings.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
 xen/drivers/vpci/header.c | 276 ++++++++++++++++++++++++++++++++++----
 xen/drivers/vpci/vpci.c   |   1 +
 xen/include/xen/vpci.h    |  24 ++--
 3 files changed, 265 insertions(+), 36 deletions(-)

Comments

Roger Pau Monne Nov. 12, 2020, 9:40 a.m. UTC | #1
On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> At the moment there is an identity mapping between how a guest sees its
> BARs and how they are programmed into guest domain's p2m. This is not
> going to work as guest domains have their own view on the BARs.
> Extend existing vPCI BAR handling to allow every domain to have its own
> view of the BARs: only hardware domain sees physical memory addresses in
> this case and for the rest those are emulated, including logic required
> for the guests to detect memory sizes and properties.
> 
> While emulating BAR access for the guests create a link between the
> virtual BAR address and physical one: use full memory address while
> creating range sets used to map/unmap corresponding address spaces and
> exploit the fact that PCI BAR value doesn't use 8 lower bits of the

I think you mean the low 4bits rather than the low 8bits?

> memory address. Use those bits to pass physical BAR's index, so we can
> build/remove proper p2m mappings.

I find this quite hard to review, given it's a fairly big and
complicated patch. Do you think you could split into smaller chunks?

Maybe you could split into smaller patches that add bits towards the
end goal but still keep the identity mappings?

I've tried to do some review below, but I would appreciate if you
could split this.

> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> ---
>  xen/drivers/vpci/header.c | 276 ++++++++++++++++++++++++++++++++++----
>  xen/drivers/vpci/vpci.c   |   1 +
>  xen/include/xen/vpci.h    |  24 ++--
>  3 files changed, 265 insertions(+), 36 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index f74f728884c0..7dc7c70e24f2 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -31,14 +31,87 @@
>  struct map_data {
>      struct domain *d;
>      bool map;
> +    struct pci_dev *pdev;

If the field is required please place it after the domain one.

>  };
>  
> +static struct vpci_header *get_vpci_header(struct domain *d,
> +                                           const struct pci_dev *pdev);
> +
> +static struct vpci_header *get_hwdom_vpci_header(const struct pci_dev *pdev)
> +{
> +    if ( unlikely(list_empty(&pdev->vpci->headers)) )
> +        return get_vpci_header(hardware_domain, pdev);

I'm not sure I understand why you need a list here: each device can
only be owned by a single guest, and thus there shouldn't be multiple
views of the BARs (or the header).

> +
> +    /* hwdom's header is always the very first entry. */
> +    return list_first_entry(&pdev->vpci->headers, struct vpci_header, node);
> +}
> +
> +static struct vpci_header *get_vpci_header(struct domain *d,
> +                                           const struct pci_dev *pdev)
> +{
> +    struct list_head *prev;
> +    struct vpci_header *header;
> +    struct vpci *vpci = pdev->vpci;
> +
> +    list_for_each( prev, &vpci->headers )
> +    {
> +        struct vpci_header *this = list_entry(prev, struct vpci_header, node);
> +
> +        if ( this->domain_id == d->domain_id )
> +            return this;
> +    }
> +    printk(XENLOG_DEBUG "--------------------------------------" \
> +           "Adding new vPCI BAR headers for domain %d: " PRI_pci" \n",
> +           d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
> +           pdev->sbdf.dev, pdev->sbdf.fn);
> +    header = xzalloc(struct vpci_header);
> +    if ( !header )
> +    {
> +        printk(XENLOG_ERR
> +               "Failed to add new vPCI BAR headers for domain %d: " PRI_pci" \n",
> +               d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
> +               pdev->sbdf.dev, pdev->sbdf.fn);
> +        return NULL;
> +    }
> +
> +    if ( !is_hardware_domain(d) )
> +    {
> +        struct vpci_header *hwdom_header = get_hwdom_vpci_header(pdev);
> +
> +        /* Make a copy of the hwdom's BARs as the initial state for vBARs. */
> +        memcpy(header, hwdom_header, sizeof(*header));
> +    }
> +
> +    header->domain_id = d->domain_id;
> +    list_add_tail(&header->node, &vpci->headers);

Same here, I think you want a single header, and then some fields
would be read-only for domUs (like the position of the BARs on the
physmap).

> +    return header;
> +}
> +
> +static struct vpci_bar *get_vpci_bar(struct domain *d,
> +                                     const struct pci_dev *pdev,
> +                                     int bar_idx)

unsigned

> +{
> +    struct vpci_header *vheader;
> +
> +    vheader = get_vpci_header(d, pdev);
> +    if ( !vheader )
> +        return NULL;
> +
> +    return &vheader->bars[bar_idx];
> +}
> +
>  static int map_range(unsigned long s, unsigned long e, void *data,
>                       unsigned long *c)
>  {
>      const struct map_data *map = data;
> -    int rc;
> -
> +    unsigned long mfn;
> +    int rc, bar_idx;
> +    struct vpci_header *header = get_hwdom_vpci_header(map->pdev);
> +
> +    bar_idx = s & ~PCI_BASE_ADDRESS_MEM_MASK;

I'm not sure it's fine to stash the BAR index in the low bits of the
address, what about a device having concatenated BARs?

The rangeset would normally join them into a single range, and then
you won't be able to notice whether a region in the rangeset belongs
to one BAR or another.

IMO it might be easier to just have a rangeset for each BAR and
structure the pending work as a linked list of BARs, that will contain
the physical addresses of each BAR (the real phymap one and the guest
physmap view) plus the rangeset of memory regions to map.

> +    s = PFN_DOWN(s);
> +    e = PFN_DOWN(e);

Changing the rangeset to store memory addresses instead of frames
could for example be split into a separate patch.

I think you are doing the calculation of the end pfn wrong here, you
should use PFN_UP instead in case the address is not aligned.

> +    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
>      for ( ; ; )
>      {
>          unsigned long size = e - s + 1;
> @@ -52,11 +125,15 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>           * - {un}map_mmio_regions doesn't support preemption.
>           */
>  
> -        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
> -                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
> +        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
> +                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
>          if ( rc == 0 )
>          {
> -            *c += size;
> +            /*
> +             * Range set is not expressed in frame numbers and the size
> +             * is the number of frames, so update accordingly.
> +             */
> +            *c += size << PAGE_SHIFT;
>              break;
>          }
>          if ( rc < 0 )
> @@ -67,8 +144,9 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>              break;
>          }
>          ASSERT(rc < size);
> -        *c += rc;
> +        *c += rc << PAGE_SHIFT;
>          s += rc;
> +        mfn += rc;
>          if ( general_preempt_check() )
>                  return -ERESTART;
>      }
> @@ -84,7 +162,7 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>  static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>                              bool rom_only)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>      bool map = cmd & PCI_COMMAND_MEMORY;
>      unsigned int i;
>  
> @@ -136,6 +214,7 @@ bool vpci_process_pending(struct vcpu *v)
>          struct map_data data = {
>              .d = v->domain,
>              .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> +            .pdev = v->vpci.pdev,
>          };
>          int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>  
> @@ -168,7 +247,8 @@ bool vpci_process_pending(struct vcpu *v)
>  static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>                              struct rangeset *mem, uint16_t cmd)
>  {
> -    struct map_data data = { .d = d, .map = true };
> +    struct map_data data = { .d = d, .map = true,
> +        .pdev = (struct pci_dev *)pdev };

Dropping the const here is not fine. IT either needs to be dropped
from apply_map and further up, or this needs to also be made const.

>      int rc;
>  
>      while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
> @@ -205,7 +285,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>  
>  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct vpci_header *header;
>      struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>      struct pci_dev *tmp, *dev = NULL;
>  #ifdef CONFIG_X86
> @@ -217,6 +297,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      if ( !mem )
>          return -ENOMEM;
>  
> +    if ( is_hardware_domain(current->domain) )
> +        header = get_hwdom_vpci_header(pdev);
> +    else
> +        header = get_vpci_header(current->domain, pdev);
> +
>      /*
>       * Create a rangeset that represents the current device BARs memory region
>       * and compare it against all the currently active BAR memory regions. If
> @@ -225,12 +310,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * First fill the rangeset with all the BARs of this device or with the ROM
>       * BAR only, depending on whether the guest is toggling the memory decode
>       * bit of the command register, or the enable bit of the ROM BAR register.
> +     *
> +     * Use the PCI reserved bits of the BAR to pass BAR's index.
>       */
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
>          const struct vpci_bar *bar = &header->bars[i];
> -        unsigned long start = PFN_DOWN(bar->addr);
> -        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> +        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
> +        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
> +            bar->size - 1;

Will this work fine on Arm 32bits with LPAE? It's my understanding
that in that case unsigned long is 32bits, but the physical address
space is 44bits, in which case this won't work.

I think you need to keep the usage of frame numbers here.

>  
>          if ( !MAPPABLE_BAR(bar) ||
>               (rom_only ? bar->type != VPCI_BAR_ROM
> @@ -251,9 +339,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      /* Remove any MSIX regions if present. */
>      for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
>      {
> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
> -                                     vmsix_table_size(pdev->vpci, i) - 1);
> +        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
> +                               PCI_BASE_ADDRESS_MEM_MASK) | i;
> +        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
> +                             PCI_BASE_ADDRESS_MEM_MASK ) +
> +                             vmsix_table_size(pdev->vpci, i) - 1;
>  
>          rc = rangeset_remove_range(mem, start, end);
>          if ( rc )
> @@ -273,6 +363,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       */
>      for_each_pdev ( pdev->domain, tmp )
>      {
> +        struct vpci_header *header;
> +
>          if ( tmp == pdev )
>          {
>              /*
> @@ -289,11 +381,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  continue;
>          }
>  
> -        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
> +        header = get_vpci_header(tmp->domain, pdev);
> +
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>          {
> -            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
> -            unsigned long start = PFN_DOWN(bar->addr);
> -            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> +            const struct vpci_bar *bar = &header->bars[i];
> +            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
> +            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
> +                + bar->size - 1;
>  
>              if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
>                   /*
> @@ -357,7 +452,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>          pci_conf_write16(pdev->sbdf, reg, cmd);
>  }
>  
> -static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> +static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
>                        uint32_t val, void *data)
>  {
>      struct vpci_bar *bar = data;
> @@ -377,14 +472,17 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>      {
>          /* If the value written is the current one avoid printing a warning. */
>          if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
> +        {
> +            struct vpci_header *header = get_hwdom_vpci_header(pdev);
> +
>              gprintk(XENLOG_WARNING,
>                      "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
>                      pdev->seg, pdev->bus, slot, func,
> -                    bar - pdev->vpci->header.bars + hi);
> +                    bar - header->bars + hi);
> +        }
>          return;
>      }
>  
> -
>      /*
>       * Update the cached address, so that when memory decoding is enabled
>       * Xen can map the BAR into the guest p2m.
> @@ -403,10 +501,89 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>      pci_conf_write32(pdev->sbdf, reg, val);
>  }
>  
> +static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
> +                               void *data)
> +{
> +    return vpci_hw_read32(pdev, reg, data);
> +}
> +
> +static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
> +                            uint32_t val, void *data)
> +{
> +    struct vpci_bar *vbar = data;
> +    bool hi = false;
> +
> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
> +    {
> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
> +        vbar--;
> +        hi = true;
> +    }
> +    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
> +    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
> +}
> +
> +static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
> +                               void *data)
> +{
> +    struct vpci_bar *vbar = data;
> +    uint32_t val;
> +    bool hi = false;
> +
> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
> +    {
> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
> +        vbar--;
> +        hi = true;
> +    }
> +
> +    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )

I think this would be clearer using a switch statement.

> +    {
> +        if ( hi )
> +            val = vbar->addr >> 32;
> +        else
> +            val = vbar->addr & 0xffffffff;
> +        if ( val == ~0 )

Strictly speaking I think you are not forced to write 1s to the
reserved 4 bits in the low register (and in the 32bit case).

> +        {
> +            /* Guests detects BAR's properties and sizes. */
> +            if ( !hi )
> +            {
> +                val = 0xffffffff & ~(vbar->size - 1);
> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> +            }
> +            else
> +                val = vbar->size >> 32;
> +            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
> +            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
> +        }
> +    }
> +    else if ( vbar->type == VPCI_BAR_MEM32 )
> +    {
> +        val = vbar->addr;
> +        if ( val == ~0 )
> +        {
> +            if ( !hi )

There's no way hi can be true at this point AFAICT.

> +            {
> +                val = 0xffffffff & ~(vbar->size - 1);
> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> +            }
> +        }
> +    }
> +    else
> +    {
> +        val = vbar->addr;
> +    }
> +    return val;
> +}
> +
>  static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>                        uint32_t val, void *data)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>      struct vpci_bar *rom = data;
>      uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>      uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
> @@ -452,15 +629,56 @@ static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>          rom->addr = val & PCI_ROM_ADDRESS_MASK;
>  }

Don't you need to also protect a domU from writing to the ROM BAR
register?

>  
> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
> +                                  void *data)
> +{
> +    struct vpci_bar *vbar, *bar = data;
> +
> +    if ( is_hardware_domain(current->domain) )
> +        return bar_read_hwdom(pdev, reg, data);
> +
> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
> +    if ( !vbar )
> +        return ~0;
> +
> +    return bar_read_guest(pdev, reg, vbar);
> +}
> +
> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
> +                               uint32_t val, void *data)
> +{
> +    struct vpci_bar *bar = data;
> +
> +    if ( is_hardware_domain(current->domain) )
> +        bar_write_hwdom(pdev, reg, val, data);
> +    else
> +    {
> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
> +
> +        if ( !vbar )
> +            return;
> +        bar_write_guest(pdev, reg, val, vbar);
> +    }
> +}

You should assign different handlers based on whether the domain that
has the device assigned is a domU or the hardware domain, rather than
doing the selection here.

> +
> +/*
> + * FIXME: This is called early while adding vPCI handlers which is done
> + * by and for hwdom.
> + */
>  static int init_bars(struct pci_dev *pdev)
>  {
>      uint16_t cmd;
>      uint64_t addr, size;
>      unsigned int i, num_bars, rom_reg;
> -    struct vpci_header *header = &pdev->vpci->header;
> -    struct vpci_bar *bars = header->bars;
> +    struct vpci_header *header;
> +    struct vpci_bar *bars;
>      int rc;
>  
> +    header = get_hwdom_vpci_header(pdev);
> +    if ( !header )
> +        return -ENOMEM;
> +    bars = header->bars;
> +
>      switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
>      {
>      case PCI_HEADER_TYPE_NORMAL:
> @@ -496,11 +714,12 @@ static int init_bars(struct pci_dev *pdev)
>          uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
>          uint32_t val;
>  
> +        bars[i].index = i;
>          if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
>          {
>              bars[i].type = VPCI_BAR_MEM64_HI;
> -            rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg,
> -                                   4, &bars[i]);
> +            rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
> +                                   bar_write_dispatch, reg, 4, &bars[i]);
>              if ( rc )
>              {
>                  pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> @@ -540,8 +759,8 @@ static int init_bars(struct pci_dev *pdev)
>          bars[i].size = size;
>          bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
>  
> -        rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg, 4,
> -                               &bars[i]);
> +        rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
> +                               bar_write_dispatch, reg, 4, &bars[i]);
>          if ( rc )
>          {
>              pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> @@ -558,6 +777,7 @@ static int init_bars(struct pci_dev *pdev)
>          rom->type = VPCI_BAR_ROM;
>          rom->size = size;
>          rom->addr = addr;
> +        rom->index = num_bars;
>          header->rom_enabled = pci_conf_read32(pdev->sbdf, rom_reg) &
>                                PCI_ROM_ADDRESS_ENABLE;
>  
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index a5293521a36a..728029da3e9c 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -69,6 +69,7 @@ int __hwdom_init vpci_add_handlers(struct pci_dev *pdev)
>          return -ENOMEM;
>  
>      INIT_LIST_HEAD(&pdev->vpci->handlers);
> +    INIT_LIST_HEAD(&pdev->vpci->headers);
>      spin_lock_init(&pdev->vpci->lock);
>  
>      for ( i = 0; i < NUM_VPCI_INIT; i++ )
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index c3501e9ec010..54423bc6556d 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -55,16 +55,14 @@ uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
>   */
>  bool __must_check vpci_process_pending(struct vcpu *v);
>  
> -struct vpci {
> -    /* List of vPCI handlers for a device. */
> -    struct list_head handlers;
> -    spinlock_t lock;
> -
>  #ifdef __XEN__
> -    /* Hide the rest of the vpci struct from the user-space test harness. */
>      struct vpci_header {
> +    struct list_head node;
> +    /* Domain that owns this view of the BARs. */
> +    domid_t domain_id;

Indentation seems screwed here.

>          /* Information about the PCI BARs of this device. */
>          struct vpci_bar {
> +            int index;

unsigned

>              uint64_t addr;
>              uint64_t size;
>              enum {
> @@ -88,8 +86,18 @@ struct vpci {
>           * is mapped into guest p2m) if there's a ROM BAR on the device.
>           */
>          bool rom_enabled      : 1;
> -        /* FIXME: currently there's no support for SR-IOV. */

Unless you are also adding support for SR-IOV, I would keep the
comment.

Thanks, Roger.
Oleksandr Andrushchenko Nov. 12, 2020, 1:16 p.m. UTC | #2
On 11/12/20 11:40 AM, Roger Pau Monné wrote:
> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>
>> At the moment there is an identity mapping between how a guest sees its
>> BARs and how they are programmed into guest domain's p2m. This is not
>> going to work as guest domains have their own view on the BARs.
>> Extend existing vPCI BAR handling to allow every domain to have its own
>> view of the BARs: only hardware domain sees physical memory addresses in
>> this case and for the rest those are emulated, including logic required
>> for the guests to detect memory sizes and properties.
>>
>> While emulating BAR access for the guests create a link between the
>> virtual BAR address and physical one: use full memory address while
>> creating range sets used to map/unmap corresponding address spaces and
>> exploit the fact that PCI BAR value doesn't use 8 lower bits of the
> I think you mean the low 4bits rather than the low 8bits?
Yes, you are right. Will fix that
>
>> memory address. Use those bits to pass physical BAR's index, so we can
>> build/remove proper p2m mappings.
> I find this quite hard to review, given it's a fairly big and
> complicated patch. Do you think you could split into smaller chunks?

I'll try. But at the moment this code isn't meant to be production quality yet

as what I'd like to achieve is to collect community's view on the subject

Once all questions are resolved I'll start working on the agreed solution

which I expect to be production quality then.

>
> Maybe you could split into smaller patches that add bits towards the
> end goal but still keep the identity mappings?
>
> I've tried to do some review below, but I would appreciate if you
> could split this.
Thank you so much for doing so!
>
>> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>> ---
>>   xen/drivers/vpci/header.c | 276 ++++++++++++++++++++++++++++++++++----
>>   xen/drivers/vpci/vpci.c   |   1 +
>>   xen/include/xen/vpci.h    |  24 ++--
>>   3 files changed, 265 insertions(+), 36 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index f74f728884c0..7dc7c70e24f2 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -31,14 +31,87 @@
>>   struct map_data {
>>       struct domain *d;
>>       bool map;
>> +    struct pci_dev *pdev;
> If the field is required please place it after the domain one.
I will, but may I ask why?
>
>>   };
>>   
>> +static struct vpci_header *get_vpci_header(struct domain *d,
>> +                                           const struct pci_dev *pdev);
>> +
>> +static struct vpci_header *get_hwdom_vpci_header(const struct pci_dev *pdev)
>> +{
>> +    if ( unlikely(list_empty(&pdev->vpci->headers)) )
>> +        return get_vpci_header(hardware_domain, pdev);
> I'm not sure I understand why you need a list here: each device can
> only be owned by a single guest, and thus there shouldn't be multiple
> views of the BARs (or the header).

That is because of the over-engineering happening here: you are 100% right

and this all must be made way simpler without lists and all that. I just

blindly thought that we could have multiple guests, but I have overseen

the simple fact you mentioned: physical BARs are for hwdom and virtual are

for *a single* guest as we can't passthrough the same device to multiple

guests at a time.

>
>> +
>> +    /* hwdom's header is always the very first entry. */
>> +    return list_first_entry(&pdev->vpci->headers, struct vpci_header, node);
>> +}
>> +
>> +static struct vpci_header *get_vpci_header(struct domain *d,
>> +                                           const struct pci_dev *pdev)
>> +{
>> +    struct list_head *prev;
>> +    struct vpci_header *header;
>> +    struct vpci *vpci = pdev->vpci;
>> +
>> +    list_for_each( prev, &vpci->headers )
>> +    {
>> +        struct vpci_header *this = list_entry(prev, struct vpci_header, node);
>> +
>> +        if ( this->domain_id == d->domain_id )
>> +            return this;
>> +    }
>> +    printk(XENLOG_DEBUG "--------------------------------------" \
>> +           "Adding new vPCI BAR headers for domain %d: " PRI_pci" \n",
>> +           d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
>> +           pdev->sbdf.dev, pdev->sbdf.fn);
>> +    header = xzalloc(struct vpci_header);
>> +    if ( !header )
>> +    {
>> +        printk(XENLOG_ERR
>> +               "Failed to add new vPCI BAR headers for domain %d: " PRI_pci" \n",
>> +               d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
>> +               pdev->sbdf.dev, pdev->sbdf.fn);
>> +        return NULL;
>> +    }
>> +
>> +    if ( !is_hardware_domain(d) )
>> +    {
>> +        struct vpci_header *hwdom_header = get_hwdom_vpci_header(pdev);
>> +
>> +        /* Make a copy of the hwdom's BARs as the initial state for vBARs. */
>> +        memcpy(header, hwdom_header, sizeof(*header));
>> +    }
>> +
>> +    header->domain_id = d->domain_id;
>> +    list_add_tail(&header->node, &vpci->headers);
> Same here, I think you want a single header, and then some fields
> would be read-only for domUs (like the position of the BARs on the
> physmap).
ditto, will remove the list
>
>> +    return header;
>> +}
>> +
>> +static struct vpci_bar *get_vpci_bar(struct domain *d,
>> +                                     const struct pci_dev *pdev,
>> +                                     int bar_idx)
> unsigned
ok
>
>> +{
>> +    struct vpci_header *vheader;
>> +
>> +    vheader = get_vpci_header(d, pdev);
>> +    if ( !vheader )
>> +        return NULL;
>> +
>> +    return &vheader->bars[bar_idx];
>> +}
>> +
>>   static int map_range(unsigned long s, unsigned long e, void *data,
>>                        unsigned long *c)
>>   {
>>       const struct map_data *map = data;
>> -    int rc;
>> -
>> +    unsigned long mfn;
>> +    int rc, bar_idx;
>> +    struct vpci_header *header = get_hwdom_vpci_header(map->pdev);
>> +
>> +    bar_idx = s & ~PCI_BASE_ADDRESS_MEM_MASK;
> I'm not sure it's fine to stash the BAR index in the low bits of the
> address, what about a device having concatenated BARs?

Hm, I am not an expert in PCI, so didn't think about that.

Probably nothing stops a PCI device from splitting the same memory

region into multiple ones...

>
> The rangeset would normally join them into a single range, and then
> you won't be able to notice whether a region in the rangeset belongs
> to one BAR or another.
Yes, I see. Very good catch, thank you
>
> IMO it might be easier to just have a rangeset for each BAR and
> structure the pending work as a linked list of BARs, that will contain
> the physical addresses of each BAR (the real phymap one and the guest
> physmap view) plus the rangeset of memory regions to map.
I'll try to think how to do that, thank you
>
>> +    s = PFN_DOWN(s);
>> +    e = PFN_DOWN(e);
> Changing the rangeset to store memory addresses instead of frames
> could for example be split into a separate patch.
Ok
>
> I think you are doing the calculation of the end pfn wrong here, you
> should use PFN_UP instead in case the address is not aligned.

PFN_DOWN for the start seems to be ok if the address is not aligned

which is the case if I pass bar_index in the lower bits: PCI memory has

PAGE_SIZE granularity, so besides the fact that I use bar_index the address

must be page aligned.

The end address is expressed in (size - 1) form, again page aligned,

so to get the last page to be mapped PFN_DOWN also seems to be appropriate.

Do I miss something here?

>
>> +    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
>>       for ( ; ; )
>>       {
>>           unsigned long size = e - s + 1;
>> @@ -52,11 +125,15 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>            * - {un}map_mmio_regions doesn't support preemption.
>>            */
>>   
>> -        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
>> -                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
>> +        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
>> +                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
>>           if ( rc == 0 )
>>           {
>> -            *c += size;
>> +            /*
>> +             * Range set is not expressed in frame numbers and the size
>> +             * is the number of frames, so update accordingly.
>> +             */
>> +            *c += size << PAGE_SHIFT;
>>               break;
>>           }
>>           if ( rc < 0 )
>> @@ -67,8 +144,9 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>               break;
>>           }
>>           ASSERT(rc < size);
>> -        *c += rc;
>> +        *c += rc << PAGE_SHIFT;
>>           s += rc;
>> +        mfn += rc;
>>           if ( general_preempt_check() )
>>                   return -ERESTART;
>>       }
>> @@ -84,7 +162,7 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>   static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>                               bool rom_only)
>>   {
>> -    struct vpci_header *header = &pdev->vpci->header;
>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>       bool map = cmd & PCI_COMMAND_MEMORY;
>>       unsigned int i;
>>   
>> @@ -136,6 +214,7 @@ bool vpci_process_pending(struct vcpu *v)
>>           struct map_data data = {
>>               .d = v->domain,
>>               .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>> +            .pdev = v->vpci.pdev,
>>           };
>>           int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>>   
>> @@ -168,7 +247,8 @@ bool vpci_process_pending(struct vcpu *v)
>>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>                               struct rangeset *mem, uint16_t cmd)
>>   {
>> -    struct map_data data = { .d = d, .map = true };
>> +    struct map_data data = { .d = d, .map = true,
>> +        .pdev = (struct pci_dev *)pdev };
> Dropping the const here is not fine. IT either needs to be dropped
> from apply_map and further up, or this needs to also be made const.
Ok, I'll try to keep it const
>
>>       int rc;
>>   
>>       while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
>> @@ -205,7 +285,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>>   
>>   static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>   {
>> -    struct vpci_header *header = &pdev->vpci->header;
>> +    struct vpci_header *header;
>>       struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>>       struct pci_dev *tmp, *dev = NULL;
>>   #ifdef CONFIG_X86
>> @@ -217,6 +297,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>       if ( !mem )
>>           return -ENOMEM;
>>   
>> +    if ( is_hardware_domain(current->domain) )
>> +        header = get_hwdom_vpci_header(pdev);
>> +    else
>> +        header = get_vpci_header(current->domain, pdev);
>> +
>>       /*
>>        * Create a rangeset that represents the current device BARs memory region
>>        * and compare it against all the currently active BAR memory regions. If
>> @@ -225,12 +310,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>        * First fill the rangeset with all the BARs of this device or with the ROM
>>        * BAR only, depending on whether the guest is toggling the memory decode
>>        * bit of the command register, or the enable bit of the ROM BAR register.
>> +     *
>> +     * Use the PCI reserved bits of the BAR to pass BAR's index.
>>        */
>>       for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>       {
>>           const struct vpci_bar *bar = &header->bars[i];
>> -        unsigned long start = PFN_DOWN(bar->addr);
>> -        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>> +        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>> +        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
>> +            bar->size - 1;
> Will this work fine on Arm 32bits with LPAE? It's my understanding
> that in that case unsigned long is 32bits, but the physical address
> space is 44bits, in which case this won't work.
Hm, good question
>
> I think you need to keep the usage of frame numbers here.
If I re-work the gfn <-> mfn mapping then yes, I can use frame numbers here and elsewhere
>
>>   
>>           if ( !MAPPABLE_BAR(bar) ||
>>                (rom_only ? bar->type != VPCI_BAR_ROM
>> @@ -251,9 +339,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>       /* Remove any MSIX regions if present. */
>>       for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
>>       {
>> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
>> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>> -                                     vmsix_table_size(pdev->vpci, i) - 1);
>> +        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
>> +                               PCI_BASE_ADDRESS_MEM_MASK) | i;
>> +        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
>> +                             PCI_BASE_ADDRESS_MEM_MASK ) +
>> +                             vmsix_table_size(pdev->vpci, i) - 1;
>>   
>>           rc = rangeset_remove_range(mem, start, end);
>>           if ( rc )
>> @@ -273,6 +363,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>        */
>>       for_each_pdev ( pdev->domain, tmp )
>>       {
>> +        struct vpci_header *header;
>> +
>>           if ( tmp == pdev )
>>           {
>>               /*
>> @@ -289,11 +381,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>                   continue;
>>           }
>>   
>> -        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>> +        header = get_vpci_header(tmp->domain, pdev);
>> +
>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>           {
>> -            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
>> -            unsigned long start = PFN_DOWN(bar->addr);
>> -            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>> +            const struct vpci_bar *bar = &header->bars[i];
>> +            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>> +            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
>> +                + bar->size - 1;
>>   
>>               if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
>>                    /*
>> @@ -357,7 +452,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>>           pci_conf_write16(pdev->sbdf, reg, cmd);
>>   }
>>   
>> -static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>> +static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
>>                         uint32_t val, void *data)
>>   {
>>       struct vpci_bar *bar = data;
>> @@ -377,14 +472,17 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>       {
>>           /* If the value written is the current one avoid printing a warning. */
>>           if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
>> +        {
>> +            struct vpci_header *header = get_hwdom_vpci_header(pdev);
>> +
>>               gprintk(XENLOG_WARNING,
>>                       "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
>>                       pdev->seg, pdev->bus, slot, func,
>> -                    bar - pdev->vpci->header.bars + hi);
>> +                    bar - header->bars + hi);
>> +        }
>>           return;
>>       }
>>   
>> -
>>       /*
>>        * Update the cached address, so that when memory decoding is enabled
>>        * Xen can map the BAR into the guest p2m.
>> @@ -403,10 +501,89 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>       pci_conf_write32(pdev->sbdf, reg, val);
>>   }
>>   
>> +static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
>> +                               void *data)
>> +{
>> +    return vpci_hw_read32(pdev, reg, data);
>> +}
>> +
>> +static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
>> +                            uint32_t val, void *data)
>> +{
>> +    struct vpci_bar *vbar = data;
>> +    bool hi = false;
>> +
>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>> +    {
>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>> +        vbar--;
>> +        hi = true;
>> +    }
>> +    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>> +    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>> +}
>> +
>> +static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
>> +                               void *data)
>> +{
>> +    struct vpci_bar *vbar = data;
>> +    uint32_t val;
>> +    bool hi = false;
>> +
>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>> +    {
>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>> +        vbar--;
>> +        hi = true;
>> +    }
>> +
>> +    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )
> I think this would be clearer using a switch statement.
I'll think about
>
>> +    {
>> +        if ( hi )
>> +            val = vbar->addr >> 32;
>> +        else
>> +            val = vbar->addr & 0xffffffff;
>> +        if ( val == ~0 )
> Strictly speaking I think you are not forced to write 1s to the
> reserved 4 bits in the low register (and in the 32bit case).

Ah, so Linux kernel, for instance, could have written 0xffffff0 while

I expect 0xffffffff?

>
>> +        {
>> +            /* Guests detects BAR's properties and sizes. */
>> +            if ( !hi )
>> +            {
>> +                val = 0xffffffff & ~(vbar->size - 1);
>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>> +            }
>> +            else
>> +                val = vbar->size >> 32;
>> +            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>> +            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>> +        }
>> +    }
>> +    else if ( vbar->type == VPCI_BAR_MEM32 )
>> +    {
>> +        val = vbar->addr;
>> +        if ( val == ~0 )
>> +        {
>> +            if ( !hi )
> There's no way hi can be true at this point AFAICT.
Sure, thank you
>
>> +            {
>> +                val = 0xffffffff & ~(vbar->size - 1);
>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>> +            }
>> +        }
>> +    }
>> +    else
>> +    {
>> +        val = vbar->addr;
>> +    }
>> +    return val;
>> +}
>> +
>>   static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>                         uint32_t val, void *data)
>>   {
>> -    struct vpci_header *header = &pdev->vpci->header;
>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>       struct vpci_bar *rom = data;
>>       uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>>       uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
>> @@ -452,15 +629,56 @@ static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>           rom->addr = val & PCI_ROM_ADDRESS_MASK;
>>   }
> Don't you need to also protect a domU from writing to the ROM BAR
> register?

ROM was not a target of this RFC as I have no HW to test that, but final code must

also handle ROM as well, you are right

>
>>   
>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>> +                                  void *data)
>> +{
>> +    struct vpci_bar *vbar, *bar = data;
>> +
>> +    if ( is_hardware_domain(current->domain) )
>> +        return bar_read_hwdom(pdev, reg, data);
>> +
>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>> +    if ( !vbar )
>> +        return ~0;
>> +
>> +    return bar_read_guest(pdev, reg, vbar);
>> +}
>> +
>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>> +                               uint32_t val, void *data)
>> +{
>> +    struct vpci_bar *bar = data;
>> +
>> +    if ( is_hardware_domain(current->domain) )
>> +        bar_write_hwdom(pdev, reg, val, data);
>> +    else
>> +    {
>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>> +
>> +        if ( !vbar )
>> +            return;
>> +        bar_write_guest(pdev, reg, val, vbar);
>> +    }
>> +}
> You should assign different handlers based on whether the domain that
> has the device assigned is a domU or the hardware domain, rather than
> doing the selection here.

Hm, handlers are assigned once in init_bars and this function is only called

for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher

>
>> +
>> +/*
>> + * FIXME: This is called early while adding vPCI handlers which is done
>> + * by and for hwdom.
>> + */
>>   static int init_bars(struct pci_dev *pdev)
>>   {
>>       uint16_t cmd;
>>       uint64_t addr, size;
>>       unsigned int i, num_bars, rom_reg;
>> -    struct vpci_header *header = &pdev->vpci->header;
>> -    struct vpci_bar *bars = header->bars;
>> +    struct vpci_header *header;
>> +    struct vpci_bar *bars;
>>       int rc;
>>   
>> +    header = get_hwdom_vpci_header(pdev);
>> +    if ( !header )
>> +        return -ENOMEM;
>> +    bars = header->bars;
>> +
>>       switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
>>       {
>>       case PCI_HEADER_TYPE_NORMAL:
>> @@ -496,11 +714,12 @@ static int init_bars(struct pci_dev *pdev)
>>           uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
>>           uint32_t val;
>>   
>> +        bars[i].index = i;
>>           if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
>>           {
>>               bars[i].type = VPCI_BAR_MEM64_HI;
>> -            rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg,
>> -                                   4, &bars[i]);
>> +            rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
>> +                                   bar_write_dispatch, reg, 4, &bars[i]);
>>               if ( rc )
>>               {
>>                   pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
>> @@ -540,8 +759,8 @@ static int init_bars(struct pci_dev *pdev)
>>           bars[i].size = size;
>>           bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
>>   
>> -        rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg, 4,
>> -                               &bars[i]);
>> +        rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
>> +                               bar_write_dispatch, reg, 4, &bars[i]);
>>           if ( rc )
>>           {
>>               pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
>> @@ -558,6 +777,7 @@ static int init_bars(struct pci_dev *pdev)
>>           rom->type = VPCI_BAR_ROM;
>>           rom->size = size;
>>           rom->addr = addr;
>> +        rom->index = num_bars;
>>           header->rom_enabled = pci_conf_read32(pdev->sbdf, rom_reg) &
>>                                 PCI_ROM_ADDRESS_ENABLE;
>>   
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index a5293521a36a..728029da3e9c 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -69,6 +69,7 @@ int __hwdom_init vpci_add_handlers(struct pci_dev *pdev)
>>           return -ENOMEM;
>>   
>>       INIT_LIST_HEAD(&pdev->vpci->handlers);
>> +    INIT_LIST_HEAD(&pdev->vpci->headers);
>>       spin_lock_init(&pdev->vpci->lock);
>>   
>>       for ( i = 0; i < NUM_VPCI_INIT; i++ )
>> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
>> index c3501e9ec010..54423bc6556d 100644
>> --- a/xen/include/xen/vpci.h
>> +++ b/xen/include/xen/vpci.h
>> @@ -55,16 +55,14 @@ uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
>>    */
>>   bool __must_check vpci_process_pending(struct vcpu *v);
>>   
>> -struct vpci {
>> -    /* List of vPCI handlers for a device. */
>> -    struct list_head handlers;
>> -    spinlock_t lock;
>> -
>>   #ifdef __XEN__
>> -    /* Hide the rest of the vpci struct from the user-space test harness. */
>>       struct vpci_header {
>> +    struct list_head node;
>> +    /* Domain that owns this view of the BARs. */
>> +    domid_t domain_id;
> Indentation seems screwed here.
It did ;)
>
>>           /* Information about the PCI BARs of this device. */
>>           struct vpci_bar {
>> +            int index;
> unsigned
ok
>
>>               uint64_t addr;
>>               uint64_t size;
>>               enum {
>> @@ -88,8 +86,18 @@ struct vpci {
>>            * is mapped into guest p2m) if there's a ROM BAR on the device.
>>            */
>>           bool rom_enabled      : 1;
>> -        /* FIXME: currently there's no support for SR-IOV. */
> Unless you are also adding support for SR-IOV, I would keep the
> comment.

WRT SR-IOV I do need your series [1] ;) SR-IOV is one of our targets

> Thanks, Roger.

Thank you so much for reviewing this,

Oleksandr

[1] https://lists.xenproject.org/archives/html/xen-devel/2018-07/msg01494.html
Roger Pau Monne Nov. 12, 2020, 2:46 p.m. UTC | #3
On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
> 
> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
> > On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
> >> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> >> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> >> index f74f728884c0..7dc7c70e24f2 100644
> >> --- a/xen/drivers/vpci/header.c
> >> +++ b/xen/drivers/vpci/header.c
> >> @@ -31,14 +31,87 @@
> >>   struct map_data {
> >>       struct domain *d;
> >>       bool map;
> >> +    struct pci_dev *pdev;
> > If the field is required please place it after the domain one.
> I will, but may I ask why?

So that if we add further boolean fields we can do at the end of the
struct for layout reasons. If we do:

struct map_data {
    struct domain *d;
    bool map;
    struct pci_dev *pdev;
    bool foo;
}

We will end up with a bunch of padding that could be avoided by doing:

struct map_data {
    struct domain *d;
    struct pci_dev *pdev;
    bool map;
    bool foo;
}

> >> +    s = PFN_DOWN(s);
> >> +    e = PFN_DOWN(e);
> > Changing the rangeset to store memory addresses instead of frames
> > could for example be split into a separate patch.
> Ok
> >
> > I think you are doing the calculation of the end pfn wrong here, you
> > should use PFN_UP instead in case the address is not aligned.
> 
> PFN_DOWN for the start seems to be ok if the address is not aligned
> 
> which is the case if I pass bar_index in the lower bits: PCI memory has
> 
> PAGE_SIZE granularity, so besides the fact that I use bar_index the address

No, BARs don't need to be aligned to page boundaries, you can even
have different BARs inside the same physical page.

The spec recommends that the minimum size of a BAR should be 4KB, but
that's not a strict requirement in which case a BAR can be as small as
16bytes, and then you can have multiple ones inside the same page.

> must be page aligned.
> 
> The end address is expressed in (size - 1) form, again page aligned,
> 
> so to get the last page to be mapped PFN_DOWN also seems to be appropriate.
> 
> Do I miss something here?

I'm not aware of any  of those addresses or sizes being guaranteed to
be page aligned, so I think you need to account for that.

Some of the code here uses PFN_DOWN to calculate the end address
because the rangesets are used in an inclusive fashion, so the end
frame also gets mapped.

> >
> >> +    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
> >>       for ( ; ; )
> >>       {
> >>           unsigned long size = e - s + 1;
> >> @@ -52,11 +125,15 @@ static int map_range(unsigned long s, unsigned long e, void *data,
> >>            * - {un}map_mmio_regions doesn't support preemption.
> >>            */
> >>   
> >> -        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
> >> -                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
> >> +        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
> >> +                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
> >>           if ( rc == 0 )
> >>           {
> >> -            *c += size;
> >> +            /*
> >> +             * Range set is not expressed in frame numbers and the size
> >> +             * is the number of frames, so update accordingly.
> >> +             */
> >> +            *c += size << PAGE_SHIFT;
> >>               break;
> >>           }
> >>           if ( rc < 0 )
> >> @@ -67,8 +144,9 @@ static int map_range(unsigned long s, unsigned long e, void *data,
> >>               break;
> >>           }
> >>           ASSERT(rc < size);
> >> -        *c += rc;
> >> +        *c += rc << PAGE_SHIFT;
> >>           s += rc;
> >> +        mfn += rc;
> >>           if ( general_preempt_check() )
> >>                   return -ERESTART;
> >>       }
> >> @@ -84,7 +162,7 @@ static int map_range(unsigned long s, unsigned long e, void *data,
> >>   static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> >>                               bool rom_only)
> >>   {
> >> -    struct vpci_header *header = &pdev->vpci->header;
> >> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
> >>       bool map = cmd & PCI_COMMAND_MEMORY;
> >>       unsigned int i;
> >>   
> >> @@ -136,6 +214,7 @@ bool vpci_process_pending(struct vcpu *v)
> >>           struct map_data data = {
> >>               .d = v->domain,
> >>               .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> >> +            .pdev = v->vpci.pdev,
> >>           };
> >>           int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> >>   
> >> @@ -168,7 +247,8 @@ bool vpci_process_pending(struct vcpu *v)
> >>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >>                               struct rangeset *mem, uint16_t cmd)
> >>   {
> >> -    struct map_data data = { .d = d, .map = true };
> >> +    struct map_data data = { .d = d, .map = true,
> >> +        .pdev = (struct pci_dev *)pdev };
> > Dropping the const here is not fine. IT either needs to be dropped
> > from apply_map and further up, or this needs to also be made const.
> Ok, I'll try to keep it const
> >
> >>       int rc;
> >>   
> >>       while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
> >> @@ -205,7 +285,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
> >>   
> >>   static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>   {
> >> -    struct vpci_header *header = &pdev->vpci->header;
> >> +    struct vpci_header *header;
> >>       struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> >>       struct pci_dev *tmp, *dev = NULL;
> >>   #ifdef CONFIG_X86
> >> @@ -217,6 +297,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>       if ( !mem )
> >>           return -ENOMEM;
> >>   
> >> +    if ( is_hardware_domain(current->domain) )
> >> +        header = get_hwdom_vpci_header(pdev);
> >> +    else
> >> +        header = get_vpci_header(current->domain, pdev);
> >> +
> >>       /*
> >>        * Create a rangeset that represents the current device BARs memory region
> >>        * and compare it against all the currently active BAR memory regions. If
> >> @@ -225,12 +310,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>        * First fill the rangeset with all the BARs of this device or with the ROM
> >>        * BAR only, depending on whether the guest is toggling the memory decode
> >>        * bit of the command register, or the enable bit of the ROM BAR register.
> >> +     *
> >> +     * Use the PCI reserved bits of the BAR to pass BAR's index.
> >>        */
> >>       for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> >>       {
> >>           const struct vpci_bar *bar = &header->bars[i];
> >> -        unsigned long start = PFN_DOWN(bar->addr);
> >> -        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> >> +        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
> >> +        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
> >> +            bar->size - 1;
> > Will this work fine on Arm 32bits with LPAE? It's my understanding
> > that in that case unsigned long is 32bits, but the physical address
> > space is 44bits, in which case this won't work.
> Hm, good question
> >
> > I think you need to keep the usage of frame numbers here.
> If I re-work the gfn <-> mfn mapping then yes, I can use frame numbers here and elsewhere
> >
> >>   
> >>           if ( !MAPPABLE_BAR(bar) ||
> >>                (rom_only ? bar->type != VPCI_BAR_ROM
> >> @@ -251,9 +339,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>       /* Remove any MSIX regions if present. */
> >>       for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
> >>       {
> >> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
> >> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
> >> -                                     vmsix_table_size(pdev->vpci, i) - 1);
> >> +        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
> >> +                               PCI_BASE_ADDRESS_MEM_MASK) | i;
> >> +        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
> >> +                             PCI_BASE_ADDRESS_MEM_MASK ) +
> >> +                             vmsix_table_size(pdev->vpci, i) - 1;
> >>   
> >>           rc = rangeset_remove_range(mem, start, end);
> >>           if ( rc )
> >> @@ -273,6 +363,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>        */
> >>       for_each_pdev ( pdev->domain, tmp )
> >>       {
> >> +        struct vpci_header *header;
> >> +
> >>           if ( tmp == pdev )
> >>           {
> >>               /*
> >> @@ -289,11 +381,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>                   continue;
> >>           }
> >>   
> >> -        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
> >> +        header = get_vpci_header(tmp->domain, pdev);
> >> +
> >> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> >>           {
> >> -            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
> >> -            unsigned long start = PFN_DOWN(bar->addr);
> >> -            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> >> +            const struct vpci_bar *bar = &header->bars[i];
> >> +            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
> >> +            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
> >> +                + bar->size - 1;
> >>   
> >>               if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
> >>                    /*
> >> @@ -357,7 +452,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
> >>           pci_conf_write16(pdev->sbdf, reg, cmd);
> >>   }
> >>   
> >> -static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> >> +static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
> >>                         uint32_t val, void *data)
> >>   {
> >>       struct vpci_bar *bar = data;
> >> @@ -377,14 +472,17 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> >>       {
> >>           /* If the value written is the current one avoid printing a warning. */
> >>           if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
> >> +        {
> >> +            struct vpci_header *header = get_hwdom_vpci_header(pdev);
> >> +
> >>               gprintk(XENLOG_WARNING,
> >>                       "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
> >>                       pdev->seg, pdev->bus, slot, func,
> >> -                    bar - pdev->vpci->header.bars + hi);
> >> +                    bar - header->bars + hi);
> >> +        }
> >>           return;
> >>       }
> >>   
> >> -
> >>       /*
> >>        * Update the cached address, so that when memory decoding is enabled
> >>        * Xen can map the BAR into the guest p2m.
> >> @@ -403,10 +501,89 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> >>       pci_conf_write32(pdev->sbdf, reg, val);
> >>   }
> >>   
> >> +static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
> >> +                               void *data)
> >> +{
> >> +    return vpci_hw_read32(pdev, reg, data);
> >> +}
> >> +
> >> +static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
> >> +                            uint32_t val, void *data)
> >> +{
> >> +    struct vpci_bar *vbar = data;
> >> +    bool hi = false;
> >> +
> >> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
> >> +    {
> >> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
> >> +        vbar--;
> >> +        hi = true;
> >> +    }
> >> +    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
> >> +    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
> >> +}
> >> +
> >> +static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
> >> +                               void *data)
> >> +{
> >> +    struct vpci_bar *vbar = data;
> >> +    uint32_t val;
> >> +    bool hi = false;
> >> +
> >> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
> >> +    {
> >> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
> >> +        vbar--;
> >> +        hi = true;
> >> +    }
> >> +
> >> +    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )
> > I think this would be clearer using a switch statement.
> I'll think about
> >
> >> +    {
> >> +        if ( hi )
> >> +            val = vbar->addr >> 32;
> >> +        else
> >> +            val = vbar->addr & 0xffffffff;
> >> +        if ( val == ~0 )
> > Strictly speaking I think you are not forced to write 1s to the
> > reserved 4 bits in the low register (and in the 32bit case).
> 
> Ah, so Linux kernel, for instance, could have written 0xffffff0 while
> 
> I expect 0xffffffff?

I think real hardware would return the size when written 1s to all
bits except the reserved ones.

> 
> >
> >> +        {
> >> +            /* Guests detects BAR's properties and sizes. */
> >> +            if ( !hi )
> >> +            {
> >> +                val = 0xffffffff & ~(vbar->size - 1);
> >> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
> >> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
> >> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> >> +            }
> >> +            else
> >> +                val = vbar->size >> 32;
> >> +            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
> >> +            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
> >> +        }
> >> +    }
> >> +    else if ( vbar->type == VPCI_BAR_MEM32 )
> >> +    {
> >> +        val = vbar->addr;
> >> +        if ( val == ~0 )
> >> +        {
> >> +            if ( !hi )
> > There's no way hi can be true at this point AFAICT.
> Sure, thank you
> >
> >> +            {
> >> +                val = 0xffffffff & ~(vbar->size - 1);
> >> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
> >> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
> >> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> >> +            }
> >> +        }
> >> +    }
> >> +    else
> >> +    {
> >> +        val = vbar->addr;
> >> +    }
> >> +    return val;
> >> +}
> >> +
> >>   static void rom_write(const struct pci_dev *pdev, unsigned int reg,
> >>                         uint32_t val, void *data)
> >>   {
> >> -    struct vpci_header *header = &pdev->vpci->header;
> >> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
> >>       struct vpci_bar *rom = data;
> >>       uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> >>       uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
> >> @@ -452,15 +629,56 @@ static void rom_write(const struct pci_dev *pdev, unsigned int reg,
> >>           rom->addr = val & PCI_ROM_ADDRESS_MASK;
> >>   }
> > Don't you need to also protect a domU from writing to the ROM BAR
> > register?
> 
> ROM was not a target of this RFC as I have no HW to test that, but final code must
> 
> also handle ROM as well, you are right
> 
> >
> >>   
> >> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
> >> +                                  void *data)
> >> +{
> >> +    struct vpci_bar *vbar, *bar = data;
> >> +
> >> +    if ( is_hardware_domain(current->domain) )
> >> +        return bar_read_hwdom(pdev, reg, data);
> >> +
> >> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
> >> +    if ( !vbar )
> >> +        return ~0;
> >> +
> >> +    return bar_read_guest(pdev, reg, vbar);
> >> +}
> >> +
> >> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
> >> +                               uint32_t val, void *data)
> >> +{
> >> +    struct vpci_bar *bar = data;
> >> +
> >> +    if ( is_hardware_domain(current->domain) )
> >> +        bar_write_hwdom(pdev, reg, val, data);
> >> +    else
> >> +    {
> >> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
> >> +
> >> +        if ( !vbar )
> >> +            return;
> >> +        bar_write_guest(pdev, reg, val, vbar);
> >> +    }
> >> +}
> > You should assign different handlers based on whether the domain that
> > has the device assigned is a domU or the hardware domain, rather than
> > doing the selection here.
> 
> Hm, handlers are assigned once in init_bars and this function is only called
> 
> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher

I think we might want to reset the vPCI handlers when a devices gets
assigned and deassigned. In order to do passthrough to domUs safely
we will have to add more handlers than what's required for dom0, and
having is_hardware_domain sprinkled in all of them is not a suitable
solution.

Roger.
Oleksandr Andrushchenko Nov. 13, 2020, 6:32 a.m. UTC | #4
On 11/12/20 4:46 PM, Roger Pau Monné wrote:
> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>>>> index f74f728884c0..7dc7c70e24f2 100644
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -31,14 +31,87 @@
>>>>    struct map_data {
>>>>        struct domain *d;
>>>>        bool map;
>>>> +    struct pci_dev *pdev;
>>> If the field is required please place it after the domain one.
>> I will, but may I ask why?
> So that if we add further boolean fields we can do at the end of the
> struct for layout reasons. If we do:
>
> struct map_data {
>      struct domain *d;
>      bool map;
>      struct pci_dev *pdev;
>      bool foo;
> }
>
> We will end up with a bunch of padding that could be avoided by doing:
>
> struct map_data {
>      struct domain *d;
>      struct pci_dev *pdev;
>      bool map;
>      bool foo;
> }
Ah, so this is about padding. Got it
>
>>>> +    s = PFN_DOWN(s);
>>>> +    e = PFN_DOWN(e);
>>> Changing the rangeset to store memory addresses instead of frames
>>> could for example be split into a separate patch.
>> Ok
>>> I think you are doing the calculation of the end pfn wrong here, you
>>> should use PFN_UP instead in case the address is not aligned.
>> PFN_DOWN for the start seems to be ok if the address is not aligned
>>
>> which is the case if I pass bar_index in the lower bits: PCI memory has
>>
>> PAGE_SIZE granularity, so besides the fact that I use bar_index the address
> No, BARs don't need to be aligned to page boundaries, you can even
> have different BARs inside the same physical page.
>
> The spec recommends that the minimum size of a BAR should be 4KB, but
> that's not a strict requirement in which case a BAR can be as small as
> 16bytes, and then you can have multiple ones inside the same page.
Ok, will account on that
>
>> must be page aligned.
>>
>> The end address is expressed in (size - 1) form, again page aligned,
>>
>> so to get the last page to be mapped PFN_DOWN also seems to be appropriate.
>>
>> Do I miss something here?
> I'm not aware of any  of those addresses or sizes being guaranteed to
> be page aligned, so I think you need to account for that.
>
> Some of the code here uses PFN_DOWN to calculate the end address
> because the rangesets are used in an inclusive fashion, so the end
> frame also gets mapped.
Ok
>
>>>> +    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
>>>>        for ( ; ; )
>>>>        {
>>>>            unsigned long size = e - s + 1;
>>>> @@ -52,11 +125,15 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>             * - {un}map_mmio_regions doesn't support preemption.
>>>>             */
>>>>    
>>>> -        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
>>>> -                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
>>>> +        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
>>>> +                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
>>>>            if ( rc == 0 )
>>>>            {
>>>> -            *c += size;
>>>> +            /*
>>>> +             * Range set is not expressed in frame numbers and the size
>>>> +             * is the number of frames, so update accordingly.
>>>> +             */
>>>> +            *c += size << PAGE_SHIFT;
>>>>                break;
>>>>            }
>>>>            if ( rc < 0 )
>>>> @@ -67,8 +144,9 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>                break;
>>>>            }
>>>>            ASSERT(rc < size);
>>>> -        *c += rc;
>>>> +        *c += rc << PAGE_SHIFT;
>>>>            s += rc;
>>>> +        mfn += rc;
>>>>            if ( general_preempt_check() )
>>>>                    return -ERESTART;
>>>>        }
>>>> @@ -84,7 +162,7 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>    static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>                                bool rom_only)
>>>>    {
>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>>        bool map = cmd & PCI_COMMAND_MEMORY;
>>>>        unsigned int i;
>>>>    
>>>> @@ -136,6 +214,7 @@ bool vpci_process_pending(struct vcpu *v)
>>>>            struct map_data data = {
>>>>                .d = v->domain,
>>>>                .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>> +            .pdev = v->vpci.pdev,
>>>>            };
>>>>            int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>>>>    
>>>> @@ -168,7 +247,8 @@ bool vpci_process_pending(struct vcpu *v)
>>>>    static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>>>                                struct rangeset *mem, uint16_t cmd)
>>>>    {
>>>> -    struct map_data data = { .d = d, .map = true };
>>>> +    struct map_data data = { .d = d, .map = true,
>>>> +        .pdev = (struct pci_dev *)pdev };
>>> Dropping the const here is not fine. IT either needs to be dropped
>>> from apply_map and further up, or this needs to also be made const.
>> Ok, I'll try to keep it const
>>>>        int rc;
>>>>    
>>>>        while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
>>>> @@ -205,7 +285,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>>>>    
>>>>    static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>    {
>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>> +    struct vpci_header *header;
>>>>        struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>>>>        struct pci_dev *tmp, *dev = NULL;
>>>>    #ifdef CONFIG_X86
>>>> @@ -217,6 +297,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>        if ( !mem )
>>>>            return -ENOMEM;
>>>>    
>>>> +    if ( is_hardware_domain(current->domain) )
>>>> +        header = get_hwdom_vpci_header(pdev);
>>>> +    else
>>>> +        header = get_vpci_header(current->domain, pdev);
>>>> +
>>>>        /*
>>>>         * Create a rangeset that represents the current device BARs memory region
>>>>         * and compare it against all the currently active BAR memory regions. If
>>>> @@ -225,12 +310,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>         * First fill the rangeset with all the BARs of this device or with the ROM
>>>>         * BAR only, depending on whether the guest is toggling the memory decode
>>>>         * bit of the command register, or the enable bit of the ROM BAR register.
>>>> +     *
>>>> +     * Use the PCI reserved bits of the BAR to pass BAR's index.
>>>>         */
>>>>        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>        {
>>>>            const struct vpci_bar *bar = &header->bars[i];
>>>> -        unsigned long start = PFN_DOWN(bar->addr);
>>>> -        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>>> +        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>> +        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
>>>> +            bar->size - 1;
>>> Will this work fine on Arm 32bits with LPAE? It's my understanding
>>> that in that case unsigned long is 32bits, but the physical address
>>> space is 44bits, in which case this won't work.
>> Hm, good question
>>> I think you need to keep the usage of frame numbers here.
>> If I re-work the gfn <-> mfn mapping then yes, I can use frame numbers here and elsewhere
>>>>    
>>>>            if ( !MAPPABLE_BAR(bar) ||
>>>>                 (rom_only ? bar->type != VPCI_BAR_ROM
>>>> @@ -251,9 +339,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>        /* Remove any MSIX regions if present. */
>>>>        for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
>>>>        {
>>>> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
>>>> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>>>> -                                     vmsix_table_size(pdev->vpci, i) - 1);
>>>> +        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
>>>> +                               PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>> +        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
>>>> +                             PCI_BASE_ADDRESS_MEM_MASK ) +
>>>> +                             vmsix_table_size(pdev->vpci, i) - 1;
>>>>    
>>>>            rc = rangeset_remove_range(mem, start, end);
>>>>            if ( rc )
>>>> @@ -273,6 +363,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>         */
>>>>        for_each_pdev ( pdev->domain, tmp )
>>>>        {
>>>> +        struct vpci_header *header;
>>>> +
>>>>            if ( tmp == pdev )
>>>>            {
>>>>                /*
>>>> @@ -289,11 +381,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>                    continue;
>>>>            }
>>>>    
>>>> -        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>> +        header = get_vpci_header(tmp->domain, pdev);
>>>> +
>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>            {
>>>> -            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
>>>> -            unsigned long start = PFN_DOWN(bar->addr);
>>>> -            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>>> +            const struct vpci_bar *bar = &header->bars[i];
>>>> +            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>> +            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
>>>> +                + bar->size - 1;
>>>>    
>>>>                if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
>>>>                     /*
>>>> @@ -357,7 +452,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>>>>            pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>    }
>>>>    
>>>> -static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>> +static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
>>>>                          uint32_t val, void *data)
>>>>    {
>>>>        struct vpci_bar *bar = data;
>>>> @@ -377,14 +472,17 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>        {
>>>>            /* If the value written is the current one avoid printing a warning. */
>>>>            if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
>>>> +        {
>>>> +            struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>> +
>>>>                gprintk(XENLOG_WARNING,
>>>>                        "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
>>>>                        pdev->seg, pdev->bus, slot, func,
>>>> -                    bar - pdev->vpci->header.bars + hi);
>>>> +                    bar - header->bars + hi);
>>>> +        }
>>>>            return;
>>>>        }
>>>>    
>>>> -
>>>>        /*
>>>>         * Update the cached address, so that when memory decoding is enabled
>>>>         * Xen can map the BAR into the guest p2m.
>>>> @@ -403,10 +501,89 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>        pci_conf_write32(pdev->sbdf, reg, val);
>>>>    }
>>>>    
>>>> +static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
>>>> +                               void *data)
>>>> +{
>>>> +    return vpci_hw_read32(pdev, reg, data);
>>>> +}
>>>> +
>>>> +static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
>>>> +                            uint32_t val, void *data)
>>>> +{
>>>> +    struct vpci_bar *vbar = data;
>>>> +    bool hi = false;
>>>> +
>>>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>>>> +    {
>>>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>> +        vbar--;
>>>> +        hi = true;
>>>> +    }
>>>> +    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>> +    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>>>> +}
>>>> +
>>>> +static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
>>>> +                               void *data)
>>>> +{
>>>> +    struct vpci_bar *vbar = data;
>>>> +    uint32_t val;
>>>> +    bool hi = false;
>>>> +
>>>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>>>> +    {
>>>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>> +        vbar--;
>>>> +        hi = true;
>>>> +    }
>>>> +
>>>> +    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )
>>> I think this would be clearer using a switch statement.
>> I'll think about
>>>> +    {
>>>> +        if ( hi )
>>>> +            val = vbar->addr >> 32;
>>>> +        else
>>>> +            val = vbar->addr & 0xffffffff;
>>>> +        if ( val == ~0 )
>>> Strictly speaking I think you are not forced to write 1s to the
>>> reserved 4 bits in the low register (and in the 32bit case).
>> Ah, so Linux kernel, for instance, could have written 0xffffff0 while
>>
>> I expect 0xffffffff?
> I think real hardware would return the size when written 1s to all
> bits except the reserved ones.
>
>>>> +        {
>>>> +            /* Guests detects BAR's properties and sizes. */
>>>> +            if ( !hi )
>>>> +            {
>>>> +                val = 0xffffffff & ~(vbar->size - 1);
>>>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>>> +            }
>>>> +            else
>>>> +                val = vbar->size >> 32;
>>>> +            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>> +            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>>>> +        }
>>>> +    }
>>>> +    else if ( vbar->type == VPCI_BAR_MEM32 )
>>>> +    {
>>>> +        val = vbar->addr;
>>>> +        if ( val == ~0 )
>>>> +        {
>>>> +            if ( !hi )
>>> There's no way hi can be true at this point AFAICT.
>> Sure, thank you
>>>> +            {
>>>> +                val = 0xffffffff & ~(vbar->size - 1);
>>>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +    else
>>>> +    {
>>>> +        val = vbar->addr;
>>>> +    }
>>>> +    return val;
>>>> +}
>>>> +
>>>>    static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>>>                          uint32_t val, void *data)
>>>>    {
>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>>        struct vpci_bar *rom = data;
>>>>        uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>>>>        uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
>>>> @@ -452,15 +629,56 @@ static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>>>            rom->addr = val & PCI_ROM_ADDRESS_MASK;
>>>>    }
>>> Don't you need to also protect a domU from writing to the ROM BAR
>>> register?
>> ROM was not a target of this RFC as I have no HW to test that, but final code must
>>
>> also handle ROM as well, you are right
>>
>>>>    
>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>> +                                  void *data)
>>>> +{
>>>> +    struct vpci_bar *vbar, *bar = data;
>>>> +
>>>> +    if ( is_hardware_domain(current->domain) )
>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>> +
>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>> +    if ( !vbar )
>>>> +        return ~0;
>>>> +
>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>> +}
>>>> +
>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>> +                               uint32_t val, void *data)
>>>> +{
>>>> +    struct vpci_bar *bar = data;
>>>> +
>>>> +    if ( is_hardware_domain(current->domain) )
>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>> +    else
>>>> +    {
>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>> +
>>>> +        if ( !vbar )
>>>> +            return;
>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>> +    }
>>>> +}
>>> You should assign different handlers based on whether the domain that
>>> has the device assigned is a domU or the hardware domain, rather than
>>> doing the selection here.
>> Hm, handlers are assigned once in init_bars and this function is only called
>>
>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
> I think we might want to reset the vPCI handlers when a devices gets
> assigned and deassigned.

In ARM case init_bars is called too early: PCI device assignment is currently

initiated by Domain-0' kernel and is done *before* PCI devices are given memory

ranges and BARs assigned:

[    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
[    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
[    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
[    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
[    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
[    0.429670] pci 0000:00:00.0: enabling Extended Tags
[    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE

< init_bars >

[    0.453793] pci 0000:00:00.0: -- IRQ 0
[    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
[    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE

< init_bars >

[    0.471821] pci 0000:01:00.0: -- IRQ 255
[    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!

< BAR assignments below >

[    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
[    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]

In case of x86 this is pretty much ok as BARs are already in place, but for ARM we

need to take care and re-setup vPCI BARs for hwdom. Things are getting even more

complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers

and trap hwdom's access to the config space to update BARs etc. This is why I have that

ugly hack for rcar_gen3 to read actual BARs for hwdom.


If we go further and take a look at SR-IOV then when the kernel assigns the device

(BUS_NOTIFY_ADD_DEVICE) then it already has BARs assigned for virtual functions

(need to double-check that).

>   In order to do passthrough to domUs safely
> we will have to add more handlers than what's required for dom0,
Can you please tell what are thinking about? What are the missing handlers?
>   and
> having is_hardware_domain sprinkled in all of them is not a suitable
> solution.

I'll try to replace is_hardware_domain with something like:

+/*
+ * Detect whether physical PCI devices in this segment belong
+ * to the domain given, e.g. on x86 all PCI devices live in hwdom,
+ * but in case of ARM this might not be the case: those may also
+ * live in driver domains or even Xen itself.
+ */
+bool pci_is_hardware_domain(struct domain *d, u16 seg)
+{
+#ifdef CONFIG_X86
+    return is_hardware_domain(d);
+#elif CONFIG_ARM
+    return pci_is_owner_domain(d, seg);
+#else
+#error "Unsupported architecture"
+#endif
+}
+
+/*
+ * Get domain which owns this segment: for x86 this is always hardware
+ * domain and for ARM this can be different.
+ */
+struct domain *pci_get_hardware_domain(u16 seg)
+{
+#ifdef CONFIG_X86
+    return hardware_domain;
+#elif CONFIG_ARM
+    return pci_get_owner_domain(seg);
+#else
+#error "Unsupported architecture"
+#endif
+}

This is what I use to properly detect the domain that really owns physical host bridge

>
> Roger.

Thank you,

Oleksandr
Oleksandr Andrushchenko Nov. 13, 2020, 6:48 a.m. UTC | #5
On 11/13/20 8:32 AM, Oleksandr Andrushchenko wrote:
>
> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>>>>> index f74f728884c0..7dc7c70e24f2 100644
>>>>> --- a/xen/drivers/vpci/header.c
>>>>> +++ b/xen/drivers/vpci/header.c
>>>>> @@ -31,14 +31,87 @@
>>>>>    struct map_data {
>>>>>        struct domain *d;
>>>>>        bool map;
>>>>> +    struct pci_dev *pdev;
>>>> If the field is required please place it after the domain one.
>>> I will, but may I ask why?
>> So that if we add further boolean fields we can do at the end of the
>> struct for layout reasons. If we do:
>>
>> struct map_data {
>>      struct domain *d;
>>      bool map;
>>      struct pci_dev *pdev;
>>      bool foo;
>> }
>>
>> We will end up with a bunch of padding that could be avoided by doing:
>>
>> struct map_data {
>>      struct domain *d;
>>      struct pci_dev *pdev;
>>      bool map;
>>      bool foo;
>> }
> Ah, so this is about padding. Got it
>>
>>>>> +    s = PFN_DOWN(s);
>>>>> +    e = PFN_DOWN(e);
>>>> Changing the rangeset to store memory addresses instead of frames
>>>> could for example be split into a separate patch.
>>> Ok
>>>> I think you are doing the calculation of the end pfn wrong here, you
>>>> should use PFN_UP instead in case the address is not aligned.
>>> PFN_DOWN for the start seems to be ok if the address is not aligned
>>>
>>> which is the case if I pass bar_index in the lower bits: PCI memory has
>>>
>>> PAGE_SIZE granularity, so besides the fact that I use bar_index the address
>> No, BARs don't need to be aligned to page boundaries, you can even
>> have different BARs inside the same physical page.
>>
>> The spec recommends that the minimum size of a BAR should be 4KB, but
>> that's not a strict requirement in which case a BAR can be as small as
>> 16bytes, and then you can have multiple ones inside the same page.
> Ok, will account on that
>>
>>> must be page aligned.
>>>
>>> The end address is expressed in (size - 1) form, again page aligned,
>>>
>>> so to get the last page to be mapped PFN_DOWN also seems to be appropriate.
>>>
>>> Do I miss something here?
>> I'm not aware of any  of those addresses or sizes being guaranteed to
>> be page aligned, so I think you need to account for that.
>>
>> Some of the code here uses PFN_DOWN to calculate the end address
>> because the rangesets are used in an inclusive fashion, so the end
>> frame also gets mapped.
> Ok
>>
>>>>> +    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
>>>>>        for ( ; ; )
>>>>>        {
>>>>>            unsigned long size = e - s + 1;
>>>>> @@ -52,11 +125,15 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>>             * - {un}map_mmio_regions doesn't support preemption.
>>>>>             */
>>>>>    -        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
>>>>> -                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
>>>>> +        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
>>>>> +                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
>>>>>            if ( rc == 0 )
>>>>>            {
>>>>> -            *c += size;
>>>>> +            /*
>>>>> +             * Range set is not expressed in frame numbers and the size
>>>>> +             * is the number of frames, so update accordingly.
>>>>> +             */
>>>>> +            *c += size << PAGE_SHIFT;
>>>>>                break;
>>>>>            }
>>>>>            if ( rc < 0 )
>>>>> @@ -67,8 +144,9 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>>                break;
>>>>>            }
>>>>>            ASSERT(rc < size);
>>>>> -        *c += rc;
>>>>> +        *c += rc << PAGE_SHIFT;
>>>>>            s += rc;
>>>>> +        mfn += rc;
>>>>>            if ( general_preempt_check() )
>>>>>                    return -ERESTART;
>>>>>        }
>>>>> @@ -84,7 +162,7 @@ static int map_range(unsigned long s, unsigned long e, void *data,
>>>>>    static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>>                                bool rom_only)
>>>>>    {
>>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>>>        bool map = cmd & PCI_COMMAND_MEMORY;
>>>>>        unsigned int i;
>>>>>    @@ -136,6 +214,7 @@ bool vpci_process_pending(struct vcpu *v)
>>>>>            struct map_data data = {
>>>>>                .d = v->domain,
>>>>>                .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>>> +            .pdev = v->vpci.pdev,
>>>>>            };
>>>>>            int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>>>>>    @@ -168,7 +247,8 @@ bool vpci_process_pending(struct vcpu *v)
>>>>>    static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>>>>                                struct rangeset *mem, uint16_t cmd)
>>>>>    {
>>>>> -    struct map_data data = { .d = d, .map = true };
>>>>> +    struct map_data data = { .d = d, .map = true,
>>>>> +        .pdev = (struct pci_dev *)pdev };
>>>> Dropping the const here is not fine. IT either needs to be dropped
>>>> from apply_map and further up, or this needs to also be made const.
>>> Ok, I'll try to keep it const
>>>>>        int rc;
>>>>>           while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
>>>>> @@ -205,7 +285,7 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
>>>>>       static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>    {
>>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>>> +    struct vpci_header *header;
>>>>>        struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>>>>>        struct pci_dev *tmp, *dev = NULL;
>>>>>    #ifdef CONFIG_X86
>>>>> @@ -217,6 +297,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>        if ( !mem )
>>>>>            return -ENOMEM;
>>>>>    +    if ( is_hardware_domain(current->domain) )
>>>>> +        header = get_hwdom_vpci_header(pdev);
>>>>> +    else
>>>>> +        header = get_vpci_header(current->domain, pdev);
>>>>> +
>>>>>        /*
>>>>>         * Create a rangeset that represents the current device BARs memory region
>>>>>         * and compare it against all the currently active BAR memory regions. If
>>>>> @@ -225,12 +310,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>         * First fill the rangeset with all the BARs of this device or with the ROM
>>>>>         * BAR only, depending on whether the guest is toggling the memory decode
>>>>>         * bit of the command register, or the enable bit of the ROM BAR register.
>>>>> +     *
>>>>> +     * Use the PCI reserved bits of the BAR to pass BAR's index.
>>>>>         */
>>>>>        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>        {
>>>>>            const struct vpci_bar *bar = &header->bars[i];
>>>>> -        unsigned long start = PFN_DOWN(bar->addr);
>>>>> -        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>>>> +        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>>> +        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
>>>>> +            bar->size - 1;
>>>> Will this work fine on Arm 32bits with LPAE? It's my understanding
>>>> that in that case unsigned long is 32bits, but the physical address
>>>> space is 44bits, in which case this won't work.
>>> Hm, good question
>>>> I think you need to keep the usage of frame numbers here.
>>> If I re-work the gfn <-> mfn mapping then yes, I can use frame numbers here and elsewhere
>>>>>               if ( !MAPPABLE_BAR(bar) ||
>>>>>                 (rom_only ? bar->type != VPCI_BAR_ROM
>>>>> @@ -251,9 +339,11 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>        /* Remove any MSIX regions if present. */
>>>>>        for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
>>>>>        {
>>>>> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
>>>>> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>>>>> - vmsix_table_size(pdev->vpci, i) - 1);
>>>>> +        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
>>>>> +                               PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>>> +        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
>>>>> +                             PCI_BASE_ADDRESS_MEM_MASK ) +
>>>>> + vmsix_table_size(pdev->vpci, i) - 1;
>>>>>               rc = rangeset_remove_range(mem, start, end);
>>>>>            if ( rc )
>>>>> @@ -273,6 +363,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>         */
>>>>>        for_each_pdev ( pdev->domain, tmp )
>>>>>        {
>>>>> +        struct vpci_header *header;
>>>>> +
>>>>>            if ( tmp == pdev )
>>>>>            {
>>>>>                /*
>>>>> @@ -289,11 +381,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>>                    continue;
>>>>>            }
>>>>>    -        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
>>>>> +        header = get_vpci_header(tmp->domain, pdev);
>>>>> +
>>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>            {
>>>>> -            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
>>>>> -            unsigned long start = PFN_DOWN(bar->addr);
>>>>> -            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>>>> +            const struct vpci_bar *bar = &header->bars[i];
>>>>> +            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
>>>>> +            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
>>>>> +                + bar->size - 1;
>>>>>                   if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
>>>>>                     /*
>>>>> @@ -357,7 +452,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>            pci_conf_write16(pdev->sbdf, reg, cmd);
>>>>>    }
>>>>>    -static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>> +static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
>>>>>                          uint32_t val, void *data)
>>>>>    {
>>>>>        struct vpci_bar *bar = data;
>>>>> @@ -377,14 +472,17 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>        {
>>>>>            /* If the value written is the current one avoid printing a warning. */
>>>>>            if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
>>>>> +        {
>>>>> +            struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>>> +
>>>>>                gprintk(XENLOG_WARNING,
>>>>>                        "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
>>>>>                        pdev->seg, pdev->bus, slot, func,
>>>>> -                    bar - pdev->vpci->header.bars + hi);
>>>>> +                    bar - header->bars + hi);
>>>>> +        }
>>>>>            return;
>>>>>        }
>>>>>    -
>>>>>        /*
>>>>>         * Update the cached address, so that when memory decoding is enabled
>>>>>         * Xen can map the BAR into the guest p2m.
>>>>> @@ -403,10 +501,89 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>        pci_conf_write32(pdev->sbdf, reg, val);
>>>>>    }
>>>>>    +static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                               void *data)
>>>>> +{
>>>>> +    return vpci_hw_read32(pdev, reg, data);
>>>>> +}
>>>>> +
>>>>> +static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                            uint32_t val, void *data)
>>>>> +{
>>>>> +    struct vpci_bar *vbar = data;
>>>>> +    bool hi = false;
>>>>> +
>>>>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>>>>> +    {
>>>>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>> +        vbar--;
>>>>> +        hi = true;
>>>>> +    }
>>>>> +    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>> +    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>> +}
>>>>> +
>>>>> +static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                               void *data)
>>>>> +{
>>>>> +    struct vpci_bar *vbar = data;
>>>>> +    uint32_t val;
>>>>> +    bool hi = false;
>>>>> +
>>>>> +    if ( vbar->type == VPCI_BAR_MEM64_HI )
>>>>> +    {
>>>>> +        ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>> +        vbar--;
>>>>> +        hi = true;
>>>>> +    }
>>>>> +
>>>>> +    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )
>>>> I think this would be clearer using a switch statement.
>>> I'll think about
>>>>> +    {
>>>>> +        if ( hi )
>>>>> +            val = vbar->addr >> 32;
>>>>> +        else
>>>>> +            val = vbar->addr & 0xffffffff;
>>>>> +        if ( val == ~0 )
>>>> Strictly speaking I think you are not forced to write 1s to the
>>>> reserved 4 bits in the low register (and in the 32bit case).
>>> Ah, so Linux kernel, for instance, could have written 0xffffff0 while
>>>
>>> I expect 0xffffffff?
>> I think real hardware would return the size when written 1s to all
>> bits except the reserved ones.
>>
>>>>> +        {
>>>>> +            /* Guests detects BAR's properties and sizes. */
>>>>> +            if ( !hi )
>>>>> +            {
>>>>> +                val = 0xffffffff & ~(vbar->size - 1);
>>>>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>>>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>>>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>>>> +            }
>>>>> +            else
>>>>> +                val = vbar->size >> 32;
>>>>> +            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>> +            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>> +        }
>>>>> +    }
>>>>> +    else if ( vbar->type == VPCI_BAR_MEM32 )
>>>>> +    {
>>>>> +        val = vbar->addr;
>>>>> +        if ( val == ~0 )
>>>>> +        {
>>>>> +            if ( !hi )
>>>> There's no way hi can be true at this point AFAICT.
>>> Sure, thank you
>>>>> +            {
>>>>> +                val = 0xffffffff & ~(vbar->size - 1);
>>>>> +                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>>>> +                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>>>> +                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>>>> +            }
>>>>> +        }
>>>>> +    }
>>>>> +    else
>>>>> +    {
>>>>> +        val = vbar->addr;
>>>>> +    }
>>>>> +    return val;
>>>>> +}
>>>>> +
>>>>>    static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>                          uint32_t val, void *data)
>>>>>    {
>>>>> -    struct vpci_header *header = &pdev->vpci->header;
>>>>> +    struct vpci_header *header = get_hwdom_vpci_header(pdev);
>>>>>        struct vpci_bar *rom = data;
>>>>>        uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>>>>>        uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
>>>>> @@ -452,15 +629,56 @@ static void rom_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>            rom->addr = val & PCI_ROM_ADDRESS_MASK;
>>>>>    }
>>>> Don't you need to also protect a domU from writing to the ROM BAR
>>>> register?
>>> ROM was not a target of this RFC as I have no HW to test that, but final code must
>>>
>>> also handle ROM as well, you are right
>>>
>>>>>    +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                                  void *data)
>>>>> +{
>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>> +
>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>> +
>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>> +    if ( !vbar )
>>>>> +        return ~0;
>>>>> +
>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>> +}
>>>>> +
>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                               uint32_t val, void *data)
>>>>> +{
>>>>> +    struct vpci_bar *bar = data;
>>>>> +
>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>> +    else
>>>>> +    {
>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>> +
>>>>> +        if ( !vbar )
>>>>> +            return;
>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>> +    }
>>>>> +}
>>>> You should assign different handlers based on whether the domain that
>>>> has the device assigned is a domU or the hardware domain, rather than
>>>> doing the selection here.
>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>
>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>> I think we might want to reset the vPCI handlers when a devices gets
>> assigned and deassigned.
>
> In ARM case init_bars is called too early: PCI device assignment is currently
>
> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>
> ranges and BARs assigned:
>
> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>
> < init_bars >
>
> [    0.453793] pci 0000:00:00.0: -- IRQ 0
> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>
> < init_bars >
>
> [    0.471821] pci 0000:01:00.0: -- IRQ 255
> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>
> < BAR assignments below >
>
> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>
> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>
> need to take care and re-setup vPCI BARs for hwdom. Things are getting even more
>
> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>
> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>
> ugly hack for rcar_gen3 to read actual BARs for hwdom.
>
>
> If we go further and take a look at SR-IOV then when the kernel assigns the device
>
> (BUS_NOTIFY_ADD_DEVICE) then it already has BARs assigned for virtual functions
>
> (need to double-check that).

Hm, indeed. We just need to move init_bars from being called on PCI *device add* to

*device assign*. This way it won't (?) break x86 and allow ARM to properly initialize vPCI's

BARs...

>
>>   In order to do passthrough to domUs safely
>> we will have to add more handlers than what's required for dom0,
> Can you please tell what are thinking about? What are the missing handlers?
>>   and
>> having is_hardware_domain sprinkled in all of them is not a suitable
>> solution.
>
> I'll try to replace is_hardware_domain with something like:
>
> +/*
> + * Detect whether physical PCI devices in this segment belong
> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
> + * but in case of ARM this might not be the case: those may also
> + * live in driver domains or even Xen itself.
> + */
> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
> +{
> +#ifdef CONFIG_X86
> +    return is_hardware_domain(d);
> +#elif CONFIG_ARM
> +    return pci_is_owner_domain(d, seg);
> +#else
> +#error "Unsupported architecture"
> +#endif
> +}
> +
> +/*
> + * Get domain which owns this segment: for x86 this is always hardware
> + * domain and for ARM this can be different.
> + */
> +struct domain *pci_get_hardware_domain(u16 seg)
> +{
> +#ifdef CONFIG_X86
> +    return hardware_domain;
> +#elif CONFIG_ARM
> +    return pci_get_owner_domain(seg);
> +#else
> +#error "Unsupported architecture"
> +#endif
> +}
>
> This is what I use to properly detect the domain that really owns physical host bridge
>
>>
>> Roger.
>
> Thank you,
>
> Oleksandr
>
Jan Beulich Nov. 13, 2020, 10:25 a.m. UTC | #6
On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                                  void *data)
>>>>> +{
>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>> +
>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>> +
>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>> +    if ( !vbar )
>>>>> +        return ~0;
>>>>> +
>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>> +}
>>>>> +
>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>> +                               uint32_t val, void *data)
>>>>> +{
>>>>> +    struct vpci_bar *bar = data;
>>>>> +
>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>> +    else
>>>>> +    {
>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>> +
>>>>> +        if ( !vbar )
>>>>> +            return;
>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>> +    }
>>>>> +}
>>>> You should assign different handlers based on whether the domain that
>>>> has the device assigned is a domU or the hardware domain, rather than
>>>> doing the selection here.
>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>
>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>> I think we might want to reset the vPCI handlers when a devices gets
>> assigned and deassigned.
> 
> In ARM case init_bars is called too early: PCI device assignment is currently
> 
> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
> 
> ranges and BARs assigned:
> 
> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
> 
> < init_bars >
> 
> [    0.453793] pci 0000:00:00.0: -- IRQ 0
> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
> 
> < init_bars >
> 
> [    0.471821] pci 0000:01:00.0: -- IRQ 255
> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
> 
> < BAR assignments below >
> 
> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
> 
> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
> 
> need to take care and re-setup vPCI BARs for hwdom.

Even on x86 there's no guarantee that all devices have their BARs set
up by firmware.

In a subsequent reply you've suggested to move init_bars from "add" to
"assign", but I'm having trouble seeing what this would change: It's
not Dom0 controlling assignment (to itself), but Xen assigns the device
towards the end of pci_add_device().

> Things are getting even more
> 
> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
> 
> and trap hwdom's access to the config space to update BARs etc. This is why I have that
> 
> ugly hack for rcar_gen3 to read actual BARs for hwdom.

How to config space accesses work there? The latest for MSI/MSI-X it'll
be imperative that Xen be able to intercept config space writes.

>>   In order to do passthrough to domUs safely
>> we will have to add more handlers than what's required for dom0,
> Can you please tell what are thinking about? What are the missing handlers?
>>   and
>> having is_hardware_domain sprinkled in all of them is not a suitable
>> solution.
> 
> I'll try to replace is_hardware_domain with something like:
> 
> +/*
> + * Detect whether physical PCI devices in this segment belong
> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
> + * but in case of ARM this might not be the case: those may also
> + * live in driver domains or even Xen itself.
> + */
> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
> +{
> +#ifdef CONFIG_X86
> +    return is_hardware_domain(d);
> +#elif CONFIG_ARM
> +    return pci_is_owner_domain(d, seg);
> +#else
> +#error "Unsupported architecture"
> +#endif
> +}
> +
> +/*
> + * Get domain which owns this segment: for x86 this is always hardware
> + * domain and for ARM this can be different.
> + */
> +struct domain *pci_get_hardware_domain(u16 seg)
> +{
> +#ifdef CONFIG_X86
> +    return hardware_domain;
> +#elif CONFIG_ARM
> +    return pci_get_owner_domain(seg);
> +#else
> +#error "Unsupported architecture"
> +#endif
> +}
> 
> This is what I use to properly detect the domain that really owns physical host bridge

I consider this problematic. We should try to not let Arm's and x86'es
PCI implementations diverge too much, i.e. at least the underlying basic
model would better be similar. For example, if entire segments can be
assigned to a driver domain on Arm, why should the same not be possible
on x86?

Furthermore I'm suspicious about segments being the right granularity
here. On ia64 multiple host bridges could (and typically would) live
on segment 0. Iirc I had once seen output from an x86 system which was
apparently laid out similarly. Therefore, just like for MCFG, I think
the granularity wants to be bus ranges within a segment.

Jan
Julien Grall Nov. 13, 2020, 10:36 a.m. UTC | #7
On 13/11/2020 10:25, Jan Beulich wrote:
> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>> +                                  void *data)
>>>>>> +{
>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>> +
>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>> +
>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>> +    if ( !vbar )
>>>>>> +        return ~0;
>>>>>> +
>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>> +}
>>>>>> +
>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>> +                               uint32_t val, void *data)
>>>>>> +{
>>>>>> +    struct vpci_bar *bar = data;
>>>>>> +
>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>> +    else
>>>>>> +    {
>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>> +
>>>>>> +        if ( !vbar )
>>>>>> +            return;
>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>> +    }
>>>>>> +}
>>>>> You should assign different handlers based on whether the domain that
>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>> doing the selection here.
>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>
>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>> I think we might want to reset the vPCI handlers when a devices gets
>>> assigned and deassigned.
>>
>> In ARM case init_bars is called too early: PCI device assignment is currently
>>
>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>
>> ranges and BARs assigned:
>>
>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>
>> < init_bars >
>>
>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>
>> < init_bars >
>>
>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>
>> < BAR assignments below >
>>
>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>
>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>
>> need to take care and re-setup vPCI BARs for hwdom.
> 
> Even on x86 there's no guarantee that all devices have their BARs set
> up by firmware.
> 
> In a subsequent reply you've suggested to move init_bars from "add" to
> "assign", but I'm having trouble seeing what this would change: It's
> not Dom0 controlling assignment (to itself), but Xen assigns the device
> towards the end of pci_add_device().
> 
>> Things are getting even more
>>
>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>
>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>
>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
> 
> How to config space accesses work there? The latest for MSI/MSI-X it'll
> be imperative that Xen be able to intercept config space writes.

I am not sure to understand your last sentence. Are you saying that we 
always need to trap access to MSI/MSI-X message in order to sanitize it?

If one is using the GICv3 ITS (I haven't investigated other MSI 
controller), then I don't believe you need to sanitize the MSI/MSI-X 
message in most of the situation.

Cheers,
Oleksandr Andrushchenko Nov. 13, 2020, 10:46 a.m. UTC | #8
On 11/13/20 12:25 PM, Jan Beulich wrote:
> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>> +                                  void *data)
>>>>>> +{
>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>> +
>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>> +
>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>> +    if ( !vbar )
>>>>>> +        return ~0;
>>>>>> +
>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>> +}
>>>>>> +
>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>> +                               uint32_t val, void *data)
>>>>>> +{
>>>>>> +    struct vpci_bar *bar = data;
>>>>>> +
>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>> +    else
>>>>>> +    {
>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>> +
>>>>>> +        if ( !vbar )
>>>>>> +            return;
>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>> +    }
>>>>>> +}
>>>>> You should assign different handlers based on whether the domain that
>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>> doing the selection here.
>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>
>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>> I think we might want to reset the vPCI handlers when a devices gets
>>> assigned and deassigned.
>> In ARM case init_bars is called too early: PCI device assignment is currently
>>
>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>
>> ranges and BARs assigned:
>>
>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>
>> < init_bars >
>>
>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>
>> < init_bars >
>>
>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>
>> < BAR assignments below >
>>
>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>
>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>
>> need to take care and re-setup vPCI BARs for hwdom.
> Even on x86 there's no guarantee that all devices have their BARs set
> up by firmware.

This is true. But there you could have config space trapped in "x86 generic way",

please correct me if I'm wrong here

>
> In a subsequent reply you've suggested to move init_bars from "add" to
> "assign", but I'm having trouble seeing what this would change: It's
> not Dom0 controlling assignment (to itself), but Xen assigns the device
> towards the end of pci_add_device().

PHYSDEVOP_pci_device_add vs XEN_DOMCTL_assign_device

Currently we initialize BARs during PHYSDEVOP_pci_device_add and

if we do that during XEN_DOMCTL_assign_device things seem to change

>
>> Things are getting even more
>>
>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>
>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>
>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
> How to config space accesses work there? The latest for MSI/MSI-X it'll
> be imperative that Xen be able to intercept config space writes.
>
>>>    In order to do passthrough to domUs safely
>>> we will have to add more handlers than what's required for dom0,
>> Can you please tell what are thinking about? What are the missing handlers?
>>>    and
>>> having is_hardware_domain sprinkled in all of them is not a suitable
>>> solution.
>> I'll try to replace is_hardware_domain with something like:
>>
>> +/*
>> + * Detect whether physical PCI devices in this segment belong
>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>> + * but in case of ARM this might not be the case: those may also
>> + * live in driver domains or even Xen itself.
>> + */
>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>> +{
>> +#ifdef CONFIG_X86
>> +    return is_hardware_domain(d);
>> +#elif CONFIG_ARM
>> +    return pci_is_owner_domain(d, seg);
>> +#else
>> +#error "Unsupported architecture"
>> +#endif
>> +}
>> +
>> +/*
>> + * Get domain which owns this segment: for x86 this is always hardware
>> + * domain and for ARM this can be different.
>> + */
>> +struct domain *pci_get_hardware_domain(u16 seg)
>> +{
>> +#ifdef CONFIG_X86
>> +    return hardware_domain;
>> +#elif CONFIG_ARM
>> +    return pci_get_owner_domain(seg);
>> +#else
>> +#error "Unsupported architecture"
>> +#endif
>> +}
>>
>> This is what I use to properly detect the domain that really owns physical host bridge
> I consider this problematic. We should try to not let Arm's and x86'es
> PCI implementations diverge too much, i.e. at least the underlying basic
> model would better be similar. For example, if entire segments can be
> assigned to a driver domain on Arm, why should the same not be possible
> on x86?

Good question, probably in this case x86 == ARM and I can use

pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86

>
> Furthermore I'm suspicious about segments being the right granularity
> here. On ia64 multiple host bridges could (and typically would) live
> on segment 0. Iirc I had once seen output from an x86 system which was
> apparently laid out similarly. Therefore, just like for MCFG, I think
> the granularity wants to be bus ranges within a segment.
Can you please suggest something we can use as a hint for such a detection logic?
>
> Jan

Thank you,

Oleksandr
Jan Beulich Nov. 13, 2020, 10:50 a.m. UTC | #9
On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
> On 11/13/20 12:25 PM, Jan Beulich wrote:
>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> +                                  void *data)
>>>>>>> +{
>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>> +
>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>> +
>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>> +    if ( !vbar )
>>>>>>> +        return ~0;
>>>>>>> +
>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> +                               uint32_t val, void *data)
>>>>>>> +{
>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>> +
>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>> +    else
>>>>>>> +    {
>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>> +
>>>>>>> +        if ( !vbar )
>>>>>>> +            return;
>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>> +    }
>>>>>>> +}
>>>>>> You should assign different handlers based on whether the domain that
>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>> doing the selection here.
>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>
>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>> assigned and deassigned.
>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>
>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>
>>> ranges and BARs assigned:
>>>
>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>
>>> < init_bars >
>>>
>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>
>>> < init_bars >
>>>
>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>
>>> < BAR assignments below >
>>>
>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>
>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>
>>> need to take care and re-setup vPCI BARs for hwdom.
>> Even on x86 there's no guarantee that all devices have their BARs set
>> up by firmware.
> 
> This is true. But there you could have config space trapped in "x86 generic way",
> 
> please correct me if I'm wrong here
> 
>>
>> In a subsequent reply you've suggested to move init_bars from "add" to
>> "assign", but I'm having trouble seeing what this would change: It's
>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>> towards the end of pci_add_device().
> 
> PHYSDEVOP_pci_device_add vs XEN_DOMCTL_assign_device
> 
> Currently we initialize BARs during PHYSDEVOP_pci_device_add and
> 
> if we do that during XEN_DOMCTL_assign_device things seem to change

But there can't possibly be any XEN_DOMCTL_assign_device involved in
booting of Dom0. 

>>>>    In order to do passthrough to domUs safely
>>>> we will have to add more handlers than what's required for dom0,
>>> Can you please tell what are thinking about? What are the missing handlers?
>>>>    and
>>>> having is_hardware_domain sprinkled in all of them is not a suitable
>>>> solution.
>>> I'll try to replace is_hardware_domain with something like:
>>>
>>> +/*
>>> + * Detect whether physical PCI devices in this segment belong
>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>> + * but in case of ARM this might not be the case: those may also
>>> + * live in driver domains or even Xen itself.
>>> + */
>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>> +{
>>> +#ifdef CONFIG_X86
>>> +    return is_hardware_domain(d);
>>> +#elif CONFIG_ARM
>>> +    return pci_is_owner_domain(d, seg);
>>> +#else
>>> +#error "Unsupported architecture"
>>> +#endif
>>> +}
>>> +
>>> +/*
>>> + * Get domain which owns this segment: for x86 this is always hardware
>>> + * domain and for ARM this can be different.
>>> + */
>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>> +{
>>> +#ifdef CONFIG_X86
>>> +    return hardware_domain;
>>> +#elif CONFIG_ARM
>>> +    return pci_get_owner_domain(seg);
>>> +#else
>>> +#error "Unsupported architecture"
>>> +#endif
>>> +}
>>>
>>> This is what I use to properly detect the domain that really owns physical host bridge
>> I consider this problematic. We should try to not let Arm's and x86'es
>> PCI implementations diverge too much, i.e. at least the underlying basic
>> model would better be similar. For example, if entire segments can be
>> assigned to a driver domain on Arm, why should the same not be possible
>> on x86?
> 
> Good question, probably in this case x86 == ARM and I can use
> 
> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
> 
>>
>> Furthermore I'm suspicious about segments being the right granularity
>> here. On ia64 multiple host bridges could (and typically would) live
>> on segment 0. Iirc I had once seen output from an x86 system which was
>> apparently laid out similarly. Therefore, just like for MCFG, I think
>> the granularity wants to be bus ranges within a segment.
> Can you please suggest something we can use as a hint for such a detection logic?

The underlying information comes from ACPI tables, iirc. I don't
recall the details, though - sorry.

Jan
Jan Beulich Nov. 13, 2020, 10:53 a.m. UTC | #10
On 13.11.2020 11:36, Julien Grall wrote:
> On 13/11/2020 10:25, Jan Beulich wrote:
>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> +                                  void *data)
>>>>>>> +{
>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>> +
>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>> +
>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>> +    if ( !vbar )
>>>>>>> +        return ~0;
>>>>>>> +
>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> +                               uint32_t val, void *data)
>>>>>>> +{
>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>> +
>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>> +    else
>>>>>>> +    {
>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>> +
>>>>>>> +        if ( !vbar )
>>>>>>> +            return;
>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>> +    }
>>>>>>> +}
>>>>>> You should assign different handlers based on whether the domain that
>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>> doing the selection here.
>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>
>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>> assigned and deassigned.
>>>
>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>
>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>
>>> ranges and BARs assigned:
>>>
>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>
>>> < init_bars >
>>>
>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>
>>> < init_bars >
>>>
>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>
>>> < BAR assignments below >
>>>
>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>
>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>
>>> need to take care and re-setup vPCI BARs for hwdom.
>>
>> Even on x86 there's no guarantee that all devices have their BARs set
>> up by firmware.
>>
>> In a subsequent reply you've suggested to move init_bars from "add" to
>> "assign", but I'm having trouble seeing what this would change: It's
>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>> towards the end of pci_add_device().
>>
>>> Things are getting even more
>>>
>>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>>
>>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>>
>>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
>>
>> How to config space accesses work there? The latest for MSI/MSI-X it'll
>> be imperative that Xen be able to intercept config space writes.
> 
> I am not sure to understand your last sentence. Are you saying that we 
> always need to trap access to MSI/MSI-X message in order to sanitize it?
> 
> If one is using the GICv3 ITS (I haven't investigated other MSI 
> controller), then I don't believe you need to sanitize the MSI/MSI-X 
> message in most of the situation.

Well, if it's fine for the guest to write arbitrary values to message
address and message data, _and_ to arbitrarily enable/disable MSI / MSI-X,
then yes, no interception would be needed.

Jan
Oleksandr Andrushchenko Nov. 13, 2020, 11:02 a.m. UTC | #11
On 11/13/20 12:50 PM, Jan Beulich wrote:
> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> +                                  void *data)
>>>>>>>> +{
>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>> +
>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>> +
>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>> +    if ( !vbar )
>>>>>>>> +        return ~0;
>>>>>>>> +
>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>> +{
>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>> +
>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>> +    else
>>>>>>>> +    {
>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>> +
>>>>>>>> +        if ( !vbar )
>>>>>>>> +            return;
>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>> doing the selection here.
>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>
>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>> assigned and deassigned.
>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>
>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>
>>>> ranges and BARs assigned:
>>>>
>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>
>>>> < init_bars >
>>>>
>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>
>>>> < init_bars >
>>>>
>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>
>>>> < BAR assignments below >
>>>>
>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>
>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>
>>>> need to take care and re-setup vPCI BARs for hwdom.
>>> Even on x86 there's no guarantee that all devices have their BARs set
>>> up by firmware.
>> This is true. But there you could have config space trapped in "x86 generic way",
>>
>> please correct me if I'm wrong here
>>
>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>> "assign", but I'm having trouble seeing what this would change: It's
>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>> towards the end of pci_add_device().
>> PHYSDEVOP_pci_device_add vs XEN_DOMCTL_assign_device
>>
>> Currently we initialize BARs during PHYSDEVOP_pci_device_add and
>>
>> if we do that during XEN_DOMCTL_assign_device things seem to change
> But there can't possibly be any XEN_DOMCTL_assign_device involved in
> booting of Dom0.

Indeed. So, do you have an idea when we should call init_bars suitable

for both ARM and x86?

Another question is: what happens bad if x86 and ARM won't call init_bars

until the moment we really assign a PCI device to the first guest?

>
>>>>>     In order to do passthrough to domUs safely
>>>>> we will have to add more handlers than what's required for dom0,
>>>> Can you please tell what are thinking about? What are the missing handlers?
>>>>>     and
>>>>> having is_hardware_domain sprinkled in all of them is not a suitable
>>>>> solution.
>>>> I'll try to replace is_hardware_domain with something like:
>>>>
>>>> +/*
>>>> + * Detect whether physical PCI devices in this segment belong
>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>> + * but in case of ARM this might not be the case: those may also
>>>> + * live in driver domains or even Xen itself.
>>>> + */
>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>> +{
>>>> +#ifdef CONFIG_X86
>>>> +    return is_hardware_domain(d);
>>>> +#elif CONFIG_ARM
>>>> +    return pci_is_owner_domain(d, seg);
>>>> +#else
>>>> +#error "Unsupported architecture"
>>>> +#endif
>>>> +}
>>>> +
>>>> +/*
>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>> + * domain and for ARM this can be different.
>>>> + */
>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>> +{
>>>> +#ifdef CONFIG_X86
>>>> +    return hardware_domain;
>>>> +#elif CONFIG_ARM
>>>> +    return pci_get_owner_domain(seg);
>>>> +#else
>>>> +#error "Unsupported architecture"
>>>> +#endif
>>>> +}
>>>>
>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>> I consider this problematic. We should try to not let Arm's and x86'es
>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>> model would better be similar. For example, if entire segments can be
>>> assigned to a driver domain on Arm, why should the same not be possible
>>> on x86?
>> Good question, probably in this case x86 == ARM and I can use
>>
>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>
>>> Furthermore I'm suspicious about segments being the right granularity
>>> here. On ia64 multiple host bridges could (and typically would) live
>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>> the granularity wants to be bus ranges within a segment.
>> Can you please suggest something we can use as a hint for such a detection logic?
> The underlying information comes from ACPI tables, iirc. I don't
> recall the details, though - sorry.

Ok, so seg + bus should be enough for both ARM and Xen then, right?

pci_get_hardware_domain(u16 seg, u8 bus)

>
> Jan

Thank you,

Oleksandr
Julien Grall Nov. 13, 2020, 11:06 a.m. UTC | #12
Hi Jan,

On 13/11/2020 10:53, Jan Beulich wrote:
> On 13.11.2020 11:36, Julien Grall wrote:
>> On 13/11/2020 10:25, Jan Beulich wrote:
>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> +                                  void *data)
>>>>>>>> +{
>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>> +
>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>> +
>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>> +    if ( !vbar )
>>>>>>>> +        return ~0;
>>>>>>>> +
>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>> +{
>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>> +
>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>> +    else
>>>>>>>> +    {
>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>> +
>>>>>>>> +        if ( !vbar )
>>>>>>>> +            return;
>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>> doing the selection here.
>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>
>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>> assigned and deassigned.
>>>>
>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>
>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>
>>>> ranges and BARs assigned:
>>>>
>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>
>>>> < init_bars >
>>>>
>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>
>>>> < init_bars >
>>>>
>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>
>>>> < BAR assignments below >
>>>>
>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>
>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>
>>>> need to take care and re-setup vPCI BARs for hwdom.
>>>
>>> Even on x86 there's no guarantee that all devices have their BARs set
>>> up by firmware.
>>>
>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>> "assign", but I'm having trouble seeing what this would change: It's
>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>> towards the end of pci_add_device().
>>>
>>>> Things are getting even more
>>>>
>>>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>>>
>>>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>>>
>>>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
>>>
>>> How to config space accesses work there? The latest for MSI/MSI-X it'll
>>> be imperative that Xen be able to intercept config space writes.
>>
>> I am not sure to understand your last sentence. Are you saying that we
>> always need to trap access to MSI/MSI-X message in order to sanitize it?
>>
>> If one is using the GICv3 ITS (I haven't investigated other MSI
>> controller), then I don't believe you need to sanitize the MSI/MSI-X
>> message in most of the situation.
> 
> Well, if it's fine for the guest to write arbitrary values to message
> address and message data,

The message address would be the doorbell of the ITS that is usually 
going through the IOMMU page-tables. Although, I am aware of a couple of 
platforms where the doorbell access (among other address ranges 
including P2P transaction) bypass the IOMMU. In this situation, we would 
need a lot more work than just trapping the access.

Regarding the message data, for the ITS this is an event ID. The HW will 
then tag each message with the device ID (this prevents spoofing). The 
tupple (device ID, event ID) is used by the ITS to decide where to 
inject the event.

Whether other MSI controllers (e.g. GICv2m) have similar isolation 
feature will be on the case by case basis.

> _and_ to arbitrarily enable/disable MSI / MSI-X,
> then yes, no interception would be needed.
The device would be owned by the guest, so I am not sure to understand 
the exact problem of letting it enabling/disabling MSI/MSI-X. Do you 
mind expanding your thoughts?

Furthermore, you can also control which event is enabled/disabled at the 
ITS level.

Cheers,
Jan Beulich Nov. 13, 2020, 11:26 a.m. UTC | #13
On 13.11.2020 12:06, Julien Grall wrote:
> Hi Jan,
> 
> On 13/11/2020 10:53, Jan Beulich wrote:
>> On 13.11.2020 11:36, Julien Grall wrote:
>>> On 13/11/2020 10:25, Jan Beulich wrote:
>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> +                                  void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>>> +
>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>>> +
>>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>> +    if ( !vbar )
>>>>>>>>> +        return ~0;
>>>>>>>>> +
>>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>>> +
>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>>> +    else
>>>>>>>>> +    {
>>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>> +
>>>>>>>>> +        if ( !vbar )
>>>>>>>>> +            return;
>>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>>> +    }
>>>>>>>>> +}
>>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>>> doing the selection here.
>>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>>
>>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>>> assigned and deassigned.
>>>>>
>>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>>
>>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>>
>>>>> ranges and BARs assigned:
>>>>>
>>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>
>>>>> < init_bars >
>>>>>
>>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>
>>>>> < init_bars >
>>>>>
>>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>
>>>>> < BAR assignments below >
>>>>>
>>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>>
>>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>>
>>>>> need to take care and re-setup vPCI BARs for hwdom.
>>>>
>>>> Even on x86 there's no guarantee that all devices have their BARs set
>>>> up by firmware.
>>>>
>>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>>> "assign", but I'm having trouble seeing what this would change: It's
>>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>>> towards the end of pci_add_device().
>>>>
>>>>> Things are getting even more
>>>>>
>>>>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>>>>
>>>>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>>>>
>>>>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
>>>>
>>>> How to config space accesses work there? The latest for MSI/MSI-X it'll
>>>> be imperative that Xen be able to intercept config space writes.
>>>
>>> I am not sure to understand your last sentence. Are you saying that we
>>> always need to trap access to MSI/MSI-X message in order to sanitize it?
>>>
>>> If one is using the GICv3 ITS (I haven't investigated other MSI
>>> controller), then I don't believe you need to sanitize the MSI/MSI-X
>>> message in most of the situation.
>>
>> Well, if it's fine for the guest to write arbitrary values to message
>> address and message data,
> 
> The message address would be the doorbell of the ITS that is usually 
> going through the IOMMU page-tables. Although, I am aware of a couple of 
> platforms where the doorbell access (among other address ranges 
> including P2P transaction) bypass the IOMMU. In this situation, we would 
> need a lot more work than just trapping the access.

When you say "The message address would be the doorbell of the ITS" am
I right in understanding that's the designated address to be put there?
What if the guest puts some random different address there?

> Regarding the message data, for the ITS this is an event ID. The HW will 
> then tag each message with the device ID (this prevents spoofing). The 
> tupple (device ID, event ID) is used by the ITS to decide where to 
> inject the event.
> 
> Whether other MSI controllers (e.g. GICv2m) have similar isolation 
> feature will be on the case by case basis.
> 
>> _and_ to arbitrarily enable/disable MSI / MSI-X,
>> then yes, no interception would be needed.
> The device would be owned by the guest, so I am not sure to understand 
> the exact problem of letting it enabling/disabling MSI/MSI-X. Do you 
> mind expanding your thoughts?

Question is - is Xen involved in any way in the handling of interrupts
from such a device? If not, then I guess full control can indeed be
left with the guest.

> Furthermore, you can also control which event is enabled/disabled at the 
> ITS level.

And that's something Xen controls? On x86 we don't have a 2nd level
of controls, so we need to merge Xen's and the guest's intentions in
software to know what to store in hardware.

Jan
Jan Beulich Nov. 13, 2020, 11:35 a.m. UTC | #14
On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
> 
> On 11/13/20 12:50 PM, Jan Beulich wrote:
>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> +                                  void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>>> +
>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>>> +
>>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>> +    if ( !vbar )
>>>>>>>>> +        return ~0;
>>>>>>>>> +
>>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>>> +{
>>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>>> +
>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>>> +    else
>>>>>>>>> +    {
>>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>> +
>>>>>>>>> +        if ( !vbar )
>>>>>>>>> +            return;
>>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>>> +    }
>>>>>>>>> +}
>>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>>> doing the selection here.
>>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>>
>>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>>> assigned and deassigned.
>>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>>
>>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>>
>>>>> ranges and BARs assigned:
>>>>>
>>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>
>>>>> < init_bars >
>>>>>
>>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>
>>>>> < init_bars >
>>>>>
>>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>
>>>>> < BAR assignments below >
>>>>>
>>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>>
>>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>>
>>>>> need to take care and re-setup vPCI BARs for hwdom.
>>>> Even on x86 there's no guarantee that all devices have their BARs set
>>>> up by firmware.
>>> This is true. But there you could have config space trapped in "x86 generic way",
>>>
>>> please correct me if I'm wrong here
>>>
>>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>>> "assign", but I'm having trouble seeing what this would change: It's
>>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>>> towards the end of pci_add_device().
>>> PHYSDEVOP_pci_device_add vs XEN_DOMCTL_assign_device
>>>
>>> Currently we initialize BARs during PHYSDEVOP_pci_device_add and
>>>
>>> if we do that during XEN_DOMCTL_assign_device things seem to change
>> But there can't possibly be any XEN_DOMCTL_assign_device involved in
>> booting of Dom0.
> 
> Indeed. So, do you have an idea when we should call init_bars suitable
> 
> for both ARM and x86?
> 
> Another question is: what happens bad if x86 and ARM won't call init_bars
> 
> until the moment we really assign a PCI device to the first guest?

I'd like to answer the latter question first: How would Dom0 use
the device prior to such an assignment? As an implication to the
presumed answer here, I guess init_bars could be deferred up until
the first time Dom0 (or more generally the possessing domain)
accesses any of them. Similarly, devices used by Xen itself could
have this done immediately before first use. This may require
tracking on a per-device basis whether initialization was done.

>>>>>>     In order to do passthrough to domUs safely
>>>>>> we will have to add more handlers than what's required for dom0,
>>>>> Can you please tell what are thinking about? What are the missing handlers?
>>>>>>     and
>>>>>> having is_hardware_domain sprinkled in all of them is not a suitable
>>>>>> solution.
>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>
>>>>> +/*
>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>> + * but in case of ARM this might not be the case: those may also
>>>>> + * live in driver domains or even Xen itself.
>>>>> + */
>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>> +{
>>>>> +#ifdef CONFIG_X86
>>>>> +    return is_hardware_domain(d);
>>>>> +#elif CONFIG_ARM
>>>>> +    return pci_is_owner_domain(d, seg);
>>>>> +#else
>>>>> +#error "Unsupported architecture"
>>>>> +#endif
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>> + * domain and for ARM this can be different.
>>>>> + */
>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>> +{
>>>>> +#ifdef CONFIG_X86
>>>>> +    return hardware_domain;
>>>>> +#elif CONFIG_ARM
>>>>> +    return pci_get_owner_domain(seg);
>>>>> +#else
>>>>> +#error "Unsupported architecture"
>>>>> +#endif
>>>>> +}
>>>>>
>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>> model would better be similar. For example, if entire segments can be
>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>> on x86?
>>> Good question, probably in this case x86 == ARM and I can use
>>>
>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>
>>>> Furthermore I'm suspicious about segments being the right granularity
>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>> the granularity wants to be bus ranges within a segment.
>>> Can you please suggest something we can use as a hint for such a detection logic?
>> The underlying information comes from ACPI tables, iirc. I don't
>> recall the details, though - sorry.
> 
> Ok, so seg + bus should be enough for both ARM and Xen then, right?
> 
> pci_get_hardware_domain(u16 seg, u8 bus)

Whether an individual bus number can suitable express things I can't
tell; I did say bus range, but of course if you care about just
individual devices, then a single bus number will of course do.

Jan
Julien Grall Nov. 13, 2020, 11:53 a.m. UTC | #15
On 13/11/2020 11:26, Jan Beulich wrote:
> On 13.11.2020 12:06, Julien Grall wrote:
>> Hi Jan,
>>
>> On 13/11/2020 10:53, Jan Beulich wrote:
>>> On 13.11.2020 11:36, Julien Grall wrote:
>>>> On 13/11/2020 10:25, Jan Beulich wrote:
>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> +                                  void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>>>> +
>>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>>>> +
>>>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>>> +    if ( !vbar )
>>>>>>>>>> +        return ~0;
>>>>>>>>>> +
>>>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>>>> +
>>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>>>> +    else
>>>>>>>>>> +    {
>>>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>>> +
>>>>>>>>>> +        if ( !vbar )
>>>>>>>>>> +            return;
>>>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>>>> +    }
>>>>>>>>>> +}
>>>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>>>> doing the selection here.
>>>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>>>
>>>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>>>> assigned and deassigned.
>>>>>>
>>>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>>>
>>>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>>>
>>>>>> ranges and BARs assigned:
>>>>>>
>>>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>>
>>>>>> < init_bars >
>>>>>>
>>>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>>
>>>>>> < init_bars >
>>>>>>
>>>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>>
>>>>>> < BAR assignments below >
>>>>>>
>>>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>>>
>>>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>>>
>>>>>> need to take care and re-setup vPCI BARs for hwdom.
>>>>>
>>>>> Even on x86 there's no guarantee that all devices have their BARs set
>>>>> up by firmware.
>>>>>
>>>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>>>> "assign", but I'm having trouble seeing what this would change: It's
>>>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>>>> towards the end of pci_add_device().
>>>>>
>>>>>> Things are getting even more
>>>>>>
>>>>>> complicated if the host PCI bridge is not ECAM like, so you cannot set mmio_handlers
>>>>>>
>>>>>> and trap hwdom's access to the config space to update BARs etc. This is why I have that
>>>>>>
>>>>>> ugly hack for rcar_gen3 to read actual BARs for hwdom.
>>>>>
>>>>> How to config space accesses work there? The latest for MSI/MSI-X it'll
>>>>> be imperative that Xen be able to intercept config space writes.
>>>>
>>>> I am not sure to understand your last sentence. Are you saying that we
>>>> always need to trap access to MSI/MSI-X message in order to sanitize it?
>>>>
>>>> If one is using the GICv3 ITS (I haven't investigated other MSI
>>>> controller), then I don't believe you need to sanitize the MSI/MSI-X
>>>> message in most of the situation.
>>>
>>> Well, if it's fine for the guest to write arbitrary values to message
>>> address and message data,
>>
>> The message address would be the doorbell of the ITS that is usually
>> going through the IOMMU page-tables. Although, I am aware of a couple of
>> platforms where the doorbell access (among other address ranges
>> including P2P transaction) bypass the IOMMU. In this situation, we would
>> need a lot more work than just trapping the access.
> 
> When you say "The message address would be the doorbell of the ITS" am
> I right in understanding that's the designated address to be put there?
> What if the guest puts some random different address there?

My point was that all the accesses from a PCI device should go through 
the IOMMU. Although, I know this may not be true for all the platforms.

In which case, sanitizing the MSI message address is not going to help 
because a PCI device can DMA into memory range that bypass the IOMMU.

> 
>> Regarding the message data, for the ITS this is an event ID. The HW will
>> then tag each message with the device ID (this prevents spoofing). The
>> tupple (device ID, event ID) is used by the ITS to decide where to
>> inject the event.
>>
>> Whether other MSI controllers (e.g. GICv2m) have similar isolation
>> feature will be on the case by case basis.
>>
>>> _and_ to arbitrarily enable/disable MSI / MSI-X,
>>> then yes, no interception would be needed.
>> The device would be owned by the guest, so I am not sure to understand
>> the exact problem of letting it enabling/disabling MSI/MSI-X. Do you
>> mind expanding your thoughts?
> 
> Question is - is Xen involved in any way in the handling of interrupts
> from such a device? If not, then I guess full control can indeed be
> left with the guest.

Xen will only forward the interrupts to the guest. This is not very 
different to how other interrupts (e.g. SPIs) are dealt with.

So I don't see the problem of giving full control to the guest.

> 
>> Furthermore, you can also control which event is enabled/disabled at the
>> ITS level.
> 
> And that's something Xen controls?

Yes. We only expose a virtual ITS to the guest.

Cheers,
Oleksandr Andrushchenko Nov. 13, 2020, 12:41 p.m. UTC | #16
On 11/13/20 1:35 PM, Jan Beulich wrote:
> On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
>> On 11/13/20 12:50 PM, Jan Beulich wrote:
>>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>> On 11/12/20 4:46 PM, Roger Pau Monné wrote:
>>>>>>> On Thu, Nov 12, 2020 at 01:16:10PM +0000, Oleksandr Andrushchenko wrote:
>>>>>>>> On 11/12/20 11:40 AM, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Nov 09, 2020 at 02:50:27PM +0200, Oleksandr Andrushchenko wrote:
>>>>>>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>>>>>>>>> +static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> +                                  void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct vpci_bar *vbar, *bar = data;
>>>>>>>>>> +
>>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>>> +        return bar_read_hwdom(pdev, reg, data);
>>>>>>>>>> +
>>>>>>>>>> +    vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>>> +    if ( !vbar )
>>>>>>>>>> +        return ~0;
>>>>>>>>>> +
>>>>>>>>>> +    return bar_read_guest(pdev, reg, vbar);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> +                               uint32_t val, void *data)
>>>>>>>>>> +{
>>>>>>>>>> +    struct vpci_bar *bar = data;
>>>>>>>>>> +
>>>>>>>>>> +    if ( is_hardware_domain(current->domain) )
>>>>>>>>>> +        bar_write_hwdom(pdev, reg, val, data);
>>>>>>>>>> +    else
>>>>>>>>>> +    {
>>>>>>>>>> +        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
>>>>>>>>>> +
>>>>>>>>>> +        if ( !vbar )
>>>>>>>>>> +            return;
>>>>>>>>>> +        bar_write_guest(pdev, reg, val, vbar);
>>>>>>>>>> +    }
>>>>>>>>>> +}
>>>>>>>>> You should assign different handlers based on whether the domain that
>>>>>>>>> has the device assigned is a domU or the hardware domain, rather than
>>>>>>>>> doing the selection here.
>>>>>>>> Hm, handlers are assigned once in init_bars and this function is only called
>>>>>>>>
>>>>>>>> for hwdom, so there is no way I can do that for the guests. Hence, the dispatcher
>>>>>>> I think we might want to reset the vPCI handlers when a devices gets
>>>>>>> assigned and deassigned.
>>>>>> In ARM case init_bars is called too early: PCI device assignment is currently
>>>>>>
>>>>>> initiated by Domain-0' kernel and is done *before* PCI devices are given memory
>>>>>>
>>>>>> ranges and BARs assigned:
>>>>>>
>>>>>> [    0.429514] pci_bus 0000:00: root bus resource [bus 00-ff]
>>>>>> [    0.429532] pci_bus 0000:00: root bus resource [io 0x0000-0xfffff]
>>>>>> [    0.429555] pci_bus 0000:00: root bus resource [mem 0xfe200000-0xfe3fffff]
>>>>>> [    0.429575] pci_bus 0000:00: root bus resource [mem 0x30000000-0x37ffffff]
>>>>>> [    0.429604] pci_bus 0000:00: root bus resource [mem 0x38000000-0x3fffffff pref]
>>>>>> [    0.429670] pci 0000:00:00.0: enabling Extended Tags
>>>>>> [    0.453764] pci 0000:00:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>>
>>>>>> < init_bars >
>>>>>>
>>>>>> [    0.453793] pci 0000:00:00.0: -- IRQ 0
>>>>>> [    0.458825] pci 0000:00:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>> [    0.471790] pci 0000:01:00.0: -------------------- BUS_NOTIFY_ADD_DEVICE
>>>>>>
>>>>>> < init_bars >
>>>>>>
>>>>>> [    0.471821] pci 0000:01:00.0: -- IRQ 255
>>>>>> [    0.476809] pci 0000:01:00.0: Failed to add - passthrough or MSI/MSI-X might fail!
>>>>>>
>>>>>> < BAR assignments below >
>>>>>>
>>>>>> [    0.488233] pci 0000:00:00.0: BAR 14: assigned [mem 0xfe200000-0xfe2fffff]
>>>>>> [    0.488265] pci 0000:00:00.0: BAR 15: assigned [mem 0x38000000-0x380fffff pref]
>>>>>>
>>>>>> In case of x86 this is pretty much ok as BARs are already in place, but for ARM we
>>>>>>
>>>>>> need to take care and re-setup vPCI BARs for hwdom.
>>>>> Even on x86 there's no guarantee that all devices have their BARs set
>>>>> up by firmware.
>>>> This is true. But there you could have config space trapped in "x86 generic way",
>>>>
>>>> please correct me if I'm wrong here
>>>>
>>>>> In a subsequent reply you've suggested to move init_bars from "add" to
>>>>> "assign", but I'm having trouble seeing what this would change: It's
>>>>> not Dom0 controlling assignment (to itself), but Xen assigns the device
>>>>> towards the end of pci_add_device().
>>>> PHYSDEVOP_pci_device_add vs XEN_DOMCTL_assign_device
>>>>
>>>> Currently we initialize BARs during PHYSDEVOP_pci_device_add and
>>>>
>>>> if we do that during XEN_DOMCTL_assign_device things seem to change
>>> But there can't possibly be any XEN_DOMCTL_assign_device involved in
>>> booting of Dom0.
>> Indeed. So, do you have an idea when we should call init_bars suitable
>>
>> for both ARM and x86?
>>
>> Another question is: what happens bad if x86 and ARM won't call init_bars
>>
>> until the moment we really assign a PCI device to the first guest?
> I'd like to answer the latter question first: How would Dom0 use
> the device prior to such an assignment? As an implication to the
> presumed answer here, I guess init_bars could be deferred up until
> the first time Dom0 (or more generally the possessing domain)
> accesses any of them. Similarly, devices used by Xen itself could
> have this done immediately before first use. This may require
> tracking on a per-device basis whether initialization was done.
Ok, I'll try to look into it
>
>>>>>>>      In order to do passthrough to domUs safely
>>>>>>> we will have to add more handlers than what's required for dom0,
>>>>>> Can you please tell what are thinking about? What are the missing handlers?
>>>>>>>      and
>>>>>>> having is_hardware_domain sprinkled in all of them is not a suitable
>>>>>>> solution.
>>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>>
>>>>>> +/*
>>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>>> + * but in case of ARM this might not be the case: those may also
>>>>>> + * live in driver domains or even Xen itself.
>>>>>> + */
>>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>>> +{
>>>>>> +#ifdef CONFIG_X86
>>>>>> +    return is_hardware_domain(d);
>>>>>> +#elif CONFIG_ARM
>>>>>> +    return pci_is_owner_domain(d, seg);
>>>>>> +#else
>>>>>> +#error "Unsupported architecture"
>>>>>> +#endif
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>>> + * domain and for ARM this can be different.
>>>>>> + */
>>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>>> +{
>>>>>> +#ifdef CONFIG_X86
>>>>>> +    return hardware_domain;
>>>>>> +#elif CONFIG_ARM
>>>>>> +    return pci_get_owner_domain(seg);
>>>>>> +#else
>>>>>> +#error "Unsupported architecture"
>>>>>> +#endif
>>>>>> +}
>>>>>>
>>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>>> model would better be similar. For example, if entire segments can be
>>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>>> on x86?
>>>> Good question, probably in this case x86 == ARM and I can use
>>>>
>>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>>
>>>>> Furthermore I'm suspicious about segments being the right granularity
>>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>>> the granularity wants to be bus ranges within a segment.
>>>> Can you please suggest something we can use as a hint for such a detection logic?
>>> The underlying information comes from ACPI tables, iirc. I don't
>>> recall the details, though - sorry.
>> Ok, so seg + bus should be enough for both ARM and Xen then, right?
>>
>> pci_get_hardware_domain(u16 seg, u8 bus)
> Whether an individual bus number can suitable express things I can't
> tell; I did say bus range, but of course if you care about just
> individual devices, then a single bus number will of course do.

I can implement the lookup whether a PCI host bridge owned by a particular

domain with something like:

struct pci_host_bridge *bridge = pci_find_host_bridge(seg, bus);

return bridge->dt_node->used_by == d->domain_id;

Could you please give me a hint how this can be done on x86?

>
> Jan

Thank you,

Oleksandr
Jan Beulich Nov. 13, 2020, 2:23 p.m. UTC | #17
On 13.11.2020 13:41, Oleksandr Andrushchenko wrote:
> 
> On 11/13/20 1:35 PM, Jan Beulich wrote:
>> On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 12:50 PM, Jan Beulich wrote:
>>>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>>>
>>>>>>> +/*
>>>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>>>> + * but in case of ARM this might not be the case: those may also
>>>>>>> + * live in driver domains or even Xen itself.
>>>>>>> + */
>>>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>>>> +{
>>>>>>> +#ifdef CONFIG_X86
>>>>>>> +    return is_hardware_domain(d);
>>>>>>> +#elif CONFIG_ARM
>>>>>>> +    return pci_is_owner_domain(d, seg);
>>>>>>> +#else
>>>>>>> +#error "Unsupported architecture"
>>>>>>> +#endif
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>>>> + * domain and for ARM this can be different.
>>>>>>> + */
>>>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>>>> +{
>>>>>>> +#ifdef CONFIG_X86
>>>>>>> +    return hardware_domain;
>>>>>>> +#elif CONFIG_ARM
>>>>>>> +    return pci_get_owner_domain(seg);
>>>>>>> +#else
>>>>>>> +#error "Unsupported architecture"
>>>>>>> +#endif
>>>>>>> +}
>>>>>>>
>>>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>>>> model would better be similar. For example, if entire segments can be
>>>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>>>> on x86?
>>>>> Good question, probably in this case x86 == ARM and I can use
>>>>>
>>>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>>>
>>>>>> Furthermore I'm suspicious about segments being the right granularity
>>>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>>>> the granularity wants to be bus ranges within a segment.
>>>>> Can you please suggest something we can use as a hint for such a detection logic?
>>>> The underlying information comes from ACPI tables, iirc. I don't
>>>> recall the details, though - sorry.
>>> Ok, so seg + bus should be enough for both ARM and Xen then, right?
>>>
>>> pci_get_hardware_domain(u16 seg, u8 bus)
>> Whether an individual bus number can suitable express things I can't
>> tell; I did say bus range, but of course if you care about just
>> individual devices, then a single bus number will of course do.
> 
> I can implement the lookup whether a PCI host bridge owned by a particular
> 
> domain with something like:
> 
> struct pci_host_bridge *bridge = pci_find_host_bridge(seg, bus);
> 
> return bridge->dt_node->used_by == d->domain_id;
> 
> Could you please give me a hint how this can be done on x86?

Bridges can't be assigned to other than the hardware domain right
now. Earlier on I didn't say you should get this to work, only
that I think the general logic around what you add shouldn't make
things more arch specific than they really should be. That said,
something similar to the above should still be doable on x86,
utilizing struct pci_seg's bus2bridge[]. There ought to be
DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
(provided by the CPUs themselves rather than the chipset) aren't
really host bridges for the purposes you're after.

Jan
Oleksandr Andrushchenko Nov. 13, 2020, 2:32 p.m. UTC | #18
On 11/13/20 4:23 PM, Jan Beulich wrote:
> On 13.11.2020 13:41, Oleksandr Andrushchenko wrote:
>> On 11/13/20 1:35 PM, Jan Beulich wrote:
>>> On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 12:50 PM, Jan Beulich wrote:
>>>>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>>>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>>>>
>>>>>>>> +/*
>>>>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>>>>> + * but in case of ARM this might not be the case: those may also
>>>>>>>> + * live in driver domains or even Xen itself.
>>>>>>>> + */
>>>>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>>>>> +{
>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>> +    return is_hardware_domain(d);
>>>>>>>> +#elif CONFIG_ARM
>>>>>>>> +    return pci_is_owner_domain(d, seg);
>>>>>>>> +#else
>>>>>>>> +#error "Unsupported architecture"
>>>>>>>> +#endif
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>>>>> + * domain and for ARM this can be different.
>>>>>>>> + */
>>>>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>>>>> +{
>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>> +    return hardware_domain;
>>>>>>>> +#elif CONFIG_ARM
>>>>>>>> +    return pci_get_owner_domain(seg);
>>>>>>>> +#else
>>>>>>>> +#error "Unsupported architecture"
>>>>>>>> +#endif
>>>>>>>> +}
>>>>>>>>
>>>>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>>>>> model would better be similar. For example, if entire segments can be
>>>>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>>>>> on x86?
>>>>>> Good question, probably in this case x86 == ARM and I can use
>>>>>>
>>>>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>>>>
>>>>>>> Furthermore I'm suspicious about segments being the right granularity
>>>>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>>>>> the granularity wants to be bus ranges within a segment.
>>>>>> Can you please suggest something we can use as a hint for such a detection logic?
>>>>> The underlying information comes from ACPI tables, iirc. I don't
>>>>> recall the details, though - sorry.
>>>> Ok, so seg + bus should be enough for both ARM and Xen then, right?
>>>>
>>>> pci_get_hardware_domain(u16 seg, u8 bus)
>>> Whether an individual bus number can suitable express things I can't
>>> tell; I did say bus range, but of course if you care about just
>>> individual devices, then a single bus number will of course do.
>> I can implement the lookup whether a PCI host bridge owned by a particular
>>
>> domain with something like:
>>
>> struct pci_host_bridge *bridge = pci_find_host_bridge(seg, bus);
>>
>> return bridge->dt_node->used_by == d->domain_id;
>>
>> Could you please give me a hint how this can be done on x86?
> Bridges can't be assigned to other than the hardware domain right
> now.

So, I can probably then have pci_get_hardware_domain implemented

by both ARM and x86 in their arch specific code. And for x86 for now

it can simply be a wrapper for is_hardware_domain?

>   Earlier on I didn't say you should get this to work, only
> that I think the general logic around what you add shouldn't make
> things more arch specific than they really should be. That said,
> something similar to the above should still be doable on x86,
> utilizing struct pci_seg's bus2bridge[]. There ought to be
> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
> (provided by the CPUs themselves rather than the chipset) aren't
> really host bridges for the purposes you're after.

You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker

while trying to detect what I need?

>
> Jan

Thank you,

Oleksandr
Jan Beulich Nov. 13, 2020, 2:38 p.m. UTC | #19
On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
> 
> On 11/13/20 4:23 PM, Jan Beulich wrote:
>> On 13.11.2020 13:41, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 1:35 PM, Jan Beulich wrote:
>>>> On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
>>>>> On 11/13/20 12:50 PM, Jan Beulich wrote:
>>>>>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>>>>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>>>>>
>>>>>>>>> +/*
>>>>>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>>>>>> + * but in case of ARM this might not be the case: those may also
>>>>>>>>> + * live in driver domains or even Xen itself.
>>>>>>>>> + */
>>>>>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>>>>>> +{
>>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>>> +    return is_hardware_domain(d);
>>>>>>>>> +#elif CONFIG_ARM
>>>>>>>>> +    return pci_is_owner_domain(d, seg);
>>>>>>>>> +#else
>>>>>>>>> +#error "Unsupported architecture"
>>>>>>>>> +#endif
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>>>>>> + * domain and for ARM this can be different.
>>>>>>>>> + */
>>>>>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>>>>>> +{
>>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>>> +    return hardware_domain;
>>>>>>>>> +#elif CONFIG_ARM
>>>>>>>>> +    return pci_get_owner_domain(seg);
>>>>>>>>> +#else
>>>>>>>>> +#error "Unsupported architecture"
>>>>>>>>> +#endif
>>>>>>>>> +}
>>>>>>>>>
>>>>>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>>>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>>>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>>>>>> model would better be similar. For example, if entire segments can be
>>>>>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>>>>>> on x86?
>>>>>>> Good question, probably in this case x86 == ARM and I can use
>>>>>>>
>>>>>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>>>>>
>>>>>>>> Furthermore I'm suspicious about segments being the right granularity
>>>>>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>>>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>>>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>>>>>> the granularity wants to be bus ranges within a segment.
>>>>>>> Can you please suggest something we can use as a hint for such a detection logic?
>>>>>> The underlying information comes from ACPI tables, iirc. I don't
>>>>>> recall the details, though - sorry.
>>>>> Ok, so seg + bus should be enough for both ARM and Xen then, right?
>>>>>
>>>>> pci_get_hardware_domain(u16 seg, u8 bus)
>>>> Whether an individual bus number can suitable express things I can't
>>>> tell; I did say bus range, but of course if you care about just
>>>> individual devices, then a single bus number will of course do.
>>> I can implement the lookup whether a PCI host bridge owned by a particular
>>>
>>> domain with something like:
>>>
>>> struct pci_host_bridge *bridge = pci_find_host_bridge(seg, bus);
>>>
>>> return bridge->dt_node->used_by == d->domain_id;
>>>
>>> Could you please give me a hint how this can be done on x86?
>> Bridges can't be assigned to other than the hardware domain right
>> now.
> 
> So, I can probably then have pci_get_hardware_domain implemented
> 
> by both ARM and x86 in their arch specific code. And for x86 for now
> 
> it can simply be a wrapper for is_hardware_domain?

"get" can't be a wrapper for "is", but I think I get what you're
saying. But no, preferably I would not see you do this (hence my
earlier comment). I still think you could use the owner of the
respective device; I may be mistaken, but iirc we do assign
bridges to Dom0, so deriving from that would be better than
hardwiring to is_hardware_domain().

>>   Earlier on I didn't say you should get this to work, only
>> that I think the general logic around what you add shouldn't make
>> things more arch specific than they really should be. That said,
>> something similar to the above should still be doable on x86,
>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>> (provided by the CPUs themselves rather than the chipset) aren't
>> really host bridges for the purposes you're after.
> 
> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
> 
> while trying to detect what I need?

I'm afraid I don't understand what marker you're thinking about
here.

Jan
Oleksandr Andrushchenko Nov. 13, 2020, 2:44 p.m. UTC | #20
On 11/13/20 4:38 PM, Jan Beulich wrote:
> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>> On 13.11.2020 13:41, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 1:35 PM, Jan Beulich wrote:
>>>>> On 13.11.2020 12:02, Oleksandr Andrushchenko wrote:
>>>>>> On 11/13/20 12:50 PM, Jan Beulich wrote:
>>>>>>> On 13.11.2020 11:46, Oleksandr Andrushchenko wrote:
>>>>>>>> On 11/13/20 12:25 PM, Jan Beulich wrote:
>>>>>>>>> On 13.11.2020 07:32, Oleksandr Andrushchenko wrote:
>>>>>>>>>> I'll try to replace is_hardware_domain with something like:
>>>>>>>>>>
>>>>>>>>>> +/*
>>>>>>>>>> + * Detect whether physical PCI devices in this segment belong
>>>>>>>>>> + * to the domain given, e.g. on x86 all PCI devices live in hwdom,
>>>>>>>>>> + * but in case of ARM this might not be the case: those may also
>>>>>>>>>> + * live in driver domains or even Xen itself.
>>>>>>>>>> + */
>>>>>>>>>> +bool pci_is_hardware_domain(struct domain *d, u16 seg)
>>>>>>>>>> +{
>>>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>>>> +    return is_hardware_domain(d);
>>>>>>>>>> +#elif CONFIG_ARM
>>>>>>>>>> +    return pci_is_owner_domain(d, seg);
>>>>>>>>>> +#else
>>>>>>>>>> +#error "Unsupported architecture"
>>>>>>>>>> +#endif
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Get domain which owns this segment: for x86 this is always hardware
>>>>>>>>>> + * domain and for ARM this can be different.
>>>>>>>>>> + */
>>>>>>>>>> +struct domain *pci_get_hardware_domain(u16 seg)
>>>>>>>>>> +{
>>>>>>>>>> +#ifdef CONFIG_X86
>>>>>>>>>> +    return hardware_domain;
>>>>>>>>>> +#elif CONFIG_ARM
>>>>>>>>>> +    return pci_get_owner_domain(seg);
>>>>>>>>>> +#else
>>>>>>>>>> +#error "Unsupported architecture"
>>>>>>>>>> +#endif
>>>>>>>>>> +}
>>>>>>>>>>
>>>>>>>>>> This is what I use to properly detect the domain that really owns physical host bridge
>>>>>>>>> I consider this problematic. We should try to not let Arm's and x86'es
>>>>>>>>> PCI implementations diverge too much, i.e. at least the underlying basic
>>>>>>>>> model would better be similar. For example, if entire segments can be
>>>>>>>>> assigned to a driver domain on Arm, why should the same not be possible
>>>>>>>>> on x86?
>>>>>>>> Good question, probably in this case x86 == ARM and I can use
>>>>>>>>
>>>>>>>> pci_is_owner_domain for both architectures instead of using is_hardware_domain for x86
>>>>>>>>
>>>>>>>>> Furthermore I'm suspicious about segments being the right granularity
>>>>>>>>> here. On ia64 multiple host bridges could (and typically would) live
>>>>>>>>> on segment 0. Iirc I had once seen output from an x86 system which was
>>>>>>>>> apparently laid out similarly. Therefore, just like for MCFG, I think
>>>>>>>>> the granularity wants to be bus ranges within a segment.
>>>>>>>> Can you please suggest something we can use as a hint for such a detection logic?
>>>>>>> The underlying information comes from ACPI tables, iirc. I don't
>>>>>>> recall the details, though - sorry.
>>>>>> Ok, so seg + bus should be enough for both ARM and Xen then, right?
>>>>>>
>>>>>> pci_get_hardware_domain(u16 seg, u8 bus)
>>>>> Whether an individual bus number can suitable express things I can't
>>>>> tell; I did say bus range, but of course if you care about just
>>>>> individual devices, then a single bus number will of course do.
>>>> I can implement the lookup whether a PCI host bridge owned by a particular
>>>>
>>>> domain with something like:
>>>>
>>>> struct pci_host_bridge *bridge = pci_find_host_bridge(seg, bus);
>>>>
>>>> return bridge->dt_node->used_by == d->domain_id;
>>>>
>>>> Could you please give me a hint how this can be done on x86?
>>> Bridges can't be assigned to other than the hardware domain right
>>> now.
>> So, I can probably then have pci_get_hardware_domain implemented
>>
>> by both ARM and x86 in their arch specific code. And for x86 for now
>>
>> it can simply be a wrapper for is_hardware_domain?
> "get" can't be a wrapper for "is"
Sure, s/get/is
> , but I think I get what you're
> saying. But no, preferably I would not see you do this (hence my
> earlier comment). I still think you could use the owner of the
> respective device; I may be mistaken, but iirc we do assign
> bridges to Dom0, so deriving from that would be better than
> hardwiring to is_hardware_domain().
Ok, I'll try to figure out how to do that
>
>>>    Earlier on I didn't say you should get this to work, only
>>> that I think the general logic around what you add shouldn't make
>>> things more arch specific than they really should be. That said,
>>> something similar to the above should still be doable on x86,
>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>> (provided by the CPUs themselves rather than the chipset) aren't
>>> really host bridges for the purposes you're after.
>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>
>> while trying to detect what I need?
> I'm afraid I don't understand what marker you're thinking about
> here.

I mean that when I go over bus2bridge entries, should I only work with

those who have DEV_TYPE_PCI_HOST_BRIDGE?

>
> Jan
Jan Beulich Nov. 13, 2020, 2:51 p.m. UTC | #21
On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
> 
> On 11/13/20 4:38 PM, Jan Beulich wrote:
>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>    Earlier on I didn't say you should get this to work, only
>>>> that I think the general logic around what you add shouldn't make
>>>> things more arch specific than they really should be. That said,
>>>> something similar to the above should still be doable on x86,
>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>> really host bridges for the purposes you're after.
>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>
>>> while trying to detect what I need?
>> I'm afraid I don't understand what marker you're thinking about
>> here.
> 
> I mean that when I go over bus2bridge entries, should I only work with
> 
> those who have DEV_TYPE_PCI_HOST_BRIDGE?

Well, if you're after host bridges - yes.

Jan
Oleksandr Andrushchenko Nov. 13, 2020, 2:52 p.m. UTC | #22
On 11/13/20 4:51 PM, Jan Beulich wrote:
> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>     Earlier on I didn't say you should get this to work, only
>>>>> that I think the general logic around what you add shouldn't make
>>>>> things more arch specific than they really should be. That said,
>>>>> something similar to the above should still be doable on x86,
>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>> really host bridges for the purposes you're after.
>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>
>>>> while trying to detect what I need?
>>> I'm afraid I don't understand what marker you're thinking about
>>> here.
>> I mean that when I go over bus2bridge entries, should I only work with
>>
>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
> Well, if you're after host bridges - yes.
Ok, I'll try to see what I can do about it.
>
> Jan

Thank you,

Oleksandr
Oleksandr Andrushchenko Dec. 4, 2020, 2:38 p.m. UTC | #23
Hi, Jan!

On 11/13/20 4:51 PM, Jan Beulich wrote:
> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>     Earlier on I didn't say you should get this to work, only
>>>>> that I think the general logic around what you add shouldn't make
>>>>> things more arch specific than they really should be. That said,
>>>>> something similar to the above should still be doable on x86,
>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>> really host bridges for the purposes you're after.
>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>
>>>> while trying to detect what I need?
>>> I'm afraid I don't understand what marker you're thinking about
>>> here.
>> I mean that when I go over bus2bridge entries, should I only work with
>>
>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
> Well, if you're after host bridges - yes.
>
> Jan

So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an

impression that the original purpose of this was to identify the devices which x86 IOMMU should

cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.

I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...

This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the

common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.

These are DEV_TYPE_PCIe_BRIDGE and DEV_TYPE_PCI_HOST_BRIDGE. So, the assumption I've made before

that I can go over bus2bridge and filter out the "owner" or parent bridge for a given seg:bus doesn't

seem to be possible now.

Even if I find the parent bridge with xen/drivers/passthrough/pci.c:find_upstream_bridge I am not sure

I can get any further in detecting which x86 domain owns this bridge: the whole idea in the x86 code is

that Domain-0 is the only possible one here (you mentioned that). So, I doubt if we can still live with

is_hardware_domain for now for x86?

Thank you in advance,

Oleksandr
Jan Beulich Dec. 7, 2020, 8:48 a.m. UTC | #24
On 04.12.2020 15:38, Oleksandr Andrushchenko wrote:
> On 11/13/20 4:51 PM, Jan Beulich wrote:
>> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>>     Earlier on I didn't say you should get this to work, only
>>>>>> that I think the general logic around what you add shouldn't make
>>>>>> things more arch specific than they really should be. That said,
>>>>>> something similar to the above should still be doable on x86,
>>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>>> really host bridges for the purposes you're after.
>>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>>
>>>>> while trying to detect what I need?
>>>> I'm afraid I don't understand what marker you're thinking about
>>>> here.
>>> I mean that when I go over bus2bridge entries, should I only work with
>>>
>>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
>> Well, if you're after host bridges - yes.
>>
>> Jan
> 
> So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an
> 
> impression that the original purpose of this was to identify the devices which x86 IOMMU should
> 
> cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.
> 
> I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...
> 
> This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the
> 
> common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.

I'm curious to learn what's x86-specific here. I also can't deduce
why bus2bridge setup would be skipped.

> These are DEV_TYPE_PCIe_BRIDGE and DEV_TYPE_PCI_HOST_BRIDGE. So, the assumption I've made before
> 
> that I can go over bus2bridge and filter out the "owner" or parent bridge for a given seg:bus doesn't
> 
> seem to be possible now.
> 
> Even if I find the parent bridge with xen/drivers/passthrough/pci.c:find_upstream_bridge I am not sure
> 
> I can get any further in detecting which x86 domain owns this bridge: the whole idea in the x86 code is
> 
> that Domain-0 is the only possible one here (you mentioned that).

Right - your attempt to find the owner should _right now_ result in
getting back Dom0, on x86. But there shouldn't be any such assumption
in the new consumer of this data that you mean to introduce. I.e. I
only did suggest this to avoid ...

> So, I doubt if we can still live with
> 
> is_hardware_domain for now for x86?

... hard-coding information which can be properly established /
retrieved.

Jan
Oleksandr Andrushchenko Dec. 7, 2020, 9:11 a.m. UTC | #25
On 12/7/20 10:48 AM, Jan Beulich wrote:
> On 04.12.2020 15:38, Oleksandr Andrushchenko wrote:
>> On 11/13/20 4:51 PM, Jan Beulich wrote:
>>> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>>>      Earlier on I didn't say you should get this to work, only
>>>>>>> that I think the general logic around what you add shouldn't make
>>>>>>> things more arch specific than they really should be. That said,
>>>>>>> something similar to the above should still be doable on x86,
>>>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>>>> really host bridges for the purposes you're after.
>>>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>>>
>>>>>> while trying to detect what I need?
>>>>> I'm afraid I don't understand what marker you're thinking about
>>>>> here.
>>>> I mean that when I go over bus2bridge entries, should I only work with
>>>>
>>>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
>>> Well, if you're after host bridges - yes.
>>>
>>> Jan
>> So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an
>>
>> impression that the original purpose of this was to identify the devices which x86 IOMMU should
>>
>> cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.
>>
>> I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...
>>
>> This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the
>>
>> common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.
> I'm curious to learn what's x86-specific here. I also can't deduce
> why bus2bridge setup would be skipped.

So, for example:

commit 0af438757d455f8eb6b5a6ae9a990ae245f230fd
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date:   Fri Sep 27 10:11:49 2013 +0200

     AMD IOMMU: fix Dom0 device setup failure for host bridges

     The host bridge device (i.e. 0x18 for AMD) does not require IOMMU, and
     therefore is not included in the IVRS. The current logic tries to map
     all PCI devices to an IOMMU. In this case, "xl dmesg" shows the
     following message on AMD sytem.

     (XEN) setup 0000:00:18.0 for d0 failed (-19)
     (XEN) setup 0000:00:18.1 for d0 failed (-19)
     (XEN) setup 0000:00:18.2 for d0 failed (-19)
     (XEN) setup 0000:00:18.3 for d0 failed (-19)
     (XEN) setup 0000:00:18.4 for d0 failed (-19)
     (XEN) setup 0000:00:18.5 for d0 failed (-19)

     This patch adds a new device type (i.e. DEV_TYPE_PCI_HOST_BRIDGE) which
     corresponds to PCI class code 0x06 and sub-class 0x00. Then, it uses
     this new type to filter when trying to map device to IOMMU.

One of my test systems has DEV_TYPE_PCI_HOST_BRIDGE, so bus2brdige setup is ignored

>
>> These are DEV_TYPE_PCIe_BRIDGE and DEV_TYPE_PCI_HOST_BRIDGE. So, the assumption I've made before
>>
>> that I can go over bus2bridge and filter out the "owner" or parent bridge for a given seg:bus doesn't
>>
>> seem to be possible now.
>>
>> Even if I find the parent bridge with xen/drivers/passthrough/pci.c:find_upstream_bridge I am not sure
>>
>> I can get any further in detecting which x86 domain owns this bridge: the whole idea in the x86 code is
>>
>> that Domain-0 is the only possible one here (you mentioned that).
> Right - your attempt to find the owner should _right now_ result in
> getting back Dom0, on x86. But there shouldn't be any such assumption
> in the new consumer of this data that you mean to introduce. I.e. I
> only did suggest this to avoid ...
>
>> So, I doubt if we can still live with
>>
>> is_hardware_domain for now for x86?
> ... hard-coding information which can be properly established /
> retrieved.
>
> Jan
Jan Beulich Dec. 7, 2020, 9:28 a.m. UTC | #26
On 07.12.2020 10:11, Oleksandr Andrushchenko wrote:
> On 12/7/20 10:48 AM, Jan Beulich wrote:
>> On 04.12.2020 15:38, Oleksandr Andrushchenko wrote:
>>> On 11/13/20 4:51 PM, Jan Beulich wrote:
>>>> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>>>>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>>>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>>>>      Earlier on I didn't say you should get this to work, only
>>>>>>>> that I think the general logic around what you add shouldn't make
>>>>>>>> things more arch specific than they really should be. That said,
>>>>>>>> something similar to the above should still be doable on x86,
>>>>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>>>>> really host bridges for the purposes you're after.
>>>>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>>>>
>>>>>>> while trying to detect what I need?
>>>>>> I'm afraid I don't understand what marker you're thinking about
>>>>>> here.
>>>>> I mean that when I go over bus2bridge entries, should I only work with
>>>>>
>>>>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
>>>> Well, if you're after host bridges - yes.
>>>>
>>>> Jan
>>> So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an
>>>
>>> impression that the original purpose of this was to identify the devices which x86 IOMMU should
>>>
>>> cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.
>>>
>>> I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...
>>>
>>> This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the
>>>
>>> common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.
>> I'm curious to learn what's x86-specific here. I also can't deduce
>> why bus2bridge setup would be skipped.
> 
> So, for example:
> 
> commit 0af438757d455f8eb6b5a6ae9a990ae245f230fd
> Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> Date:   Fri Sep 27 10:11:49 2013 +0200
> 
>      AMD IOMMU: fix Dom0 device setup failure for host bridges
> 
>      The host bridge device (i.e. 0x18 for AMD) does not require IOMMU, and
>      therefore is not included in the IVRS. The current logic tries to map
>      all PCI devices to an IOMMU. In this case, "xl dmesg" shows the
>      following message on AMD sytem.
> 
>      (XEN) setup 0000:00:18.0 for d0 failed (-19)
>      (XEN) setup 0000:00:18.1 for d0 failed (-19)
>      (XEN) setup 0000:00:18.2 for d0 failed (-19)
>      (XEN) setup 0000:00:18.3 for d0 failed (-19)
>      (XEN) setup 0000:00:18.4 for d0 failed (-19)
>      (XEN) setup 0000:00:18.5 for d0 failed (-19)
> 
>      This patch adds a new device type (i.e. DEV_TYPE_PCI_HOST_BRIDGE) which
>      corresponds to PCI class code 0x06 and sub-class 0x00. Then, it uses
>      this new type to filter when trying to map device to IOMMU.
> 
> One of my test systems has DEV_TYPE_PCI_HOST_BRIDGE, so bus2brdige setup is ignored

If there's data to be sensibly recorded for host bridges, I don't
see why the function couldn't be updated. I don't view this as
x86-specific; it may just be that on x86 we have no present use
for such data. It may in turn be the case that then x86-specific
call sites consuming this data need updating to not be mislead by
the change in what data gets recorded. But that's still all within
the scope of bringing intended-to-be-arch-independent code closer
to actually being arch-independent.

Jan
Oleksandr Andrushchenko Dec. 7, 2020, 9:37 a.m. UTC | #27
On 12/7/20 11:28 AM, Jan Beulich wrote:
> On 07.12.2020 10:11, Oleksandr Andrushchenko wrote:
>> On 12/7/20 10:48 AM, Jan Beulich wrote:
>>> On 04.12.2020 15:38, Oleksandr Andrushchenko wrote:
>>>> On 11/13/20 4:51 PM, Jan Beulich wrote:
>>>>> On 13.11.2020 15:44, Oleksandr Andrushchenko wrote:
>>>>>> On 11/13/20 4:38 PM, Jan Beulich wrote:
>>>>>>> On 13.11.2020 15:32, Oleksandr Andrushchenko wrote:
>>>>>>>> On 11/13/20 4:23 PM, Jan Beulich wrote:
>>>>>>>>>       Earlier on I didn't say you should get this to work, only
>>>>>>>>> that I think the general logic around what you add shouldn't make
>>>>>>>>> things more arch specific than they really should be. That said,
>>>>>>>>> something similar to the above should still be doable on x86,
>>>>>>>>> utilizing struct pci_seg's bus2bridge[]. There ought to be
>>>>>>>>> DEV_TYPE_PCI_HOST_BRIDGE entries there, albeit a number of them
>>>>>>>>> (provided by the CPUs themselves rather than the chipset) aren't
>>>>>>>>> really host bridges for the purposes you're after.
>>>>>>>> You mean I can still use DEV_TYPE_PCI_HOST_BRIDGE as a marker
>>>>>>>>
>>>>>>>> while trying to detect what I need?
>>>>>>> I'm afraid I don't understand what marker you're thinking about
>>>>>>> here.
>>>>>> I mean that when I go over bus2bridge entries, should I only work with
>>>>>>
>>>>>> those who have DEV_TYPE_PCI_HOST_BRIDGE?
>>>>> Well, if you're after host bridges - yes.
>>>>>
>>>>> Jan
>>>> So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an
>>>>
>>>> impression that the original purpose of this was to identify the devices which x86 IOMMU should
>>>>
>>>> cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.
>>>>
>>>> I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...
>>>>
>>>> This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the
>>>>
>>>> common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.
>>> I'm curious to learn what's x86-specific here. I also can't deduce
>>> why bus2bridge setup would be skipped.
>> So, for example:
>>
>> commit 0af438757d455f8eb6b5a6ae9a990ae245f230fd
>> Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
>> Date:   Fri Sep 27 10:11:49 2013 +0200
>>
>>       AMD IOMMU: fix Dom0 device setup failure for host bridges
>>
>>       The host bridge device (i.e. 0x18 for AMD) does not require IOMMU, and
>>       therefore is not included in the IVRS. The current logic tries to map
>>       all PCI devices to an IOMMU. In this case, "xl dmesg" shows the
>>       following message on AMD sytem.
>>
>>       (XEN) setup 0000:00:18.0 for d0 failed (-19)
>>       (XEN) setup 0000:00:18.1 for d0 failed (-19)
>>       (XEN) setup 0000:00:18.2 for d0 failed (-19)
>>       (XEN) setup 0000:00:18.3 for d0 failed (-19)
>>       (XEN) setup 0000:00:18.4 for d0 failed (-19)
>>       (XEN) setup 0000:00:18.5 for d0 failed (-19)
>>
>>       This patch adds a new device type (i.e. DEV_TYPE_PCI_HOST_BRIDGE) which
>>       corresponds to PCI class code 0x06 and sub-class 0x00. Then, it uses
>>       this new type to filter when trying to map device to IOMMU.
>>
>> One of my test systems has DEV_TYPE_PCI_HOST_BRIDGE, so bus2brdige setup is ignored
> If there's data to be sensibly recorded for host bridges, I don't
> see why the function couldn't be updated. I don't view this as
> x86-specific; it may just be that on x86 we have no present use
> for such data. It may in turn be the case that then x86-specific
> call sites consuming this data need updating to not be mislead by
> the change in what data gets recorded. But that's still all within
> the scope of bringing intended-to-be-arch-independent code closer
> to actually being arch-independent.

Well, the patch itself made me think that this is a workaround for x86

which made DEV_TYPE_PCI_HOST_BRIDGE a special case and it relies on that.

So, please correct me if I'm wrong here, but in order to make it really generic

I would need to introduce some specific knowledge for x86 about such a device

and make the IOMMU code rely on that instead of bus2bridge.

>
> Jan
Jan Beulich Dec. 7, 2020, 9:50 a.m. UTC | #28
On 07.12.2020 10:37, Oleksandr Andrushchenko wrote:
> On 12/7/20 11:28 AM, Jan Beulich wrote:
>> On 07.12.2020 10:11, Oleksandr Andrushchenko wrote:
>>> On 12/7/20 10:48 AM, Jan Beulich wrote:
>>>> On 04.12.2020 15:38, Oleksandr Andrushchenko wrote:
>>>>> So, I started looking at the bus2bridge and if it can be used for both x86 (and possible ARM) and I have an
>>>>>
>>>>> impression that the original purpose of this was to identify the devices which x86 IOMMU should
>>>>>
>>>>> cover: e.g. I am looking at the find_upstream_bridge users and they are x86 IOMMU and VGA driver.
>>>>>
>>>>> I tried to play with this on ARM, and for the HW platform I have and QEMU I got 0 entries in bus2bridge...
>>>>>
>>>>> This is because of how xen/drivers/passthrough/pci.c:alloc_pdev is implemented (which lives in the
>>>>>
>>>>> common code BTW, but seems to be x86 specific): so, it skips setting up bus2bridge entries for the bridges I have.
>>>> I'm curious to learn what's x86-specific here. I also can't deduce
>>>> why bus2bridge setup would be skipped.
>>> So, for example:
>>>
>>> commit 0af438757d455f8eb6b5a6ae9a990ae245f230fd
>>> Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
>>> Date:   Fri Sep 27 10:11:49 2013 +0200
>>>
>>>       AMD IOMMU: fix Dom0 device setup failure for host bridges
>>>
>>>       The host bridge device (i.e. 0x18 for AMD) does not require IOMMU, and
>>>       therefore is not included in the IVRS. The current logic tries to map
>>>       all PCI devices to an IOMMU. In this case, "xl dmesg" shows the
>>>       following message on AMD sytem.
>>>
>>>       (XEN) setup 0000:00:18.0 for d0 failed (-19)
>>>       (XEN) setup 0000:00:18.1 for d0 failed (-19)
>>>       (XEN) setup 0000:00:18.2 for d0 failed (-19)
>>>       (XEN) setup 0000:00:18.3 for d0 failed (-19)
>>>       (XEN) setup 0000:00:18.4 for d0 failed (-19)
>>>       (XEN) setup 0000:00:18.5 for d0 failed (-19)
>>>
>>>       This patch adds a new device type (i.e. DEV_TYPE_PCI_HOST_BRIDGE) which
>>>       corresponds to PCI class code 0x06 and sub-class 0x00. Then, it uses
>>>       this new type to filter when trying to map device to IOMMU.
>>>
>>> One of my test systems has DEV_TYPE_PCI_HOST_BRIDGE, so bus2brdige setup is ignored
>> If there's data to be sensibly recorded for host bridges, I don't
>> see why the function couldn't be updated. I don't view this as
>> x86-specific; it may just be that on x86 we have no present use
>> for such data. It may in turn be the case that then x86-specific
>> call sites consuming this data need updating to not be mislead by
>> the change in what data gets recorded. But that's still all within
>> the scope of bringing intended-to-be-arch-independent code closer
>> to actually being arch-independent.
> 
> Well, the patch itself made me think that this is a workaround for x86
> 
> which made DEV_TYPE_PCI_HOST_BRIDGE a special case and it relies on that.
> 
> So, please correct me if I'm wrong here, but in order to make it really generic
> 
> I would need to introduce some specific knowledge for x86 about such a device
> 
> and make the IOMMU code rely on that instead of bus2bridge.

I'm afraid this is too vague for me to respond with a clear "yes" or
"no". In particular I don't see the special casing of that type, not
the least because it's not clear to me what data you would see
recording for it (or more precisely, from where you'd take the to be
recorded data - the device's config space doesn't tell you the bus
range covered by the bridge afaict).

Jan
diff mbox series

Patch

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index f74f728884c0..7dc7c70e24f2 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -31,14 +31,87 @@ 
 struct map_data {
     struct domain *d;
     bool map;
+    struct pci_dev *pdev;
 };
 
+static struct vpci_header *get_vpci_header(struct domain *d,
+                                           const struct pci_dev *pdev);
+
+static struct vpci_header *get_hwdom_vpci_header(const struct pci_dev *pdev)
+{
+    if ( unlikely(list_empty(&pdev->vpci->headers)) )
+        return get_vpci_header(hardware_domain, pdev);
+
+    /* hwdom's header is always the very first entry. */
+    return list_first_entry(&pdev->vpci->headers, struct vpci_header, node);
+}
+
+static struct vpci_header *get_vpci_header(struct domain *d,
+                                           const struct pci_dev *pdev)
+{
+    struct list_head *prev;
+    struct vpci_header *header;
+    struct vpci *vpci = pdev->vpci;
+
+    list_for_each( prev, &vpci->headers )
+    {
+        struct vpci_header *this = list_entry(prev, struct vpci_header, node);
+
+        if ( this->domain_id == d->domain_id )
+            return this;
+    }
+    printk(XENLOG_DEBUG "--------------------------------------" \
+           "Adding new vPCI BAR headers for domain %d: " PRI_pci" \n",
+           d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
+           pdev->sbdf.dev, pdev->sbdf.fn);
+    header = xzalloc(struct vpci_header);
+    if ( !header )
+    {
+        printk(XENLOG_ERR
+               "Failed to add new vPCI BAR headers for domain %d: " PRI_pci" \n",
+               d->domain_id, pdev->sbdf.seg, pdev->sbdf.bus,
+               pdev->sbdf.dev, pdev->sbdf.fn);
+        return NULL;
+    }
+
+    if ( !is_hardware_domain(d) )
+    {
+        struct vpci_header *hwdom_header = get_hwdom_vpci_header(pdev);
+
+        /* Make a copy of the hwdom's BARs as the initial state for vBARs. */
+        memcpy(header, hwdom_header, sizeof(*header));
+    }
+
+    header->domain_id = d->domain_id;
+    list_add_tail(&header->node, &vpci->headers);
+    return header;
+}
+
+static struct vpci_bar *get_vpci_bar(struct domain *d,
+                                     const struct pci_dev *pdev,
+                                     int bar_idx)
+{
+    struct vpci_header *vheader;
+
+    vheader = get_vpci_header(d, pdev);
+    if ( !vheader )
+        return NULL;
+
+    return &vheader->bars[bar_idx];
+}
+
 static int map_range(unsigned long s, unsigned long e, void *data,
                      unsigned long *c)
 {
     const struct map_data *map = data;
-    int rc;
-
+    unsigned long mfn;
+    int rc, bar_idx;
+    struct vpci_header *header = get_hwdom_vpci_header(map->pdev);
+
+    bar_idx = s & ~PCI_BASE_ADDRESS_MEM_MASK;
+    s = PFN_DOWN(s);
+    e = PFN_DOWN(e);
+    mfn = _mfn(PFN_DOWN(header->bars[bar_idx].addr));
     for ( ; ; )
     {
         unsigned long size = e - s + 1;
@@ -52,11 +125,15 @@  static int map_range(unsigned long s, unsigned long e, void *data,
          * - {un}map_mmio_regions doesn't support preemption.
          */
 
-        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, _mfn(s))
-                      : unmap_mmio_regions(map->d, _gfn(s), size, _mfn(s));
+        rc = map->map ? map_mmio_regions(map->d, _gfn(s), size, mfn)
+                      : unmap_mmio_regions(map->d, _gfn(s), size, mfn);
         if ( rc == 0 )
         {
-            *c += size;
+            /*
+             * Range set is not expressed in frame numbers and the size
+             * is the number of frames, so update accordingly.
+             */
+            *c += size << PAGE_SHIFT;
             break;
         }
         if ( rc < 0 )
@@ -67,8 +144,9 @@  static int map_range(unsigned long s, unsigned long e, void *data,
             break;
         }
         ASSERT(rc < size);
-        *c += rc;
+        *c += rc << PAGE_SHIFT;
         s += rc;
+        mfn += rc;
         if ( general_preempt_check() )
                 return -ERESTART;
     }
@@ -84,7 +162,7 @@  static int map_range(unsigned long s, unsigned long e, void *data,
 static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
                             bool rom_only)
 {
-    struct vpci_header *header = &pdev->vpci->header;
+    struct vpci_header *header = get_hwdom_vpci_header(pdev);
     bool map = cmd & PCI_COMMAND_MEMORY;
     unsigned int i;
 
@@ -136,6 +214,7 @@  bool vpci_process_pending(struct vcpu *v)
         struct map_data data = {
             .d = v->domain,
             .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
+            .pdev = v->vpci.pdev,
         };
         int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
 
@@ -168,7 +247,8 @@  bool vpci_process_pending(struct vcpu *v)
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
                             struct rangeset *mem, uint16_t cmd)
 {
-    struct map_data data = { .d = d, .map = true };
+    struct map_data data = { .d = d, .map = true,
+        .pdev = (struct pci_dev *)pdev };
     int rc;
 
     while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
@@ -205,7 +285,7 @@  static void defer_map(struct domain *d, struct pci_dev *pdev,
 
 static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 {
-    struct vpci_header *header = &pdev->vpci->header;
+    struct vpci_header *header;
     struct rangeset *mem = rangeset_new(NULL, NULL, 0);
     struct pci_dev *tmp, *dev = NULL;
 #ifdef CONFIG_X86
@@ -217,6 +297,11 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     if ( !mem )
         return -ENOMEM;
 
+    if ( is_hardware_domain(current->domain) )
+        header = get_hwdom_vpci_header(pdev);
+    else
+        header = get_vpci_header(current->domain, pdev);
+
     /*
      * Create a rangeset that represents the current device BARs memory region
      * and compare it against all the currently active BAR memory regions. If
@@ -225,12 +310,15 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      * First fill the rangeset with all the BARs of this device or with the ROM
      * BAR only, depending on whether the guest is toggling the memory decode
      * bit of the command register, or the enable bit of the ROM BAR register.
+     *
+     * Use the PCI reserved bits of the BAR to pass BAR's index.
      */
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
         const struct vpci_bar *bar = &header->bars[i];
-        unsigned long start = PFN_DOWN(bar->addr);
-        unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
+        unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
+        unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) +
+            bar->size - 1;
 
         if ( !MAPPABLE_BAR(bar) ||
              (rom_only ? bar->type != VPCI_BAR_ROM
@@ -251,9 +339,11 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     /* Remove any MSIX regions if present. */
     for ( i = 0; msix && i < ARRAY_SIZE(msix->tables); i++ )
     {
-        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
-        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
-                                     vmsix_table_size(pdev->vpci, i) - 1);
+        unsigned long start = (vmsix_table_addr(pdev->vpci, i) &
+                               PCI_BASE_ADDRESS_MEM_MASK) | i;
+        unsigned long end = (vmsix_table_addr(pdev->vpci, i) &
+                             PCI_BASE_ADDRESS_MEM_MASK ) +
+                             vmsix_table_size(pdev->vpci, i) - 1;
 
         rc = rangeset_remove_range(mem, start, end);
         if ( rc )
@@ -273,6 +363,8 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      */
     for_each_pdev ( pdev->domain, tmp )
     {
+        struct vpci_header *header;
+
         if ( tmp == pdev )
         {
             /*
@@ -289,11 +381,14 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 continue;
         }
 
-        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
+        header = get_vpci_header(tmp->domain, pdev);
+
+        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
         {
-            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
-            unsigned long start = PFN_DOWN(bar->addr);
-            unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
+            const struct vpci_bar *bar = &header->bars[i];
+            unsigned long start = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK) | i;
+            unsigned long end = (bar->addr & PCI_BASE_ADDRESS_MEM_MASK)
+                + bar->size - 1;
 
             if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
                  /*
@@ -357,7 +452,7 @@  static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
         pci_conf_write16(pdev->sbdf, reg, cmd);
 }
 
-static void bar_write(const struct pci_dev *pdev, unsigned int reg,
+static void bar_write_hwdom(const struct pci_dev *pdev, unsigned int reg,
                       uint32_t val, void *data)
 {
     struct vpci_bar *bar = data;
@@ -377,14 +472,17 @@  static void bar_write(const struct pci_dev *pdev, unsigned int reg,
     {
         /* If the value written is the current one avoid printing a warning. */
         if ( val != (uint32_t)(bar->addr >> (hi ? 32 : 0)) )
+        {
+            struct vpci_header *header = get_hwdom_vpci_header(pdev);
+
             gprintk(XENLOG_WARNING,
                     "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
                     pdev->seg, pdev->bus, slot, func,
-                    bar - pdev->vpci->header.bars + hi);
+                    bar - header->bars + hi);
+        }
         return;
     }
 
-
     /*
      * Update the cached address, so that when memory decoding is enabled
      * Xen can map the BAR into the guest p2m.
@@ -403,10 +501,89 @@  static void bar_write(const struct pci_dev *pdev, unsigned int reg,
     pci_conf_write32(pdev->sbdf, reg, val);
 }
 
+static uint32_t bar_read_hwdom(const struct pci_dev *pdev, unsigned int reg,
+                               void *data)
+{
+    return vpci_hw_read32(pdev, reg, data);
+}
+
+static void bar_write_guest(const struct pci_dev *pdev, unsigned int reg,
+                            uint32_t val, void *data)
+{
+    struct vpci_bar *vbar = data;
+    bool hi = false;
+
+    if ( vbar->type == VPCI_BAR_MEM64_HI )
+    {
+        ASSERT(reg > PCI_BASE_ADDRESS_0);
+        vbar--;
+        hi = true;
+    }
+    vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
+    vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
+}
+
+static uint32_t bar_read_guest(const struct pci_dev *pdev, unsigned int reg,
+                               void *data)
+{
+    struct vpci_bar *vbar = data;
+    uint32_t val;
+    bool hi = false;
+
+    if ( vbar->type == VPCI_BAR_MEM64_HI )
+    {
+        ASSERT(reg > PCI_BASE_ADDRESS_0);
+        vbar--;
+        hi = true;
+    }
+
+    if ( vbar->type == VPCI_BAR_MEM64_LO || vbar->type == VPCI_BAR_MEM64_HI )
+    {
+        if ( hi )
+            val = vbar->addr >> 32;
+        else
+            val = vbar->addr & 0xffffffff;
+        if ( val == ~0 )
+        {
+            /* Guests detects BAR's properties and sizes. */
+            if ( !hi )
+            {
+                val = 0xffffffff & ~(vbar->size - 1);
+                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
+                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
+                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+            }
+            else
+                val = vbar->size >> 32;
+            vbar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
+            vbar->addr |= (uint64_t)val << (hi ? 32 : 0);
+        }
+    }
+    else if ( vbar->type == VPCI_BAR_MEM32 )
+    {
+        val = vbar->addr;
+        if ( val == ~0 )
+        {
+            if ( !hi )
+            {
+                val = 0xffffffff & ~(vbar->size - 1);
+                val |= vbar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
+                                                    : PCI_BASE_ADDRESS_MEM_TYPE_64;
+                val |= vbar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+            }
+        }
+    }
+    else
+    {
+        val = vbar->addr;
+    }
+    return val;
+}
+
 static void rom_write(const struct pci_dev *pdev, unsigned int reg,
                       uint32_t val, void *data)
 {
-    struct vpci_header *header = &pdev->vpci->header;
+    struct vpci_header *header = get_hwdom_vpci_header(pdev);
     struct vpci_bar *rom = data;
     uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
     uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
@@ -452,15 +629,56 @@  static void rom_write(const struct pci_dev *pdev, unsigned int reg,
         rom->addr = val & PCI_ROM_ADDRESS_MASK;
 }
 
+static uint32_t bar_read_dispatch(const struct pci_dev *pdev, unsigned int reg,
+                                  void *data)
+{
+    struct vpci_bar *vbar, *bar = data;
+
+    if ( is_hardware_domain(current->domain) )
+        return bar_read_hwdom(pdev, reg, data);
+
+    vbar = get_vpci_bar(current->domain, pdev, bar->index);
+    if ( !vbar )
+        return ~0;
+
+    return bar_read_guest(pdev, reg, vbar);
+}
+
+static void bar_write_dispatch(const struct pci_dev *pdev, unsigned int reg,
+                               uint32_t val, void *data)
+{
+    struct vpci_bar *bar = data;
+
+    if ( is_hardware_domain(current->domain) )
+        bar_write_hwdom(pdev, reg, val, data);
+    else
+    {
+        struct vpci_bar *vbar = get_vpci_bar(current->domain, pdev, bar->index);
+
+        if ( !vbar )
+            return;
+        bar_write_guest(pdev, reg, val, vbar);
+    }
+}
+
+/*
+ * FIXME: This is called early while adding vPCI handlers which is done
+ * by and for hwdom.
+ */
 static int init_bars(struct pci_dev *pdev)
 {
     uint16_t cmd;
     uint64_t addr, size;
     unsigned int i, num_bars, rom_reg;
-    struct vpci_header *header = &pdev->vpci->header;
-    struct vpci_bar *bars = header->bars;
+    struct vpci_header *header;
+    struct vpci_bar *bars;
     int rc;
 
+    header = get_hwdom_vpci_header(pdev);
+    if ( !header )
+        return -ENOMEM;
+    bars = header->bars;
+
     switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
     {
     case PCI_HEADER_TYPE_NORMAL:
@@ -496,11 +714,12 @@  static int init_bars(struct pci_dev *pdev)
         uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
         uint32_t val;
 
+        bars[i].index = i;
         if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
         {
             bars[i].type = VPCI_BAR_MEM64_HI;
-            rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg,
-                                   4, &bars[i]);
+            rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
+                                   bar_write_dispatch, reg, 4, &bars[i]);
             if ( rc )
             {
                 pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
@@ -540,8 +759,8 @@  static int init_bars(struct pci_dev *pdev)
         bars[i].size = size;
         bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
 
-        rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg, 4,
-                               &bars[i]);
+        rc = vpci_add_register(pdev->vpci, bar_read_dispatch,
+                               bar_write_dispatch, reg, 4, &bars[i]);
         if ( rc )
         {
             pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
@@ -558,6 +777,7 @@  static int init_bars(struct pci_dev *pdev)
         rom->type = VPCI_BAR_ROM;
         rom->size = size;
         rom->addr = addr;
+        rom->index = num_bars;
         header->rom_enabled = pci_conf_read32(pdev->sbdf, rom_reg) &
                               PCI_ROM_ADDRESS_ENABLE;
 
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index a5293521a36a..728029da3e9c 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -69,6 +69,7 @@  int __hwdom_init vpci_add_handlers(struct pci_dev *pdev)
         return -ENOMEM;
 
     INIT_LIST_HEAD(&pdev->vpci->handlers);
+    INIT_LIST_HEAD(&pdev->vpci->headers);
     spin_lock_init(&pdev->vpci->lock);
 
     for ( i = 0; i < NUM_VPCI_INIT; i++ )
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index c3501e9ec010..54423bc6556d 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -55,16 +55,14 @@  uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
  */
 bool __must_check vpci_process_pending(struct vcpu *v);
 
-struct vpci {
-    /* List of vPCI handlers for a device. */
-    struct list_head handlers;
-    spinlock_t lock;
-
 #ifdef __XEN__
-    /* Hide the rest of the vpci struct from the user-space test harness. */
     struct vpci_header {
+    struct list_head node;
+    /* Domain that owns this view of the BARs. */
+    domid_t domain_id;
         /* Information about the PCI BARs of this device. */
         struct vpci_bar {
+            int index;
             uint64_t addr;
             uint64_t size;
             enum {
@@ -88,8 +86,18 @@  struct vpci {
          * is mapped into guest p2m) if there's a ROM BAR on the device.
          */
         bool rom_enabled      : 1;
-        /* FIXME: currently there's no support for SR-IOV. */
-    } header;
+};
+#endif
+
+struct vpci {
+    /* List of vPCI handlers for a device. */
+    struct list_head handlers;
+    spinlock_t lock;
+
+#ifdef __XEN__
+    /* Hide the rest of the vpci struct from the user-space test harness. */
+    /* List of vPCI headers for all domains. */
+    struct list_head headers;
 
 #ifdef CONFIG_X86
     /* MSI data. */