diff mbox

[v3,9/9] vpci/msix: add MSI-X handlers

Message ID 20170427143546.14662-10-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné April 27, 2017, 2:35 p.m. UTC
Add handlers for accesses to the MSI-X message control field on the PCI
configuration space, and traps for accesses to the memory region that contains
the MSI-X table. This traps detect attempts from the guest to configure MSI-X
interrupts and properly sets them up.

Note that accesses to the Table Offset, Table BIR, PBA Offset, PBA BIR and the
PBA memory region itself are not trapped by Xen at the moment.

Whether Xen is going to provide this functionality to Dom0 (MSI-X emulation) is
controlled by the "msix" option in the dom0 field. When disabling this option
Xen will hide the MSI-X capability structure from Dom0.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since v2:
 - Split out arch-specific code.

This patch has been tested with devices using both a single MSI-X entry and
multiple ones.
---
 docs/misc/xen-command-line.markdown |   7 +
 xen/arch/x86/dom0_build.c           |   4 +
 xen/arch/x86/hvm/hvm.c              |   1 +
 xen/arch/x86/hvm/vmsi.c             | 114 ++++++++-
 xen/drivers/vpci/Makefile           |   2 +-
 xen/drivers/vpci/capabilities.c     |  16 +-
 xen/drivers/vpci/header.c           |  41 ++-
 xen/drivers/vpci/msix.c             | 487 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/domain.h    |   3 +
 xen/include/asm-x86/hvm/io.h        |  18 ++
 xen/include/xen/vpci.h              |  27 ++
 11 files changed, 696 insertions(+), 24 deletions(-)
 create mode 100644 xen/drivers/vpci/msix.c

Comments

Jan Beulich May 29, 2017, 1:29 p.m. UTC | #1
>>> On 27.04.17 at 16:35, <roger.pau@citrix.com> wrote:
> +static int vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
> +                                   union vpci_val val, void *data)
> +{
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    paddr_t table_base = pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
> +    struct vpci_msix *msix = data;

Wouldn't you better use this also to obtain the array index one line
earlier?

> +    bool new_masked, new_enabled;
> +    unsigned int i;
> +    uint32_t data32;
> +    int rc;
> +
> +    new_masked = val.word & PCI_MSIX_FLAGS_MASKALL;
> +    new_enabled = val.word & PCI_MSIX_FLAGS_ENABLE;
> +
> +    if ( new_enabled != msix->enabled && new_enabled )

    if ( !msix->enabled && new_enabled )

would likely be easier to read (similar for the "else if" below).

> +    {
> +        /* MSI-X enabled. */
> +        for ( i = 0; i < msix->max_entries; i++ )
> +        {
> +            if ( msix->entries[i].masked )
> +                continue;
> +
> +            rc = vpci_msix_enable(&msix->entries[i].arch, pdev,
> +                                  msix->entries[i].addr, msix->entries[i].data,
> +                                  msix->entries[i].nr, table_base);
> +            if ( rc )
> +            {
> +                gdprintk(XENLOG_ERR,
> +                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
> +                         seg, bus, slot, func, i, rc);
> +                return rc;
> +            }
> +
> +            vpci_msix_mask(&msix->entries[i].arch, false);
> +        }
> +    }
> +    else if ( new_enabled != msix->enabled && !new_enabled )
> +    {
> +        /* MSI-X disabled. */
> +        for ( i = 0; i < msix->max_entries; i++ )
> +        {
> +            rc = vpci_msix_disable(&msix->entries[i].arch);
> +            if ( rc )
> +            {
> +                gdprintk(XENLOG_ERR,
> +                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
> +                         seg, bus, slot, func, i, rc);
> +                return rc;
> +            }
> +        }
> +    }
> +
> +    data32 = val.word;
> +    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
> +         pci_msi_conf_write_intercept(pdev, reg, 2, &data32) >= 0 )
> +        pci_conf_write16(seg, bus, slot, func, reg, data32);

What's the intermediate variable "data32" good for here? Afaict you
could use val.word in its stead.

> +static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
> +{
> +    struct vpci_msix *msix;
> +
> +    ASSERT(vpci_locked(d));
> +    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
> +        if ( msix->pdev->vpci->header.command & PCI_COMMAND_MEMORY &&

Please parenthesize & within &&.

> +             addr >= msix->addr &&
> +             addr < msix->addr + msix->max_entries * PCI_MSIX_ENTRY_SIZE )
> +            return msix;
> +
> +    return NULL;
> +}

Looking ahead I'm getting the impression that you only allow
accesses to the MSI-X table entries, yet in vpci_modify_bars() you
(correctly) prevent mapping entire pages. While most other
registers are disallowed from sharing a page with the table, the PBA
is specifically named as an exception. Hence you need to support
at least reads from the entire range.

> +static int vpci_msix_table_accept(struct vcpu *v, unsigned long addr)
> +{
> +    int found;
> +
> +    vpci_lock(v->domain);
> +    found = !!vpci_msix_find(v->domain, addr);

At the risk of repeating a comment I gave on an earlier patch: Using
"bool" for "found" allows you to avoid the !! .

> +static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr,
> +                                  unsigned int len)
> +{
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +
> +

No double blank lines please.

> +    /* Only allow 32/64b accesses. */
> +    if ( len != 4 && len != 8 )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
> +                 seg, bus, slot, func, len);
> +        return -EINVAL;
> +    }
> +
> +    /* Do no allow accesses that span across multiple entries. */
> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) + len > PCI_MSIX_ENTRY_SIZE )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: MSI-X access crosses entry boundary\n",
> +                 seg, bus, slot, func);
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * Only allow 64b accesses to the low message address field.
> +     *
> +     * NB: this is more restrictive than the specification, that allows 64b
> +     * accesses to other fields under certain circumstances, so this check and
> +     * the code will have to be fixed in order to fully comply with the
> +     * specification.
> +     */
> +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) != 0 && len != 4 )
> +    {
> +        gdprintk(XENLOG_ERR,
> +                 "%04x:%02x:%02x.%u: 64bit MSI-X table access to 32bit field"
> +                 " (offset: %#lx len: %u)\n", seg, bus, slot, func,
> +                 addr & (PCI_MSIX_ENTRY_SIZE - 1), len);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}

So you allow mis-aligned accesses, but you disallow 8-byte ones
to the upper half of an entry? I think both aspects need to be got
right from the beginning, the more that you BUG() in the switch()es
further down in such cases.

> +static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
> +                                                   unsigned long addr)
> +{
> +    return &msix->entries[(addr - msix->addr) / PCI_MSIX_ENTRY_SIZE];
> +}
> +
> +static int vpci_msix_table_read(struct vcpu *v, unsigned long addr,
> +                                unsigned int len, unsigned long *data)
> +{
> +    struct vpci_msix *msix;
> +    struct vpci_msix_entry *entry;
> +    unsigned int offset;
> +
> +    vpci_lock(v->domain);
> +    msix = vpci_msix_find(v->domain, addr);
> +    if ( !msix )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    /* Get the table entry and offset. */
> +    entry = vpci_msix_get_entry(msix, addr);
> +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> +
> +    switch ( offset )
> +    {
> +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> +        *data = entry->addr;

You're not clipping off the upper 32 bits here - is that reliably
happening elsewhere?

> +        break;
> +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> +        *data = entry->addr >> 32;
> +        break;
> +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> +        *data = entry->data;
> +        break;
> +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> +        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;

What about the other 31 bits?

> +static int vpci_msix_table_write(struct vcpu *v, unsigned long addr,
> +                                 unsigned int len, unsigned long data)
> +{
> +    struct vpci_msix *msix;
> +    struct vpci_msix_entry *entry;
> +    unsigned int offset;
> +
> +    vpci_lock(v->domain);
> +    msix = vpci_msix_find(v->domain, addr);
> +    if ( !msix )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> +    {
> +        vpci_unlock(v->domain);
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    /* Get the table entry and offset. */
> +    entry = vpci_msix_get_entry(msix, addr);
> +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> +
> +    switch ( offset )
> +    {
> +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> +        if ( len == 8 )
> +        {
> +            entry->addr = data;
> +            break;
> +        }
> +        entry->addr &= ~GENMASK(31, 0);
> +        entry->addr |= data;
> +        break;
> +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> +        entry->addr &= ~GENMASK(63, 32);
> +        entry->addr |= data << 32;
> +        break;
> +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> +        entry->data = data;
> +        break;
> +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> +    {
> +        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
> +        struct pci_dev *pdev = msix->pdev;
> +        paddr_t table_base =
> +            pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;

Again simply "msix->bir"?

> +        int rc;
> +
> +        if ( !msix->enabled )
> +        {
> +            entry->masked = new_masked;
> +            break;
> +        }
> +
> +        if ( new_masked != entry->masked && !new_masked )
> +        {
> +            /* Unmasking an entry, update it. */
> +            rc = vpci_msix_enable(&entry->arch, msix->pdev, entry->addr,

And simply "pdev" here?

> +static int vpci_init_msix(struct pci_dev *pdev)
> +{
> +    struct domain *d = pdev->domain;
> +    uint8_t seg = pdev->seg, bus = pdev->bus;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    struct vpci_msix *msix;
> +    unsigned int msix_offset, i, max_entries;
> +    paddr_t msix_paddr;
> +    uint16_t control;
> +    int rc;
> +
> +    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
> +    if ( !msix_offset )
> +        return 0;
> +
> +    if ( !vpci_msix_enabled(pdev->domain) )

This is a non-__init function, so it can't use dom0_msix (I'm saying
this just in case there really is a need to retain those command line
options).

> +    {
> +        xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX);
> +        return 0;
> +    }
> +
> +    control = pci_conf_read16(seg, bus, slot, func,
> +                              msix_control_reg(msix_offset));
> +
> +    /* Get the maximum number of vectors the device supports. */
> +    max_entries = msix_table_size(control);
> +    if ( !max_entries )
> +        return 0;

This if() is never going to be true.

> +    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
> +    if ( !msix )
> +        return -ENOMEM;
> +
> +    msix->max_entries = max_entries;
> +    msix->pdev = pdev;
> +
> +    /* Find the MSI-X table address. */
> +    msix->offset = pci_conf_read32(seg, bus, slot, func,
> +                                   msix_table_offset_reg(msix_offset));
> +    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
> +    msix->offset &= ~PCI_MSIX_BIRMASK;
> +
> +    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
> +           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
> +    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + msix->offset;
> +    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;

I can't seem to find where these addresses get updated in case
the BARs are being relocated by the Dom0 kernel.

> +    for ( i = 0; i < msix->max_entries; i++)
> +    {
> +        msix->entries[i].masked = true;
> +        msix->entries[i].nr = i;
> +        vpci_msix_arch_init(&msix->entries[i].arch);
> +    }
> +
> +    if ( list_empty(&d->arch.hvm_domain.msix_tables) )
> +        register_mmio_handler(d, &vpci_msix_table_ops);
> +
> +    list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
> +
> +    rc = xen_vpci_add_register(pdev, vpci_msix_control_read,
> +                               vpci_msix_control_write,
> +                               msix_control_reg(msix_offset), 2, msix);
> +    if ( rc )
> +    {
> +        dprintk(XENLOG_ERR,
> +                "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: %d\n",
> +                seg, bus, slot, func, rc);
> +        goto error;
> +    }
> +
> +    if ( pdev->vpci->header.command & PCI_COMMAND_MEMORY )
> +    {
> +        /* Unmap this memory from the guest. */
> +        rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(msix->addr)),
> +                         _mfn(PFN_DOWN(msix_paddr)),
> +                         PFN_UP(msix->max_entries * PCI_MSIX_ENTRY_SIZE),
> +                         false);
> +        if ( rc )
> +        {
> +            dprintk(XENLOG_ERR,
> +                    "%04x:%02x:%02x.%u: unable to unmap MSI-X BAR region: %d\n",
> +                    seg, bus, slot, func, rc);
> +            goto error;
> +        }
> +    }

Why is this unmapping conditional upon PCI_COMMAND_MEMORY?

> +static void vpci_dump_msix(unsigned char key)
> +{
> +    struct domain *d;
> +    struct pci_dev *pdev;
> +
> +    printk("Guest MSI-X information:\n");
> +
> +    for_each_domain ( d )
> +    {
> +        if ( !has_vpci(d) )
> +            continue;
> +
> +        vpci_lock(d);

Dump handlers, even if there are existing examples to the contrary,
should only try-lock any locks they mean to hold (and not dump
anything if they can't get hold of the lock).

> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -112,6 +112,33 @@ struct vpci {
>          /* Arch-specific data. */
>          struct vpci_arch_msi arch;
>      } *msi;
> +
> +    /* MSI-X data. */
> +    struct vpci_msix {
> +        struct pci_dev *pdev;
> +        /* Maximum number of vectors supported by the device. */
> +        unsigned int max_entries;
> +        /* MSI-X table offset. */
> +        unsigned int offset;
> +        /* MSI-X table BIR. */
> +        unsigned int bir;
> +        /* Table addr. */
> +        paddr_t addr;
> +        /* MSI-X enabled? */
> +        bool enabled;
> +        /* Masked? */
> +        bool masked;
> +        /* List link. */
> +        struct list_head next;
> +        /* Entries. */
> +        struct vpci_msix_entry {
> +                unsigned int nr;
> +                uint64_t addr;
> +                uint32_t data;
> +                bool masked;
> +                struct vpci_arch_msix_entry arch;

Indentation.

Jan
Roger Pau Monné June 28, 2017, 3:29 p.m. UTC | #2
On Mon, May 29, 2017 at 07:29:29AM -0600, Jan Beulich wrote:
> >>> On 27.04.17 at 16:35, <roger.pau@citrix.com> wrote:
> > +static int vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
> > +                                   union vpci_val val, void *data)
> > +{
> > +    uint8_t seg = pdev->seg, bus = pdev->bus;
> > +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> > +    paddr_t table_base = pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
> > +    struct vpci_msix *msix = data;
> 
> Wouldn't you better use this also to obtain the array index one line
> earlier?

Yes.

> > +    bool new_masked, new_enabled;
> > +    unsigned int i;
> > +    uint32_t data32;
> > +    int rc;
> > +
> > +    new_masked = val.word & PCI_MSIX_FLAGS_MASKALL;
> > +    new_enabled = val.word & PCI_MSIX_FLAGS_ENABLE;
> > +
> > +    if ( new_enabled != msix->enabled && new_enabled )
> 
>     if ( !msix->enabled && new_enabled )
> 
> would likely be easier to read (similar for the "else if" below).

Ack.

> > +    {
> > +        /* MSI-X enabled. */
> > +        for ( i = 0; i < msix->max_entries; i++ )
> > +        {
> > +            if ( msix->entries[i].masked )
> > +                continue;
> > +
> > +            rc = vpci_msix_enable(&msix->entries[i].arch, pdev,
> > +                                  msix->entries[i].addr, msix->entries[i].data,
> > +                                  msix->entries[i].nr, table_base);
> > +            if ( rc )
> > +            {
> > +                gdprintk(XENLOG_ERR,
> > +                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
> > +                         seg, bus, slot, func, i, rc);
> > +                return rc;
> > +            }
> > +
> > +            vpci_msix_mask(&msix->entries[i].arch, false);
> > +        }
> > +    }
> > +    else if ( new_enabled != msix->enabled && !new_enabled )
> > +    {
> > +        /* MSI-X disabled. */
> > +        for ( i = 0; i < msix->max_entries; i++ )
> > +        {
> > +            rc = vpci_msix_disable(&msix->entries[i].arch);
> > +            if ( rc )
> > +            {
> > +                gdprintk(XENLOG_ERR,
> > +                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
> > +                         seg, bus, slot, func, i, rc);
> > +                return rc;
> > +            }
> > +        }
> > +    }
> > +
> > +    data32 = val.word;
> > +    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
> > +         pci_msi_conf_write_intercept(pdev, reg, 2, &data32) >= 0 )
> > +        pci_conf_write16(seg, bus, slot, func, reg, data32);
> 
> What's the intermediate variable "data32" good for here? Afaict you
> could use val.word in its stead.

Yes, that's seems better.

> > +static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
> > +{
> > +    struct vpci_msix *msix;
> > +
> > +    ASSERT(vpci_locked(d));
> > +    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
> > +        if ( msix->pdev->vpci->header.command & PCI_COMMAND_MEMORY &&
> 
> Please parenthesize & within &&.

Done.

> > +             addr >= msix->addr &&
> > +             addr < msix->addr + msix->max_entries * PCI_MSIX_ENTRY_SIZE )
> > +            return msix;
> > +
> > +    return NULL;
> > +}
> 
> Looking ahead I'm getting the impression that you only allow
> accesses to the MSI-X table entries, yet in vpci_modify_bars() you
> (correctly) prevent mapping entire pages. While most other
> registers are disallowed from sharing a page with the table, the PBA
> is specifically named as an exception. Hence you need to support
> at least reads from the entire range.

That's right, I've added support for handling the PBA also as a direct
read to the real PBA in hardware. To simplify this, in the new version
I'm always trapping accesses to the PBA (I don't think it's worth
checking whether it shares a page with the vector table or not).

> > +static int vpci_msix_table_accept(struct vcpu *v, unsigned long addr)
> > +{
> > +    int found;
> > +
> > +    vpci_lock(v->domain);
> > +    found = !!vpci_msix_find(v->domain, addr);
> 
> At the risk of repeating a comment I gave on an earlier patch: Using
> "bool" for "found" allows you to avoid the !! .

Done.

> > +static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr,
> > +                                  unsigned int len)
> > +{
> > +    uint8_t seg = pdev->seg, bus = pdev->bus;
> > +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> > +
> > +
> 
> No double blank lines please.

Done.

> > +    /* Only allow 32/64b accesses. */
> > +    if ( len != 4 && len != 8 )
> > +    {
> > +        gdprintk(XENLOG_ERR,
> > +                 "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
> > +                 seg, bus, slot, func, len);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* Do no allow accesses that span across multiple entries. */
> > +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) + len > PCI_MSIX_ENTRY_SIZE )
> > +    {
> > +        gdprintk(XENLOG_ERR,
> > +                 "%04x:%02x:%02x.%u: MSI-X access crosses entry boundary\n",
> > +                 seg, bus, slot, func);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /*
> > +     * Only allow 64b accesses to the low message address field.
> > +     *
> > +     * NB: this is more restrictive than the specification, that allows 64b
> > +     * accesses to other fields under certain circumstances, so this check and
> > +     * the code will have to be fixed in order to fully comply with the
> > +     * specification.
> > +     */
> > +    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) != 0 && len != 4 )
> > +    {
> > +        gdprintk(XENLOG_ERR,
> > +                 "%04x:%02x:%02x.%u: 64bit MSI-X table access to 32bit field"
> > +                 " (offset: %#lx len: %u)\n", seg, bus, slot, func,
> > +                 addr & (PCI_MSIX_ENTRY_SIZE - 1), len);
> > +        return -EINVAL;
> > +    }
> > +
> > +    return 0;
> > +}
> 
> So you allow mis-aligned accesses, but you disallow 8-byte ones
> to the upper half of an entry? I think both aspects need to be got
> right from the beginning, the more that you BUG() in the switch()es
> further down in such cases.

I've now fixed this to comply with the spec, that only allows aligned
32/64b accesses.

I'm allowing 32/64b read accesses to any fields, and write accesses to
any fields except for 64b write accesses to the message data and
vector control fields, unless the vector is masked (or MSI-X is not
globally enabled). That matches the restrictions detailed in the PCI
3.0 spec AFAICT.

> > +static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
> > +                                                   unsigned long addr)
> > +{
> > +    return &msix->entries[(addr - msix->addr) / PCI_MSIX_ENTRY_SIZE];
> > +}
> > +
> > +static int vpci_msix_table_read(struct vcpu *v, unsigned long addr,
> > +                                unsigned int len, unsigned long *data)
> > +{
> > +    struct vpci_msix *msix;
> > +    struct vpci_msix_entry *entry;
> > +    unsigned int offset;
> > +
> > +    vpci_lock(v->domain);
> > +    msix = vpci_msix_find(v->domain, addr);
> > +    if ( !msix )
> > +    {
> > +        vpci_unlock(v->domain);
> > +        return X86EMUL_UNHANDLEABLE;
> > +    }
> > +
> > +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> > +    {
> > +        vpci_unlock(v->domain);
> > +        return X86EMUL_UNHANDLEABLE;
> > +    }
> > +
> > +    /* Get the table entry and offset. */
> > +    entry = vpci_msix_get_entry(msix, addr);
> > +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> > +
> > +    switch ( offset )
> > +    {
> > +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> > +        *data = entry->addr;
> 
> You're not clipping off the upper 32 bits here - is that reliably
> happening elsewhere?

This could be a 64b access, so I though there was no need to
differentiate between 32/64b in this case, since the underlying
handlers will already clip it when needed. I could switch it to:

if ( len == 8 )
    *data = entry->addr;
else
    *data = (uint32_t)entry->addr;

I don't think it's happening elsewhere, but I will try to check. Is
that really an issue?

> > +        break;
> > +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> > +        *data = entry->addr >> 32;
> > +        break;
> > +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> > +        *data = entry->data;
> > +        break;
> > +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> > +        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
> 
> What about the other 31 bits?

They are all marked as "reserved" in my copy of the PCI spec.

> > +static int vpci_msix_table_write(struct vcpu *v, unsigned long addr,
> > +                                 unsigned int len, unsigned long data)
> > +{
> > +    struct vpci_msix *msix;
> > +    struct vpci_msix_entry *entry;
> > +    unsigned int offset;
> > +
> > +    vpci_lock(v->domain);
> > +    msix = vpci_msix_find(v->domain, addr);
> > +    if ( !msix )
> > +    {
> > +        vpci_unlock(v->domain);
> > +        return X86EMUL_UNHANDLEABLE;
> > +    }
> > +
> > +    if ( vpci_msix_access_check(msix->pdev, addr, len) )
> > +    {
> > +        vpci_unlock(v->domain);
> > +        return X86EMUL_UNHANDLEABLE;
> > +    }
> > +
> > +    /* Get the table entry and offset. */
> > +    entry = vpci_msix_get_entry(msix, addr);
> > +    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
> > +
> > +    switch ( offset )
> > +    {
> > +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> > +        if ( len == 8 )
> > +        {
> > +            entry->addr = data;
> > +            break;
> > +        }
> > +        entry->addr &= ~GENMASK(31, 0);
> > +        entry->addr |= data;
> > +        break;
> > +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> > +        entry->addr &= ~GENMASK(63, 32);
> > +        entry->addr |= data << 32;
> > +        break;
> > +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> > +        entry->data = data;
> > +        break;
> > +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> > +    {
> > +        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
> > +        struct pci_dev *pdev = msix->pdev;
> > +        paddr_t table_base =
> > +            pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
> 
> Again simply "msix->bir"?

Yes.

> > +        int rc;
> > +
> > +        if ( !msix->enabled )
> > +        {
> > +            entry->masked = new_masked;
> > +            break;
> > +        }
> > +
> > +        if ( new_masked != entry->masked && !new_masked )
> > +        {
> > +            /* Unmasking an entry, update it. */
> > +            rc = vpci_msix_enable(&entry->arch, msix->pdev, entry->addr,
> 
> And simply "pdev" here?

Done.

> > +static int vpci_init_msix(struct pci_dev *pdev)
> > +{
> > +    struct domain *d = pdev->domain;
> > +    uint8_t seg = pdev->seg, bus = pdev->bus;
> > +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> > +    struct vpci_msix *msix;
> > +    unsigned int msix_offset, i, max_entries;
> > +    paddr_t msix_paddr;
> > +    uint16_t control;
> > +    int rc;
> > +
> > +    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
> > +    if ( !msix_offset )
> > +        return 0;
> > +
> > +    if ( !vpci_msix_enabled(pdev->domain) )
> 
> This is a non-__init function, so it can't use dom0_msix (I'm saying
> this just in case there really is a need to retain those command line
> options).

No, I've just removed them, and obviously this call is now gone.

> > +    {
> > +        xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX);
> > +        return 0;
> > +    }
> > +
> > +    control = pci_conf_read16(seg, bus, slot, func,
> > +                              msix_control_reg(msix_offset));
> > +
> > +    /* Get the maximum number of vectors the device supports. */
> > +    max_entries = msix_table_size(control);
> > +    if ( !max_entries )
> > +        return 0;
> 
> This if() is never going to be true.

Right. 0 means 1 vector.

> > +    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
> > +    if ( !msix )
> > +        return -ENOMEM;
> > +
> > +    msix->max_entries = max_entries;
> > +    msix->pdev = pdev;
> > +
> > +    /* Find the MSI-X table address. */
> > +    msix->offset = pci_conf_read32(seg, bus, slot, func,
> > +                                   msix_table_offset_reg(msix_offset));
> > +    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
> > +    msix->offset &= ~PCI_MSIX_BIRMASK;
> > +
> > +    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
> > +           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
> > +    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + msix->offset;
> > +    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;
> 
> I can't seem to find where these addresses get updated in case
> the BARs are being relocated by the Dom0 kernel.

Is that something that Xen should support? I got the impression that
the current MSI-X code in Xen didn't support relocating the BARs that
contain the MSI-X table (but maybe I got it wrong).

> > +    for ( i = 0; i < msix->max_entries; i++)
> > +    {
> > +        msix->entries[i].masked = true;
> > +        msix->entries[i].nr = i;
> > +        vpci_msix_arch_init(&msix->entries[i].arch);
> > +    }
> > +
> > +    if ( list_empty(&d->arch.hvm_domain.msix_tables) )
> > +        register_mmio_handler(d, &vpci_msix_table_ops);
> > +
> > +    list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
> > +
> > +    rc = xen_vpci_add_register(pdev, vpci_msix_control_read,
> > +                               vpci_msix_control_write,
> > +                               msix_control_reg(msix_offset), 2, msix);
> > +    if ( rc )
> > +    {
> > +        dprintk(XENLOG_ERR,
> > +                "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: %d\n",
> > +                seg, bus, slot, func, rc);
> > +        goto error;
> > +    }
> > +
> > +    if ( pdev->vpci->header.command & PCI_COMMAND_MEMORY )
> > +    {
> > +        /* Unmap this memory from the guest. */
> > +        rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(msix->addr)),
> > +                         _mfn(PFN_DOWN(msix_paddr)),
> > +                         PFN_UP(msix->max_entries * PCI_MSIX_ENTRY_SIZE),
> > +                         false);
> > +        if ( rc )
> > +        {
> > +            dprintk(XENLOG_ERR,
> > +                    "%04x:%02x:%02x.%u: unable to unmap MSI-X BAR region: %d\n",
> > +                    seg, bus, slot, func, rc);
> > +            goto error;
> > +        }
> > +    }
> 
> Why is this unmapping conditional upon PCI_COMMAND_MEMORY?

Well, if memory decoding is not enabled the BAR is not mapped in the
first place. I've now reworked this a little bit in the newer version,
so it's handled in the header code instead.

> > +static void vpci_dump_msix(unsigned char key)
> > +{
> > +    struct domain *d;
> > +    struct pci_dev *pdev;
> > +
> > +    printk("Guest MSI-X information:\n");
> > +
> > +    for_each_domain ( d )
> > +    {
> > +        if ( !has_vpci(d) )
> > +            continue;
> > +
> > +        vpci_lock(d);
> 
> Dump handlers, even if there are existing examples to the contrary,
> should only try-lock any locks they mean to hold (and not dump
> anything if they can't get hold of the lock).

OK, will have to change the MSI dump also. Since you commented on the
MSI dump handler, I guess this output should also be appended to the
'M' key?

> > --- a/xen/include/xen/vpci.h
> > +++ b/xen/include/xen/vpci.h
> > @@ -112,6 +112,33 @@ struct vpci {
> >          /* Arch-specific data. */
> >          struct vpci_arch_msi arch;
> >      } *msi;
> > +
> > +    /* MSI-X data. */
> > +    struct vpci_msix {
> > +        struct pci_dev *pdev;
> > +        /* Maximum number of vectors supported by the device. */
> > +        unsigned int max_entries;
> > +        /* MSI-X table offset. */
> > +        unsigned int offset;
> > +        /* MSI-X table BIR. */
> > +        unsigned int bir;
> > +        /* Table addr. */
> > +        paddr_t addr;
> > +        /* MSI-X enabled? */
> > +        bool enabled;
> > +        /* Masked? */
> > +        bool masked;
> > +        /* List link. */
> > +        struct list_head next;
> > +        /* Entries. */
> > +        struct vpci_msix_entry {
> > +                unsigned int nr;
> > +                uint64_t addr;
> > +                uint32_t data;
> > +                bool masked;
> > +                struct vpci_arch_msix_entry arch;
> 
> Indentation.

Fixed.

As usual, thank you very much for the detailed review.

Roger.
Jan Beulich June 29, 2017, 6:19 a.m. UTC | #3
>>> Roger Pau Monne <roger.pau@citrix.com> 06/28/17 5:37 PM >>>
>On Mon, May 29, 2017 at 07:29:29AM -0600, Jan Beulich wrote:
>> >>> On 27.04.17 at 16:35, <roger.pau@citrix.com> wrote:
>> > +    switch ( offset )
>> > +    {
>> > +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
>> > +        *data = entry->addr;
>> 
>> You're not clipping off the upper 32 bits here - is that reliably
>> happening elsewhere?
>
>This could be a 64b access, so I though there was no need to
>differentiate between 32/64b in this case, since the underlying
>handlers will already clip it when needed. I could switch it to:
>
>if ( len == 8 )
    >*data = entry->addr;
>else
    >*data = (uint32_t)entry->addr;
>
>I don't think it's happening elsewhere, but I will try to check. Is
>that really an issue?

I would hope it isn't, but I'm not 100% certain, hence my request to at
least check. I agree that it would be nice to not have to do it here. All other
similar read routines I've looked at appear to leave the upper half as zero,
albeit that's always a result of the way the code is written instead of an
explicit truncation as would be required here.

>> > +        break;
>> > +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
>> > +        *data = entry->addr >> 32;
>> > +        break;
>> > +    case PCI_MSIX_ENTRY_DATA_OFFSET:
>> > +        *data = entry->data;
>> > +        break;
>> > +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
>> > +        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
>> 
>> What about the other 31 bits?
>
>They are all marked as "reserved" in my copy of the PCI spec.

Indeed, but it's at least worth considering to pass through the values (as it's
Dom0 we're talking about here), and having a comment giving a brief explanation
for the choice.

>> > +    /* Find the MSI-X table address. */
>> > +    msix->offset = pci_conf_read32(seg, bus, slot, func,
>> > +                                   msix_table_offset_reg(msix_offset));
>> > +    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
>> > +    msix->offset &= ~PCI_MSIX_BIRMASK;
>> > +
>> > +    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
>> > +           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
>> > +    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + msix->offset;
>> > +    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;
>> 
>> I can't seem to find where these addresses get updated in case
>> the BARs are being relocated by the Dom0 kernel.
>
>Is that something that Xen should support? I got the impression that
>the current MSI-X code in Xen didn't support relocating the BARs that
>contain the MSI-X table (but maybe I got it wrong).

Well, the current expectation is that Dom0 would do BAR relocation prior to any MSI-X
setup. But since you aim at maximum transparency from PVH Dom0's perspective, I'm
not certain latching the addresses here once and for all is sufficient.

>> > +static void vpci_dump_msix(unsigned char key)
>> > +{
>> > +    struct domain *d;
>> > +    struct pci_dev *pdev;
>> > +
>> > +    printk("Guest MSI-X information:\n");
>> > +
>> > +    for_each_domain ( d )
>> > +    {
>> > +        if ( !has_vpci(d) )
>> > +            continue;
>> > +
>> > +        vpci_lock(d);
>> 
>> Dump handlers, even if there are existing examples to the contrary,
>> should only try-lock any locks they mean to hold (and not dump
>> anything if they can't get hold of the lock).
>
>OK, will have to change the MSI dump also. Since you commented on the
>MSI dump handler, I guess this output should also be appended to the
>'M' key?

Yes, that indeed was the implication.

Jan
Roger Pau Monné June 29, 2017, 8:25 a.m. UTC | #4
On Thu, Jun 29, 2017 at 12:19:39AM -0600, Jan Beulich wrote:
> >>> Roger Pau Monne <roger.pau@citrix.com> 06/28/17 5:37 PM >>>
> >On Mon, May 29, 2017 at 07:29:29AM -0600, Jan Beulich wrote:
> >> >>> On 27.04.17 at 16:35, <roger.pau@citrix.com> wrote:
> >> > +    switch ( offset )
> >> > +    {
> >> > +    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
> >> > +        *data = entry->addr;
> >> 
> >> You're not clipping off the upper 32 bits here - is that reliably
> >> happening elsewhere?
> >
> >This could be a 64b access, so I though there was no need to
> >differentiate between 32/64b in this case, since the underlying
> >handlers will already clip it when needed. I could switch it to:
> >
> >if ( len == 8 )
>     >*data = entry->addr;
> >else
>     >*data = (uint32_t)entry->addr;
> >
> >I don't think it's happening elsewhere, but I will try to check. Is
> >that really an issue?
> 
> I would hope it isn't, but I'm not 100% certain, hence my request to at
> least check. I agree that it would be nice to not have to do it here. All other
> similar read routines I've looked at appear to leave the upper half as zero,
> albeit that's always a result of the way the code is written instead of an
> explicit truncation as would be required here.

Ack, I will add a comment to clarify this.

> >> > +        break;
> >> > +    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
> >> > +        *data = entry->addr >> 32;
> >> > +        break;
> >> > +    case PCI_MSIX_ENTRY_DATA_OFFSET:
> >> > +        *data = entry->data;
> >> > +        break;
> >> > +    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
> >> > +        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
> >> 
> >> What about the other 31 bits?
> >
> >They are all marked as "reserved" in my copy of the PCI spec.
> 
> Indeed, but it's at least worth considering to pass through the values (as it's
> Dom0 we're talking about here), and having a comment giving a brief explanation
> for the choice.

I can certainly do that, but I don't think we should passthrough them
in the write handler. I'm worried that then Dom0 would see an
incoherent value if it attempts to modify some of the bits marked as
reserved.

IMHO it seems better to simply hide them until Xen knows how to deal
with them.

> >> > +    /* Find the MSI-X table address. */
> >> > +    msix->offset = pci_conf_read32(seg, bus, slot, func,
> >> > +                                   msix_table_offset_reg(msix_offset));
> >> > +    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
> >> > +    msix->offset &= ~PCI_MSIX_BIRMASK;
> >> > +
> >> > +    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
> >> > +           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
> >> > +    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + msix->offset;
> >> > +    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;
> >> 
> >> I can't seem to find where these addresses get updated in case
> >> the BARs are being relocated by the Dom0 kernel.
> >
> >Is that something that Xen should support? I got the impression that
> >the current MSI-X code in Xen didn't support relocating the BARs that
> >contain the MSI-X table (but maybe I got it wrong).
> 
> Well, the current expectation is that Dom0 would do BAR relocation prior to any MSI-X
> setup. But since you aim at maximum transparency from PVH Dom0's perspective, I'm
> not certain latching the addresses here once and for all is sufficient.

OK, I can implement something a little bit more flexible.

Thanks, Roger.
diff mbox

Patch

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 73013aea14..7c1684511c 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -684,6 +684,13 @@  enabled.
 Enable or disable (using the `no-` prefix) the MSI emulation inside of
 Xen for a PVH Dom0. Note that this option has no effect on a PV Dom0.
 
+> `msix`
+
+> Default: `true`
+
+Enable or disable (using the `no-` prefix) the MSI-X emulation inside of
+Xen for a PVH Dom0. Note that this option has no effect on a PV Dom0.
+
 ### dtuart (ARM)
 > `= path [:options]`
 
diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c
index 01afcf6215..3996d9dd12 100644
--- a/xen/arch/x86/dom0_build.c
+++ b/xen/arch/x86/dom0_build.c
@@ -177,6 +177,7 @@  bool __initdata opt_dom0_shadow;
 #endif
 bool __initdata dom0_pvh;
 bool __initdata dom0_msi = true;
+bool __initdata dom0_msix = true;
 
 /*
  * List of parameters that affect Dom0 creation:
@@ -184,6 +185,7 @@  bool __initdata dom0_msi = true;
  *  - pvh               Create a PVHv2 Dom0.
  *  - shadow            Use shadow paging for Dom0.
  *  - msi               MSI functionality.
+ *  - msix              MSI-X functionality.
  */
 static void __init parse_dom0_param(char *s)
 {
@@ -207,6 +209,8 @@  static void __init parse_dom0_param(char *s)
 #endif
         else if ( !strcmp(s, "msi") )
             dom0_msi = enabled;
+        else if ( !strcmp(s, "msix") )
+            dom0_msix = enabled;
 
         s = ss + 1;
     } while ( ss );
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index ef3ad2a615..3a3296ffe7 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -614,6 +614,7 @@  int hvm_domain_initialise(struct domain *d)
     INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list);
     INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list);
     INIT_LIST_HEAD(&d->arch.hvm_domain.ecam_regions);
+    INIT_LIST_HEAD(&d->arch.hvm_domain.msix_tables);
 
     hvm_init_cacheattr_region_list(d);
 
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index f23b2f43d6..00a15b862f 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -643,15 +643,15 @@  static unsigned int msi_flags(uint16_t data, uint64_t addr)
            (trig_mode << GFLAGS_SHIFT_TRG_MODE);
 }
 
-void vpci_msi_mask(struct vpci_arch_msi *arch, unsigned int entry, bool mask)
+static void vpci_mask_pirq(int pirq, bool mask)
 {
     struct pirq *pinfo;
     struct irq_desc *desc;
     unsigned long flags;
     int irq;
 
-    ASSERT(arch->pirq != -1);
-    pinfo = pirq_info(current->domain, arch->pirq + entry);
+    ASSERT(pirq != -1);
+    pinfo = pirq_info(current->domain, pirq);
     ASSERT(pinfo);
 
     irq = pinfo->arch.irq;
@@ -665,6 +665,11 @@  void vpci_msi_mask(struct vpci_arch_msi *arch, unsigned int entry, bool mask)
     spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+void vpci_msi_mask(struct vpci_arch_msi *arch, unsigned int entry, bool mask)
+{
+    vpci_mask_pirq(arch->pirq + entry, mask);
+}
+
 int vpci_msi_enable(struct vpci_arch_msi *arch, struct pci_dev *pdev,
                     uint64_t address, uint32_t data, unsigned int vectors)
 {
@@ -763,3 +768,106 @@  void vpci_msi_arch_print(struct vpci_arch_msi *arch)
     printk("PIRQ: %d\n", arch->pirq);
 }
 
+void vpci_msix_mask(struct vpci_arch_msix_entry *arch, bool mask)
+{
+    vpci_mask_pirq(arch->pirq, mask);
+}
+
+int vpci_msix_enable(struct vpci_arch_msix_entry *arch, struct pci_dev *pdev,
+                     uint64_t address, uint32_t data, unsigned int entry_nr,
+                     paddr_t table_base)
+{
+    struct domain *d = pdev->domain;
+    xen_domctl_bind_pt_irq_t bind = {
+        .hvm_domid = DOMID_SELF,
+        .irq_type = PT_IRQ_TYPE_MSI,
+        .u.msi.gvec = msi_vector(data),
+        .u.msi.gflags = msi_flags(data, address),
+    };
+    int rc;
+
+    if ( arch->pirq == -1 )
+    {
+        struct msi_info msi_info = {
+            .seg = pdev->seg,
+            .bus = pdev->bus,
+            .devfn = pdev->devfn,
+            .table_base = table_base,
+            .entry_nr = entry_nr,
+        };
+        int index = -1;
+
+        /* Map PIRQ. */
+        rc = allocate_and_map_msi_pirq(pdev->domain, &index, &arch->pirq,
+                                       &msi_info);
+        if ( rc )
+        {
+            gdprintk(XENLOG_ERR,
+                     "%04x:%02x:%02x.%u: unable to map MSI-X PIRQ entry %u: %d\n",
+                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                     PCI_FUNC(pdev->devfn), entry_nr, rc);
+            return rc;
+        }
+    }
+
+    bind.machine_irq = arch->pirq;
+    pcidevs_lock();
+    rc = pt_irq_create_bind(d, &bind);
+    if ( rc )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: unable to create MSI-X bind %u: %d\n",
+                 pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                 PCI_FUNC(pdev->devfn), entry_nr, rc);
+        spin_lock(&pdev->domain->event_lock);
+        unmap_domain_pirq(pdev->domain, arch->pirq);
+        spin_unlock(&pdev->domain->event_lock);
+        arch->pirq = -1;
+    }
+    pcidevs_unlock();
+
+    return 0;
+}
+
+int vpci_msix_disable(struct vpci_arch_msix_entry *arch)
+{
+    xen_domctl_bind_pt_irq_t bind = {
+        .hvm_domid = DOMID_SELF,
+        .irq_type = PT_IRQ_TYPE_MSI,
+        .machine_irq = arch->pirq,
+    };
+    int rc;
+
+    if ( arch->pirq == -1 )
+        return 0;
+
+    pcidevs_lock();
+    rc = pt_irq_destroy_bind(current->domain, &bind);
+    if ( rc )
+    {
+        pcidevs_unlock();
+        return rc;
+    }
+
+    spin_lock(&current->domain->event_lock);
+    unmap_domain_pirq(current->domain, arch->pirq);
+    spin_unlock(&current->domain->event_lock);
+    pcidevs_unlock();
+
+    arch->pirq = -1;
+
+    return 0;
+}
+
+int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch)
+{
+    arch->pirq = -1;
+    return 0;
+}
+
+void vpci_msix_arch_print(struct vpci_arch_msix_entry *arch)
+{
+    /* No newline, it will be added by the generic debug handler. */
+    printk("pirq: %d", arch->pirq);
+}
+
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index ef4fc6caf3..55398d4428 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@ 
-obj-y += vpci.o header.o capabilities.o msi.o
+obj-y += vpci.o header.o capabilities.o msi.o msix.o
diff --git a/xen/drivers/vpci/capabilities.c b/xen/drivers/vpci/capabilities.c
index ad9f45c2e1..7166ccb502 100644
--- a/xen/drivers/vpci/capabilities.c
+++ b/xen/drivers/vpci/capabilities.c
@@ -130,21 +130,7 @@  void xen_vpci_mask_capability(struct pci_dev *pdev, uint8_t cap_id)
     }
 }
 
-static int vpci_capabilities_init(struct pci_dev *pdev)
-{
-    int rc;
-
-    rc = vpci_index_capabilities(pdev);
-    if ( rc )
-        return rc;
-
-    /* Mask MSI-X capability until Xen handles it. */
-    xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX);
-
-    return 0;
-}
-
-REGISTER_VPCI_INIT(vpci_capabilities_init, true);
+REGISTER_VPCI_INIT(vpci_index_capabilities, true);
 
 /*
  * Local variables:
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 3deec53efd..c03dcdd708 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -32,16 +32,47 @@  static int vpci_modify_bars(struct pci_dev *pdev, const bool map)
         paddr_t gaddr = map ? header->bars[i].gaddr
                             : header->bars[i].mapped_addr;
         paddr_t paddr = header->bars[i].paddr;
+        size_t size = header->bars[i].size;
 
         if ( header->bars[i].type != VPCI_BAR_MEM &&
              header->bars[i].type != VPCI_BAR_MEM64_LO )
             continue;
 
-        rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(gaddr)),
-                         _mfn(PFN_DOWN(paddr)), PFN_UP(header->bars[i].size),
-                         map);
-        if ( rc )
-            break;
+        if ( pdev->vpci->msix != NULL && pdev->vpci->msix->bir == i )
+        {
+            /* There's an MSI-X table inside of this BAR. */
+            paddr_t msix_gaddr = gaddr + pdev->vpci->msix->offset;
+            paddr_t msix_paddr = paddr + pdev->vpci->msix->offset;
+            size_t msix_size = pdev->vpci->msix->max_entries *
+                               PCI_MSIX_ENTRY_SIZE;
+
+            ASSERT(IS_ALIGNED(msix_gaddr, PAGE_SIZE) &&
+                   IS_ALIGNED(msix_paddr, PAGE_SIZE));
+
+            rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(gaddr)),
+                             _mfn(PFN_DOWN(paddr)),
+                             PFN_DOWN(msix_paddr - paddr), map);
+            if ( rc )
+                break;
+
+            rc = modify_mmio(pdev->domain,
+                             _gfn(PFN_UP(msix_gaddr + msix_size)),
+                             _mfn(PFN_UP(msix_paddr + msix_size)),
+                             PFN_UP(paddr + size -
+                                    round_pgup(msix_paddr + msix_size)), map);
+            if ( rc )
+                break;
+
+            if ( map )
+                pdev->vpci->msix->addr = msix_gaddr;
+        }
+        else
+        {
+            rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(gaddr)),
+                             _mfn(PFN_DOWN(paddr)), PFN_UP(size), map);
+            if ( rc )
+                break;
+        }
 
         header->bars[i].mapped_addr = map ? gaddr : 0;
     }
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
new file mode 100644
index 0000000000..df45dca917
--- /dev/null
+++ b/xen/drivers/vpci/msix.c
@@ -0,0 +1,487 @@ 
+/*
+ * Handlers for accesses to the MSI-X capability structure and the memory
+ * region.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <asm/msi.h>
+#include <xen/p2m-common.h>
+#include <xen/keyhandler.h>
+
+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num]))
+
+static int vpci_msix_control_read(struct pci_dev *pdev, unsigned int reg,
+                                  union vpci_val *val, void *data)
+{
+    struct vpci_msix *msix = data;
+
+    val->word = (msix->max_entries - 1) & PCI_MSIX_FLAGS_QSIZE;
+    val->word |= msix->enabled ? PCI_MSIX_FLAGS_ENABLE : 0;
+    val->word |= msix->masked ? PCI_MSIX_FLAGS_MASKALL : 0;
+
+    return 0;
+}
+
+static int vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
+                                   union vpci_val val, void *data)
+{
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    paddr_t table_base = pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
+    struct vpci_msix *msix = data;
+    bool new_masked, new_enabled;
+    unsigned int i;
+    uint32_t data32;
+    int rc;
+
+    new_masked = val.word & PCI_MSIX_FLAGS_MASKALL;
+    new_enabled = val.word & PCI_MSIX_FLAGS_ENABLE;
+
+    if ( new_enabled != msix->enabled && new_enabled )
+    {
+        /* MSI-X enabled. */
+        for ( i = 0; i < msix->max_entries; i++ )
+        {
+            if ( msix->entries[i].masked )
+                continue;
+
+            rc = vpci_msix_enable(&msix->entries[i].arch, pdev,
+                                  msix->entries[i].addr, msix->entries[i].data,
+                                  msix->entries[i].nr, table_base);
+            if ( rc )
+            {
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
+                         seg, bus, slot, func, i, rc);
+                return rc;
+            }
+
+            vpci_msix_mask(&msix->entries[i].arch, false);
+        }
+    }
+    else if ( new_enabled != msix->enabled && !new_enabled )
+    {
+        /* MSI-X disabled. */
+        for ( i = 0; i < msix->max_entries; i++ )
+        {
+            rc = vpci_msix_disable(&msix->entries[i].arch);
+            if ( rc )
+            {
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
+                         seg, bus, slot, func, i, rc);
+                return rc;
+            }
+        }
+    }
+
+    data32 = val.word;
+    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
+         pci_msi_conf_write_intercept(pdev, reg, 2, &data32) >= 0 )
+        pci_conf_write16(seg, bus, slot, func, reg, data32);
+
+    msix->masked = new_masked;
+    msix->enabled = new_enabled;
+
+    return 0;
+}
+
+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
+{
+    struct vpci_msix *msix;
+
+    ASSERT(vpci_locked(d));
+    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
+        if ( msix->pdev->vpci->header.command & PCI_COMMAND_MEMORY &&
+             addr >= msix->addr &&
+             addr < msix->addr + msix->max_entries * PCI_MSIX_ENTRY_SIZE )
+            return msix;
+
+    return NULL;
+}
+
+static int vpci_msix_table_accept(struct vcpu *v, unsigned long addr)
+{
+    int found;
+
+    vpci_lock(v->domain);
+    found = !!vpci_msix_find(v->domain, addr);
+    vpci_unlock(v->domain);
+
+    return found;
+}
+
+static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr,
+                                  unsigned int len)
+{
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+
+
+    /* Only allow 32/64b accesses. */
+    if ( len != 4 && len != 8 )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
+                 seg, bus, slot, func, len);
+        return -EINVAL;
+    }
+
+    /* Do no allow accesses that span across multiple entries. */
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) + len > PCI_MSIX_ENTRY_SIZE )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: MSI-X access crosses entry boundary\n",
+                 seg, bus, slot, func);
+        return -EINVAL;
+    }
+
+    /*
+     * Only allow 64b accesses to the low message address field.
+     *
+     * NB: this is more restrictive than the specification, that allows 64b
+     * accesses to other fields under certain circumstances, so this check and
+     * the code will have to be fixed in order to fully comply with the
+     * specification.
+     */
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) != 0 && len != 4 )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: 64bit MSI-X table access to 32bit field"
+                 " (offset: %#lx len: %u)\n", seg, bus, slot, func,
+                 addr & (PCI_MSIX_ENTRY_SIZE - 1), len);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
+                                                   unsigned long addr)
+{
+    return &msix->entries[(addr - msix->addr) / PCI_MSIX_ENTRY_SIZE];
+}
+
+static int vpci_msix_table_read(struct vcpu *v, unsigned long addr,
+                                unsigned int len, unsigned long *data)
+{
+    struct vpci_msix *msix;
+    struct vpci_msix_entry *entry;
+    unsigned int offset;
+
+    vpci_lock(v->domain);
+    msix = vpci_msix_find(v->domain, addr);
+    if ( !msix )
+    {
+        vpci_unlock(v->domain);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
+    {
+        vpci_unlock(v->domain);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    /* Get the table entry and offset. */
+    entry = vpci_msix_get_entry(msix, addr);
+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+    switch ( offset )
+    {
+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+        *data = entry->addr;
+        break;
+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+        *data = entry->addr >> 32;
+        break;
+    case PCI_MSIX_ENTRY_DATA_OFFSET:
+        *data = entry->data;
+        break;
+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
+        break;
+    default:
+        BUG();
+    }
+    vpci_unlock(v->domain);
+
+    return X86EMUL_OKAY;
+}
+
+static int vpci_msix_table_write(struct vcpu *v, unsigned long addr,
+                                 unsigned int len, unsigned long data)
+{
+    struct vpci_msix *msix;
+    struct vpci_msix_entry *entry;
+    unsigned int offset;
+
+    vpci_lock(v->domain);
+    msix = vpci_msix_find(v->domain, addr);
+    if ( !msix )
+    {
+        vpci_unlock(v->domain);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
+    {
+        vpci_unlock(v->domain);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    /* Get the table entry and offset. */
+    entry = vpci_msix_get_entry(msix, addr);
+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+    switch ( offset )
+    {
+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+        if ( len == 8 )
+        {
+            entry->addr = data;
+            break;
+        }
+        entry->addr &= ~GENMASK(31, 0);
+        entry->addr |= data;
+        break;
+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+        entry->addr &= ~GENMASK(63, 32);
+        entry->addr |= data << 32;
+        break;
+    case PCI_MSIX_ENTRY_DATA_OFFSET:
+        entry->data = data;
+        break;
+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+    {
+        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
+        struct pci_dev *pdev = msix->pdev;
+        paddr_t table_base =
+            pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr;
+        int rc;
+
+        if ( !msix->enabled )
+        {
+            entry->masked = new_masked;
+            break;
+        }
+
+        if ( new_masked != entry->masked && !new_masked )
+        {
+            /* Unmasking an entry, update it. */
+            rc = vpci_msix_enable(&entry->arch, msix->pdev, entry->addr,
+                                  entry->data, entry->nr, table_base);
+            if ( rc )
+            {
+                vpci_unlock(v->domain);
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
+                         pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                         PCI_FUNC(pdev->devfn), entry->nr, rc);
+                return X86EMUL_UNHANDLEABLE;
+            }
+        }
+
+        vpci_msix_mask(&entry->arch, new_masked);
+        entry->masked = new_masked;
+
+        break;
+    }
+    default:
+        BUG();
+    }
+    vpci_unlock(v->domain);
+
+    return X86EMUL_OKAY;
+}
+
+static const struct hvm_mmio_ops vpci_msix_table_ops = {
+    .check = vpci_msix_table_accept,
+    .read = vpci_msix_table_read,
+    .write = vpci_msix_table_write,
+};
+
+static int vpci_init_msix(struct pci_dev *pdev)
+{
+    struct domain *d = pdev->domain;
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    struct vpci_msix *msix;
+    unsigned int msix_offset, i, max_entries;
+    paddr_t msix_paddr;
+    uint16_t control;
+    int rc;
+
+    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !msix_offset )
+        return 0;
+
+    if ( !vpci_msix_enabled(pdev->domain) )
+    {
+        xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX);
+        return 0;
+    }
+
+    control = pci_conf_read16(seg, bus, slot, func,
+                              msix_control_reg(msix_offset));
+
+    /* Get the maximum number of vectors the device supports. */
+    max_entries = msix_table_size(control);
+    if ( !max_entries )
+        return 0;
+
+    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
+    if ( !msix )
+        return -ENOMEM;
+
+    msix->max_entries = max_entries;
+    msix->pdev = pdev;
+
+    /* Find the MSI-X table address. */
+    msix->offset = pci_conf_read32(seg, bus, slot, func,
+                                   msix_table_offset_reg(msix_offset));
+    msix->bir = msix->offset & PCI_MSIX_BIRMASK;
+    msix->offset &= ~PCI_MSIX_BIRMASK;
+
+    ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM ||
+           pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO);
+    msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + msix->offset;
+    msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset;
+
+    for ( i = 0; i < msix->max_entries; i++)
+    {
+        msix->entries[i].masked = true;
+        msix->entries[i].nr = i;
+        vpci_msix_arch_init(&msix->entries[i].arch);
+    }
+
+    if ( list_empty(&d->arch.hvm_domain.msix_tables) )
+        register_mmio_handler(d, &vpci_msix_table_ops);
+
+    list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
+
+    rc = xen_vpci_add_register(pdev, vpci_msix_control_read,
+                               vpci_msix_control_write,
+                               msix_control_reg(msix_offset), 2, msix);
+    if ( rc )
+    {
+        dprintk(XENLOG_ERR,
+                "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: %d\n",
+                seg, bus, slot, func, rc);
+        goto error;
+    }
+
+    if ( pdev->vpci->header.command & PCI_COMMAND_MEMORY )
+    {
+        /* Unmap this memory from the guest. */
+        rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(msix->addr)),
+                         _mfn(PFN_DOWN(msix_paddr)),
+                         PFN_UP(msix->max_entries * PCI_MSIX_ENTRY_SIZE),
+                         false);
+        if ( rc )
+        {
+            dprintk(XENLOG_ERR,
+                    "%04x:%02x:%02x.%u: unable to unmap MSI-X BAR region: %d\n",
+                    seg, bus, slot, func, rc);
+            goto error;
+        }
+    }
+
+    pdev->vpci->msix = msix;
+
+    return 0;
+
+ error:
+    ASSERT(rc);
+    xfree(msix);
+    return rc;
+}
+
+REGISTER_VPCI_INIT(vpci_init_msix, false);
+
+static void vpci_dump_msix(unsigned char key)
+{
+    struct domain *d;
+    struct pci_dev *pdev;
+
+    printk("Guest MSI-X information:\n");
+
+    for_each_domain ( d )
+    {
+        if ( !has_vpci(d) )
+            continue;
+
+        vpci_lock(d);
+        list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list)
+        {
+            uint8_t seg = pdev->seg, bus = pdev->bus;
+            uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+            struct vpci_msix *msix = pdev->vpci->msix;
+            unsigned int i;
+
+            if ( !msix )
+                continue;
+
+            printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+
+            printk("Max entries: %u maskall: %u enabled: %u\n",
+                   msix->max_entries, msix->masked, msix->enabled);
+
+            printk("Guest entries:\n");
+            for ( i = 0; i < msix->max_entries; i++ )
+            {
+                struct vpci_msix_entry *entry = &msix->entries[i];
+                uint32_t data = entry->data;
+                uint64_t addr = entry->addr;
+
+                printk("%4u vec=%#02x%7s%6s%3sassert%5s%7s dest_id=%lu mask=%u ",
+                       i,
+                       (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT,
+                       data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
+                       data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge",
+                       data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
+                       addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
+                       addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu",
+                       (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT,
+                       entry->masked);
+                vpci_msix_arch_print(&entry->arch);
+                printk("\n");
+            }
+            printk("\n");
+        }
+        vpci_unlock(d);
+    }
+}
+
+static int __init vpci_msix_setup_keyhandler(void)
+{
+    register_keyhandler('X', vpci_dump_msix, "dump guest MSI-X state", 1);
+    return 0;
+}
+__initcall(vpci_msix_setup_keyhandler);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index b69c33df3c..2bef170cc0 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -198,6 +198,9 @@  struct hvm_domain {
     /* List of ECAM (MMCFG) regions trapped by Xen. */
     struct list_head ecam_regions;
 
+    /* List of MSI-X tables. */
+    struct list_head msix_tables;
+
     /* List of permanently write-mapped pages. */
     struct {
         spinlock_t lock;
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index ae3af43749..d7ecc2adbf 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -131,6 +131,10 @@  void msixtbl_init(struct domain *d);
 extern bool dom0_msi;
 #define vpci_msi_enabled(d) (!is_hardware_domain((d)) || dom0_msi)
 
+/* Is emulated MSI enabled? */
+extern bool dom0_msix;
+#define vpci_msix_enabled(d) (!is_hardware_domain((d)) || dom0_msix)
+
 /* Arch-specific MSI data for vPCI. */
 struct vpci_arch_msi {
     int pirq;
@@ -145,6 +149,20 @@  int vpci_msi_disable(struct vpci_arch_msi *arch, struct pci_dev *pdev,
 int vpci_msi_arch_init(struct vpci_arch_msi *arch);
 void vpci_msi_arch_print(struct vpci_arch_msi *arch);
 
+/* Arch-specific MSI-X entry data for vPCI. */
+struct vpci_arch_msix_entry {
+    int pirq;
+};
+
+/* Arch-specific vPCI MSI-X helpers. */
+void vpci_msix_mask(struct vpci_arch_msix_entry *arch, bool mask);
+int vpci_msix_enable(struct vpci_arch_msix_entry *arch, struct pci_dev *pdev,
+                     uint64_t address, uint32_t data, unsigned int entry_nr,
+                     paddr_t table_base);
+int vpci_msix_disable(struct vpci_arch_msix_entry *arch);
+int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch);
+void vpci_msix_arch_print(struct vpci_arch_msix_entry *arch);
+
 enum stdvga_cache_state {
     STDVGA_CACHE_UNINITIALIZED,
     STDVGA_CACHE_ENABLED,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 373b8d6505..8c3e56c1e4 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -112,6 +112,33 @@  struct vpci {
         /* Arch-specific data. */
         struct vpci_arch_msi arch;
     } *msi;
+
+    /* MSI-X data. */
+    struct vpci_msix {
+        struct pci_dev *pdev;
+        /* Maximum number of vectors supported by the device. */
+        unsigned int max_entries;
+        /* MSI-X table offset. */
+        unsigned int offset;
+        /* MSI-X table BIR. */
+        unsigned int bir;
+        /* Table addr. */
+        paddr_t addr;
+        /* MSI-X enabled? */
+        bool enabled;
+        /* Masked? */
+        bool masked;
+        /* List link. */
+        struct list_head next;
+        /* Entries. */
+        struct vpci_msix_entry {
+                unsigned int nr;
+                uint64_t addr;
+                uint32_t data;
+                bool masked;
+                struct vpci_arch_msix_entry arch;
+          } entries[];
+    } *msix;
 #endif
 };