[v4,9/9] vpci/msix: add MSI-X handlers

Message ID	20170630150117.88489-10-roger.pau@citrix.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> From: Roger Pau Monne <roger.pau@citrix.com> To: <xen-devel@lists.xenproject.org> Date: Fri, 30 Jun 2017 16:01:17 +0100 Message-ID: <20170630150117.88489-10-roger.pau@citrix.com> In-Reply-To: <20170630150117.88489-1-roger.pau@citrix.com> References: <20170630150117.88489-1-roger.pau@citrix.com> MIME-Version: 1.0 Cc: Andrew Cooper <andrew.cooper3@citrix.com>, julien.grall@arm.com, Jan Beulich <jbeulich@suse.com>, boris.ostrovsky@oracle.com, Roger Pau Monne <roger.pau@citrix.com> Subject: [Xen-devel] [PATCH v4 9/9] vpci/msix: add MSI-X handlers Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>

>>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>> >Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA >BIR are not trapped by Xen at the moment. They're mandated r/o by the spec anyway. >@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar, >if ( IS_ERR(mem) ) >return -PTR_ERR(mem); > >+ /* >+ * Make sure the MSI-X regions of the BAR are not mapped into the domain >+ * p2m, or else the MSI-X handlers are useless. Only do this when mapping, >+ * since that's when the memory decoding on the device is enabled. >+ */ >+ for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ ) >+ { >+ struct vpci_msix_mem *msix = bar->msix[i]; >+ >+ if ( !msix || msix->addr == INVALID_PADDR ) >+ continue; >+ >+ rc = vpci_unmap_msix(d, msix); Why do you need this, instead of being able to simply rely on the rangeset based (un)mapping? >@@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev) >continue; >} > >- bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR; >+ if ( cmd & PCI_COMMAND_MEMORY ) >+ { >+ unsigned int j; >+ >+ bars[i].addr = addr; >+ >+ for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ ) >+ if ( bars[i].msix[j] ) >+ bars[i].msix[j]->addr = bars[i].addr + >+ bars[i].msix[j]->offset; >+ } >+ else >+ bars[i].addr = INVALID_PADDR; As (I think) mentioned elsewhere already, this init-time special case looks dangerous (and unnecessary) to me (or else I'd expect you to also zap the field when the memory decode bit is being cleared). >--- /dev/null >+++ b/xen/drivers/vpci/msix.c >@@ -0,0 +1,503 @@ >+/* >+ * Handlers for accesses to the MSI-X capability structure and the memory >+ * region. >+ * >+ * Copyright (C) 2017 Citrix Systems R&D >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms and conditions of the GNU General Public >+ * License, version 2, as published by the Free Software Foundation. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * General Public License for more details. >+ * >+ * You should have received a copy of the GNU General Public >+ * License along with this program; If not, see <http://www.gnu.org/licenses/>. >+ */ >+ >+#include <xen/sched.h> >+#include <xen/vpci.h> >+#include <asm/msi.h> >+#include <xen/p2m-common.h> >+#include <xen/keyhandler.h> >+ >+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num])) The outermost parens are pointless here. >+static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg, >+ union vpci_val val, void *data) >+{ >+ uint8_t seg = pdev->seg, bus = pdev->bus; >+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); >+ struct vpci_msix *msix = data; >+ paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; >+ bool new_masked, new_enabled; >+ unsigned int i; >+ int rc; >+ >+ new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL; >+ new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE; >+ >+ if ( !msix->enabled && new_enabled ) >+ { >+ /* MSI-X enabled. */ Insert "being"? >+ for ( i = 0; i < msix->max_entries; i++ ) >+ { >+ if ( msix->entries[i].masked ) >+ continue; Why is the mask bit relevant here, but not the mask-all one? >+ rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev, >+ msix->entries[i].addr, >+ msix->entries[i].data, >+ msix->entries[i].nr, table_base); >+ if ( rc ) >+ { >+ gdprintk(XENLOG_ERR, <+ "%04x:%02x:%02x.%u: unable to update entry %u: %d\n", >+ seg, bus, slot, func, i, rc); >+ return; >+ } >+ >+ vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false); Same question here. >+ } >+ } >+ else if ( msix->enabled && !new_enabled ) >+ { >+ /* MSI-X disabled. */ >+ for ( i = 0; i < msix->max_entries; i++ ) >+ { >+ rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev); >+ if ( rc ) >+ { >+ gdprintk(XENLOG_ERR, >+ "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n", >+ seg, bus, slot, func, i, rc); >+ return; >+ } >+ } >+ } >+ >+ if ( (new_enabled != msix->enabled || new_masked != msix->masked) && >+ pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 ) >+ pci_conf_write16(seg, bus, slot, func, reg, val.u32); DYM val.u16 here? >+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr) >+{ >+ struct vpci_msix *msix; >+ >+ ASSERT(vpci_locked(d)); >+ list_for_each_entry ( msix, &d->arch.hvm_domain.msix_tables, next ) >+ { >+ uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus; >+ uint8_t slot = PCI_SLOT(msix->pdev->devfn); >+ uint8_t func = PCI_FUNC(msix->pdev->devfn); >+ uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND); Perhaps better to keep a cached copy of the command register value? >+static int vpci_msix_read(struct vcpu *v, unsigned long addr, >+ unsigned int len, unsigned long *data) >+{ >+ struct domain *d = v->domain; >+ struct vpci_msix *msix; >+ const struct vpci_msix_entry *entry; >+ unsigned int offset; >+ >+ vpci_lock(d); >+ >+ msix = vpci_msix_find(d, addr); >+ if ( !msix ) >+ { >+ vpci_unlock(d); >+ *data = ~0ul; >+ return X86EMUL_OKAY; >+ } >+ >+ if ( vpci_msix_access_check(msix->pdev, addr, len) ) >+ { >+ vpci_unlock(d); >+ *data = ~0ul; >+ return X86EMUL_OKAY; >+ } >+ >+ if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) >+ { >+ /* Access to PBA. */ >+ switch ( len ) >+ { >+ case 4: >+ *data = readl(addr); >+ break; >+ case 8: >+ *data = readq(addr); >+ break; >+ default: >+ ASSERT_UNREACHABLE(); data = ~0ul; >+static int vpci_msix_write(struct vcpu *v, unsigned long addr, >+ unsigned int len, unsigned long data) >+{ >+ struct domain *d = v->domain; >+ struct vpci_msix *msix; >+ struct vpci_msix_entry *entry; >+ unsigned int offset; >+ >+ vpci_lock(d); >+ msix = vpci_msix_find(d, addr); >+ if ( !msix ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) >+ { >+ /* Ignore writes to PBA, it's behavior is undefined. */ >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ if ( vpci_msix_access_check(msix->pdev, addr, len) ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ /* Get the table entry and offset. */ >+ entry = vpci_msix_get_entry(msix, addr); >+ offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); >+ >+ switch ( offset ) >+ { >+ case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: >+ if ( len == 8 ) >+ { >+ entry->addr = data; >+ break; >+ } >+ entry->addr &= ~0xffffffff; With this, why not ... >+ entry->addr |= data; >+ break; >+ case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: >+ entry->addr &= ~((uint64_t)0xffffffff << 32); ... simply 0xffffffff here? >+ entry->addr |= data << 32; Iirc we've already talked about this being undefined for 32-bit arches (e.g. ARM32), and the resulting need to make "data" uint64_t. >+ break; >+ case PCI_MSIX_ENTRY_DATA_OFFSET: >+ /* >+ * 8 byte writes to the msg data and vector control fields are >+ * only allowed if the entry is masked. >+ */ >+ if ( len == 8 && !entry->masked && !msix->masked && msix->enabled ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } I don't think this is correct - iirc such writes simply don't take effect immediately (but I then seem to recall this to apply to the address field and 32-bit writes to the data field as well). They'd instead take effect the next time the entry is being unmasked (or some such). A while ago I did fix the qemu code to behave in this way. >+ entry->data = data; >+ >+ if ( len == 4 ) >+ break; >+ >+ data >>= 32; >+ /* fallthrough */ >+ case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: >+ { >+ bool new_masked = data & PCI_MSIX_VECTOR_BITMASK; >+ struct pci_dev *pdev = msix->pdev; >+ paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; >+ int rc; >+ >+ if ( !msix->enabled ) >+ { >+ entry->masked = new_masked; >+ break; >+ } >+ >+ if ( new_masked != entry->masked && !new_masked ) if ( !new_masked && entry->masked ) (or the other way around) >+static int vpci_init_msix(struct pci_dev *pdev) >+{ >+ struct domain *d = pdev->domain; >+ uint8_t seg = pdev->seg, bus = pdev->bus; >+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); >+ struct vpci_msix *msix; >+ unsigned int msix_offset, i, max_entries; >+ struct vpci_bar *table_bar, *pba_bar; >+ uint16_t control; >+ int rc; >+ >+ msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); >+ if ( !msix_offset ) >+ return 0; >+ >+ control = pci_conf_read16(seg, bus, slot, func, >+ msix_control_reg(msix_offset)); >+ >+ /* Get the maximum number of vectors the device supports. */ >+ max_entries = msix_table_size(control); >+ >+ msix = xzalloc_bytes(MSIX_SIZE(max_entries)); >+ if ( !msix ) >+ return -ENOMEM; >+ >+ msix->max_entries = max_entries; >+ msix->pdev = pdev; >+ >+ /* Find the MSI-X table address. */ >+ msix->table.offset = pci_conf_read32(seg, bus, slot, func, >+ msix_table_offset_reg(msix_offset)); >+ msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK; >+ msix->table.offset &= ~PCI_MSIX_BIRMASK; >+ msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE; >+ msix->table.addr = INVALID_PADDR; >+ >+ /* Find the MSI-X pba address. */ >+ msix->pba.offset = pci_conf_read32(seg, bus, slot, func, >+ msix_pba_offset_reg(msix_offset)); >+ msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK; >+ msix->pba.offset &= ~PCI_MSIX_BIRMASK; >+ msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8); I think you want to round up to at least the next 32-bit boundary; the spec talking about bits 63..00 even suggests a 64-bit boundary. The table addresses being required to be qword aligned also supports this. >+void vpci_dump_msix(void) >+{ >+ struct domain *d; >+ struct pci_dev *pdev; const for all pointers in dump handlers, as far as possible. >+ for_each_domain ( d ) >+ { >+ if ( !has_vpci(d) ) >+ continue; >+ >+ printk("vPCI MSI-X information for guest %u\n", d->domain_id); Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a domain next to each other? Apart from the comments here the ones give for the MSI patch apply respectively. Jan

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c index 6b9f76ec36..c060eb85eb 100644 --- a/xen/arch/x86/hvm/dom0_build.c +++ b/xen/arch/x86/hvm/dom0_build.c @@ -1091,7 +1091,7 @@ int __init dom0_construct_pvh(struct domain *d, const module_t *image, return rc; } - panic("Building a PVHv2 Dom0 is not yet supported."); + printk("WARNING: PVH is an experimental mode with limited functionality\n"); return 0; } diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index f45e2bd23d..9277e84150 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -585,6 +585,7 @@ int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags, INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list); INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list); INIT_LIST_HEAD(&d->arch.hvm_domain.mmcfg_regions); + INIT_LIST_HEAD(&d->arch.hvm_domain.msix_tables); rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL); if ( rc ) diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c index 5732c70b5c..f1c72f23d9 100644 --- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -643,17 +643,15 @@ static unsigned int msi_flags(uint16_t data, uint64_t addr) (trig_mode << GFLAGS_SHIFT_TRG_MODE); } -void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev, - unsigned int entry, bool mask) +static void vpci_mask_pirq(struct domain *d, int pirq, bool mask) { - struct domain *d = pdev->domain; const struct pirq *pinfo; struct irq_desc *desc; unsigned long flags; int irq; - ASSERT(arch->pirq >= 0); - pinfo = pirq_info(d, arch->pirq + entry); + ASSERT(pirq >= 0); + pinfo = pirq_info(d, pirq); ASSERT(pinfo); irq = pinfo->arch.irq; @@ -667,6 +665,12 @@ void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev, spin_unlock_irqrestore(&desc->lock, flags); } +void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev, + unsigned int entry, bool mask) +{ + vpci_mask_pirq(pdev->domain, arch->pirq + entry, mask); +} + int vpci_msi_arch_enable(struct vpci_arch_msi *arch, struct pci_dev *pdev, uint64_t address, uint32_t data, unsigned int vectors) { @@ -771,3 +775,117 @@ void vpci_msi_arch_print(struct vpci_arch_msi *arch, uint16_t data, MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK), arch->pirq); } + +void vpci_msix_arch_mask(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev, bool mask) +{ + vpci_mask_pirq(pdev->domain, arch->pirq, mask); +} + +int vpci_msix_arch_enable(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev, uint64_t address, + uint32_t data, unsigned int entry_nr, + paddr_t table_base) +{ + struct domain *d = pdev->domain; + xen_domctl_bind_pt_irq_t bind = { + .irq_type = PT_IRQ_TYPE_MSI, + .u.msi.gvec = msi_vector(data), + .u.msi.gflags = msi_flags(data, address), + }; + int rc; + + if ( arch->pirq == -1 ) + { + struct msi_info msi_info = { + .seg = pdev->seg, + .bus = pdev->bus, + .devfn = pdev->devfn, + .table_base = table_base, + .entry_nr = entry_nr, + }; + + /* Map PIRQ. */ + rc = allocate_and_map_msi_pirq(d, -1, &arch->pirq, + MAP_PIRQ_TYPE_MSI, &msi_info); + if ( rc ) + { + dprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: unable to map MSI-X PIRQ entry %u: %d\n", + pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), entry_nr, rc); + return rc; + } + } + + bind.machine_irq = arch->pirq; + pcidevs_lock(); + rc = pt_irq_create_bind(d, &bind); + if ( rc ) + { + dprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: unable to create MSI-X bind %u: %d\n", + pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), entry_nr, rc); + spin_lock(&d->event_lock); + unmap_domain_pirq(d, arch->pirq); + spin_unlock(&d->event_lock); + pcidevs_unlock(); + arch->pirq = -1; + return rc; + } + pcidevs_unlock(); + + return 0; +} + +int vpci_msix_arch_disable(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev) +{ + struct domain *d = pdev->domain; + xen_domctl_bind_pt_irq_t bind = { + .irq_type = PT_IRQ_TYPE_MSI, + .machine_irq = arch->pirq, + }; + int rc; + + if ( arch->pirq == -1 ) + return 0; + + pcidevs_lock(); + rc = pt_irq_destroy_bind(d, &bind); + if ( rc ) + { + pcidevs_unlock(); + return rc; + } + + spin_lock(&d->event_lock); + unmap_domain_pirq(d, arch->pirq); + spin_unlock(&d->event_lock); + pcidevs_unlock(); + + arch->pirq = -1; + + return 0; +} + +int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch) +{ + arch->pirq = -1; + return 0; +} + +void vpci_msix_arch_print(struct vpci_arch_msix_entry *entry, uint32_t data, + uint64_t addr, bool masked, unsigned int pos) +{ + printk("%4u vec=%#02x%7s%6s%3sassert%5s%7s dest_id=%lu mask=%u pirq: %d\n", + pos, MASK_EXTR(data, MSI_DATA_VECTOR_MASK), + data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed", + data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge", + data & MSI_DATA_LEVEL_ASSERT ? "" : "de", + addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys", + addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu", + MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK), + masked, entry->pirq); +} diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c index 573378d6c3..ad5c27df18 100644 --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -1539,6 +1539,7 @@ static void dump_msi(unsigned char key) } vpci_dump_msi(); + vpci_dump_msix(); } static int __init msi_setup_keyhandler(void) diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile index 62cec9e82b..55d1bdfda0 100644 --- a/xen/drivers/vpci/Makefile +++ b/xen/drivers/vpci/Makefile @@ -1 +1 @@ -obj-y += vpci.o header.o msi.o +obj-y += vpci.o header.o msi.o msix.o diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c index ae5719ab1a..07056350d6 100644 --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -20,6 +20,7 @@ #include <xen/sched.h> #include <xen/vpci.h> #include <xen/p2m-common.h> +#include <asm/p2m.h> #define MAPPABLE_BAR(x) \ (((x)->type == VPCI_BAR_MEM32 || (x)->type == VPCI_BAR_MEM64_LO || \ @@ -100,11 +101,45 @@ static int vpci_map_range(unsigned long s, unsigned long e, void *data) return modify_mmio(map->d, _gfn(s), _mfn(s), e - s + 1, map->map); } +static int vpci_unmap_msix(struct domain *d, struct vpci_msix_mem *msix) +{ + unsigned long gfn; + + for ( gfn = PFN_DOWN(msix->addr); gfn <= PFN_UP(msix->addr + msix->size); + gfn++ ) + { + p2m_type_t t; + mfn_t mfn = get_gfn(d, gfn, &t); + int rc; + + if ( mfn_eq(mfn, INVALID_MFN) ) + { + /* Nothing to do, this is already a hole. */ + put_gfn(d, gfn); + continue; + } + + if ( !p2m_is_mmio(t) ) + { + put_gfn(d, gfn); + return -EINVAL; + } + + rc = modify_mmio(d, _gfn(gfn), mfn, 1, false); + put_gfn(d, gfn); + if ( rc ) + return rc; + } + + return 0; +} + static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar, const bool map) { struct rangeset *mem; struct map_data data = { .d = d, .map = map }; + unsigned int i; int rc; ASSERT(MAPPABLE_BAR(bar)); @@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar, if ( IS_ERR(mem) ) return -PTR_ERR(mem); + /* + * Make sure the MSI-X regions of the BAR are not mapped into the domain + * p2m, or else the MSI-X handlers are useless. Only do this when mapping, + * since that's when the memory decoding on the device is enabled. + */ + for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ ) + { + struct vpci_msix_mem *msix = bar->msix[i]; + + if ( !msix || msix->addr == INVALID_PADDR ) + continue; + + rc = vpci_unmap_msix(d, msix); + if ( rc ) + { + rangeset_destroy(mem); + return rc; + } + + rc = rangeset_remove_range(mem, PFN_DOWN(msix->addr), + PFN_DOWN(msix->addr + msix->size)); + if ( rc ) + { + rangeset_destroy(mem); + return rc; + } + + } + rc = rangeset_report_ranges(mem, 0, ~0ul, vpci_map_range, &data); rangeset_destroy(mem); if ( rc ) @@ -221,6 +285,7 @@ static void vpci_bar_write(struct pci_dev *pdev, unsigned int reg, uint8_t seg = pdev->seg, bus = pdev->bus; uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); uint32_t wdata = val.u32, size_mask; + unsigned int i; bool hi = false; switch ( bar->type ) @@ -269,6 +334,11 @@ static void vpci_bar_write(struct pci_dev *pdev, unsigned int reg, bar->addr &= ~((uint64_t)0xffffffff << (hi ? 32 : 0)); bar->addr |= (uint64_t)wdata << (hi ? 32 : 0); + /* Update any MSI-X areas contained in this BAR. */ + for ( i = 0; i < ARRAY_SIZE(bar->msix); i++ ) + if ( bar->msix[i] ) + bar->msix[i]->addr = bar->addr + bar->msix[i]->offset; + /* Make sure Xen writes back the same value for the BAR RO bits. */ if ( !hi ) wdata |= pci_conf_read32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), @@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev) continue; } - bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR; + if ( cmd & PCI_COMMAND_MEMORY ) + { + unsigned int j; + + bars[i].addr = addr; + + for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ ) + if ( bars[i].msix[j] ) + bars[i].msix[j]->addr = bars[i].addr + + bars[i].msix[j]->offset; + } + else + bars[i].addr = INVALID_PADDR; + bars[i].size = size; bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH; diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c new file mode 100644 index 0000000000..dc0e7070e4 --- /dev/null +++ b/xen/drivers/vpci/msix.c @@ -0,0 +1,503 @@ +/* + * Handlers for accesses to the MSI-X capability structure and the memory + * region. + * + * Copyright (C) 2017 Citrix Systems R&D + * + * This program is free software; you can redistribute it and/or + * modify it under the terms and conditions of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include <xen/sched.h> +#include <xen/vpci.h> +#include <asm/msi.h> +#include <xen/p2m-common.h> +#include <xen/keyhandler.h> + +#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num])) +#define MSIX_ADDR_IN_RANGE(a, table) \ + ((table)->addr != INVALID_PADDR && (a) >= (table)->addr && \ + (a) < (table)->addr + (table)->size) + +static void vpci_msix_control_read(struct pci_dev *pdev, unsigned int reg, + union vpci_val *val, void *data) +{ + const struct vpci_msix *msix = data; + + val->u16 = (msix->max_entries - 1) & PCI_MSIX_FLAGS_QSIZE; + val->u16 |= msix->enabled ? PCI_MSIX_FLAGS_ENABLE : 0; + val->u16 |= msix->masked ? PCI_MSIX_FLAGS_MASKALL : 0; +} + +static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg, + union vpci_val val, void *data) +{ + uint8_t seg = pdev->seg, bus = pdev->bus; + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); + struct vpci_msix *msix = data; + paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; + bool new_masked, new_enabled; + unsigned int i; + int rc; + + new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL; + new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE; + + if ( !msix->enabled && new_enabled ) + { + /* MSI-X enabled. */ + for ( i = 0; i < msix->max_entries; i++ ) + { + if ( msix->entries[i].masked ) + continue; + + rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev, + msix->entries[i].addr, + msix->entries[i].data, + msix->entries[i].nr, table_base); + if ( rc ) + { + gdprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: unable to update entry %u: %d\n", + seg, bus, slot, func, i, rc); + return; + } + + vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false); + } + } + else if ( msix->enabled && !new_enabled ) + { + /* MSI-X disabled. */ + for ( i = 0; i < msix->max_entries; i++ ) + { + rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev); + if ( rc ) + { + gdprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n", + seg, bus, slot, func, i, rc); + return; + } + } + } + + if ( (new_enabled != msix->enabled || new_masked != msix->masked) && + pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 ) + pci_conf_write16(seg, bus, slot, func, reg, val.u32); + + msix->masked = new_masked; + msix->enabled = new_enabled; +} + +static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr) +{ + struct vpci_msix *msix; + + ASSERT(vpci_locked(d)); + list_for_each_entry ( msix, &d->arch.hvm_domain.msix_tables, next ) + { + uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus; + uint8_t slot = PCI_SLOT(msix->pdev->devfn); + uint8_t func = PCI_FUNC(msix->pdev->devfn); + uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND); + + if ( (cmd & PCI_COMMAND_MEMORY) && + (MSIX_ADDR_IN_RANGE(addr, &msix->table) || + MSIX_ADDR_IN_RANGE(addr, &msix->pba)) ) + return msix; + } + + return NULL; +} + +static int vpci_msix_accept(struct vcpu *v, unsigned long addr) +{ + bool found; + + vpci_lock(v->domain); + found = vpci_msix_find(v->domain, addr); + vpci_unlock(v->domain); + + return found; +} + +static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr, + unsigned int len) +{ + uint8_t seg = pdev->seg, bus = pdev->bus; + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); + + /* Only allow 32/64b accesses. */ + if ( len != 4 && len != 8 ) + { + gdprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n", + seg, bus, slot, func, len); + return -EINVAL; + } + + /* Only allow aligned accesses. */ + if ( (addr & (len - 1)) != 0 ) + { + gdprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: MSI-X only allows aligned accesses\n", + seg, bus, slot, func); + return -EINVAL; + } + + return 0; +} + +static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix, + unsigned long addr) +{ + return &msix->entries[(addr - msix->table.addr) / PCI_MSIX_ENTRY_SIZE]; +} + +static int vpci_msix_read(struct vcpu *v, unsigned long addr, + unsigned int len, unsigned long *data) +{ + struct domain *d = v->domain; + struct vpci_msix *msix; + const struct vpci_msix_entry *entry; + unsigned int offset; + + vpci_lock(d); + + msix = vpci_msix_find(d, addr); + if ( !msix ) + { + vpci_unlock(d); + *data = ~0ul; + return X86EMUL_OKAY; + } + + if ( vpci_msix_access_check(msix->pdev, addr, len) ) + { + vpci_unlock(d); + *data = ~0ul; + return X86EMUL_OKAY; + } + + if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) + { + /* Access to PBA. */ + switch ( len ) + { + case 4: + *data = readl(addr); + break; + case 8: + *data = readq(addr); + break; + default: + ASSERT_UNREACHABLE(); + break; + } + + vpci_unlock(d); + return X86EMUL_OKAY; + } + + /* Get the table entry and offset. */ + entry = vpci_msix_get_entry(msix, addr); + offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); + + switch ( offset ) + { + case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: + /* + * NB: do explicit truncation to the size of the access. This shouldn't + * be required here, since the caller of the handler should already + * take the appropriate measures to truncate the value before returning + * to the guest, but better be safe than sorry. + */ + *data = len == 8 ? entry->addr : (uint32_t)entry->addr; + break; + case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: + *data = entry->addr >> 32; + break; + case PCI_MSIX_ENTRY_DATA_OFFSET: + *data = entry->data; + if ( len == 8 ) + *data |= + (uint64_t)(entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0) << 32; + break; + case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: + *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0; + break; + default: + BUG(); + } + vpci_unlock(d); + + return X86EMUL_OKAY; +} + +static int vpci_msix_write(struct vcpu *v, unsigned long addr, + unsigned int len, unsigned long data) +{ + struct domain *d = v->domain; + struct vpci_msix *msix; + struct vpci_msix_entry *entry; + unsigned int offset; + + vpci_lock(d); + msix = vpci_msix_find(d, addr); + if ( !msix ) + { + vpci_unlock(d); + return X86EMUL_OKAY; + } + + if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) + { + /* Ignore writes to PBA, it's behavior is undefined. */ + vpci_unlock(d); + return X86EMUL_OKAY; + } + + if ( vpci_msix_access_check(msix->pdev, addr, len) ) + { + vpci_unlock(d); + return X86EMUL_OKAY; + } + + /* Get the table entry and offset. */ + entry = vpci_msix_get_entry(msix, addr); + offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); + + switch ( offset ) + { + case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: + if ( len == 8 ) + { + entry->addr = data; + break; + } + entry->addr &= ~0xffffffff; + entry->addr |= data; + break; + case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: + entry->addr &= ~((uint64_t)0xffffffff << 32); + entry->addr |= data << 32; + break; + case PCI_MSIX_ENTRY_DATA_OFFSET: + /* + * 8 byte writes to the msg data and vector control fields are + * only allowed if the entry is masked. + */ + if ( len == 8 && !entry->masked && !msix->masked && msix->enabled ) + { + vpci_unlock(d); + return X86EMUL_OKAY; + } + + entry->data = data; + + if ( len == 4 ) + break; + + data >>= 32; + /* fallthrough */ + case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: + { + bool new_masked = data & PCI_MSIX_VECTOR_BITMASK; + struct pci_dev *pdev = msix->pdev; + paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; + int rc; + + if ( !msix->enabled ) + { + entry->masked = new_masked; + break; + } + + if ( new_masked != entry->masked && !new_masked ) + { + /* Unmasking an entry, update it. */ + rc = vpci_msix_arch_enable(&entry->arch, pdev, entry->addr, + entry->data, entry->nr, table_base); + if ( rc ) + { + vpci_unlock(d); + gdprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: unable to update entry %u: %d\n", + pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), entry->nr, rc); + return X86EMUL_OKAY; + } + } + + vpci_msix_arch_mask(&entry->arch, pdev, new_masked); + entry->masked = new_masked; + + break; + } + default: + BUG(); + } + vpci_unlock(d); + + return X86EMUL_OKAY; +} + +static const struct hvm_mmio_ops vpci_msix_table_ops = { + .check = vpci_msix_accept, + .read = vpci_msix_read, + .write = vpci_msix_write, +}; + +static int vpci_init_msix(struct pci_dev *pdev) +{ + struct domain *d = pdev->domain; + uint8_t seg = pdev->seg, bus = pdev->bus; + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); + struct vpci_msix *msix; + unsigned int msix_offset, i, max_entries; + struct vpci_bar *table_bar, *pba_bar; + uint16_t control; + int rc; + + msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); + if ( !msix_offset ) + return 0; + + control = pci_conf_read16(seg, bus, slot, func, + msix_control_reg(msix_offset)); + + /* Get the maximum number of vectors the device supports. */ + max_entries = msix_table_size(control); + + msix = xzalloc_bytes(MSIX_SIZE(max_entries)); + if ( !msix ) + return -ENOMEM; + + msix->max_entries = max_entries; + msix->pdev = pdev; + + /* Find the MSI-X table address. */ + msix->table.offset = pci_conf_read32(seg, bus, slot, func, + msix_table_offset_reg(msix_offset)); + msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK; + msix->table.offset &= ~PCI_MSIX_BIRMASK; + msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE; + msix->table.addr = INVALID_PADDR; + + /* Find the MSI-X pba address. */ + msix->pba.offset = pci_conf_read32(seg, bus, slot, func, + msix_pba_offset_reg(msix_offset)); + msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK; + msix->pba.offset &= ~PCI_MSIX_BIRMASK; + msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8); + msix->pba.addr = INVALID_PADDR; + + for ( i = 0; i < msix->max_entries; i++) + { + msix->entries[i].masked = true; + msix->entries[i].nr = i; + vpci_msix_arch_init(&msix->entries[i].arch); + } + + if ( list_empty(&d->arch.hvm_domain.msix_tables) ) + register_mmio_handler(d, &vpci_msix_table_ops); + + list_add(&msix->next, &d->arch.hvm_domain.msix_tables); + + rc = vpci_add_register(pdev, vpci_msix_control_read, + vpci_msix_control_write, + msix_control_reg(msix_offset), 2, msix); + if ( rc ) + { + dprintk(XENLOG_ERR, + "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: %d\n", + seg, bus, slot, func, rc); + goto error; + } + + table_bar = &pdev->vpci->header.bars[msix->table.bir]; + pba_bar = &pdev->vpci->header.bars[msix->pba.bir]; + + /* + * The header handlers will take care of leaving a hole for the MSI-X + * related areas, that's why MSI-X needs to be initialized before the + * header. + */ + table_bar->msix[VPCI_BAR_MSIX_TABLE] = &msix->table; + pba_bar->msix[VPCI_BAR_MSIX_PBA] = &msix->pba; + pdev->vpci->msix = msix; + + return 0; + + error: + ASSERT(rc); + xfree(msix); + return rc; +} + +REGISTER_VPCI_INIT(vpci_init_msix, VPCI_PRIORITY_HIGH); + +void vpci_dump_msix(void) +{ + struct domain *d; + struct pci_dev *pdev; + + for_each_domain ( d ) + { + if ( !has_vpci(d) ) + continue; + + printk("vPCI MSI-X information for guest %u\n", d->domain_id); + + if ( !vpci_trylock(d) ) + { + printk("Unable to get vPCI lock, skipping\n"); + continue; + } + + list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list) + { + uint8_t seg = pdev->seg, bus = pdev->bus; + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); + struct vpci_msix *msix = pdev->vpci->msix; + unsigned int i; + + if ( !msix ) + continue; + + printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func); + + printk("Max entries: %u maskall: %u enabled: %u\n", + msix->max_entries, msix->masked, msix->enabled); + + printk("Guest table entries:\n"); + for ( i = 0; i < msix->max_entries; i++ ) + vpci_msix_arch_print(&msix->entries[i].arch, + msix->entries[i].data, + msix->entries[i].addr, + msix->entries[i].masked, i); + } + vpci_unlock(d); + } +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h index 7028f93861..980d718327 100644 --- a/xen/include/asm-x86/hvm/domain.h +++ b/xen/include/asm-x86/hvm/domain.h @@ -190,6 +190,9 @@ struct hvm_domain { /* List of ECAM (MMCFG) regions trapped by Xen. */ struct list_head mmcfg_regions; + /* List of MSI-X tables. */ + struct list_head msix_tables; + /* List of permanently write-mapped pages. */ struct { spinlock_t lock; diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h index 55ed094734..739eefe541 100644 --- a/xen/include/asm-x86/hvm/io.h +++ b/xen/include/asm-x86/hvm/io.h @@ -144,6 +144,24 @@ int vpci_msi_arch_init(struct vpci_arch_msi *arch); void vpci_msi_arch_print(struct vpci_arch_msi *arch, uint16_t data, uint64_t addr); +/* Arch-specific MSI-X entry data for vPCI. */ +struct vpci_arch_msix_entry { + int pirq; +}; + +/* Arch-specific vPCI MSI-X helpers. */ +void vpci_msix_arch_mask(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev, bool mask); +int vpci_msix_arch_enable(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev, uint64_t address, + uint32_t data, unsigned int entry_nr, + paddr_t table_base); +int vpci_msix_arch_disable(struct vpci_arch_msix_entry *arch, + struct pci_dev *pdev); +int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch); +void vpci_msix_arch_print(struct vpci_arch_msix_entry *entry, uint32_t data, + uint64_t addr, bool masked, unsigned int pos); + enum stdvga_cache_state { STDVGA_CACHE_UNINITIALIZED, STDVGA_CACHE_ENABLED, diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h index ca693f3667..5bc6380531 100644 --- a/xen/include/xen/vpci.h +++ b/xen/include/xen/vpci.h @@ -83,6 +83,10 @@ struct vpci { } type; paddr_t addr; uint64_t size; +#define VPCI_BAR_MSIX_TABLE 0 +#define VPCI_BAR_MSIX_PBA 1 +#define VPCI_BAR_MSIX_NUM 2 + struct vpci_msix_mem *msix[VPCI_BAR_MSIX_NUM]; bool prefetchable; bool sizing; bool enabled; @@ -112,10 +116,45 @@ struct vpci { /* Arch-specific data. */ struct vpci_arch_msi arch; } *msi; + + /* MSI-X data. */ + struct vpci_msix { + struct pci_dev *pdev; + /* Maximum number of vectors supported by the device. */ + unsigned int max_entries; + /* MSI-X enabled? */ + bool enabled; + /* Masked? */ + bool masked; + /* List link. */ + struct list_head next; + /* Table information. */ + struct vpci_msix_mem { + /* MSI-X table offset. */ + unsigned int offset; + /* MSI-X table BIR. */ + unsigned int bir; + /* Table addr. */ + paddr_t addr; + /* Table size. */ + unsigned int size; + } table; + /* PBA */ + struct vpci_msix_mem pba; + /* Entries. */ + struct vpci_msix_entry { + unsigned int nr; + uint64_t addr; + uint32_t data; + bool masked; + struct vpci_arch_msix_entry arch; + } entries[]; + } *msix; #endif }; void vpci_dump_msi(void); +void vpci_dump_msix(void); #endif

[v4,9/9] vpci/msix: add MSI-X handlers

Commit Message

Comments

Patch