Message ID | 20161005130657.3399-4-rkrcmar@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, 5 Oct 2016 15:06:52 +0200 Radim Krčmář <rkrcmar@redhat.com> wrote: > The MMIO interface to APIC only allowed 8 bit addresses, which is not > enough for 32 bit addresses from EIM remapping. > Intel stored upper 24 bits in the high MSI address, so use the same > technique. The technique is also used in KVM MSI interface. > Other APICs are unlikely to handle those upper bits. > > Reviewed-by: Igor Mammedov <imammedo@redhat.com> I don't recall giving my RB to this patch but I do recall asking question, see below. > Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> > --- > v4: r-b Igor > v2: fix build with enabled DEBUG_INTEL_IOMMU [Peter] > --- > hw/i386/intel_iommu.c | 21 +++++++++------------ > 1 file changed, 9 insertions(+), 12 deletions(-) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 9f4e64af1ad5..c39b62b898d8 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -31,6 +31,7 @@ > #include "hw/i386/x86-iommu.h" > #include "hw/pci-host/q35.h" > #include "sysemu/kvm.h" > +#include "hw/i386/apic_internal.h" > > /*#define DEBUG_INTEL_IOMMU*/ > #ifdef DEBUG_INTEL_IOMMU > @@ -279,18 +280,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, > static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, > hwaddr mesg_data_reg) > { > - hwaddr addr; > - uint32_t data; > + MSIMessage msi; > > assert(mesg_data_reg < DMAR_REG_SIZE); > assert(mesg_addr_reg < DMAR_REG_SIZE); > > - addr = vtd_get_long_raw(s, mesg_addr_reg); > - data = vtd_get_long_raw(s, mesg_data_reg); > + msi.address = vtd_get_long_raw(s, mesg_addr_reg); > + msi.data = vtd_get_long_raw(s, mesg_data_reg); > > - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); > - address_space_stl_le(&address_space_memory, addr, data, > - MEMTXATTRS_UNSPECIFIED, NULL); > + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, > + msi.address, msi.data); > + apic_get_class()->send_msi(&msi); > } > > /* Generate a fault event to software via MSI if conditions are met. > @@ -2133,6 +2133,7 @@ static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out) > msg.dest_mode = irq->dest_mode; > msg.redir_hint = irq->redir_hint; > msg.dest = irq->dest; > + msg.__addr_hi = irq->dest & 0xffffff00; what about BE host? should it be: msg.__addr_hi = cpu_to_le32(irq->dest & 0xffffff00) > msg.__addr_head = cpu_to_le32(0xfee); > /* Keep this from original MSI address bits */ > msg.__not_used = irq->msi_addr_last_bits; > @@ -2281,11 +2282,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, > " for device sid 0x%04x", > to.address, to.data, sid); > > - if (dma_memory_write(&address_space_memory, to.address, > - &to.data, size)) { > - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 > - " value 0x%"PRIx32, to.address, to.data); > - } > + apic_get_class()->send_msi(&to); > > return MEMTX_OK; > }
2016-10-07 15:05+0200, Igor Mammedov: > On Wed, 5 Oct 2016 15:06:52 +0200 > Radim Krčmář <rkrcmar@redhat.com> wrote: > >> The MMIO interface to APIC only allowed 8 bit addresses, which is not >> enough for 32 bit addresses from EIM remapping. >> Intel stored upper 24 bits in the high MSI address, so use the same >> technique. The technique is also used in KVM MSI interface. >> Other APICs are unlikely to handle those upper bits. >> >> Reviewed-by: Igor Mammedov <imammedo@redhat.com> > I don't recall giving my RB to this patch but I do recall asking question, > see below. Crap, sorry. >> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> >> --- >> v4: r-b Igor >> v2: fix build with enabled DEBUG_INTEL_IOMMU [Peter] >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> @@ -2133,6 +2133,7 @@ static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out) >> msg.dest_mode = irq->dest_mode; >> msg.redir_hint = irq->redir_hint; >> msg.dest = irq->dest; >> + msg.__addr_hi = irq->dest & 0xffffff00; > > what about BE host? KVM accepts the address in host endianess and QEMU/VTD code also uses host endianess for internal representation of memory addresses, so this hunk should be fine. It is confusing, because the VTD is definitely broken with respect to endianess -- it is even trying to swap the order of bits in a byte in the definition of VTD_MSIMessage. I don't believe that dma_memory_write() accepted LE address on BE hosts, so the existing code for filling the address is wrong: msg.__addr_head = cpu_to_le32(0xfee); > should it be: > msg.__addr_hi = cpu_to_le32(irq->dest & 0xffffff00) I don't think so. Howewer, there are endianess bugs in this patch: >> @@ -2281,11 +2282,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, >> " for device sid 0x%04x", >> to.address, to.data, sid); >> >> - if (dma_memory_write(&address_space_memory, to.address, >> - &to.data, size)) { >> - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 >> - " value 0x%"PRIx32, to.address, to.data); >> - } >> + apic_get_class()->send_msi(&to); because dma_memory_write() does magic on data. I don't understand how the code should have worked before this series, because kvm_apic_mem_write() expects data in little endian and apic_mem_writel() expects data in host endian, even though both of them are DEVICE_NATIVE_ENDIAN and are called the same way, so one shouldn't work. And similarly, this hunk is wrong: >> @@ -279,18 +280,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, >> static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, >> hwaddr mesg_data_reg) >> { >> - hwaddr addr; >> - uint32_t data; >> + MSIMessage msi; >> >> assert(mesg_data_reg < DMAR_REG_SIZE); >> assert(mesg_addr_reg < DMAR_REG_SIZE); >> >> - addr = vtd_get_long_raw(s, mesg_addr_reg); >> - data = vtd_get_long_raw(s, mesg_data_reg); >> + msi.address = vtd_get_long_raw(s, mesg_addr_reg); >> + msi.data = vtd_get_long_raw(s, mesg_data_reg); >> >> - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); >> - address_space_stl_le(&address_space_memory, addr, data, >> - MEMTXATTRS_UNSPECIFIED, NULL); >> + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, >> + msi.address, msi.data); >> + apic_get_class()->send_msi(&msi); >> } It should have been wrong even before, because address_space_stl_le() seems to accept the address in host endianess and not in LE ... UGH.
On Fri, Oct 07, 2016 at 06:24:15PM +0200, Radim Krčmář wrote: [...] > KVM accepts the address in host endianess and QEMU/VTD code also uses > host endianess for internal representation of memory addresses, so this > hunk should be fine. > > It is confusing, because the VTD is definitely broken with respect to > endianess -- it is even trying to swap the order of bits in a byte in > the definition of VTD_MSIMessage. > I don't believe that dma_memory_write() accepted LE address on BE hosts, > so the existing code for filling the address is wrong: > > msg.__addr_head = cpu_to_le32(0xfee); Yeah. This is my fault. Sorry for the troubles. I have a patch (as well...) to fix this in my local tree, but not posted (as mst suggested). Maybe it's time to post some of them now (I tried to make patches more into a bunch so that they won't be lost in mailing list in case maintainer missed it). I agree that current code is never tested on big endian machines yet. Here it should be: msg.__addr_head = 0xfee; > > > should it be: > > msg.__addr_hi = cpu_to_le32(irq->dest & 0xffffff00) > > I don't think so. > > Howewer, there are endianess bugs in this patch: > > >> @@ -2281,11 +2282,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, > >> " for device sid 0x%04x", > >> to.address, to.data, sid); > >> > >> - if (dma_memory_write(&address_space_memory, to.address, > >> - &to.data, size)) { > >> - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 > >> - " value 0x%"PRIx32, to.address, to.data); > >> - } > >> + apic_get_class()->send_msi(&to); > > because dma_memory_write() does magic on data. > > I don't understand how the code should have worked before this series, > because kvm_apic_mem_write() expects data in little endian and > apic_mem_writel() expects data in host endian, even though both of them > are DEVICE_NATIVE_ENDIAN and are called the same way, so one shouldn't > work. I guess that's because APIC is only used for x86? Then it does not matter. And I agree with you that currently MSI is a little bit confused on endianess. First of all, I believe in the protocol MSI (along with PCI logics) should be LE. Instead, our MSIMessage struct looks more like to be for host endianess. E.g. in msi_get_message() we are using: msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); and pci_get_word() is converting LE to host endianess. However, in all kvm_irqchip_*() APIs, we are assuming MSIMessage as LE. E.g.: int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) { struct kvm_msi msi; KVMMSIRoute *route; if (kvm_direct_msi_allowed) { ... msi.data = le32_to_cpu(msg.data); ... } } These things are conflicting. Maybe we need to clean this up. And I prefer MSIMessage to be host endianess. > > And similarly, this hunk is wrong: > > >> @@ -279,18 +280,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, > >> static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, > >> hwaddr mesg_data_reg) > >> { > >> - hwaddr addr; > >> - uint32_t data; > >> + MSIMessage msi; > >> > >> assert(mesg_data_reg < DMAR_REG_SIZE); > >> assert(mesg_addr_reg < DMAR_REG_SIZE); > >> > >> - addr = vtd_get_long_raw(s, mesg_addr_reg); > >> - data = vtd_get_long_raw(s, mesg_data_reg); > >> + msi.address = vtd_get_long_raw(s, mesg_addr_reg); > >> + msi.data = vtd_get_long_raw(s, mesg_data_reg); > >> > >> - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); > >> - address_space_stl_le(&address_space_memory, addr, data, > >> - MEMTXATTRS_UNSPECIFIED, NULL); > >> + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, > >> + msi.address, msi.data); > >> + apic_get_class()->send_msi(&msi); > >> } > > It should have been wrong even before, because address_space_stl_le() > seems to accept the address in host endianess and not in LE ... UGH. Again, I guess no one is running VT-d in BE machines. So problems are not exposed. Thanks, -- peterx
On Sat, Oct 08, 2016 at 02:14:09PM +0800, Peter Xu wrote: > On Fri, Oct 07, 2016 at 06:24:15PM +0200, Radim Krčmář wrote: > > [...] > > > KVM accepts the address in host endianess and QEMU/VTD code also uses > > host endianess for internal representation of memory addresses, so this > > hunk should be fine. > > > > It is confusing, because the VTD is definitely broken with respect to > > endianess -- it is even trying to swap the order of bits in a byte in > > the definition of VTD_MSIMessage. > > I don't believe that dma_memory_write() accepted LE address on BE hosts, > > so the existing code for filling the address is wrong: > > > > msg.__addr_head = cpu_to_le32(0xfee); > > Yeah. This is my fault. Sorry for the troubles. > > I have a patch (as well...) to fix this in my local tree, but not > posted (as mst suggested). Maybe it's time to post some of them now (I > tried to make patches more into a bunch so that they won't be lost in > mailing list in case maintainer missed it). I'll send them after your series to avoid unecessary conflicts. -- peterx
On Wed, Oct 05, 2016 at 03:06:52PM +0200, Radim Krčmář wrote: > The MMIO interface to APIC only allowed 8 bit addresses, which is not > enough for 32 bit addresses from EIM remapping. > Intel stored upper 24 bits in the high MSI address, so use the same > technique. The technique is also used in KVM MSI interface. > Other APICs are unlikely to handle those upper bits. > > Reviewed-by: Igor Mammedov <imammedo@redhat.com> > Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> Again we are assuming MSIMessage as host endianess here. If there is more spin for this one, maybe we can consider providing a common function for: apic_get_class()->send_msi(&msi); And call it in the two places. But this one is good enough for me, so: Reviewed-by: Peter Xu <peterx@redhat.com> Thanks, -- peterx
On Sat, 8 Oct 2016 14:14:09 +0800 Peter Xu <peterx@redhat.com> wrote: > On Fri, Oct 07, 2016 at 06:24:15PM +0200, Radim Krčmář wrote: > > [...] > > > KVM accepts the address in host endianess and QEMU/VTD code also uses > > host endianess for internal representation of memory addresses, so this > > hunk should be fine. > > > > It is confusing, because the VTD is definitely broken with respect to > > endianess -- it is even trying to swap the order of bits in a byte in > > the definition of VTD_MSIMessage. > > I don't believe that dma_memory_write() accepted LE address on BE hosts, > > so the existing code for filling the address is wrong: > > > > msg.__addr_head = cpu_to_le32(0xfee); > > Yeah. This is my fault. Sorry for the troubles. > > I have a patch (as well...) to fix this in my local tree, but not > posted (as mst suggested). Maybe it's time to post some of them now (I > tried to make patches more into a bunch so that they won't be lost in > mailing list in case maintainer missed it). I agree that current code > is never tested on big endian machines yet. > > Here it should be: > > msg.__addr_head = 0xfee; > > > > > > should it be: > > > msg.__addr_hi = cpu_to_le32(irq->dest & 0xffffff00) > > > > I don't think so. > > > > Howewer, there are endianess bugs in this patch: > > > > >> @@ -2281,11 +2282,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, > > >> " for device sid 0x%04x", > > >> to.address, to.data, sid); > > >> > > >> - if (dma_memory_write(&address_space_memory, to.address, > > >> - &to.data, size)) { > > >> - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 > > >> - " value 0x%"PRIx32, to.address, to.data); > > >> - } > > >> + apic_get_class()->send_msi(&to); > > > > because dma_memory_write() does magic on data. > > > > I don't understand how the code should have worked before this series, > > because kvm_apic_mem_write() expects data in little endian and > > apic_mem_writel() expects data in host endian, even though both of them > > are DEVICE_NATIVE_ENDIAN and are called the same way, so one shouldn't > > work. > > I guess that's because APIC is only used for x86? Then it does not > matter. And I agree with you that currently MSI is a little bit > confused on endianess. > > First of all, I believe in the protocol MSI (along with PCI logics) > should be LE. > > Instead, our MSIMessage struct looks more like to be for host > endianess. E.g. in msi_get_message() we are using: > > msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); > > and pci_get_word() is converting LE to host endianess. > > However, in all kvm_irqchip_*() APIs, we are assuming MSIMessage as > LE. E.g.: > > int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) > { > struct kvm_msi msi; > KVMMSIRoute *route; > > if (kvm_direct_msi_allowed) { > ... > msi.data = le32_to_cpu(msg.data); > ... > } > } > > These things are conflicting. > > Maybe we need to clean this up. And I prefer MSIMessage to be host > endianess. Mostly internal QEMU structures are in host order and converted to guest byte-order on transfer, except for structures that are memcpy-ed, in which case they are typically marked QEMU_PACKED and that throws flag to reviewers to check if endianess is correct. At least struct VTD_MSIMessage definition need a comment saying what byte-order is expected. > > > > > And similarly, this hunk is wrong: > > > > >> @@ -279,18 +280,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, > > >> static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, > > >> hwaddr mesg_data_reg) > > >> { > > >> - hwaddr addr; > > >> - uint32_t data; > > >> + MSIMessage msi; > > >> > > >> assert(mesg_data_reg < DMAR_REG_SIZE); > > >> assert(mesg_addr_reg < DMAR_REG_SIZE); > > >> > > >> - addr = vtd_get_long_raw(s, mesg_addr_reg); > > >> - data = vtd_get_long_raw(s, mesg_data_reg); > > >> + msi.address = vtd_get_long_raw(s, mesg_addr_reg); > > >> + msi.data = vtd_get_long_raw(s, mesg_data_reg); > > >> > > >> - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); > > >> - address_space_stl_le(&address_space_memory, addr, data, > > >> - MEMTXATTRS_UNSPECIFIED, NULL); > > >> + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, > > >> + msi.address, msi.data); > > >> + apic_get_class()->send_msi(&msi); > > >> } > > > > It should have been wrong even before, because address_space_stl_le() > > seems to accept the address in host endianess and not in LE ... UGH. > > Again, I guess no one is running VT-d in BE machines. So problems are > not exposed. > > Thanks, > > -- peterx >
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 9f4e64af1ad5..c39b62b898d8 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -31,6 +31,7 @@ #include "hw/i386/x86-iommu.h" #include "hw/pci-host/q35.h" #include "sysemu/kvm.h" +#include "hw/i386/apic_internal.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -279,18 +280,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, hwaddr mesg_data_reg) { - hwaddr addr; - uint32_t data; + MSIMessage msi; assert(mesg_data_reg < DMAR_REG_SIZE); assert(mesg_addr_reg < DMAR_REG_SIZE); - addr = vtd_get_long_raw(s, mesg_addr_reg); - data = vtd_get_long_raw(s, mesg_data_reg); + msi.address = vtd_get_long_raw(s, mesg_addr_reg); + msi.data = vtd_get_long_raw(s, mesg_data_reg); - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); - address_space_stl_le(&address_space_memory, addr, data, - MEMTXATTRS_UNSPECIFIED, NULL); + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, + msi.address, msi.data); + apic_get_class()->send_msi(&msi); } /* Generate a fault event to software via MSI if conditions are met. @@ -2133,6 +2133,7 @@ static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out) msg.dest_mode = irq->dest_mode; msg.redir_hint = irq->redir_hint; msg.dest = irq->dest; + msg.__addr_hi = irq->dest & 0xffffff00; msg.__addr_head = cpu_to_le32(0xfee); /* Keep this from original MSI address bits */ msg.__not_used = irq->msi_addr_last_bits; @@ -2281,11 +2282,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, " for device sid 0x%04x", to.address, to.data, sid); - if (dma_memory_write(&address_space_memory, to.address, - &to.data, size)) { - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 - " value 0x%"PRIx32, to.address, to.data); - } + apic_get_class()->send_msi(&to); return MEMTX_OK; }