Message ID | 571F4840.2060800@web.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Apr 26, 2016 at 12:51:44PM +0200, Jan Kiszka wrote: > On 2016-04-26 12:38, Peter Xu wrote: > >>>> Hi, Jan, > >>>> > >>>> The above issue should be caused by EOI missing of level-triggered > >>>> interrupts. Before that, I was always using edge-triggered > >>>> interrupts for test, so didn't encounter this one. Would you please > >>>> help try below patch? It can be applied directly onto the series, > >>>> and should solve the issue (it works on my test vm, and I'll take it > >>>> in v5 as well if it also works for you): > >>>> > >>> > >>> Works here as well. I even made EIM working with some hack, though > >>> Jailhouse spits out strange warnings, despite it works fine (x2apic > >>> mode, split irqchip). > >> > >> Corrections: the warnings are issued by qemu, not Jailhouse, e.g. > >> > >> qemu-system-x86_64: VT-d Failed to remap interrupt for gsi 22. > >> > >> I suspect that comes from the hand-over phase of Jailhouse, when it > >> mutes all interrupts in the system while reconfiguring IR and IOAPIC. > >> > >> Please convert this error (in kvm_arch_fixup_msi_route) into a trace > >> point. It shall not annoy the host. Also check if you have more of such > >> guest-triggerable error messages. > > > > Okay. This should be the only one. I can use trace instead. > > > > Meanwhile, I still suppose we should not seen it even with > > error_report().. Would this happen when boot e.g. generic kernels? > > No, this is - most probably, still need to validate in details - an > effect due to the special case with Jailhouse that interrupt and VT-d > management is handed over from a running OS to a hypervisor. > > Jan > > PS: Current quick hack for EIM support (lacks compat mode blocking at > least): > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 1082ab5..709a92c 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -907,6 +907,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) > value = vtd_get_quad_raw(s, DMAR_IRTA_REG); > s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); > s->intr_root = value & VTD_IRTA_ADDR_MASK; > + s->intr_eime = value & VTD_IRTA_EIME; > > QLIST_FOREACH(notifier, &s->iec_notifiers, list) { > if (notifier->iec_notify) { > @@ -2052,11 +2053,13 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq > irq->trigger_mode = irte.trigger_mode; > irq->vector = irte.vector; > irq->delivery_mode = irte.delivery_mode; > - /* Not support EIM yet: please refer to vt-d 9.10 DST bits */ > + irq->dest = irte.dest_id; > + if (!iommu->intr_eime) { > #define VTD_IR_APIC_DEST_MASK (0xff00ULL) > #define VTD_IR_APIC_DEST_SHIFT (8) > - irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \ > - VTD_IR_APIC_DEST_SHIFT; > + irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> > + VTD_IR_APIC_DEST_SHIFT; > + } > irq->dest_mode = irte.dest_mode; > irq->redir_hint = irte.redir_hint; > > @@ -2316,7 +2319,7 @@ static void vtd_init(IntelIOMMUState *s) > s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; > > if (ms->iommu_intr) { > - s->ecap |= VTD_ECAP_IR; > + s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM; > } > > vtd_reset_context_cache(s); > @@ -2370,10 +2373,9 @@ static void vtd_init(IntelIOMMUState *s) > vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); > > /* > - * Interrupt remapping registers, not support extended interrupt > - * mode for now. > + * Interrupt remapping registers. > */ > - vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff00fULL, 0); > + vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); > } > > /* Should not reset address_spaces when reset because devices will still use > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index 10c20fe..72b0114 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -176,6 +176,7 @@ > > /* IRTA_REG */ > #define VTD_IRTA_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL) > +#define VTD_IRTA_EIME (1ULL << 11) > #define VTD_IRTA_SIZE_MASK (0xfULL) > > /* ECAP_REG */ > @@ -184,6 +185,7 @@ > #define VTD_ECAP_QI (1ULL << 1) > /* Interrupt Remapping support */ > #define VTD_ECAP_IR (1ULL << 3) > +#define VTD_ECAP_EIM (1ULL << 4) > > /* CAP_REG */ > /* (offset >> 4) << 24 */ > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index d7613af..c6c53c6 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -261,6 +261,7 @@ struct IntelIOMMUState { > bool intr_enabled; /* Whether guest enabled IR */ > dma_addr_t intr_root; /* Interrupt remapping table pointer */ > uint32_t intr_size; /* Number of IR table entries */ > + bool intr_eime; /* Extended interrupt mode enabled */ > QLIST_HEAD(, VTD_IEC_Notifier) iec_notifiers; /* IEC notify list */ > }; > Currently, all the interrupts will be translated into one MSI in vtd_generate_msi_message(), in which only 8 bits of dest_id is used (msg.dest = irq->dest). We may possibly need to use the high 32 bits of MSI address to store the rest dest[31:8]? Don't know whether this would be enough though. Thanks. -- peterx
On 2016-04-26 13:40, Peter Xu wrote: > On Tue, Apr 26, 2016 at 12:51:44PM +0200, Jan Kiszka wrote: >> On 2016-04-26 12:38, Peter Xu wrote: >>>>>> Hi, Jan, >>>>>> >>>>>> The above issue should be caused by EOI missing of level-triggered >>>>>> interrupts. Before that, I was always using edge-triggered >>>>>> interrupts for test, so didn't encounter this one. Would you please >>>>>> help try below patch? It can be applied directly onto the series, >>>>>> and should solve the issue (it works on my test vm, and I'll take it >>>>>> in v5 as well if it also works for you): >>>>>> >>>>> >>>>> Works here as well. I even made EIM working with some hack, though >>>>> Jailhouse spits out strange warnings, despite it works fine (x2apic >>>>> mode, split irqchip). >>>> >>>> Corrections: the warnings are issued by qemu, not Jailhouse, e.g. >>>> >>>> qemu-system-x86_64: VT-d Failed to remap interrupt for gsi 22. >>>> >>>> I suspect that comes from the hand-over phase of Jailhouse, when it >>>> mutes all interrupts in the system while reconfiguring IR and IOAPIC. >>>> >>>> Please convert this error (in kvm_arch_fixup_msi_route) into a trace >>>> point. It shall not annoy the host. Also check if you have more of such >>>> guest-triggerable error messages. >>> >>> Okay. This should be the only one. I can use trace instead. >>> >>> Meanwhile, I still suppose we should not seen it even with >>> error_report().. Would this happen when boot e.g. generic kernels? >> >> No, this is - most probably, still need to validate in details - an >> effect due to the special case with Jailhouse that interrupt and VT-d >> management is handed over from a running OS to a hypervisor. >> >> Jan >> >> PS: Current quick hack for EIM support (lacks compat mode blocking at >> least): >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> index 1082ab5..709a92c 100644 >> --- a/hw/i386/intel_iommu.c >> +++ b/hw/i386/intel_iommu.c >> @@ -907,6 +907,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) >> value = vtd_get_quad_raw(s, DMAR_IRTA_REG); >> s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); >> s->intr_root = value & VTD_IRTA_ADDR_MASK; >> + s->intr_eime = value & VTD_IRTA_EIME; >> >> QLIST_FOREACH(notifier, &s->iec_notifiers, list) { >> if (notifier->iec_notify) { >> @@ -2052,11 +2053,13 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq >> irq->trigger_mode = irte.trigger_mode; >> irq->vector = irte.vector; >> irq->delivery_mode = irte.delivery_mode; >> - /* Not support EIM yet: please refer to vt-d 9.10 DST bits */ >> + irq->dest = irte.dest_id; >> + if (!iommu->intr_eime) { >> #define VTD_IR_APIC_DEST_MASK (0xff00ULL) >> #define VTD_IR_APIC_DEST_SHIFT (8) >> - irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \ >> - VTD_IR_APIC_DEST_SHIFT; >> + irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> >> + VTD_IR_APIC_DEST_SHIFT; >> + } >> irq->dest_mode = irte.dest_mode; >> irq->redir_hint = irte.redir_hint; >> >> @@ -2316,7 +2319,7 @@ static void vtd_init(IntelIOMMUState *s) >> s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; >> >> if (ms->iommu_intr) { >> - s->ecap |= VTD_ECAP_IR; >> + s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM; >> } >> >> vtd_reset_context_cache(s); >> @@ -2370,10 +2373,9 @@ static void vtd_init(IntelIOMMUState *s) >> vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); >> >> /* >> - * Interrupt remapping registers, not support extended interrupt >> - * mode for now. >> + * Interrupt remapping registers. >> */ >> - vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff00fULL, 0); >> + vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); >> } >> >> /* Should not reset address_spaces when reset because devices will still use >> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h >> index 10c20fe..72b0114 100644 >> --- a/hw/i386/intel_iommu_internal.h >> +++ b/hw/i386/intel_iommu_internal.h >> @@ -176,6 +176,7 @@ >> >> /* IRTA_REG */ >> #define VTD_IRTA_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL) >> +#define VTD_IRTA_EIME (1ULL << 11) >> #define VTD_IRTA_SIZE_MASK (0xfULL) >> >> /* ECAP_REG */ >> @@ -184,6 +185,7 @@ >> #define VTD_ECAP_QI (1ULL << 1) >> /* Interrupt Remapping support */ >> #define VTD_ECAP_IR (1ULL << 3) >> +#define VTD_ECAP_EIM (1ULL << 4) >> >> /* CAP_REG */ >> /* (offset >> 4) << 24 */ >> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h >> index d7613af..c6c53c6 100644 >> --- a/include/hw/i386/intel_iommu.h >> +++ b/include/hw/i386/intel_iommu.h >> @@ -261,6 +261,7 @@ struct IntelIOMMUState { >> bool intr_enabled; /* Whether guest enabled IR */ >> dma_addr_t intr_root; /* Interrupt remapping table pointer */ >> uint32_t intr_size; /* Number of IR table entries */ >> + bool intr_eime; /* Extended interrupt mode enabled */ >> QLIST_HEAD(, VTD_IEC_Notifier) iec_notifiers; /* IEC notify list */ >> }; >> > > Currently, all the interrupts will be translated into one MSI in > vtd_generate_msi_message(), in which only 8 bits of dest_id is used > (msg.dest = irq->dest). We may possibly need to use the high 32 bits > of MSI address to store the rest dest[31:8]? Don't know whether this > would be enough though. Yes, I ran into this topic as well as I hacked up those line. Currently, KVM does not support more than 254 vCPUs, so 8 bits of those 32 are actually fine, and piggy-backing them in an MSI message works. Once KVM supports more CPUs, it has to come up with a new userspace interface to inject APIC events for more than 255 CPUs. Maybe the existing direct MSI inject with its unused flags could be "bended", maybe there are already better ideas (Radim?). In any case, the KVM layer in userspace will then have to pick up all 32 destination bits from the IRTE and deliver them via that new interface to the in-kernel APICs. Jan
2016-04-26 16:24+0200, Jan Kiszka: > On 2016-04-26 13:40, Peter Xu wrote: >> Currently, all the interrupts will be translated into one MSI in >> vtd_generate_msi_message(), in which only 8 bits of dest_id is used >> (msg.dest = irq->dest). We may possibly need to use the high 32 bits >> of MSI address to store the rest dest[31:8]? Don't know whether this >> would be enough though. > > Yes, I ran into this topic as well as I hacked up those line. Currently, > KVM does not support more than 254 vCPUs, so 8 bits of those 32 are > actually fine, and piggy-backing them in an MSI message works. > > Once KVM supports more CPUs, it has to come up with a new userspace > interface to inject APIC events for more than 255 CPUs. Maybe the > existing direct MSI inject with its unused flags could be "bended", > maybe there are already better ideas (Radim?). Adding a flag to msi_msg and taking 3-4 bytes from padding to express x2APIC addresses is reasonable. (It is what my prototype did. :]) The conceptually different idea is forcing all userspace interrupts through irqfd routes, which would obsolete the ad-host inject. > In any case, the KVM layer in userspace will then have to pick up all 32 > destination bits from the IRTE and deliver them via that new interface > to the in-kernel APICs. I'll keep that in mind, thanks.
On 2016-04-26 16:59, Radim Kr?má? wrote: > 2016-04-26 16:24+0200, Jan Kiszka: >> On 2016-04-26 13:40, Peter Xu wrote: >>> Currently, all the interrupts will be translated into one MSI in >>> vtd_generate_msi_message(), in which only 8 bits of dest_id is used >>> (msg.dest = irq->dest). We may possibly need to use the high 32 bits >>> of MSI address to store the rest dest[31:8]? Don't know whether this >>> would be enough though. >> >> Yes, I ran into this topic as well as I hacked up those line. Currently, >> KVM does not support more than 254 vCPUs, so 8 bits of those 32 are >> actually fine, and piggy-backing them in an MSI message works. >> >> Once KVM supports more CPUs, it has to come up with a new userspace >> interface to inject APIC events for more than 255 CPUs. Maybe the >> existing direct MSI inject with its unused flags could be "bended", >> maybe there are already better ideas (Radim?). > > Adding a flag to msi_msg and taking 3-4 bytes from padding to express > x2APIC addresses is reasonable. (It is what my prototype did. :]) > > The conceptually different idea is forcing all userspace interrupts > through irqfd routes, which would obsolete the ad-host inject. irqfd for userspace sources is a bit clumsy from the API POV. On the other hand, we need to tweak the routing API anyway to achieve the same address extension there, too. Jan > >> In any case, the KVM layer in userspace will then have to pick up all 32 >> destination bits from the IRTE and deliver them via that new interface >> to the in-kernel APICs. > > I'll keep that in mind, thanks. >
2016-04-26 17:28+0200, Jan Kiszka: > On 2016-04-26 16:59, Radim Kr?má? wrote: >> 2016-04-26 16:24+0200, Jan Kiszka: >>> On 2016-04-26 13:40, Peter Xu wrote: >>>> Currently, all the interrupts will be translated into one MSI in >>>> vtd_generate_msi_message(), in which only 8 bits of dest_id is used >>>> (msg.dest = irq->dest). We may possibly need to use the high 32 bits >>>> of MSI address to store the rest dest[31:8]? Don't know whether this >>>> would be enough though. >>> >>> Yes, I ran into this topic as well as I hacked up those line. Currently, >>> KVM does not support more than 254 vCPUs, so 8 bits of those 32 are >>> actually fine, and piggy-backing them in an MSI message works. >>> >>> Once KVM supports more CPUs, it has to come up with a new userspace >>> interface to inject APIC events for more than 255 CPUs. Maybe the >>> existing direct MSI inject with its unused flags could be "bended", >>> maybe there are already better ideas (Radim?). >> >> Adding a flag to msi_msg and taking 3-4 bytes from padding to express >> x2APIC addresses is reasonable. (It is what my prototype did. :]) >> >> The conceptually different idea is forcing all userspace interrupts >> through irqfd routes, which would obsolete the ad-host inject. > > irqfd for userspace sources is a bit clumsy from the API POV. On the > other hand, we need to tweak the routing API anyway to achieve the same > address extension there, too. I agree, both of them should change. KVM_SIGNAL_MSI was added just because the route-based injection in kvm_irqchip_send_msi() sucked; maybe we could find a good solution now, but the direct interface is already here ...
On 2016-04-26 18:07, Radim Kr?má? wrote: > 2016-04-26 17:28+0200, Jan Kiszka: >> On 2016-04-26 16:59, Radim Kr?má? wrote: >>> 2016-04-26 16:24+0200, Jan Kiszka: >>>> On 2016-04-26 13:40, Peter Xu wrote: >>>>> Currently, all the interrupts will be translated into one MSI in >>>>> vtd_generate_msi_message(), in which only 8 bits of dest_id is used >>>>> (msg.dest = irq->dest). We may possibly need to use the high 32 bits >>>>> of MSI address to store the rest dest[31:8]? Don't know whether this >>>>> would be enough though. >>>> >>>> Yes, I ran into this topic as well as I hacked up those line. Currently, >>>> KVM does not support more than 254 vCPUs, so 8 bits of those 32 are >>>> actually fine, and piggy-backing them in an MSI message works. >>>> >>>> Once KVM supports more CPUs, it has to come up with a new userspace >>>> interface to inject APIC events for more than 255 CPUs. Maybe the >>>> existing direct MSI inject with its unused flags could be "bended", >>>> maybe there are already better ideas (Radim?). >>> >>> Adding a flag to msi_msg and taking 3-4 bytes from padding to express >>> x2APIC addresses is reasonable. (It is what my prototype did. :]) >>> >>> The conceptually different idea is forcing all userspace interrupts >>> through irqfd routes, which would obsolete the ad-host inject. >> >> irqfd for userspace sources is a bit clumsy from the API POV. On the >> other hand, we need to tweak the routing API anyway to achieve the same >> address extension there, too. > > I agree, both of them should change. KVM_SIGNAL_MSI was added just > because the route-based injection in kvm_irqchip_send_msi() sucked; > maybe we could find a good solution now, but the direct interface is > already here ... We won't get away without irq routes because we need them for sources that only issue binary signals for interrupt events (in-kernel, eventfd from userspace). In fact, those sources should not know more than that about their irqs. But we also do not want them to route everything through userspace for on-the-fly translation and reinjection to kvm. Jan
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1082ab5..709a92c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -907,6 +907,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) value = vtd_get_quad_raw(s, DMAR_IRTA_REG); s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); s->intr_root = value & VTD_IRTA_ADDR_MASK; + s->intr_eime = value & VTD_IRTA_EIME; QLIST_FOREACH(notifier, &s->iec_notifiers, list) { if (notifier->iec_notify) { @@ -2052,11 +2053,13 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq irq->trigger_mode = irte.trigger_mode; irq->vector = irte.vector; irq->delivery_mode = irte.delivery_mode; - /* Not support EIM yet: please refer to vt-d 9.10 DST bits */ + irq->dest = irte.dest_id; + if (!iommu->intr_eime) { #define VTD_IR_APIC_DEST_MASK (0xff00ULL) #define VTD_IR_APIC_DEST_SHIFT (8) - irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \ - VTD_IR_APIC_DEST_SHIFT; + irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >> + VTD_IR_APIC_DEST_SHIFT; + } irq->dest_mode = irte.dest_mode; irq->redir_hint = irte.redir_hint; @@ -2316,7 +2319,7 @@ static void vtd_init(IntelIOMMUState *s) s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; if (ms->iommu_intr) { - s->ecap |= VTD_ECAP_IR; + s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM; } vtd_reset_context_cache(s); @@ -2370,10 +2373,9 @@ static void vtd_init(IntelIOMMUState *s) vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL); /* - * Interrupt remapping registers, not support extended interrupt - * mode for now. + * Interrupt remapping registers. */ - vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff00fULL, 0); + vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); } /* Should not reset address_spaces when reset because devices will still use diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 10c20fe..72b0114 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -176,6 +176,7 @@ /* IRTA_REG */ #define VTD_IRTA_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL) +#define VTD_IRTA_EIME (1ULL << 11) #define VTD_IRTA_SIZE_MASK (0xfULL) /* ECAP_REG */ @@ -184,6 +185,7 @@ #define VTD_ECAP_QI (1ULL << 1) /* Interrupt Remapping support */ #define VTD_ECAP_IR (1ULL << 3) +#define VTD_ECAP_EIM (1ULL << 4) /* CAP_REG */ /* (offset >> 4) << 24 */ diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index d7613af..c6c53c6 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -261,6 +261,7 @@ struct IntelIOMMUState { bool intr_enabled; /* Whether guest enabled IR */ dma_addr_t intr_root; /* Interrupt remapping table pointer */ uint32_t intr_size; /* Number of IR table entries */ + bool intr_eime; /* Extended interrupt mode enabled */ QLIST_HEAD(, VTD_IEC_Notifier) iec_notifiers; /* IEC notify list */ };