Message ID | alpine.DEB.2.20.1712291406420.1899@nanos (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
On Fri, Dec 29, 2017 at 02:09:40PM +0100, Thomas Gleixner wrote: > On Fri, 29 Dec 2017, Alexandru Chirvasitu wrote: > > All right, I tried to do some more digging around, in the hope of > > getting as close to the source of the problem as I can. > > > > I went back to the very first commit that went astray for me, 2db1f95 > > (which is the only one actually panicking), and tried to move from its > > parent 90ad9e2 (that boots fine) to it gradually, altering the code in > > small chunks. > > > > I tried to ignore the stuff that clearly shouldn't make a difference, > > such as definitions. So in the end I get defined-but-unused-function > > errors in my compilations, but I'm ignoring those for now. Some > > results: > > > > (1) When I move from the good commit 90ad9e2 according to the attached > > bad-diff (which moves partly towards 2db1f95), I get a panic. > > > > (2) On the other hand, when I further change this last panicking > > commit by simply doing > > > > > > ---------------------------------------------------------------- > > removed activate / deactivate from x86_vector_domain_ops > > > > diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c > > index 7317ba5a..063594d 100644 > > --- a/arch/x86/kernel/apic/vector.c > > +++ b/arch/x86/kernel/apic/vector.c > > @@ -514,8 +514,6 @@ void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, > > static const struct irq_domain_ops x86_vector_domain_ops = { > > .alloc = x86_vector_alloc_irqs, > > .free = x86_vector_free_irqs, > > - .activate = x86_vector_activate, > > - .deactivate = x86_vector_deactivate, > > #ifdef CONFIG_GENERIC_IRQ_DEBUGFS > > .debug_show = x86_vector_debug_show, > > #endif > > ---------------------------------------------------------------- > > > > all is well. > > Nice detective work. Unfortunately that's not a real solution ... > Oh, of course. It was never intended as a solution, only as information perhaps enabling someone who knows what they're doing (unlike myself :) ) to find one. > Can you try the patch below on top of Linus tree, please? > > Thanks, > Applied it to 464e1d5 4.15-rc5 just now. it appears to be trouble-free: booted, logged me in fine, the works. > tglx > > 8<--------------------- > --- a/kernel/irq/msi.c > +++ b/kernel/irq/msi.c > @@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_ > return ret; > } > > +/* > + * Carefully check whether the device can use reservation mode. If > + * reservation mode is enabled then the early activation will assign a > + * dummy vector to the device. If the PCI/MSI device does not support > + * masking of the entry then this can result in spurious interrupts when > + * the device driver is not absolutely careful. But even then a malfunction > + * of the hardware could result in a spurious interrupt on the dummy vector > + * and render the device unusable. If the entry can be masked then the core > + * logic will prevent the spurious interrupt and reservation mode can be > + * used. For now reservation mode is restricted to PCI/MSI. > + */ > +static bool msi_check_reservation_mode(struct irq_domain *domain, > + struct msi_domain_info *info, > + struct device *dev) > +{ > + struct msi_desc *desc; > + > + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) > + return false; > + > + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) > + return false; > + > + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) > + return false; > + > + /* > + * Checking the first MSI descriptor is sufficient. MSIX supports > + * masking and MSI does so when the maskbit is set. > + */ > + desc = first_msi_entry(dev); > + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; > +} > + > /** > * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain > * @domain: The domain to allocate from > @@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_dom > { > struct msi_domain_info *info = domain->host_data; > struct msi_domain_ops *ops = info->ops; > - msi_alloc_info_t arg; > + struct irq_data *irq_data; > struct msi_desc *desc; > + msi_alloc_info_t arg; > int i, ret, virq; > + bool can_reserve; > > ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); > if (ret) > @@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_dom > if (ops->msi_finish) > ops->msi_finish(&arg, 0); > > + can_reserve = msi_check_reservation_mode(domain, info, dev); > + > for_each_msi_entry(desc, dev) { > virq = desc->irq; > if (desc->nvec_used == 1) > @@ -397,17 +435,28 @@ int msi_domain_alloc_irqs(struct irq_dom > * the MSI entries before the PCI layer enables MSI in the > * card. Otherwise the card latches a random msi message. > */ > - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { > - struct irq_data *irq_data; > + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) > + continue; > > + irq_data = irq_domain_get_irq_data(domain, desc->irq); > + if (!can_reserve) > + irqd_clr_can_reserve(irq_data); > + ret = irq_domain_activate_irq(irq_data, can_reserve); > + if (ret) > + goto cleanup; > + } > + > + /* > + * If these interrupts use reservation mode clear the activated bit > + * so request_irq() will assign the final vector. > + */ > + if (can_reserve) { > + for_each_msi_entry(desc, dev) { > irq_data = irq_domain_get_irq_data(domain, desc->irq); > - ret = irq_domain_activate_irq(irq_data, true); > - if (ret) > - goto cleanup; > - if (info->flags & MSI_FLAG_MUST_REACTIVATE) > - irqd_clr_activated(irq_data); > + irqd_clr_activated(irq_data); > } > } > + > return 0; > > cleanup: > --- a/arch/x86/kernel/apic/vector.c > +++ b/arch/x86/kernel/apic/vector.c > @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(st > irq_matrix_reserve(vector_matrix); > apicd->can_reserve = true; > apicd->has_reserved = true; > + irqd_set_can_reserve(irqd); > trace_vector_reserve(irqd->irq, 0); > vector_assign_managed_shutdown(irqd); > } > @@ -368,8 +369,18 @@ static int activate_reserved(struct irq_ > int ret; > > ret = assign_irq_vector_any_locked(irqd); > - if (!ret) > + if (!ret) { > apicd->has_reserved = false; > + /* > + * Core might have disabled reservation mode after > + * allocating the irq descriptor. Ideally this should > + * happen before allocation time, but that would require > + * completely convoluted ways of transporting that > + * information. > + */ > + if (!irqd_can_reserve(irqd)) > + apicd->can_reserve = false; > + } > return ret; > } > > @@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsi > } else { > /* Release the vector */ > apicd->can_reserve = true; > + irqd_set_can_reserve(irqd); > clear_irq_vector(irqd); > realloc = true; > } > --- a/include/linux/irq.h > +++ b/include/linux/irq.h > @@ -212,6 +212,7 @@ struct irq_data { > * mask. Applies only to affinity managed irqs. > * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target > * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set > + * IRQD_CAN_RESERVE - Can use reservation mode > */ > enum { > IRQD_TRIGGER_MASK = 0xf, > @@ -233,6 +234,7 @@ enum { > IRQD_MANAGED_SHUTDOWN = (1 << 23), > IRQD_SINGLE_TARGET = (1 << 24), > IRQD_DEFAULT_TRIGGER_SET = (1 << 25), > + IRQD_CAN_RESERVE = (1 << 26), > }; > > #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) > @@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_s > return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; > } > > +static inline void irqd_set_can_reserve(struct irq_data *d) > +{ > + __irqd_to_state(d) |= IRQD_CAN_RESERVE; > +} > + > +static inline void irqd_clr_can_reserve(struct irq_data *d) > +{ > + __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; > +} > + > +static inline bool irqd_can_reserve(struct irq_data *d) > +{ > + return __irqd_to_state(d) & IRQD_CAN_RESERVE; > +} > + > #undef __irqd_to_state > > static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) > --- a/kernel/irq/debugfs.c > +++ b/kernel/irq/debugfs.c > @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdat > BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), > BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), > BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), > + BIT_MASK_DESCR(IRQD_CAN_RESERVE), > > BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), > > --- a/arch/x86/kernel/apic/apic_flat_64.c > +++ b/arch/x86/kernel/apic/apic_flat_64.c > @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_ > .apic_id_valid = default_apic_id_valid, > .apic_id_registered = flat_apic_id_registered, > > - .irq_delivery_mode = dest_LowestPrio, > + .irq_delivery_mode = dest_Fixed, > .irq_dest_mode = 1, /* logical */ > > .disable_esr = 0, > --- a/arch/x86/kernel/apic/apic_noop.c > +++ b/arch/x86/kernel/apic/apic_noop.c > @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = > .apic_id_valid = default_apic_id_valid, > .apic_id_registered = noop_apic_id_registered, > > - .irq_delivery_mode = dest_LowestPrio, > + .irq_delivery_mode = dest_Fixed, > /* logical delivery broadcast to all CPUs: */ > .irq_dest_mode = 1, > > --- a/arch/x86/kernel/apic/msi.c > +++ b/arch/x86/kernel/apic/msi.c > @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct i > ((apic->irq_dest_mode == 0) ? > MSI_ADDR_DEST_MODE_PHYSICAL : > MSI_ADDR_DEST_MODE_LOGICAL) | > - ((apic->irq_delivery_mode != dest_LowestPrio) ? > - MSI_ADDR_REDIRECTION_CPU : > - MSI_ADDR_REDIRECTION_LOWPRI) | > + MSI_ADDR_REDIRECTION_CPU | > MSI_ADDR_DEST_ID(cfg->dest_apicid); > > msg->data = > MSI_DATA_TRIGGER_EDGE | > MSI_DATA_LEVEL_ASSERT | > - ((apic->irq_delivery_mode != dest_LowestPrio) ? > - MSI_DATA_DELIVERY_FIXED : > - MSI_DATA_DELIVERY_LOWPRI) | > + MSI_DATA_DELIVERY_FIXED | > MSI_DATA_VECTOR(cfg->vector); > } > > --- a/arch/x86/kernel/apic/probe_32.c > +++ b/arch/x86/kernel/apic/probe_32.c > @@ -105,7 +105,7 @@ static struct apic apic_default __ro_aft > .apic_id_valid = default_apic_id_valid, > .apic_id_registered = default_apic_id_registered, > > - .irq_delivery_mode = dest_LowestPrio, > + .irq_delivery_mode = dest_Fixed, > /* logical delivery broadcast to all CPUs: */ > .irq_dest_mode = 1, > > --- a/arch/x86/kernel/apic/x2apic_cluster.c > +++ b/arch/x86/kernel/apic/x2apic_cluster.c > @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster _ > .apic_id_valid = x2apic_apic_id_valid, > .apic_id_registered = x2apic_apic_id_registered, > > - .irq_delivery_mode = dest_LowestPrio, > + .irq_delivery_mode = dest_Fixed, > .irq_dest_mode = 1, /* logical */ > > .disable_esr = 0, > --- a/drivers/pci/host/pci-hyperv.c > +++ b/drivers/pci/host/pci-hyperv.c > @@ -985,9 +985,7 @@ static u32 hv_compose_msi_req_v1( > int_pkt->wslot.slot = slot; > int_pkt->int_desc.vector = vector; > int_pkt->int_desc.vector_count = 1; > - int_pkt->int_desc.delivery_mode = > - (apic->irq_delivery_mode == dest_LowestPrio) ? > - dest_LowestPrio : dest_Fixed; > + int_pkt->int_desc.delivery_mode = dest_Fixed; > > /* > * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in > @@ -1008,9 +1006,7 @@ static u32 hv_compose_msi_req_v2( > int_pkt->wslot.slot = slot; > int_pkt->int_desc.vector = vector; > int_pkt->int_desc.vector_count = 1; > - int_pkt->int_desc.delivery_mode = > - (apic->irq_delivery_mode == dest_LowestPrio) ? > - dest_LowestPrio : dest_Fixed; > + int_pkt->int_desc.delivery_mode = dest_Fixed; > > /* > * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
--- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_ return ret; } +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) + return false; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_dom { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - msi_alloc_info_t arg; + struct irq_data *irq_data; struct msi_desc *desc; + msi_alloc_info_t arg; int i, ret, virq; + bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_dom if (ops->msi_finish) ops->msi_finish(&arg, 0); + can_reserve = msi_check_reservation_mode(domain, info, dev); + for_each_msi_entry(desc, dev) { virq = desc->irq; if (desc->nvec_used == 1) @@ -397,17 +435,28 @@ int msi_domain_alloc_irqs(struct irq_dom * the MSI entries before the PCI layer enables MSI in the * card. Otherwise the card latches a random msi message. */ - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { - struct irq_data *irq_data; + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + continue; + irq_data = irq_domain_get_irq_data(domain, desc->irq); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); + if (ret) + goto cleanup; + } + + /* + * If these interrupts use reservation mode clear the activated bit + * so request_irq() will assign the final vector. + */ + if (can_reserve) { + for_each_msi_entry(desc, dev) { irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); - if (ret) - goto cleanup; - if (info->flags & MSI_FLAG_MUST_REACTIVATE) - irqd_clr_activated(irq_data); + irqd_clr_activated(irq_data); } } + return 0; cleanup: --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(st irq_matrix_reserve(vector_matrix); apicd->can_reserve = true; apicd->has_reserved = true; + irqd_set_can_reserve(irqd); trace_vector_reserve(irqd->irq, 0); vector_assign_managed_shutdown(irqd); } @@ -368,8 +369,18 @@ static int activate_reserved(struct irq_ int ret; ret = assign_irq_vector_any_locked(irqd); - if (!ret) + if (!ret) { apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } return ret; } @@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsi } else { /* Release the vector */ apicd->can_reserve = true; + irqd_set_can_reserve(irqd); clear_irq_vector(irqd); realloc = true; } --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -212,6 +212,7 @@ struct irq_data { * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set + * IRQD_CAN_RESERVE - Can use reservation mode */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -233,6 +234,7 @@ enum { IRQD_MANAGED_SHUTDOWN = (1 << 23), IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), + IRQD_CAN_RESERVE = (1 << 26), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_s return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } +static inline void irqd_set_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_CAN_RESERVE; +} + +static inline void irqd_clr_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; +} + +static inline bool irqd_can_reserve(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_CAN_RESERVE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdat BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_ .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct i ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_aft .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster _ .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, --- a/drivers/pci/host/pci-hyperv.c +++ b/drivers/pci/host/pci-hyperv.c @@ -985,9 +985,7 @@ static u32 hv_compose_msi_req_v1( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = - (apic->irq_delivery_mode == dest_LowestPrio) ? - dest_LowestPrio : dest_Fixed; + int_pkt->int_desc.delivery_mode = dest_Fixed; /* * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in @@ -1008,9 +1006,7 @@ static u32 hv_compose_msi_req_v2( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = - (apic->irq_delivery_mode == dest_LowestPrio) ? - dest_LowestPrio : dest_Fixed; + int_pkt->int_desc.delivery_mode = dest_Fixed; /* * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten