diff mbox series

[RFC,11/13] iommu/vt-d: Add an irq_chip for posted MSIs

Message ID 20231112041643.2868316-12-jacob.jun.pan@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Coalesced Interrupt Delivery with posted MSI | expand

Commit Message

Jacob Pan Nov. 12, 2023, 4:16 a.m. UTC
With posted MSIs, end of interrupt is handled by the notification
handler. Each MSI handler does not go through local APIC IRR, ISR
processing. There's no need to do apic_eoi() in those handlers.

Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
the call trace looks like:

__sysvec_posted_msi_notification() {
  irq_chip_ack_parent() {
    apic_ack_irq_no_eoi();
  }
  handle_irq_event() {
    handle_irq_event_percpu() {
       driver_handler()
    }
  }

IO-APIC IR is excluded the from posted MSI, we need to make sure it
still performs EOI.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/apic.h         |  1 +
 arch/x86/kernel/apic/io_apic.c      |  2 +-
 arch/x86/kernel/apic/vector.c       |  5 ++++
 drivers/iommu/intel/irq_remapping.c | 38 ++++++++++++++++++++++++++++-
 4 files changed, 44 insertions(+), 2 deletions(-)

Comments

Thomas Gleixner Dec. 6, 2023, 8:15 p.m. UTC | #1
On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> With posted MSIs, end of interrupt is handled by the notification
> handler. Each MSI handler does not go through local APIC IRR, ISR
> processing. There's no need to do apic_eoi() in those handlers.
>
> Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
> the call trace looks like:
>
> __sysvec_posted_msi_notification() {
>   irq_chip_ack_parent() {
>     apic_ack_irq_no_eoi();
>   }

Huch? There is something missing here to make sense.

>   handle_irq_event() {
>     handle_irq_event_percpu() {
>        driver_handler()
>     }
>   }
>
> IO-APIC IR is excluded the from posted MSI, we need to make sure it
> still performs EOI.

We need to make the code correct and write changelogs which make
sense. This sentence makes no sense whatsoever.

What has the IO-APIC to do with posted MSIs?

It's a different interrupt chip hierarchy, no?

> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 00da6cf6b07d..ca398ee9075b 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1993,7 +1993,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
>  	.irq_startup		= startup_ioapic_irq,
>  	.irq_mask		= mask_ioapic_irq,
>  	.irq_unmask		= unmask_ioapic_irq,
> -	.irq_ack		= irq_chip_ack_parent,
> +	.irq_ack		= apic_ack_irq,

Why?

>  	.irq_eoi		= ioapic_ir_ack_level,
>  	.irq_set_affinity	= ioapic_set_affinity,
>  	.irq_retrigger		= irq_chip_retrigger_hierarchy,
> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
> index 14fc33cfdb37..01223ac4f57a 100644
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -911,6 +911,11 @@ void apic_ack_irq(struct irq_data *irqd)
>  	apic_eoi();
>  }
>  
> +void apic_ack_irq_no_eoi(struct irq_data *irqd)
> +{
> +	irq_move_irq(irqd);
> +}
> +

The exact purpose of that function is to invoke irq_move_irq() which is
a completely pointless exercise for interrupts which are remapped.
Thomas Gleixner Dec. 6, 2023, 8:44 p.m. UTC | #2
On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
>  static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
>  {
>  	memset(msg, 0, sizeof(*msg));
> @@ -1361,7 +1397,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
>  
>  		irq_data->hwirq = (index << 16) + i;
>  		irq_data->chip_data = ird;
> -		irq_data->chip = &intel_ir_chip;
> +		irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;

This is just wrong because you change the chip to posted for _ALL_
domains unconditionally.

The only domains which want this chip are the PCI/MSI domains. And those
are distinct from the domains which serve IO/APIC, HPET, no?

So you can set that chip only for PCI/MSI and just let IO/APIC, HPET
domains keep the original chip, which spares any modification of the
IO/APIC domain.
Jacob Pan Dec. 13, 2023, 3:42 a.m. UTC | #3
Hi Thomas,

On Wed, 06 Dec 2023 21:44:02 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:

> On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> >  static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
> >  {
> >  	memset(msg, 0, sizeof(*msg));
> > @@ -1361,7 +1397,7 @@ static int intel_irq_remapping_alloc(struct
> > irq_domain *domain, 
> >  		irq_data->hwirq = (index << 16) + i;
> >  		irq_data->chip_data = ird;
> > -		irq_data->chip = &intel_ir_chip;
> > +		irq_data->chip = posted_msi_supported() ?
> > &intel_ir_chip_post_msi : &intel_ir_chip;  
> 
> This is just wrong because you change the chip to posted for _ALL_
> domains unconditionally.
> 
> The only domains which want this chip are the PCI/MSI domains. And those
> are distinct from the domains which serve IO/APIC, HPET, no?
> 
> So you can set that chip only for PCI/MSI and just let IO/APIC, HPET
> domains keep the original chip, which spares any modification of the
> IO/APIC domain.
> 
> 
make sense.
-               irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;
+               if ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) && posted_msi_supported())
+                       irq_data->chip = &intel_ir_chip_post_msi;
+               else
+                       irq_data->chip = &intel_ir_chip;

Now in IRQ debugfs, I can see the correct IR chips for IOAPIC IRQs and
MSIs.

e.g


domain:  IO-APIC-8
 hwirq:   0x9
 chip:    IR-IO-APIC
  flags:   0x410
             IRQCHIP_SKIP_SET_WAKE
 parent:
    domain:  INTEL-IR-9-13
     hwirq:   0x80000
     chip:    INTEL-IR
      flags:   0x0
     parent:
        domain:  VECTOR


domain:  IR-PCI-MSI-0000:3d:00.4-11
 hwirq:   0x0
 chip:    IR-PCI-MSI-0000:3d:00.4
  flags:   0x430
             IRQCHIP_SKIP_SET_WAKE
             IRQCHIP_ONESHOT_SAFE
 parent:
    domain:  INTEL-IR-4-13
     hwirq:   0x0
     chip:    INTEL-IR-POST
      flags:   0x0
     parent:
        domain:  VECTOR


Thanks,

Jacob
Jacob Pan Jan. 26, 2024, 11:31 p.m. UTC | #4
Hi Thomas,

On Wed, 06 Dec 2023 21:15:24 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:

> On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> > With posted MSIs, end of interrupt is handled by the notification
> > handler. Each MSI handler does not go through local APIC IRR, ISR
> > processing. There's no need to do apic_eoi() in those handlers.
> >
> > Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
> > the call trace looks like:
> >
> > __sysvec_posted_msi_notification() {
> >   irq_chip_ack_parent() {
> >     apic_ack_irq_no_eoi();
> >   }  
> 
> Huch? There is something missing here to make sense.
Good point, I was too focused on eoi. The trace should be like

 * __sysvec_posted_msi_notification()
 *	irq_enter();
 *		handle_edge_irq()
 *			irq_chip_ack_parent()
 *				dummy(); // No EOI
 *			handle_irq_event()
 *				driver_handler()
 *	irq_enter();
 *		handle_edge_irq()
 *			irq_chip_ack_parent()
 *				dummy(); // No EOI
 *			handle_irq_event()
 *				driver_handler()
 *	irq_enter();
 *		handle_edge_irq()
 *			irq_chip_ack_parent()
 *				dummy(); // No EOI
 *			handle_irq_event()
 *				driver_handler()
 *	apic_eoi()
 * irq_exit()

> >   handle_irq_event() {
> >     handle_irq_event_percpu() {
> >        driver_handler()
> >     }
> >   }
> >
> > IO-APIC IR is excluded the from posted MSI, we need to make sure it
> > still performs EOI.  
> 
> We need to make the code correct and write changelogs which make
> sense. This sentence makes no sense whatsoever.
> 
> What has the IO-APIC to do with posted MSIs?
> 
> It's a different interrupt chip hierarchy, no?
Right, I should not modify IOAPIC chip. Just assign posted IR chip to
device MSI/x.

> > diff --git a/arch/x86/kernel/apic/io_apic.c
> > b/arch/x86/kernel/apic/io_apic.c index 00da6cf6b07d..ca398ee9075b 100644
> > --- a/arch/x86/kernel/apic/io_apic.c
> > +++ b/arch/x86/kernel/apic/io_apic.c
> > @@ -1993,7 +1993,7 @@ static struct irq_chip ioapic_ir_chip
> > __read_mostly = { .irq_startup		= startup_ioapic_irq,
> >  	.irq_mask		= mask_ioapic_irq,
> >  	.irq_unmask		= unmask_ioapic_irq,
> > -	.irq_ack		= irq_chip_ack_parent,
> > +	.irq_ack		= apic_ack_irq,  
> 
> Why?
ditto.

> 
> >  	.irq_eoi		= ioapic_ir_ack_level,
> >  	.irq_set_affinity	= ioapic_set_affinity,
> >  	.irq_retrigger		= irq_chip_retrigger_hierarchy,
> > diff --git a/arch/x86/kernel/apic/vector.c
> > b/arch/x86/kernel/apic/vector.c index 14fc33cfdb37..01223ac4f57a 100644
> > --- a/arch/x86/kernel/apic/vector.c
> > +++ b/arch/x86/kernel/apic/vector.c
> > @@ -911,6 +911,11 @@ void apic_ack_irq(struct irq_data *irqd)
> >  	apic_eoi();
> >  }
> >  
> > +void apic_ack_irq_no_eoi(struct irq_data *irqd)
> > +{
> > +	irq_move_irq(irqd);
> > +}
> > +  
> 
> The exact purpose of that function is to invoke irq_move_irq() which is
> a completely pointless exercise for interrupts which are remapped.

OK, I will replace this with a dummy .irq_ack() function.
Device MSIs do not have IRQD_SETAFFINITY_PENDING set.

Thanks,

Jacob
diff mbox series

Patch

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5af4ec1a0f71..a88015d5638b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -485,6 +485,7 @@  static inline void apic_setup_apic_calls(void) { }
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 extern void apic_ack_irq(struct irq_data *data);
+extern void apic_ack_irq_no_eoi(struct irq_data *data);
 
 static inline bool lapic_vector_set_in_irr(unsigned int vector)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b07d..ca398ee9075b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1993,7 +1993,7 @@  static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= irq_chip_ack_parent,
+	.irq_ack		= apic_ack_irq,
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 14fc33cfdb37..01223ac4f57a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -911,6 +911,11 @@  void apic_ack_irq(struct irq_data *irqd)
 	apic_eoi();
 }
 
+void apic_ack_irq_no_eoi(struct irq_data *irqd)
+{
+	irq_move_irq(irqd);
+}
+
 void apic_ack_edge(struct irq_data *irqd)
 {
 	irq_complete_move(irqd_cfg(irqd));
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 29b9e55dcf26..f2870d3c8313 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1233,6 +1233,42 @@  static struct irq_chip intel_ir_chip = {
 	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
 };
 
+/*
+ * With posted MSIs, all vectors are multiplexed into a single notification
+ * vector. Devices MSIs are then dispatched in a demux loop where
+ * EOIs can be coalesced as well.
+ *
+ * IR chip "INTEL-IR-POST" does not do EOI on ACK instead letting posted
+ * interrupt notification handler to perform EOI.
+ *
+ * For the example below, 3 MSIs are coalesced in one CPU notification. Only
+ * one apic_eoi() is needed.
+ *
+ * __sysvec_posted_msi_notification() {
+ * irq_enter()
+ *	handle_edge_irq()
+ *		irq_chip_ack_parent()
+ *			apic_ack_irq_no_eoi()
+ *		handle_irq()
+ *	handle_edge_irq()
+ *		irq_chip_ack_parent()
+ *			apic_ack_irq_no_eoi()
+ *		handle_irq()
+ *	handle_edge_irq()
+ *		irq_chip_ack_parent()
+ *			apic_ack_irq_no_eoi()
+ *		handle_irq()
+ *	apic_eoi()
+ * irq_exit()
+ */
+static struct irq_chip intel_ir_chip_post_msi = {
+	.name			= "INTEL-IR-POST",
+	.irq_ack		= apic_ack_irq_no_eoi,
+	.irq_set_affinity	= intel_ir_set_affinity,
+	.irq_compose_msi_msg	= intel_ir_compose_msi_msg,
+	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
+};
+
 static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
 {
 	memset(msg, 0, sizeof(*msg));
@@ -1361,7 +1397,7 @@  static int intel_irq_remapping_alloc(struct irq_domain *domain,
 
 		irq_data->hwirq = (index << 16) + i;
 		irq_data->chip_data = ird;
-		irq_data->chip = &intel_ir_chip;
+		irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;
 		intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
 		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
 	}