diff mbox series

[RFC,09/13] x86/irq: Install posted MSI notification handler

Message ID 20231112041643.2868316-10-jacob.jun.pan@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Coalesced Interrupt Delivery with posted MSI | expand

Commit Message

Jacob Pan Nov. 12, 2023, 4:16 a.m. UTC
All MSI vectors are multiplexed into a single notification vector when
posted MSI is enabled. It is the responsibility of the notification
vector handler to demultiplex MSI vectors. In this handler, for each
pending bit, MSI vector handlers are dispatched without IDT delivery.

For example, the interrupt flow will change as follows:
(3 MSIs of different vectors arrive in a a high frequency burst)

BEFORE:
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()
interrupt(MSI)
    irq_enter()
    handler() /* EOI */
    irq_exit()
        process_softirq()

AFTER:
interrupt /* Posted MSI notification vector */
    irq_enter()
	atomic_xchg(PIR)
	handler()
	handler()
	handler()
	pi_clear_on()
    apic_eoi()
    irq_exit()
        process_softirq()

Except for the leading MSI, CPU notifications are skipped/coalesced.

For MSIs arrive at a low frequency, the demultiplexing loop does not
wait for more interrupts to coalesce. Therefore, there's no additional
latency other than the processing time.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/hardirq.h  |  3 ++
 arch/x86/include/asm/idtentry.h |  3 ++
 arch/x86/kernel/idt.c           |  3 ++
 arch/x86/kernel/irq.c           | 91 +++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

Comments

Peter Zijlstra Nov. 15, 2023, 12:42 p.m. UTC | #1
On Sat, Nov 11, 2023 at 08:16:39PM -0800, Jacob Pan wrote:

> +static __always_inline inline void handle_pending_pir(struct pi_desc *pid, struct pt_regs *regs)
> +{
> +	int i, vec = FIRST_EXTERNAL_VECTOR;
> +	u64 pir_copy[4];
> +
> +	/*
> +	 * Make a copy of PIR which contains IRQ pending bits for vectors,
> +	 * then invoke IRQ handlers for each pending vector.
> +	 * If any new interrupts were posted while we are processing, will
> +	 * do again before allowing new notifications. The idea is to
> +	 * minimize the number of the expensive notifications if IRQs come
> +	 * in a high frequency burst.
> +	 */
> +	for (i = 0; i < 4; i++)
> +		pir_copy[i] = raw_atomic64_xchg((atomic64_t *)&pid->pir_l[i], 0);

Might as well use arch_xchg() and save the atomic64_t casting.

> +
> +	/*
> +	 * Ideally, we should start from the high order bits set in the PIR
> +	 * since each bit represents a vector. Higher order bit position means
> +	 * the vector has higher priority. But external vectors are allocated
> +	 * based on availability not priority.
> +	 *
> +	 * EOI is included in the IRQ handlers call to apic_ack_irq, which
> +	 * allows higher priority system interrupt to get in between.
> +	 */
> +	for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)
> +		call_irq_handler(vec, regs);
> +
> +}
Peter Zijlstra Nov. 15, 2023, 12:56 p.m. UTC | #2
On Sat, Nov 11, 2023 at 08:16:39PM -0800, Jacob Pan wrote:

> +static __always_inline inline void handle_pending_pir(struct pi_desc *pid, struct pt_regs *regs)
> +{

__always_inline means that... (A)

> +	int i, vec = FIRST_EXTERNAL_VECTOR;
> +	u64 pir_copy[4];
> +
> +	/*
> +	 * Make a copy of PIR which contains IRQ pending bits for vectors,
> +	 * then invoke IRQ handlers for each pending vector.
> +	 * If any new interrupts were posted while we are processing, will
> +	 * do again before allowing new notifications. The idea is to
> +	 * minimize the number of the expensive notifications if IRQs come
> +	 * in a high frequency burst.
> +	 */
> +	for (i = 0; i < 4; i++)
> +		pir_copy[i] = raw_atomic64_xchg((atomic64_t *)&pid->pir_l[i], 0);
> +
> +	/*
> +	 * Ideally, we should start from the high order bits set in the PIR
> +	 * since each bit represents a vector. Higher order bit position means
> +	 * the vector has higher priority. But external vectors are allocated
> +	 * based on availability not priority.
> +	 *
> +	 * EOI is included in the IRQ handlers call to apic_ack_irq, which
> +	 * allows higher priority system interrupt to get in between.
> +	 */
> +	for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)
> +		call_irq_handler(vec, regs);
> +
> +}
> +
> +/*
> + * Performance data shows that 3 is good enough to harvest 90+% of the benefit
> + * on high IRQ rate workload.
> + * Alternatively, could make this tunable, use 3 as default.
> + */
> +#define MAX_POSTED_MSI_COALESCING_LOOP 3
> +
> +/*
> + * For MSIs that are delivered as posted interrupts, the CPU notifications
> + * can be coalesced if the MSIs arrive in high frequency bursts.
> + */
> +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
> +{
> +	struct pt_regs *old_regs = set_irq_regs(regs);
> +	struct pi_desc *pid;
> +	int i = 0;
> +
> +	pid = this_cpu_ptr(&posted_interrupt_desc);
> +
> +	inc_irq_stat(posted_msi_notification_count);
> +	irq_enter();
> +
> +	while (i++ < MAX_POSTED_MSI_COALESCING_LOOP) {
> +		handle_pending_pir(pid, regs);
> +
> +		/*
> +		 * If there are new interrupts posted in PIR, do again. If
> +		 * nothing pending, no need to wait for more interrupts.
> +		 */
> +		if (is_pir_pending(pid))

So this reads those same 4 words we xchg in handle_pending_pir(), right?

> +			continue;
> +		else
> +			break;
> +	}
> +
> +	/*
> +	 * Clear outstanding notification bit to allow new IRQ notifications,
> +	 * do this last to maximize the window of interrupt coalescing.
> +	 */
> +	pi_clear_on(pid);
> +
> +	/*
> +	 * There could be a race of PI notification and the clearing of ON bit,
> +	 * process PIR bits one last time such that handling the new interrupts
> +	 * are not delayed until the next IRQ.
> +	 */
> +	if (unlikely(is_pir_pending(pid)))
> +		handle_pending_pir(pid, regs);

(A) ... we get _two_ copies of that thing in this function. Does that
make sense ?

> +
> +	apic_eoi();
> +	irq_exit();
> +	set_irq_regs(old_regs);
> +}
>  #endif /* X86_POSTED_MSI */

Would it not make more sense to write things something like:

bool handle_pending_pir()
{
	bool handled = false;
	u64 pir_copy[4];

	for (i = 0; i < 4; i++) {
		if (!pid-pir_l[i]) {
			pir_copy[i] = 0;
			continue;
		}

		pir_copy[i] = arch_xchg(&pir->pir_l[i], 0);
		handled |= true;
	}

	if (!handled)
		return handled;

	for_each_set_bit()
		....

	return handled.
}

sysvec_posted_blah_blah()
{
	bool done = false;
	bool handled;

	for (;;) {
		handled = handle_pending_pir();
		if (done)
			break;
		if (!handled || ++loops > MAX_LOOPS) {
			pi_clear_on(pid);
			/* once more after clear_on */
			done = true;
		}
	}
}


Hmm?
Jacob Pan Nov. 15, 2023, 8:04 p.m. UTC | #3
Hi Peter,

On Wed, 15 Nov 2023 13:56:24 +0100, Peter Zijlstra <peterz@infradead.org>
wrote:

> On Sat, Nov 11, 2023 at 08:16:39PM -0800, Jacob Pan wrote:
> 
> > +static __always_inline inline void handle_pending_pir(struct pi_desc
> > *pid, struct pt_regs *regs) +{  
> 
> __always_inline means that... (A)
> 
> > +	int i, vec = FIRST_EXTERNAL_VECTOR;
> > +	u64 pir_copy[4];
> > +
> > +	/*
> > +	 * Make a copy of PIR which contains IRQ pending bits for
> > vectors,
> > +	 * then invoke IRQ handlers for each pending vector.
> > +	 * If any new interrupts were posted while we are processing,
> > will
> > +	 * do again before allowing new notifications. The idea is to
> > +	 * minimize the number of the expensive notifications if IRQs
> > come
> > +	 * in a high frequency burst.
> > +	 */
> > +	for (i = 0; i < 4; i++)
> > +		pir_copy[i] = raw_atomic64_xchg((atomic64_t
> > *)&pid->pir_l[i], 0); +
> > +	/*
> > +	 * Ideally, we should start from the high order bits set in
> > the PIR
> > +	 * since each bit represents a vector. Higher order bit
> > position means
> > +	 * the vector has higher priority. But external vectors are
> > allocated
> > +	 * based on availability not priority.
> > +	 *
> > +	 * EOI is included in the IRQ handlers call to apic_ack_irq,
> > which
> > +	 * allows higher priority system interrupt to get in between.
> > +	 */
> > +	for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)
> > +		call_irq_handler(vec, regs);
> > +
> > +}
> > +
> > +/*
> > + * Performance data shows that 3 is good enough to harvest 90+% of the
> > benefit
> > + * on high IRQ rate workload.
> > + * Alternatively, could make this tunable, use 3 as default.
> > + */
> > +#define MAX_POSTED_MSI_COALESCING_LOOP 3
> > +
> > +/*
> > + * For MSIs that are delivered as posted interrupts, the CPU
> > notifications
> > + * can be coalesced if the MSIs arrive in high frequency bursts.
> > + */
> > +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
> > +{
> > +	struct pt_regs *old_regs = set_irq_regs(regs);
> > +	struct pi_desc *pid;
> > +	int i = 0;
> > +
> > +	pid = this_cpu_ptr(&posted_interrupt_desc);
> > +
> > +	inc_irq_stat(posted_msi_notification_count);
> > +	irq_enter();
> > +
> > +	while (i++ < MAX_POSTED_MSI_COALESCING_LOOP) {
> > +		handle_pending_pir(pid, regs);
> > +
> > +		/*
> > +		 * If there are new interrupts posted in PIR, do
> > again. If
> > +		 * nothing pending, no need to wait for more
> > interrupts.
> > +		 */
> > +		if (is_pir_pending(pid))  
> 
> So this reads those same 4 words we xchg in handle_pending_pir(), right?
> 
> > +			continue;
> > +		else
> > +			break;
> > +	}
> > +
> > +	/*
> > +	 * Clear outstanding notification bit to allow new IRQ
> > notifications,
> > +	 * do this last to maximize the window of interrupt coalescing.
> > +	 */
> > +	pi_clear_on(pid);
> > +
> > +	/*
> > +	 * There could be a race of PI notification and the clearing
> > of ON bit,
> > +	 * process PIR bits one last time such that handling the new
> > interrupts
> > +	 * are not delayed until the next IRQ.
> > +	 */
> > +	if (unlikely(is_pir_pending(pid)))
> > +		handle_pending_pir(pid, regs);  
> 
> (A) ... we get _two_ copies of that thing in this function. Does that
> make sense ?
> 
> > +
> > +	apic_eoi();
> > +	irq_exit();
> > +	set_irq_regs(old_regs);
> > +}
> >  #endif /* X86_POSTED_MSI */  
> 
> Would it not make more sense to write things something like:
> 
it is a great idea, we can save expensive xchg if pir[i] is 0. But I have
to tweak a little to let it perform better.

> bool handle_pending_pir()
> {
> 	bool handled = false;
> 	u64 pir_copy[4];
> 
> 	for (i = 0; i < 4; i++) {
> 		if (!pid-pir_l[i]) {
> 			pir_copy[i] = 0;
> 			continue;
> 		}
> 
> 		pir_copy[i] = arch_xchg(&pir->pir_l[i], 0);
we are interleaving cacheline read and xchg. So made it to

	for (i = 0; i < 4; i++) {
		pir_copy[i] = pid->pir_l[i];
	}

	for (i = 0; i < 4; i++) {
		if (pir_copy[i]) {
			pir_copy[i] = arch_xchg(&pid->pir_l[i], 0);
			handled = true;
		}
	}

With DSA MEMFILL test just one queue one MSI, we are saving 3 xchg per loop.
Here is the performance comparison in IRQ rate:

Original RFC 9.29 m/sec, 
Optimized in your email 8.82m/sec,
Tweaked above: 9.54m/s

I need to test with more MSI vectors spreading out to all 4 u64. I suspect
the benefit will decrease since we need to do both read and xchg for
non-zero entries.

> 		handled |= true;
> 	}


> 
> 	if (!handled)
> 		return handled;
> 
> 	for_each_set_bit()
> 		....
> 
> 	return handled.
> }
> 
> sysvec_posted_blah_blah()
> {
> 	bool done = false;
> 	bool handled;
> 
> 	for (;;) {
> 		handled = handle_pending_pir();
> 		if (done)
> 			break;
> 		if (!handled || ++loops > MAX_LOOPS) {
> 			pi_clear_on(pid);
> 			/* once more after clear_on */
> 			done = true;
> 		}
> 	}
> }
> 
> 
> Hmm?




Thanks,

Jacob
Jacob Pan Nov. 15, 2023, 8:05 p.m. UTC | #4
Hi Peter,

On Wed, 15 Nov 2023 13:42:21 +0100, Peter Zijlstra <peterz@infradead.org>
wrote:

> On Sat, Nov 11, 2023 at 08:16:39PM -0800, Jacob Pan wrote:
> 
> > +static __always_inline inline void handle_pending_pir(struct pi_desc
> > *pid, struct pt_regs *regs) +{
> > +	int i, vec = FIRST_EXTERNAL_VECTOR;
> > +	u64 pir_copy[4];
> > +
> > +	/*
> > +	 * Make a copy of PIR which contains IRQ pending bits for
> > vectors,
> > +	 * then invoke IRQ handlers for each pending vector.
> > +	 * If any new interrupts were posted while we are processing,
> > will
> > +	 * do again before allowing new notifications. The idea is to
> > +	 * minimize the number of the expensive notifications if IRQs
> > come
> > +	 * in a high frequency burst.
> > +	 */
> > +	for (i = 0; i < 4; i++)
> > +		pir_copy[i] = raw_atomic64_xchg((atomic64_t
> > *)&pid->pir_l[i], 0);  
> 
> Might as well use arch_xchg() and save the atomic64_t casting.
will do

> 
>  [...]  


Thanks,

Jacob
Peter Zijlstra Nov. 15, 2023, 8:25 p.m. UTC | #5
On Wed, Nov 15, 2023 at 12:04:01PM -0800, Jacob Pan wrote:

> we are interleaving cacheline read and xchg. So made it to

Hmm, I wasn't expecting that to be a problem, but sure.

> 	for (i = 0; i < 4; i++) {
> 		pir_copy[i] = pid->pir_l[i];
> 	}
> 
> 	for (i = 0; i < 4; i++) {
> 		if (pir_copy[i]) {
> 			pir_copy[i] = arch_xchg(&pid->pir_l[i], 0);
> 			handled = true;
> 		}
> 	}
> 
> With DSA MEMFILL test just one queue one MSI, we are saving 3 xchg per loop.
> Here is the performance comparison in IRQ rate:
> 
> Original RFC 9.29 m/sec, 
> Optimized in your email 8.82m/sec,
> Tweaked above: 9.54m/s
> 
> I need to test with more MSI vectors spreading out to all 4 u64. I suspect
> the benefit will decrease since we need to do both read and xchg for
> non-zero entries.

Ah, but performance was not the reason I suggested this. Code
compactness and clarity was.

Possibly using less xchg is just a bonus :-)
Thomas Gleixner Dec. 6, 2023, 7:14 p.m. UTC | #6
On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> +	/*
> +	 * Ideally, we should start from the high order bits set in the PIR
> +	 * since each bit represents a vector. Higher order bit position means
> +	 * the vector has higher priority. But external vectors are allocated
> +	 * based on availability not priority.
> +	 *
> +	 * EOI is included in the IRQ handlers call to apic_ack_irq, which
> +	 * allows higher priority system interrupt to get in between.

What? This does not make sense.

_IF_ we go there then

     1) The EOI must be explicit in sysvec_posted_msi_notification()

     2) Interrupt enabling must happen explicit at a dedicated place in
        sysvec_posted_msi_notification()

        You _CANNOT_ run all the device handlers with interrupts
        enabled.

Please remove all traces of non-working wishful thinking from this series.

> +	 */
> +	for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)

Why does this need to check up to vector 255? The vector space does not
magially expand just because of posted interrupts, really. At least not
without major modifications to the vector management.

> +		call_irq_handler(vec, regs);
> +

Stray newline.

> +}

Thanks,

        tglx
Thomas Gleixner Dec. 6, 2023, 7:50 p.m. UTC | #7
On Wed, Nov 15 2023 at 13:56, Peter Zijlstra wrote:
>
> Would it not make more sense to write things something like:
>
> bool handle_pending_pir()
> {
> 	bool handled = false;
> 	u64 pir_copy[4];
>
> 	for (i = 0; i < 4; i++) {
> 		if (!pid-pir_l[i]) {
> 			pir_copy[i] = 0;
> 			continue;
> 		}
>
> 		pir_copy[i] = arch_xchg(&pir->pir_l[i], 0);
> 		handled |= true;
> 	}
>
> 	if (!handled)
> 		return handled;
>
> 	for_each_set_bit()
> 		....
>
> 	return handled.
> }

I don't understand what the whole copy business is about. It's
absolutely not required.

static bool handle_pending_pir(unsigned long *pir)
{
        unsigned int idx, vec;
	bool handled = false;
        unsigned long pend;
        
        for (idx = 0; offs < 4; idx++) {
                if (!pir[idx])
                	continue;
		pend = arch_xchg(pir + idx, 0);
                for_each_set_bit(vec, &pend, 64)
			call_irq_handler(vec + idx * 64, NULL);
                handled = true;
	}
        return handled;
}

No?

> sysvec_posted_blah_blah()
> {
> 	bool done = false;
> 	bool handled;
>
> 	for (;;) {
> 		handled = handle_pending_pir();
> 		if (done)
> 			break;
> 		if (!handled || ++loops > MAX_LOOPS) {

That does one loop too many. Should be ++loops == MAX_LOOPS. No?

> 			pi_clear_on(pid);
> 			/* once more after clear_on */
> 			done = true;
> 		}
> 	}
> }
>
>
> Hmm?

I think that can be done less convoluted.

{
	struct pi_desc *pid = this_cpu_ptr(&posted_interrupt_desc);
	struct pt_regs *old_regs = set_irq_regs(regs);
        int loops;

	for (loops = 0;;) {
        	bool handled = handle_pending_pir((unsigned long)pid->pir);

                if (++loops > MAX_LOOPS)
                	break;

                if (!handled || loops == MAX_LOOPS) {
                	pi_clear_on(pid);
                        /* Break the loop after handle_pending_pir()! */
                        loops = MAX_LOOPS;
                }
	}

	...
	set_irq_regs(old_regs);
}

Hmm? :)
Jacob Pan Dec. 8, 2023, 4:46 a.m. UTC | #8
Hi Thomas,

On Wed, 06 Dec 2023 20:50:24 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:

> On Wed, Nov 15 2023 at 13:56, Peter Zijlstra wrote:
> >
> > Would it not make more sense to write things something like:
> >
> > bool handle_pending_pir()
> > {
> > 	bool handled = false;
> > 	u64 pir_copy[4];
> >
> > 	for (i = 0; i < 4; i++) {
> > 		if (!pid-pir_l[i]) {
> > 			pir_copy[i] = 0;
> > 			continue;
> > 		}
> >
> > 		pir_copy[i] = arch_xchg(&pir->pir_l[i], 0);
> > 		handled |= true;
> > 	}
> >
> > 	if (!handled)
> > 		return handled;
> >
> > 	for_each_set_bit()
> > 		....
> >
> > 	return handled.
> > }  
> 
> I don't understand what the whole copy business is about. It's
> absolutely not required.
> 
> static bool handle_pending_pir(unsigned long *pir)
> {
>         unsigned int idx, vec;
> 	bool handled = false;
>         unsigned long pend;
>         
>         for (idx = 0; offs < 4; idx++) {
>                 if (!pir[idx])
>                 	continue;
> 		pend = arch_xchg(pir + idx, 0);
>                 for_each_set_bit(vec, &pend, 64)
> 			call_irq_handler(vec + idx * 64, NULL);
>                 handled = true;
> 	}
>         return handled;
> }
> 
My thinking is the following:
The PIR cache line is contended by between CPU and IOMMU, where CPU can
access PIR much faster. Nevertheless, when IOMMU does atomic swap of the
PID (PIR included), L1 cache gets evicted. Subsequent CPU read or xchg will
deal with invalid cold cache.

By making a copy of PIR as quickly as possible and clearing PIR with xchg,
we minimized the chance that IOMMU does atomic swap in the middle.
Therefore, having less L1D misses.

In the code above, it does read, xchg, and call_irq_handler() in a loop
to handle the 4 64bit PIR bits at a time. IOMMU has a greater chance to do
atomic xchg on the PIR cache line while doing call_irq_handler(). Therefore,
it causes more L1D misses.

I might be missing something?

I tested the two versions below with my DSA memory fill test and measured
DMA bandwidth and perf cache misses:

#ifdef NO_PIR_COPY
static __always_inline inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
{
	int i, vec;
	bool handled = false;
	unsigned long pending;

	for (i = 0; i < 4; i++) {
		if (!pir[i])
			continue;

		pending = arch_xchg(pir + i, 0);
		for_each_set_bit(vec, &pending, 64)
			call_irq_handler(i * 64 + vec, regs);
		handled = true;
	}

	return handled;
}
#else
static __always_inline inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
{
	int i, vec = FIRST_EXTERNAL_VECTOR;
	bool handled = false;
	unsigned long pir_copy[4];

	for (i = 0; i < 4; i++)
		pir_copy[i] = pir[i];

	for (i = 0; i < 4; i++) {
		if (!pir_copy[i])
			continue;

		pir_copy[i] = arch_xchg(pir, 0);
		handled = true;
	}

	if (handled) {
		for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
			call_irq_handler(vec, regs);
	}

	return handled;
}
#endif

DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
{
	struct pt_regs *old_regs = set_irq_regs(regs);
	struct pi_desc *pid;
	int i = 0;

	pid = this_cpu_ptr(&posted_interrupt_desc);

	inc_irq_stat(posted_msi_notification_count);
	irq_enter();

	while (i++ < MAX_POSTED_MSI_COALESCING_LOOP) {
		if (!handle_pending_pir(pid->pir64, regs))
			break;
	}

	/*
	 * Clear outstanding notification bit to allow new IRQ notifications,
	 * do this last to maximize the window of interrupt coalescing.
	 */
	pi_clear_on(pid);

	/*
	 * There could be a race of PI notification and the clearing of ON bit,
	 * process PIR bits one last time such that handling the new interrupts
	 * are not delayed until the next IRQ.
	 */
	handle_pending_pir(pid->pir64, regs);

	apic_eoi();
	irq_exit();
	set_irq_regs(old_regs);
}

Without PIR copy:

DMA memfill bandwidth: 4.944 Gbps
Performance counter stats for './run_intr.sh 512 30':                                                             
                                                                                                                   
    77,313,298,506      L1-dcache-loads                                               (79.98%)                     
         8,279,458      L1-dcache-load-misses     #    0.01% of all L1-dcache accesses  (80.03%)                   
    41,654,221,245      L1-dcache-stores                                              (80.01%)                     
            10,476      LLC-load-misses           #    0.31% of all LL-cache accesses  (79.99%)                    
         3,332,748      LLC-loads                                                     (80.00%)                     
                                                                                                                   
      30.212055434 seconds time elapsed                                                                            
                                                                                                                   
       0.002149000 seconds user                                                                                    
      30.183292000 seconds sys
                        

With PIR copy:
DMA memfill bandwidth: 5.029 Gbps
Performance counter stats for './run_intr.sh 512 30':

    78,327,247,423      L1-dcache-loads                                               (80.01%)
         7,762,311      L1-dcache-load-misses     #    0.01% of all L1-dcache accesses  (80.01%)
    42,203,221,466      L1-dcache-stores                                              (79.99%)
            23,691      LLC-load-misses           #    0.67% of all LL-cache accesses  (80.01%)
         3,561,890      LLC-loads                                                     (80.00%)

      30.201065706 seconds time elapsed

       0.005950000 seconds user
      30.167885000 seconds sys


> No?
> 
> > sysvec_posted_blah_blah()
> > {
> > 	bool done = false;
> > 	bool handled;
> >
> > 	for (;;) {
> > 		handled = handle_pending_pir();
> > 		if (done)
> > 			break;
> > 		if (!handled || ++loops > MAX_LOOPS) {  
> 
> That does one loop too many. Should be ++loops == MAX_LOOPS. No?
> 
> > 			pi_clear_on(pid);
> > 			/* once more after clear_on */
> > 			done = true;
> > 		}
> > 	}
> > }
> >
> >
> > Hmm?  
> 
> I think that can be done less convoluted.
> 
> {
> 	struct pi_desc *pid = this_cpu_ptr(&posted_interrupt_desc);
> 	struct pt_regs *old_regs = set_irq_regs(regs);
>         int loops;
> 
> 	for (loops = 0;;) {
>         	bool handled = handle_pending_pir((unsigned
> long)pid->pir);
> 
>                 if (++loops > MAX_LOOPS)
>                 	break;
> 
>                 if (!handled || loops == MAX_LOOPS) {
>                 	pi_clear_on(pid);
>                         /* Break the loop after handle_pending_pir()! */
>                         loops = MAX_LOOPS;
>                 }
> 	}
> 
> 	...
> 	set_irq_regs(old_regs);
> }
> 
> Hmm? :)


Thanks,

Jacob
Thomas Gleixner Dec. 8, 2023, 11:52 a.m. UTC | #9
On Thu, Dec 07 2023 at 20:46, Jacob Pan wrote:
> On Wed, 06 Dec 2023 20:50:24 +0100, Thomas Gleixner <tglx@linutronix.de>
> wrote:
>> I don't understand what the whole copy business is about. It's
>> absolutely not required.
>
> My thinking is the following:
> The PIR cache line is contended by between CPU and IOMMU, where CPU can
> access PIR much faster. Nevertheless, when IOMMU does atomic swap of the
> PID (PIR included), L1 cache gets evicted. Subsequent CPU read or xchg will
> deal with invalid cold cache.
>
> By making a copy of PIR as quickly as possible and clearing PIR with xchg,
> we minimized the chance that IOMMU does atomic swap in the middle.
> Therefore, having less L1D misses.
>
> In the code above, it does read, xchg, and call_irq_handler() in a loop
> to handle the 4 64bit PIR bits at a time. IOMMU has a greater chance to do
> atomic xchg on the PIR cache line while doing call_irq_handler(). Therefore,
> it causes more L1D misses.

That makes sense and if we go there it wants to be documented.

> Without PIR copy:
>
> DMA memfill bandwidth: 4.944 Gbps
> Performance counter stats for './run_intr.sh 512 30':                                                             
>                                                                                                                    
>     77,313,298,506      L1-dcache-loads                                               (79.98%)                     
>          8,279,458      L1-dcache-load-misses     #    0.01% of all L1-dcache accesses  (80.03%)                   
>     41,654,221,245      L1-dcache-stores                                              (80.01%)                     
>             10,476      LLC-load-misses           #    0.31% of all LL-cache accesses  (79.99%)                    
>          3,332,748      LLC-loads                                                     (80.00%)                     
>                                                                                                                    
>       30.212055434 seconds time elapsed                                                                            
>                                                                                                                    
>        0.002149000 seconds user                                                                                    
>       30.183292000 seconds sys
>                         
>
> With PIR copy:
> DMA memfill bandwidth: 5.029 Gbps
> Performance counter stats for './run_intr.sh 512 30':
>
>     78,327,247,423      L1-dcache-loads                                               (80.01%)
>          7,762,311      L1-dcache-load-misses     #    0.01% of all L1-dcache accesses  (80.01%)
>     42,203,221,466      L1-dcache-stores                                              (79.99%)
>             23,691      LLC-load-misses           #    0.67% of all LL-cache accesses  (80.01%)
>          3,561,890      LLC-loads                                                     (80.00%)
>
>       30.201065706 seconds time elapsed
>
>        0.005950000 seconds user
>       30.167885000 seconds sys

Interesting, though I'm not really convinced that this DMA memfill
microbenchmark resembles real work loads.

Did you test with something realistic, e.g. storage or networking, too?

Thanks,

        tglx
Jacob Pan Dec. 8, 2023, 8:02 p.m. UTC | #10
Hi Thomas,

On Fri, 08 Dec 2023 12:52:49 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:

> On Thu, Dec 07 2023 at 20:46, Jacob Pan wrote:
> > On Wed, 06 Dec 2023 20:50:24 +0100, Thomas Gleixner <tglx@linutronix.de>
> > wrote:  
> >> I don't understand what the whole copy business is about. It's
> >> absolutely not required.  
> >
> > My thinking is the following:
> > The PIR cache line is contended by between CPU and IOMMU, where CPU can
> > access PIR much faster. Nevertheless, when IOMMU does atomic swap of the
> > PID (PIR included), L1 cache gets evicted. Subsequent CPU read or xchg
> > will deal with invalid cold cache.
> >
> > By making a copy of PIR as quickly as possible and clearing PIR with
> > xchg, we minimized the chance that IOMMU does atomic swap in the middle.
> > Therefore, having less L1D misses.
> >
> > In the code above, it does read, xchg, and call_irq_handler() in a loop
> > to handle the 4 64bit PIR bits at a time. IOMMU has a greater chance to
> > do atomic xchg on the PIR cache line while doing call_irq_handler().
> > Therefore, it causes more L1D misses.  
> 
> That makes sense and if we go there it wants to be documented.
will do. How about this explanation:
"
Posted interrupt descriptor (PID) fits in a cache line that is frequently
accessed by both CPU and IOMMU.

During posted MSI processing, the CPU needs to do 64-bit read and xchg for
checking and clearing posted interrupt request (PIR), a 256 bit field
within the PID. On the other side, IOMMU do atomic swaps of the entire
PID cache line when posting interrupts. The CPU can access the cache line
much faster than the IOMMU.

The cache line states after each operation are as follows:

CPU		IOMMU			PID Cache line state
-------------------------------------------------------------
read64					exclusive
lock xchg64				modified
		post/atomic swap	invalid
-------------------------------------------------------------
Note that PID cache line is evicted after each IOMMU interrupt posting.

The posted MSI demuxing loop is written to optimize the cache performance
based on the two considerations around the PID cache line:

1. Reduce L1 data cache miss by avoiding contention with IOMMU's interrupt
posting/atomic swap, a copy of PIR is used to dispatch interrupt handlers.

2. Keep the cache line state consistent as much as possible. e.g. when
making a copy and clearing the PIR (assuming non-zero PIR bits are present
in the entire PIR), do:
read, read, read, read, xchg, xchg, xchg, xchg
instead of:
read, xchg, read, xchg, read, xchg, read, xchg
"

> 
> > Without PIR copy:
> >
> > DMA memfill bandwidth: 4.944 Gbps
> > Performance counter stats for './run_intr.sh 512 30':
> > 
> >     77,313,298,506      L1-dcache-loads
> >               (79.98%) 8,279,458      L1-dcache-load-misses     #
> > 0.01% of all L1-dcache accesses  (80.03%) 41,654,221,245
> > L1-dcache-stores                                              (80.01%)
> > 10,476      LLC-load-misses           #    0.31% of all LL-cache
> > accesses  (79.99%) 3,332,748      LLC-loads
> >                         (80.00%) 30.212055434 seconds time elapsed
> > 
> >        0.002149000 seconds user
> > 30.183292000 seconds sys
> >                         
> >
> > With PIR copy:
> > DMA memfill bandwidth: 5.029 Gbps
> > Performance counter stats for './run_intr.sh 512 30':
> >
> >     78,327,247,423      L1-dcache-loads
> >               (80.01%) 7,762,311      L1-dcache-load-misses     #
> > 0.01% of all L1-dcache accesses  (80.01%) 42,203,221,466
> > L1-dcache-stores                                              (79.99%)
> > 23,691      LLC-load-misses           #    0.67% of all LL-cache
> > accesses  (80.01%) 3,561,890      LLC-loads
> >                         (80.00%)
> >
> >       30.201065706 seconds time elapsed
> >
> >        0.005950000 seconds user
> >       30.167885000 seconds sys  
> 
> Interesting, though I'm not really convinced that this DMA memfill
> microbenchmark resembles real work loads.
> 
It is just a tool to get some quick experiments done, not realistic. Though
I am adding various knobs to make it more useful. e.g. adjustable interrupt
rate, delays in idxd hardirq handler.

> Did you test with something realistic, e.g. storage or networking, too?
> 
Not yet for this particular code, working on testing with FIO on Samsung
Gen5 NVMe disks. I am getting help from the people with the set up.


Thanks,

Jacob
Jacob Pan Jan. 26, 2024, 11:32 p.m. UTC | #11
Hi Thomas,

On Fri, 08 Dec 2023 12:52:49 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:

> > Without PIR copy:
> >
> > DMA memfill bandwidth: 4.944 Gbps
> > Performance counter stats for './run_intr.sh 512 30':
> > 
> >     77,313,298,506      L1-dcache-loads
> >               (79.98%) 8,279,458      L1-dcache-load-misses     #
> > 0.01% of all L1-dcache accesses  (80.03%) 41,654,221,245
> > L1-dcache-stores                                              (80.01%)
> > 10,476      LLC-load-misses           #    0.31% of all LL-cache
> > accesses  (79.99%) 3,332,748      LLC-loads
> >                         (80.00%) 30.212055434 seconds time elapsed
> > 
> >        0.002149000 seconds user
> > 30.183292000 seconds sys
> >                         
> >
> > With PIR copy:
> > DMA memfill bandwidth: 5.029 Gbps
> > Performance counter stats for './run_intr.sh 512 30':
> >
> >     78,327,247,423      L1-dcache-loads
> >               (80.01%) 7,762,311      L1-dcache-load-misses     #
> > 0.01% of all L1-dcache accesses  (80.01%) 42,203,221,466
> > L1-dcache-stores                                              (79.99%)
> > 23,691      LLC-load-misses           #    0.67% of all LL-cache
> > accesses  (80.01%) 3,561,890      LLC-loads
> >                         (80.00%)
> >
> >       30.201065706 seconds time elapsed
> >
> >        0.005950000 seconds user
> >       30.167885000 seconds sys  
> 
> Interesting, though I'm not really convinced that this DMA memfill
> microbenchmark resembles real work loads.
> 
> Did you test with something realistic, e.g. storage or networking, too?
I have done the following FIO test on NVME drives and not seeing any
meaningful differences in IOPS between the two implementations.

Here is my setup and results on 4 NVME drives connected to a x16 PCIe slot:

 +-[0000:62]-
 |           +-01.0-[63]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM174X
 |           +-03.0-[64]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM174X
 |           +-05.0-[65]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM174X
 |           \-07.0-[66]----00.0  Samsung Electronics Co Ltd NVMe SSD Controller PM174X


libaio, no PIR_COPY
======================================
fio-3.35                                                                                                           
Starting 512 processes                                                                                             
Jobs: 512 (f=512): [r(512)][100.0%][r=32.2GiB/s][r=8445k IOPS][eta 00m:00s]                                        
disk_nvme6n1_thread_1: (groupid=0, jobs=512): err= 0: pid=31559: Mon Jan  8 21:49:22 2024                          
  read: IOPS=8419k, BW=32.1GiB/s (34.5GB/s)(964GiB/30006msec)                                                      
    slat (nsec): min=1325, max=115807k, avg=42368.34, stdev=1517031.57                                             
    clat (usec): min=2, max=499085, avg=15139.97, stdev=25682.25                                                   
     lat (usec): min=68, max=499089, avg=15182.33, stdev=25709.81                                                  
    clat percentiles (usec):                                                                                       
     |  1.00th=[   734],  5.00th=[   783], 10.00th=[   816], 20.00th=[   857],                                     
     | 30.00th=[   906], 40.00th=[   971], 50.00th=[  1074], 60.00th=[  1369],                                     
     | 70.00th=[ 13042], 80.00th=[ 19792], 90.00th=[ 76022], 95.00th=[ 76022],                                     
     | 99.00th=[ 77071], 99.50th=[ 81265], 99.90th=[ 85459], 99.95th=[ 91751],                                     
     | 99.99th=[200279]                                                                                            
   bw (  MiB/s): min=18109, max=51859, per=100.00%, avg=32965.98, stdev=16.88, samples=14839                       
   iops        : min=4633413, max=13281470, avg=8439278.47, stdev=4324.70, samples=14839                           
  lat (usec)   : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%                                                  
  lat (usec)   : 250=0.01%, 500=0.01%, 750=1.84%, 1000=41.96%                                                      
  lat (msec)   : 2=18.37%, 4=0.20%, 10=3.88%, 20=13.95%, 50=5.42%                                                  
  lat (msec)   : 100=14.33%, 250=0.02%, 500=0.01%                                                                  
  cpu          : usr=1.16%, sys=3.54%, ctx=4932752, majf=0, minf=192764                                            
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=0.1%, >=64=100.0%                                     
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%                                    
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.1%                                    
     issued rwts: total=252616589,0,0,0 short=0,0,0,0 dropped=0,0,0,0                                              
     latency   : target=0, window=0, percentile=100.00%, depth=256                                                 
                                                                                                                   
Run status group 0 (all jobs):                                                                                     
   READ: bw=32.1GiB/s (34.5GB/s), 32.1GiB/s-32.1GiB/s (34.5GB/s-34.5GB/s), io=964GiB (1035GB), run=30006-30006msec 
                                                                                                                   
Disk stats (read/write):                                                                                           
  nvme6n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=96.31%                                                  
  nvme5n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=97.15%                                                  
  nvme4n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=98.06%                                                  
  nvme3n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=98.94%                                                  
                                                                                                                   
 Performance counter stats for 'system wide':                                                                      
                                                                                                                   
    22,985,903,515      L1-dcache-load-misses                                                   (42.86%)           
    22,989,992,126      L1-dcache-load-misses                                                   (57.14%)           
   751,228,710,993      L1-dcache-stores                                                        (57.14%)           
       465,033,820      LLC-load-misses                  #   18.27% of all LL-cache accesses    (57.15%)           
     2,545,570,669      LLC-loads                                                               (57.14%)           
     1,058,582,881      LLC-stores                                                              (28.57%)           
       326,135,823      LLC-store-misses                                                        (28.57%)           
                                                                                                                   
      32.045718194 seconds time elapsed                                                                       
-------------------------------------------
libaio with PIR_COPY
-------------------------------------------
fio-3.35                                                                                                           
Starting 512 processes                                                                                             
Jobs: 512 (f=512): [r(512)][100.0%][r=32.2GiB/s][r=8445k IOPS][eta 00m:00s]                                        
disk_nvme6n1_thread_1: (groupid=0, jobs=512): err= 0: pid=5103: Mon Jan  8 23:12:12 2024                           
  read: IOPS=8420k, BW=32.1GiB/s (34.5GB/s)(964GiB/30011msec)                                                      
    slat (nsec): min=1339, max=97021k, avg=42447.84, stdev=1442726.09                                              
    clat (usec): min=2, max=369410, avg=14820.01, stdev=24112.59                                                   
     lat (usec): min=69, max=369412, avg=14862.46, stdev=24139.33                                                  
    clat percentiles (usec):                                                                                       
     |  1.00th=[   717],  5.00th=[   783], 10.00th=[   824], 20.00th=[   873],                                     
     | 30.00th=[   930], 40.00th=[  1012], 50.00th=[  1172], 60.00th=[  8094],                                     
     | 70.00th=[ 14222], 80.00th=[ 18744], 90.00th=[ 76022], 95.00th=[ 76022],                                     
     | 99.00th=[ 76022], 99.50th=[ 78119], 99.90th=[ 81265], 99.95th=[ 81265],                                     
     | 99.99th=[135267]                                                                                            
   bw (  MiB/s): min=19552, max=62819, per=100.00%, avg=33774.56, stdev=31.02, samples=14540                       
   iops        : min=5005807, max=16089892, avg=8646500.17, stdev=7944.42, samples=14540                           
  lat (usec)   : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%                                                  
  lat (usec)   : 250=0.01%, 500=0.01%, 750=2.50%, 1000=36.41%                                                      
  lat (msec)   : 2=17.39%, 4=0.27%, 10=5.83%, 20=18.94%, 50=5.59%                                                  
  lat (msec)   : 100=13.06%, 250=0.01%, 500=0.01%                                                                  
  cpu          : usr=1.20%, sys=3.74%, ctx=6758326, majf=0, minf=193128                                            
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=0.1%, >=64=100.0%                                     
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%                                    
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.1%                                    
     issued rwts: total=252677827,0,0,0 short=0,0,0,0 dropped=0,0,0,0                                              
     latency   : target=0, window=0, percentile=100.00%, depth=256                                                 
                                                                                                                   
Run status group 0 (all jobs):                                                                                     
   READ: bw=32.1GiB/s (34.5GB/s), 32.1GiB/s-32.1GiB/s (34.5GB/s-34.5GB/s), io=964GiB (1035GB), run=30011-30011msec 
                                                                                                                   
Disk stats (read/write):                                                                                           
  nvme6n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=96.36%                                                  
  nvme5n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=97.18%                                                  
  nvme4n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=98.08%                                                  
  nvme3n1: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=98.96%                                                  
                                                                                                                   
 Performance counter stats for 'system wide':                                                                      
                                                                                                                   
    24,762,800,042      L1-dcache-load-misses                                                   (42.86%)           
    24,764,415,765      L1-dcache-load-misses                                                   (57.14%)           
   756,096,467,595      L1-dcache-stores                                                        (57.14%)           
       483,611,270      LLC-load-misses                  #   16.21% of all LL-cache accesses    (57.14%)           
     2,982,610,898      LLC-loads                                                               (57.14%)           
     1,283,077,818      LLC-stores                                                              (28.57%)           
       313,253,711      LLC-store-misses                                                        (28.57%)           
                                                                                                                   
      32.059810215 seconds time elapsed
                                       


Thanks,

Jacob
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 72c6a084dba3..6c8daa7518eb 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,6 +44,9 @@  typedef struct {
 	unsigned int irq_hv_reenlightenment_count;
 	unsigned int hyperv_stimer0_count;
 #endif
+#ifdef CONFIG_X86_POSTED_MSI
+	unsigned int posted_msi_notification_count;
+#endif
 } ____cacheline_aligned irq_cpustat_t;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 05fd175cec7d..f756e761e7c0 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -644,6 +644,9 @@  DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR,		sysvec_error_interrupt);
 DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR,		sysvec_spurious_apic_interrupt);
 DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR,		sysvec_apic_timer_interrupt);
 DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR,	sysvec_x86_platform_ipi);
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	sysvec_posted_msi_notification);
+# endif
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index b786d48f5a0f..d5840d777469 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -159,6 +159,9 @@  static const __initconst struct idt_data apic_idts[] = {
 # endif
 	INTG(SPURIOUS_APIC_VECTOR,		asm_sysvec_spurious_apic_interrupt),
 	INTG(ERROR_APIC_VECTOR,			asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+	INTG(POSTED_MSI_NOTIFICATION_VECTOR,	asm_sysvec_posted_msi_notification),
+# endif
 #endif
 };
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 0bffe8152385..786c2c8330f4 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -183,6 +183,13 @@  int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ",
 			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
 	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+	seq_printf(p, "%*s: ", prec, "PMN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->posted_msi_notification_count);
+	seq_puts(p, "  Posted MSI notification event\n");
 #endif
 	return 0;
 }
@@ -351,6 +358,90 @@  void intel_posted_msi_init(void)
 	this_cpu_write(posted_interrupt_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
 	this_cpu_write(posted_interrupt_desc.ndst, this_cpu_read(x86_cpu_to_apicid));
 }
+
+static __always_inline inline void handle_pending_pir(struct pi_desc *pid, struct pt_regs *regs)
+{
+	int i, vec = FIRST_EXTERNAL_VECTOR;
+	u64 pir_copy[4];
+
+	/*
+	 * Make a copy of PIR which contains IRQ pending bits for vectors,
+	 * then invoke IRQ handlers for each pending vector.
+	 * If any new interrupts were posted while we are processing, will
+	 * do again before allowing new notifications. The idea is to
+	 * minimize the number of the expensive notifications if IRQs come
+	 * in a high frequency burst.
+	 */
+	for (i = 0; i < 4; i++)
+		pir_copy[i] = raw_atomic64_xchg((atomic64_t *)&pid->pir_l[i], 0);
+
+	/*
+	 * Ideally, we should start from the high order bits set in the PIR
+	 * since each bit represents a vector. Higher order bit position means
+	 * the vector has higher priority. But external vectors are allocated
+	 * based on availability not priority.
+	 *
+	 * EOI is included in the IRQ handlers call to apic_ack_irq, which
+	 * allows higher priority system interrupt to get in between.
+	 */
+	for_each_set_bit_from(vec, (unsigned long *)&pir_copy[0], 256)
+		call_irq_handler(vec, regs);
+
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ * Alternatively, could make this tunable, use 3 as default.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct pi_desc *pid;
+	int i = 0;
+
+	pid = this_cpu_ptr(&posted_interrupt_desc);
+
+	inc_irq_stat(posted_msi_notification_count);
+	irq_enter();
+
+	while (i++ < MAX_POSTED_MSI_COALESCING_LOOP) {
+		handle_pending_pir(pid, regs);
+
+		/*
+		 * If there are new interrupts posted in PIR, do again. If
+		 * nothing pending, no need to wait for more interrupts.
+		 */
+		if (is_pir_pending(pid))
+			continue;
+		else
+			break;
+	}
+
+	/*
+	 * Clear outstanding notification bit to allow new IRQ notifications,
+	 * do this last to maximize the window of interrupt coalescing.
+	 */
+	pi_clear_on(pid);
+
+	/*
+	 * There could be a race of PI notification and the clearing of ON bit,
+	 * process PIR bits one last time such that handling the new interrupts
+	 * are not delayed until the next IRQ.
+	 */
+	if (unlikely(is_pir_pending(pid)))
+		handle_pending_pir(pid, regs);
+
+	apic_eoi();
+	irq_exit();
+	set_irq_regs(old_regs);
+}
 #endif /* X86_POSTED_MSI */
 
 #ifdef CONFIG_HOTPLUG_CPU