diff mbox series

[PATCHv2,1/5] arm64/entry-common: push the judgement of nmi ahead

Message ID 20210924132837.45994-2-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series arm64/irqentry: remove duplicate housekeeping of | expand

Commit Message

Pingfan Liu Sept. 24, 2021, 1:28 p.m. UTC
In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
irq, which makes the condition !interrupts_enabled(regs) fail to detect
the NMI. This will cause a mistaken account for irq.

Introducing two interfaces: handle_arch_nmi and interrupt_is_nmi to
judge NMI at this stage.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Julien Thierry <julien.thierry@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuichi Ito <ito-yuichi@fujitsu.com>
Cc: linux-kernel@vger.kernel.org
To: linux-arm-kernel@lists.infradead.org
---
 arch/arm64/include/asm/irq.h     |  5 ++++
 arch/arm64/kernel/entry-common.c | 45 ++++++++++++++++++++++----------
 arch/arm64/kernel/irq.c          | 29 ++++++++++++++++++++
 3 files changed, 65 insertions(+), 14 deletions(-)

Comments

Mark Rutland Sept. 24, 2021, 5:53 p.m. UTC | #1
On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> irq, which makes the condition !interrupts_enabled(regs) fail to detect
> the NMI. This will cause a mistaken account for irq.

Can you please explain this in more detail? It's not clear which
specific case you mean when you say "NMI interrupts an irq", as that
could mean a number of distinct scenarios.

AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
causes a new exception we'll do the right thing. So either I'm missing a
subtlety or you're describing a different scenario..

Note that the entry code is only trying to distinguish between:

a) This exception is *definitely* an NMI (because regular interrupts
   were masked).

b) This exception is *either* and IRQ or an NMI (and this *cannot* be
   distinguished until we acknowledge the interrupt), so we treat it as
   an IRQ for now.

... and we leave it to the irqchip to handle the gory details. We only
need to distinguish (a) early to avoid nesting IRQ logic within itself
in an unsafe way.

Thanks,
Mark.

> Introducing two interfaces: handle_arch_nmi and interrupt_is_nmi to
> judge NMI at this stage.
> 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Joey Gouly <joey.gouly@arm.com>
> Cc: Sami Tolvanen <samitolvanen@google.com>
> Cc: Julien Thierry <julien.thierry@arm.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Yuichi Ito <ito-yuichi@fujitsu.com>
> Cc: linux-kernel@vger.kernel.org
> To: linux-arm-kernel@lists.infradead.org
> ---
>  arch/arm64/include/asm/irq.h     |  5 ++++
>  arch/arm64/kernel/entry-common.c | 45 ++++++++++++++++++++++----------
>  arch/arm64/kernel/irq.c          | 29 ++++++++++++++++++++
>  3 files changed, 65 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index fac08e18bcd5..a59b1745f458 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -12,6 +12,11 @@ int set_handle_irq(void (*handle_irq)(struct pt_regs *));
>  #define set_handle_irq	set_handle_irq
>  int set_handle_fiq(void (*handle_fiq)(struct pt_regs *));
>  
> +extern void (*handle_arch_irq)(struct pt_regs *regs);
> +extern void (*handle_arch_fiq)(struct pt_regs *regs);
> +extern void (*handle_arch_nmi)(struct pt_regs *regs);
> +extern bool (*interrupt_is_nmi)(void);
> +
>  static inline int nr_legacy_irqs(void)
>  {
>  	return 0;
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index 32f9796c4ffe..69a8cc082712 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -24,6 +24,7 @@
>  #include <asm/stacktrace.h>
>  #include <asm/sysreg.h>
>  #include <asm/system_misc.h>
> +#include <asm/irq.h>
>  
>  /*
>   * Handle IRQ/context state management when entering from kernel mode.
> @@ -219,17 +220,28 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs)
>  		lockdep_hardirqs_on(CALLER_ADDR0);
>  }
>  
> -static void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
> +static inline bool arm64_in_nmi(struct pt_regs *regs)
>  {
> -	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
> +	if (!interrupts_enabled(regs) || (*interrupt_is_nmi)())
> +		return true;
> +	return false;
> +}
> +
> +/* return true if in irq, otherwise in nmi */
> +static bool noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
> +{
> +	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && arm64_in_nmi(regs)) {
>  		arm64_enter_nmi(regs);
> -	else
> +		return false;
> +	} else {
>  		enter_from_kernel_mode(regs);
> +		return true;
> +	}
>  }
>  
> -static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
> +static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs, bool in_irq)
>  {
> -	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
> +	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !in_irq)
>  		arm64_exit_nmi(regs);
>  	else
>  		exit_to_kernel_mode(regs);
> @@ -269,9 +281,6 @@ static void do_interrupt_handler(struct pt_regs *regs,
>  		handler(regs);
>  }
>  
> -extern void (*handle_arch_irq)(struct pt_regs *);
> -extern void (*handle_arch_fiq)(struct pt_regs *);
> -
>  static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
>  				      unsigned int esr)
>  {
> @@ -433,12 +442,20 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
>  }
>  
>  static void noinstr el1_interrupt(struct pt_regs *regs,
> -				  void (*handler)(struct pt_regs *))
> +				  void (*handler)(struct pt_regs *),
> +				  void (*nmi_handler)(struct pt_regs *))
>  {
> +	bool in_irq;
> +	void (*h)(struct pt_regs *regs);
> +
>  	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
>  
> -	enter_el1_irq_or_nmi(regs);
> -	do_interrupt_handler(regs, handler);
> +	in_irq = enter_el1_irq_or_nmi(regs);
> +	if (in_irq)
> +		h = handler;
> +	else
> +		h = nmi_handler;
> +	do_interrupt_handler(regs, h);
>  
>  	/*
>  	 * Note: thread_info::preempt_count includes both thread_info::count
> @@ -449,17 +466,17 @@ static void noinstr el1_interrupt(struct pt_regs *regs,
>  	    READ_ONCE(current_thread_info()->preempt_count) == 0)
>  		arm64_preempt_schedule_irq();
>  
> -	exit_el1_irq_or_nmi(regs);
> +	exit_el1_irq_or_nmi(regs, in_irq);
>  }
>  
>  asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs)
>  {
> -	el1_interrupt(regs, handle_arch_irq);
> +	el1_interrupt(regs, handle_arch_irq, handle_arch_nmi);
>  }
>  
>  asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs)
>  {
> -	el1_interrupt(regs, handle_arch_fiq);
> +	el1_interrupt(regs, handle_arch_fiq, handle_arch_nmi);
>  }
>  
>  asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs)
> diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
> index bda49430c9ea..e67435eb4cba 100644
> --- a/arch/arm64/kernel/irq.c
> +++ b/arch/arm64/kernel/irq.c
> @@ -81,8 +81,19 @@ static void default_handle_fiq(struct pt_regs *regs)
>  	panic("FIQ taken without a root FIQ handler\n");
>  }
>  
> +static void default_handle_nmi(struct pt_regs *unused)
> +{
> +}
> +
> +static bool default_nmi_discriminator(void)
> +{
> +	return false;
> +}
> +
>  void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq;
>  void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq;
> +void (*handle_arch_nmi)(struct pt_regs *) __ro_after_init = default_handle_nmi;
> +bool (*interrupt_is_nmi)(void) __ro_after_init = default_nmi_discriminator;
>  
>  int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
>  {
> @@ -104,6 +115,24 @@ int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *))
>  	return 0;
>  }
>  
> +int __init set_handle_nmi(void (*handle_nmi)(struct pt_regs *))
> +{
> +	if (handle_arch_nmi != default_handle_nmi)
> +		return -EBUSY;
> +
> +	handle_arch_nmi = handle_nmi;
> +	return 0;
> +}
> +
> +int __init set_nmi_discriminator(bool (*discriminator)(void))
> +{
> +	if (interrupt_is_nmi != default_nmi_discriminator)
> +		return -EBUSY;
> +
> +	interrupt_is_nmi = discriminator;
> +	return 0;
> +}
> +
>  void __init init_IRQ(void)
>  {
>  	init_irq_stacks();
> -- 
> 2.31.1
>
Pingfan Liu Sept. 25, 2021, 3:39 p.m. UTC | #2
On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > the NMI. This will cause a mistaken account for irq.
> 
Sorry about the confusing word "account", it should be "lockdep/rcu/.."

> Can you please explain this in more detail? It's not clear which
> specific case you mean when you say "NMI interrupts an irq", as that
> could mean a number of distinct scenarios.
> 
> AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> causes a new exception we'll do the right thing. So either I'm missing a
> subtlety or you're describing a different scenario..
> 
> Note that the entry code is only trying to distinguish between:
> 
> a) This exception is *definitely* an NMI (because regular interrupts
>    were masked).
> 
> b) This exception is *either* and IRQ or an NMI (and this *cannot* be
>    distinguished until we acknowledge the interrupt), so we treat it as
>    an IRQ for now.
> 
b) is the aim.

At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
which does not call rcu_irq_enter_check_tick(). So it is not proper to
"treat it as an IRQ for now"

> ... and we leave it to the irqchip to handle the gory details. We only

The detail should hide in irqchip to decide if an exception is either
NMI or IRQ. But could irqchip export the interface to entry? (This patch
export two: handle_arch_nmi() and interrupt_is_nmi() ).

Thanks,

	Pingfan
Mark Rutland Sept. 30, 2021, 1:32 p.m. UTC | #3
On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > the NMI. This will cause a mistaken account for irq.
> > 
> Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> 
> > Can you please explain this in more detail? It's not clear which
> > specific case you mean when you say "NMI interrupts an irq", as that
> > could mean a number of distinct scenarios.
> > 
> > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > causes a new exception we'll do the right thing. So either I'm missing a
> > subtlety or you're describing a different scenario..
> > 
> > Note that the entry code is only trying to distinguish between:
> > 
> > a) This exception is *definitely* an NMI (because regular interrupts
> >    were masked).
> > 
> > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> >    distinguished until we acknowledge the interrupt), so we treat it as
> >    an IRQ for now.
> > 
> b) is the aim.
> 
> At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> which does not call rcu_irq_enter_check_tick(). So it is not proper to
> "treat it as an IRQ for now"

I'm struggling to understand the problem here. What is "not proper", and
why?

Do you think there's a correctness problem, or that we're doing more
work than necessary? 

If you could give a specific example of a problem, it would really help.

I'm aware that we do more work than strictly necessary when we take a
pNMI from a context with IRQs enabled, but that's how we'd intended this
to work, as it's vastly simpler to manage the state that way. Unless
there's a real problem with that approach I'd prefer to leave it as-is.

Thanks,
Mark.
Pingfan Liu Oct. 8, 2021, 4:01 a.m. UTC | #4
Sorry that I missed this message and I am just back from a long
festival.

Adding Paul for RCU guidance.

On Thu, Sep 30, 2021 at 02:32:57PM +0100, Mark Rutland wrote:
> On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> > On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > > the NMI. This will cause a mistaken account for irq.
> > > 
> > Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> > 
> > > Can you please explain this in more detail? It's not clear which
> > > specific case you mean when you say "NMI interrupts an irq", as that
> > > could mean a number of distinct scenarios.
> > > 
> > > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > > causes a new exception we'll do the right thing. So either I'm missing a
> > > subtlety or you're describing a different scenario..
> > > 
> > > Note that the entry code is only trying to distinguish between:
> > > 
> > > a) This exception is *definitely* an NMI (because regular interrupts
> > >    were masked).
> > > 
> > > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> > >    distinguished until we acknowledge the interrupt), so we treat it as
> > >    an IRQ for now.
> > > 
> > b) is the aim.
> > 
> > At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> > While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> > which does not call rcu_irq_enter_check_tick(). So it is not proper to
> > "treat it as an IRQ for now"
> 
> I'm struggling to understand the problem here. What is "not proper", and
> why?
> 
> Do you think there's a correctness problem, or that we're doing more
> work than necessary? 
> 
I had thought it just did redundant accounting. But after revisiting RCU
code, I think it confronts a real bug.

> If you could give a specific example of a problem, it would really help.
> 
Refer to rcu_nmi_enter(), which can be called by
enter_from_kernel_mode():

||noinstr void rcu_nmi_enter(void)
||{
||        ...
||        if (rcu_dynticks_curr_cpu_in_eqs()) {
||
||                if (!in_nmi())
||                        rcu_dynticks_task_exit();
||
||                // RCU is not watching here ...
||                rcu_dynticks_eqs_exit();
||                // ... but is watching here.
||
||                if (!in_nmi()) {
||                        instrumentation_begin();
||                        rcu_cleanup_after_idle();
||                        instrumentation_end();
||                }
||
||                instrumentation_begin();
||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
||
||                incby = 1;
||        } else if (!in_nmi()) {
||                instrumentation_begin();
||                rcu_irq_enter_check_tick();
||        } else  {
||                instrumentation_begin();
||        }
||        ...
||}

There is 3 pieces of code put under the
protection of if (!in_nmi()). At least the last one
"rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
is supposed to hold a spin lock with irqoff by
"raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
scenario in rcu_nmi_exit()->rcu_prepare_for_idle().

As for the first two "if (!in_nmi())", I have no idea of why, except
breaching spin_lock_irq() by NMI. Hope Paul can give some guide.


Thanks,

	Pingfan


> I'm aware that we do more work than strictly necessary when we take a
> pNMI from a context with IRQs enabled, but that's how we'd intended this
> to work, as it's vastly simpler to manage the state that way. Unless
> there's a real problem with that approach I'd prefer to leave it as-is.
> 
> Thanks,
> Mark.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Pingfan Liu Oct. 8, 2021, 2:55 p.m. UTC | #5
On Fri, Oct 08, 2021 at 12:01:25PM +0800, Pingfan Liu wrote:
> Sorry that I missed this message and I am just back from a long
> festival.
> 
> Adding Paul for RCU guidance.
> 
> On Thu, Sep 30, 2021 at 02:32:57PM +0100, Mark Rutland wrote:
> > On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> > > On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > > > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > > > the NMI. This will cause a mistaken account for irq.
> > > > 
> > > Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> > > 
> > > > Can you please explain this in more detail? It's not clear which
> > > > specific case you mean when you say "NMI interrupts an irq", as that
> > > > could mean a number of distinct scenarios.
> > > > 
> > > > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > > > causes a new exception we'll do the right thing. So either I'm missing a
> > > > subtlety or you're describing a different scenario..
> > > > 
> > > > Note that the entry code is only trying to distinguish between:
> > > > 
> > > > a) This exception is *definitely* an NMI (because regular interrupts
> > > >    were masked).
> > > > 
> > > > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> > > >    distinguished until we acknowledge the interrupt), so we treat it as
> > > >    an IRQ for now.
> > > > 
> > > b) is the aim.
> > > 
> > > At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> > > While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> > > which does not call rcu_irq_enter_check_tick(). So it is not proper to
> > > "treat it as an IRQ for now"
> > 
> > I'm struggling to understand the problem here. What is "not proper", and
> > why?
> > 
> > Do you think there's a correctness problem, or that we're doing more
> > work than necessary? 
> > 
> I had thought it just did redundant accounting. But after revisiting RCU
> code, I think it confronts a real bug.
> 
> > If you could give a specific example of a problem, it would really help.
> > 
> Refer to rcu_nmi_enter(), which can be called by
> enter_from_kernel_mode():
> 
> ||noinstr void rcu_nmi_enter(void)
> ||{
> ||        ...
> ||        if (rcu_dynticks_curr_cpu_in_eqs()) {
> ||
> ||                if (!in_nmi())
> ||                        rcu_dynticks_task_exit();
> ||
> ||                // RCU is not watching here ...
> ||                rcu_dynticks_eqs_exit();
> ||                // ... but is watching here.
> ||
> ||                if (!in_nmi()) {
> ||                        instrumentation_begin();
> ||                        rcu_cleanup_after_idle();
> ||                        instrumentation_end();
> ||                }
> ||
> ||                instrumentation_begin();
> ||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
> ||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
> ||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
> ||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
> ||
> ||                incby = 1;
> ||        } else if (!in_nmi()) {
> ||                instrumentation_begin();
> ||                rcu_irq_enter_check_tick();
> ||        } else  {
> ||                instrumentation_begin();
> ||        }
> ||        ...
> ||}
> 

Forget to supplement the context for understanding the case:
  On arm64, at present, a pNMI (akin to NMI) may call rcu_nmi_enter()
  without calling "__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);".
  As a result it can be mistaken as an normal interrupt in
  rcu_nmi_enter().

And this may cause the following issue:
> There is 3 pieces of code put under the
> protection of if (!in_nmi()). At least the last one
> "rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
> is supposed to hold a spin lock with irqoff by
> "raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
> scenario in rcu_nmi_exit()->rcu_prepare_for_idle().
> 
> As for the first two "if (!in_nmi())", I have no idea of why, except
> breaching spin_lock_irq() by NMI. Hope Paul can give some guide.
> 
> 
> Thanks,
> 
> 	Pingfan
> 
> 
> > I'm aware that we do more work than strictly necessary when we take a
> > pNMI from a context with IRQs enabled, but that's how we'd intended this
> > to work, as it's vastly simpler to manage the state that way. Unless
> > there's a real problem with that approach I'd prefer to leave it as-is.
> > 
> > Thanks,
> > Mark.
> > 
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Paul E. McKenney Oct. 8, 2021, 3:45 p.m. UTC | #6
On Fri, Oct 08, 2021 at 12:01:25PM +0800, Pingfan Liu wrote:
> Sorry that I missed this message and I am just back from a long
> festival.
> 
> Adding Paul for RCU guidance.

Didn't the recent patch series cover this, or is this a new problem?

							Thanx, Paul

> On Thu, Sep 30, 2021 at 02:32:57PM +0100, Mark Rutland wrote:
> > On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> > > On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > > > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > > > the NMI. This will cause a mistaken account for irq.
> > > > 
> > > Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> > > 
> > > > Can you please explain this in more detail? It's not clear which
> > > > specific case you mean when you say "NMI interrupts an irq", as that
> > > > could mean a number of distinct scenarios.
> > > > 
> > > > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > > > causes a new exception we'll do the right thing. So either I'm missing a
> > > > subtlety or you're describing a different scenario..
> > > > 
> > > > Note that the entry code is only trying to distinguish between:
> > > > 
> > > > a) This exception is *definitely* an NMI (because regular interrupts
> > > >    were masked).
> > > > 
> > > > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> > > >    distinguished until we acknowledge the interrupt), so we treat it as
> > > >    an IRQ for now.
> > > > 
> > > b) is the aim.
> > > 
> > > At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> > > While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> > > which does not call rcu_irq_enter_check_tick(). So it is not proper to
> > > "treat it as an IRQ for now"
> > 
> > I'm struggling to understand the problem here. What is "not proper", and
> > why?
> > 
> > Do you think there's a correctness problem, or that we're doing more
> > work than necessary? 
> > 
> I had thought it just did redundant accounting. But after revisiting RCU
> code, I think it confronts a real bug.
> 
> > If you could give a specific example of a problem, it would really help.
> > 
> Refer to rcu_nmi_enter(), which can be called by
> enter_from_kernel_mode():
> 
> ||noinstr void rcu_nmi_enter(void)
> ||{
> ||        ...
> ||        if (rcu_dynticks_curr_cpu_in_eqs()) {
> ||
> ||                if (!in_nmi())
> ||                        rcu_dynticks_task_exit();
> ||
> ||                // RCU is not watching here ...
> ||                rcu_dynticks_eqs_exit();
> ||                // ... but is watching here.
> ||
> ||                if (!in_nmi()) {
> ||                        instrumentation_begin();
> ||                        rcu_cleanup_after_idle();
> ||                        instrumentation_end();
> ||                }
> ||
> ||                instrumentation_begin();
> ||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
> ||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
> ||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
> ||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
> ||
> ||                incby = 1;
> ||        } else if (!in_nmi()) {
> ||                instrumentation_begin();
> ||                rcu_irq_enter_check_tick();
> ||        } else  {
> ||                instrumentation_begin();
> ||        }
> ||        ...
> ||}
> 
> There is 3 pieces of code put under the
> protection of if (!in_nmi()). At least the last one
> "rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
> is supposed to hold a spin lock with irqoff by
> "raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
> scenario in rcu_nmi_exit()->rcu_prepare_for_idle().
> 
> As for the first two "if (!in_nmi())", I have no idea of why, except
> breaching spin_lock_irq() by NMI. Hope Paul can give some guide.
> 
> 
> Thanks,
> 
> 	Pingfan
> 
> 
> > I'm aware that we do more work than strictly necessary when we take a
> > pNMI from a context with IRQs enabled, but that's how we'd intended this
> > to work, as it's vastly simpler to manage the state that way. Unless
> > there's a real problem with that approach I'd prefer to leave it as-is.
> > 
> > Thanks,
> > Mark.
> > 
> > _______________________________________________
> > linux-arm-kernel mailing list
> > linux-arm-kernel@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mark Rutland Oct. 8, 2021, 5:25 p.m. UTC | #7
On Fri, Oct 08, 2021 at 10:55:04PM +0800, Pingfan Liu wrote:
> On Fri, Oct 08, 2021 at 12:01:25PM +0800, Pingfan Liu wrote:
> > Sorry that I missed this message and I am just back from a long
> > festival.
> > 
> > Adding Paul for RCU guidance.
> > 
> > On Thu, Sep 30, 2021 at 02:32:57PM +0100, Mark Rutland wrote:
> > > On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> > > > On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > > > > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > > > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > > > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > > > > the NMI. This will cause a mistaken account for irq.
> > > > > 
> > > > Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> > > > 
> > > > > Can you please explain this in more detail? It's not clear which
> > > > > specific case you mean when you say "NMI interrupts an irq", as that
> > > > > could mean a number of distinct scenarios.
> > > > > 
> > > > > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > > > > causes a new exception we'll do the right thing. So either I'm missing a
> > > > > subtlety or you're describing a different scenario..
> > > > > 
> > > > > Note that the entry code is only trying to distinguish between:
> > > > > 
> > > > > a) This exception is *definitely* an NMI (because regular interrupts
> > > > >    were masked).
> > > > > 
> > > > > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> > > > >    distinguished until we acknowledge the interrupt), so we treat it as
> > > > >    an IRQ for now.
> > > > > 
> > > > b) is the aim.
> > > > 
> > > > At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> > > > While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> > > > which does not call rcu_irq_enter_check_tick(). So it is not proper to
> > > > "treat it as an IRQ for now"
> > > 
> > > I'm struggling to understand the problem here. What is "not proper", and
> > > why?
> > > 
> > > Do you think there's a correctness problem, or that we're doing more
> > > work than necessary? 
> > > 
> > I had thought it just did redundant accounting. But after revisiting RCU
> > code, I think it confronts a real bug.
> > 
> > > If you could give a specific example of a problem, it would really help.
> > > 
> > Refer to rcu_nmi_enter(), which can be called by
> > enter_from_kernel_mode():
> > 
> > ||noinstr void rcu_nmi_enter(void)
> > ||{
> > ||        ...
> > ||        if (rcu_dynticks_curr_cpu_in_eqs()) {
> > ||
> > ||                if (!in_nmi())
> > ||                        rcu_dynticks_task_exit();
> > ||
> > ||                // RCU is not watching here ...
> > ||                rcu_dynticks_eqs_exit();
> > ||                // ... but is watching here.
> > ||
> > ||                if (!in_nmi()) {
> > ||                        instrumentation_begin();
> > ||                        rcu_cleanup_after_idle();
> > ||                        instrumentation_end();
> > ||                }
> > ||
> > ||                instrumentation_begin();
> > ||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
> > ||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
> > ||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
> > ||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
> > ||
> > ||                incby = 1;
> > ||        } else if (!in_nmi()) {
> > ||                instrumentation_begin();
> > ||                rcu_irq_enter_check_tick();
> > ||        } else  {
> > ||                instrumentation_begin();
> > ||        }
> > ||        ...
> > ||}
> > 
> 
> Forget to supplement the context for understanding the case:
>   On arm64, at present, a pNMI (akin to NMI) may call rcu_nmi_enter()
>   without calling "__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);".
>   As a result it can be mistaken as an normal interrupt in
>   rcu_nmi_enter().

I appreciate that there's a window where we treat the pNMI like an IRQ,
but that's by design, and we account for this in gic_handle_irq() and
gic_handle_nmi() where we "upgrade" to NMI context with
nmi_enter()..nmi_exit().

The idea is that we have two cases: 

1) If we take a pNMI from a context where IRQs were masked, we know it
   must be a pNMI, and perform the NMI entry immediately to avoid
   reentrancy problems. 

   I think we're all happy with this case.

2) If we take a pNMI from a context where IRQs were unmasked, we don't know
   whether the trigger was a pNMI/IRQ until we read from the GIC, and
   since we *could* have taken an IRQ, this is equivalent to taking a
   spurious IRQ, and while handling that, taking the NMI, e.g.
   
   < run with IRQs unmasked >
     ~~~ take IRQ ~~~
     < enter IRQ >
       ~~~ take NMI exception ~~~
       < enter NMI >
       < handle NMI >
       < exit NMI > 
       ~~~ return from NMI exception ~~~
     < handle IRQ / spurious / do-nothing >
     < exit IRQ >
     ~~~ return from IRQ exception ~~~
   < continue running with IRQs unmasked >

   ... except that we don't do the HW NMI exception entry/exit, just all
   the necessary SW accounting.


Note that case (2) can *never* nest within itself or within case (1).

Do you have a specific example of something that goes wrong with the
above? e.g. something that's inconsistent with that rationale?

> And this may cause the following issue:
> > There is 3 pieces of code put under the
> > protection of if (!in_nmi()). At least the last one
> > "rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
> > is supposed to hold a spin lock with irqoff by
> > "raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
> > scenario in rcu_nmi_exit()->rcu_prepare_for_idle().
> > 
> > As for the first two "if (!in_nmi())", I have no idea of why, except
> > breaching spin_lock_irq() by NMI. Hope Paul can give some guide.

That code (in enter_from_kernel_mode()) only runs in case 2, where it
cannot be nested within a pNMI, so I struggle to see how this can
deadlock. It it can, then I would expect the general case of a pNMI
nesting within and IRQ would be broken?

Can you give a concrete example of a sequence that would lockup?
Currently I can't see how that's possible.

Thanks,
Mark.
Pingfan Liu Oct. 9, 2021, 3:49 a.m. UTC | #8
On Fri, Oct 08, 2021 at 06:25:13PM +0100, Mark Rutland wrote:
> On Fri, Oct 08, 2021 at 10:55:04PM +0800, Pingfan Liu wrote:
> > On Fri, Oct 08, 2021 at 12:01:25PM +0800, Pingfan Liu wrote:
> > > Sorry that I missed this message and I am just back from a long
> > > festival.
> > > 
> > > Adding Paul for RCU guidance.
> > > 
> > > On Thu, Sep 30, 2021 at 02:32:57PM +0100, Mark Rutland wrote:
> > > > On Sat, Sep 25, 2021 at 11:39:55PM +0800, Pingfan Liu wrote:
> > > > > On Fri, Sep 24, 2021 at 06:53:06PM +0100, Mark Rutland wrote:
> > > > > > On Fri, Sep 24, 2021 at 09:28:33PM +0800, Pingfan Liu wrote:
> > > > > > > In enter_el1_irq_or_nmi(), it can be the case which NMI interrupts an
> > > > > > > irq, which makes the condition !interrupts_enabled(regs) fail to detect
> > > > > > > the NMI. This will cause a mistaken account for irq.
> > > > > > 
> > > > > Sorry about the confusing word "account", it should be "lockdep/rcu/.."
> > > > > 
> > > > > > Can you please explain this in more detail? It's not clear which
> > > > > > specific case you mean when you say "NMI interrupts an irq", as that
> > > > > > could mean a number of distinct scenarios.
> > > > > > 
> > > > > > AFAICT, if we're in an IRQ handler (with NMIs unmasked), and an NMI
> > > > > > causes a new exception we'll do the right thing. So either I'm missing a
> > > > > > subtlety or you're describing a different scenario..
> > > > > > 
> > > > > > Note that the entry code is only trying to distinguish between:
> > > > > > 
> > > > > > a) This exception is *definitely* an NMI (because regular interrupts
> > > > > >    were masked).
> > > > > > 
> > > > > > b) This exception is *either* and IRQ or an NMI (and this *cannot* be
> > > > > >    distinguished until we acknowledge the interrupt), so we treat it as
> > > > > >    an IRQ for now.
> > > > > > 
> > > > > b) is the aim.
> > > > > 
> > > > > At the entry, enter_el1_irq_or_nmi() -> enter_from_kernel_mode()->rcu_irq_enter()/rcu_irq_enter_check_tick() etc.
> > > > > While at irqchip level, gic_handle_irq()->gic_handle_nmi()->nmi_enter(),
> > > > > which does not call rcu_irq_enter_check_tick(). So it is not proper to
> > > > > "treat it as an IRQ for now"
> > > > 
> > > > I'm struggling to understand the problem here. What is "not proper", and
> > > > why?
> > > > 
> > > > Do you think there's a correctness problem, or that we're doing more
> > > > work than necessary? 
> > > > 
> > > I had thought it just did redundant accounting. But after revisiting RCU
> > > code, I think it confronts a real bug.
> > > 
> > > > If you could give a specific example of a problem, it would really help.
> > > > 
> > > Refer to rcu_nmi_enter(), which can be called by
> > > enter_from_kernel_mode():
> > > 
> > > ||noinstr void rcu_nmi_enter(void)
> > > ||{
> > > ||        ...
> > > ||        if (rcu_dynticks_curr_cpu_in_eqs()) {
> > > ||
> > > ||                if (!in_nmi())
> > > ||                        rcu_dynticks_task_exit();
> > > ||
> > > ||                // RCU is not watching here ...
> > > ||                rcu_dynticks_eqs_exit();
> > > ||                // ... but is watching here.
> > > ||
> > > ||                if (!in_nmi()) {
> > > ||                        instrumentation_begin();
> > > ||                        rcu_cleanup_after_idle();
> > > ||                        instrumentation_end();
> > > ||                }
> > > ||
> > > ||                instrumentation_begin();
> > > ||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
> > > ||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
> > > ||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
> > > ||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
> > > ||
> > > ||                incby = 1;
> > > ||        } else if (!in_nmi()) {
> > > ||                instrumentation_begin();
> > > ||                rcu_irq_enter_check_tick();
> > > ||        } else  {
> > > ||                instrumentation_begin();
> > > ||        }
> > > ||        ...
> > > ||}
> > > 
> > 
> > Forget to supplement the context for understanding the case:
> >   On arm64, at present, a pNMI (akin to NMI) may call rcu_nmi_enter()
> >   without calling "__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);".
> >   As a result it can be mistaken as an normal interrupt in
> >   rcu_nmi_enter().
> 
> I appreciate that there's a window where we treat the pNMI like an IRQ,
> but that's by design, and we account for this in gic_handle_irq() and
> gic_handle_nmi() where we "upgrade" to NMI context with
> nmi_enter()..nmi_exit().
> 
> The idea is that we have two cases: 
> 
> 1) If we take a pNMI from a context where IRQs were masked, we know it
>    must be a pNMI, and perform the NMI entry immediately to avoid
>    reentrancy problems. 
> 
>    I think we're all happy with this case.
> 
Right.

> 2) If we take a pNMI from a context where IRQs were unmasked, we don't know
>    whether the trigger was a pNMI/IRQ until we read from the GIC, and
>    since we *could* have taken an IRQ, this is equivalent to taking a
>    spurious IRQ, and while handling that, taking the NMI, e.g.
>    
>    < run with IRQs unmasked >
>      ~~~ take IRQ ~~~
>      < enter IRQ >
>        ~~~ take NMI exception ~~~
>        < enter NMI >
>        < handle NMI >
>        < exit NMI > 
>        ~~~ return from NMI exception ~~~
>      < handle IRQ / spurious / do-nothing >
>      < exit IRQ >
>      ~~~ return from IRQ exception ~~~
>    < continue running with IRQs unmasked >
> 
Yes, here I am on the same page. (I think I used a wrong example in
previous email, which caused the confusion)

>    ... except that we don't do the HW NMI exception entry/exit, just all
>    the necessary SW accounting.
> 
A little but important thing: local_irq_save() etc can not disable pNMI.

> 
> Note that case (2) can *never* nest within itself or within case (1).
> 
> Do you have a specific example of something that goes wrong with the
> above? e.g. something that's inconsistent with that rationale?
> 
Please see the following comment.

> > And this may cause the following issue:
> > > There is 3 pieces of code put under the
> > > protection of if (!in_nmi()). At least the last one
> > > "rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
> > > is supposed to hold a spin lock with irqoff by
> > > "raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
> > > scenario in rcu_nmi_exit()->rcu_prepare_for_idle().

Sorry that this should be an wrong example, since here it takes the case (1).

Concentrating on the spin lock rcu_node->lock, there are two operators:
  raw_spin_lock_rcu_node()
  raw_spin_trylock_rcu_node()

Then suppose the scenario for deadlock:
note_gp_changes() in non-irq-context
{
    local_irq_save(flags);
    ...
    raw_spin_trylock_rcu_node(rnp) // hold lock
    needwake = __note_gp_changes(rnp, rdp);        ------\
    raw_spin_unlock_irqrestore_rcu_node(rnp, flags);      \
}                                                          \
                                                            \---> pNMI breaks in due to local_irq_save() can not disable it.
                                                                 rcu_irq_enter() without __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET)
								   ->rcu_nmi_enter()
								     {
								         else if (!in_nmi())
									     rcu_irq_enter_check_tick()
									       ->__rcu_irq_enter_check_tick()
									         {
										     ...
									             raw_spin_lock_rcu_node(rdp->mynode);

										     //Oops deadlock!
									         }
								     }


> > > 
> > > As for the first two "if (!in_nmi())", I have no idea of why, except
> > > breaching spin_lock_irq() by NMI. Hope Paul can give some guide.
> 
> That code (in enter_from_kernel_mode()) only runs in case 2, where it
> cannot be nested within a pNMI, so I struggle to see how this can
> deadlock. It it can, then I would expect the general case of a pNMI
> nesting within and IRQ would be broken?
> 
Sorry again for the previous misleading wrong example. Hope my new
example can help.

> Can you give a concrete example of a sequence that would lockup?
> Currently I can't see how that's possible.
> 

It seems the RCU subsystem has a strict semantic on NMI and normal
interrupt. Besides the deadlock example, there may be other supprise to
confront with (will trace it on another mail with Paul)

Thanks,

	Pingfan
Pingfan Liu Oct. 9, 2021, 4:14 a.m. UTC | #9
On Fri, Oct 08, 2021 at 08:45:23AM -0700, Paul E. McKenney wrote:
> On Fri, Oct 08, 2021 at 12:01:25PM +0800, Pingfan Liu wrote:
> > Sorry that I missed this message and I am just back from a long
> > festival.
> > 
> > Adding Paul for RCU guidance.
> 
> Didn't the recent patch series cover this, or is this a new problem?
> 
Sorry not to explain it clearly. This is a new problem.

The acked recent series derive from [3-4/5], which addresses the nested calling:
in a single normal interrupt handler
    rcu_irq_enter()
        rcu_irq_enter()
	...
        rcu_irq_exit()
    rcu_irq_exit()


While this new problem [1-2/5] is about pNMI (similar to NMI in this context).
On arm64, the current process in a pNMI handler looks like:
    rcu_irq_enter(){ rcu_nmi_enter()}
        ^^^ At this point, the handler is treated as a normal interrupt temporary, (no chance to __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);).
	    So rcu_nmi_enter() can not distinguish NMI, because "if (!in_nmi())" can not tell it. (goto "questionA")
        nmi_enter()
	NMI handler
	nmi_exit()
    rcu_irq_exit()

[...]
> > Refer to rcu_nmi_enter(), which can be called by
> > enter_from_kernel_mode():
> > 
> > ||noinstr void rcu_nmi_enter(void)
> > ||{
> > ||        ...
> > ||        if (rcu_dynticks_curr_cpu_in_eqs()) {
> > ||
> > ||                if (!in_nmi())
> > ||                        rcu_dynticks_task_exit();
> > ||
> > ||                // RCU is not watching here ...
> > ||                rcu_dynticks_eqs_exit();
> > ||                // ... but is watching here.
> > ||
> > ||                if (!in_nmi()) {
> > ||                        instrumentation_begin();
> > ||                        rcu_cleanup_after_idle();
> > ||                        instrumentation_end();
> > ||                }
> > ||
> > ||                instrumentation_begin();
> > ||                // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
> > ||                instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
> > ||                // instrumentation for the noinstr rcu_dynticks_eqs_exit()
> > ||                instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
> > ||
> > ||                incby = 1;
> > ||        } else if (!in_nmi()) {
> > ||                instrumentation_begin();
> > ||                rcu_irq_enter_check_tick();
> > ||        } else  {
> > ||                instrumentation_begin();
> > ||        }
> > ||        ...
> > ||}
> > 
> > There is 3 pieces of code put under the
> > protection of if (!in_nmi()). At least the last one
> > "rcu_irq_enter_check_tick()" can trigger a hard lock up bug. Because it
> > is supposed to hold a spin lock with irqoff by
> > "raw_spin_lock_rcu_node(rdp->mynode)", but pNMI can breach it. The same
> > scenario in rcu_nmi_exit()->rcu_prepare_for_idle().
> > 

questionA:
> > As for the first two "if (!in_nmi())", I have no idea of why, except
> > breaching spin_lock_irq() by NMI. Hope Paul can give some guide.
> > 

Thanks,

	Pingfan
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index fac08e18bcd5..a59b1745f458 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -12,6 +12,11 @@  int set_handle_irq(void (*handle_irq)(struct pt_regs *));
 #define set_handle_irq	set_handle_irq
 int set_handle_fiq(void (*handle_fiq)(struct pt_regs *));
 
+extern void (*handle_arch_irq)(struct pt_regs *regs);
+extern void (*handle_arch_fiq)(struct pt_regs *regs);
+extern void (*handle_arch_nmi)(struct pt_regs *regs);
+extern bool (*interrupt_is_nmi)(void);
+
 static inline int nr_legacy_irqs(void)
 {
 	return 0;
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 32f9796c4ffe..69a8cc082712 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -24,6 +24,7 @@ 
 #include <asm/stacktrace.h>
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
+#include <asm/irq.h>
 
 /*
  * Handle IRQ/context state management when entering from kernel mode.
@@ -219,17 +220,28 @@  static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs)
 		lockdep_hardirqs_on(CALLER_ADDR0);
 }
 
-static void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
+static inline bool arm64_in_nmi(struct pt_regs *regs)
 {
-	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
+	if (!interrupts_enabled(regs) || (*interrupt_is_nmi)())
+		return true;
+	return false;
+}
+
+/* return true if in irq, otherwise in nmi */
+static bool noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && arm64_in_nmi(regs)) {
 		arm64_enter_nmi(regs);
-	else
+		return false;
+	} else {
 		enter_from_kernel_mode(regs);
+		return true;
+	}
 }
 
-static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
+static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs, bool in_irq)
 {
-	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
+	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !in_irq)
 		arm64_exit_nmi(regs);
 	else
 		exit_to_kernel_mode(regs);
@@ -269,9 +281,6 @@  static void do_interrupt_handler(struct pt_regs *regs,
 		handler(regs);
 }
 
-extern void (*handle_arch_irq)(struct pt_regs *);
-extern void (*handle_arch_fiq)(struct pt_regs *);
-
 static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
 				      unsigned int esr)
 {
@@ -433,12 +442,20 @@  asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 }
 
 static void noinstr el1_interrupt(struct pt_regs *regs,
-				  void (*handler)(struct pt_regs *))
+				  void (*handler)(struct pt_regs *),
+				  void (*nmi_handler)(struct pt_regs *))
 {
+	bool in_irq;
+	void (*h)(struct pt_regs *regs);
+
 	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
 
-	enter_el1_irq_or_nmi(regs);
-	do_interrupt_handler(regs, handler);
+	in_irq = enter_el1_irq_or_nmi(regs);
+	if (in_irq)
+		h = handler;
+	else
+		h = nmi_handler;
+	do_interrupt_handler(regs, h);
 
 	/*
 	 * Note: thread_info::preempt_count includes both thread_info::count
@@ -449,17 +466,17 @@  static void noinstr el1_interrupt(struct pt_regs *regs,
 	    READ_ONCE(current_thread_info()->preempt_count) == 0)
 		arm64_preempt_schedule_irq();
 
-	exit_el1_irq_or_nmi(regs);
+	exit_el1_irq_or_nmi(regs, in_irq);
 }
 
 asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs)
 {
-	el1_interrupt(regs, handle_arch_irq);
+	el1_interrupt(regs, handle_arch_irq, handle_arch_nmi);
 }
 
 asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs)
 {
-	el1_interrupt(regs, handle_arch_fiq);
+	el1_interrupt(regs, handle_arch_fiq, handle_arch_nmi);
 }
 
 asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index bda49430c9ea..e67435eb4cba 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -81,8 +81,19 @@  static void default_handle_fiq(struct pt_regs *regs)
 	panic("FIQ taken without a root FIQ handler\n");
 }
 
+static void default_handle_nmi(struct pt_regs *unused)
+{
+}
+
+static bool default_nmi_discriminator(void)
+{
+	return false;
+}
+
 void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq;
 void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq;
+void (*handle_arch_nmi)(struct pt_regs *) __ro_after_init = default_handle_nmi;
+bool (*interrupt_is_nmi)(void) __ro_after_init = default_nmi_discriminator;
 
 int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
 {
@@ -104,6 +115,24 @@  int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *))
 	return 0;
 }
 
+int __init set_handle_nmi(void (*handle_nmi)(struct pt_regs *))
+{
+	if (handle_arch_nmi != default_handle_nmi)
+		return -EBUSY;
+
+	handle_arch_nmi = handle_nmi;
+	return 0;
+}
+
+int __init set_nmi_discriminator(bool (*discriminator)(void))
+{
+	if (interrupt_is_nmi != default_nmi_discriminator)
+		return -EBUSY;
+
+	interrupt_is_nmi = discriminator;
+	return 0;
+}
+
 void __init init_IRQ(void)
 {
 	init_irq_stacks();