diff mbox series

[v6,7/7] arm64: kgdb: Roundup cpus using IPI as NMI

Message ID 1603983387-8738-8-git-send-email-sumit.garg@linaro.org (mailing list archive)
State New, archived
Headers show
Series arm64: Add framework to turn an IPI as NMI | expand

Commit Message

Sumit Garg Oct. 29, 2020, 2:56 p.m. UTC
arm64 platforms with GICv3 or later supports pseudo NMIs which can be
leveraged to roundup CPUs which are stuck in hard lockup state with
interrupts disabled that wouldn't be possible with a normal IPI.

So instead switch to roundup CPUs using IPI turned as NMI. And in
case a particular arm64 platform doesn't supports pseudo NMIs,
it will switch back to default kgdb CPUs roundup mechanism.

Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
---
 arch/arm64/include/asm/kgdb.h |  9 +++++++++
 arch/arm64/kernel/ipi_nmi.c   |  5 +++++
 arch/arm64/kernel/kgdb.c      | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

Comments

Daniel Thompson Oct. 29, 2020, 4:22 p.m. UTC | #1
On Thu, Oct 29, 2020 at 08:26:27PM +0530, Sumit Garg wrote:
> arm64 platforms with GICv3 or later supports pseudo NMIs which can be
> leveraged to roundup CPUs which are stuck in hard lockup state with
> interrupts disabled that wouldn't be possible with a normal IPI.
> 
> So instead switch to roundup CPUs using IPI turned as NMI. And in
> case a particular arm64 platform doesn't supports pseudo NMIs,
> it will switch back to default kgdb CPUs roundup mechanism.
> 
> Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
> ---
>  arch/arm64/include/asm/kgdb.h |  9 +++++++++
>  arch/arm64/kernel/ipi_nmi.c   |  5 +++++
>  arch/arm64/kernel/kgdb.c      | 35 +++++++++++++++++++++++++++++++++++
>  3 files changed, 49 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
> index 21fc85e..c3d2425 100644
> --- a/arch/arm64/include/asm/kgdb.h
> +++ b/arch/arm64/include/asm/kgdb.h
> @@ -24,6 +24,15 @@ static inline void arch_kgdb_breakpoint(void)
>  extern void kgdb_handle_bus_error(void);
>  extern int kgdb_fault_expected;
>  
> +#ifdef CONFIG_KGDB
> +extern bool kgdb_ipi_nmicallback(int cpu, void *regs);
> +#else
> +static inline bool kgdb_ipi_nmicallback(int cpu, void *regs)
> +{
> +	return false;
> +}
> +#endif
> +
>  #endif /* !__ASSEMBLY__ */
>  
>  /*
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> index 597dcf7..6ace182 100644
> --- a/arch/arm64/kernel/ipi_nmi.c
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -8,6 +8,7 @@
>  
>  #include <linux/interrupt.h>
>  #include <linux/irq.h>
> +#include <linux/kgdb.h>
>  #include <linux/nmi.h>
>  #include <linux/smp.h>
>  
> @@ -45,10 +46,14 @@ bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
>  static irqreturn_t ipi_nmi_handler(int irq, void *data)
>  {
>  	irqreturn_t ret = IRQ_NONE;
> +	unsigned int cpu = smp_processor_id();
>  
>  	if (nmi_cpu_backtrace(get_irq_regs()))
>  		ret = IRQ_HANDLED;
>  
> +	if (kgdb_ipi_nmicallback(cpu, get_irq_regs()))
> +		ret = IRQ_HANDLED;
> +
>  	return ret;

It would be better to declare existing return value for
kgdb_nmicallback() to be dangerously stupid and fix it so it returns an
irqreturn_t (that's easy since most callers do not need to check the
return value).

Then this code simply becomes:

	return kgdb_nmicallback(cpu, get_irq_regs());


>  }
>  
> diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> index 1a157ca3..c26e710 100644
> --- a/arch/arm64/kernel/kgdb.c
> +++ b/arch/arm64/kernel/kgdb.c
> @@ -17,6 +17,7 @@
>  
>  #include <asm/debug-monitors.h>
>  #include <asm/insn.h>
> +#include <asm/nmi.h>
>  #include <asm/traps.h>
>  
>  struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
> @@ -353,3 +354,37 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
>  	return aarch64_insn_write((void *)bpt->bpt_addr,
>  			*(u32 *)bpt->saved_instr);
>  }
> +
> +bool kgdb_ipi_nmicallback(int cpu, void *regs)
> +{
> +	if (atomic_read(&kgdb_active) != -1) {
> +		kgdb_nmicallback(cpu, regs);
> +		return true;
> +	}
> +
> +	return false;
> +}

I *really* don't like this function.

If the return code of kgdb_nmicallback() is broken then fix it, don't
just wrap it and invent a new criteria for the return code.

To be honest I don't actually think the logic in kgdb_nmicallback() is
broken. As mentioned above the return value has a weird definition (0
for "handled it OK" and 1 for "nothing for me to do") but the logic to
calculate the return code looks OK.


> +
> +static void kgdb_smp_callback(void *data)
> +{
> +	unsigned int cpu = smp_processor_id();
> +
> +	if (atomic_read(&kgdb_active) != -1)
> +		kgdb_nmicallback(cpu, get_irq_regs());
> +}

This is Unused. I presume it is litter from a previous revision of the
code and can be deleted?


> +
> +bool kgdb_arch_roundup_cpus(void)
> +{
> +	struct cpumask mask;
> +
> +	if (!arm64_supports_nmi())
> +		return false;
> +
> +	cpumask_copy(&mask, cpu_online_mask);
> +	cpumask_clear_cpu(raw_smp_processor_id(), &mask);
> +	if (cpumask_empty(&mask))
> +		return false;

Why do we need to fallback if there is no work to do? There will still
be no work to do when we call the fallback.


Daniel.
Daniel Thompson Oct. 29, 2020, 4:39 p.m. UTC | #2
On Thu, Oct 29, 2020 at 04:22:34PM +0000, Daniel Thompson wrote:
> On Thu, Oct 29, 2020 at 08:26:27PM +0530, Sumit Garg wrote:
> > arm64 platforms with GICv3 or later supports pseudo NMIs which can be
> > leveraged to roundup CPUs which are stuck in hard lockup state with
> > interrupts disabled that wouldn't be possible with a normal IPI.
> > 
> > So instead switch to roundup CPUs using IPI turned as NMI. And in
> > case a particular arm64 platform doesn't supports pseudo NMIs,
> > it will switch back to default kgdb CPUs roundup mechanism.
> > 
> > Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
> > ---
> >  arch/arm64/include/asm/kgdb.h |  9 +++++++++
> >  arch/arm64/kernel/ipi_nmi.c   |  5 +++++
> >  arch/arm64/kernel/kgdb.c      | 35 +++++++++++++++++++++++++++++++++++
> >  3 files changed, 49 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
> > index 21fc85e..c3d2425 100644
> > --- a/arch/arm64/include/asm/kgdb.h
> > +++ b/arch/arm64/include/asm/kgdb.h
> > @@ -24,6 +24,15 @@ static inline void arch_kgdb_breakpoint(void)
> >  extern void kgdb_handle_bus_error(void);
> >  extern int kgdb_fault_expected;
> >  
> > +#ifdef CONFIG_KGDB
> > +extern bool kgdb_ipi_nmicallback(int cpu, void *regs);
> > +#else
> > +static inline bool kgdb_ipi_nmicallback(int cpu, void *regs)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +
> >  #endif /* !__ASSEMBLY__ */
> >  
> >  /*
> > diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> > index 597dcf7..6ace182 100644
> > --- a/arch/arm64/kernel/ipi_nmi.c
> > +++ b/arch/arm64/kernel/ipi_nmi.c
> > @@ -8,6 +8,7 @@
> >  
> >  #include <linux/interrupt.h>
> >  #include <linux/irq.h>
> > +#include <linux/kgdb.h>
> >  #include <linux/nmi.h>
> >  #include <linux/smp.h>
> >  
> > @@ -45,10 +46,14 @@ bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
> >  static irqreturn_t ipi_nmi_handler(int irq, void *data)
> >  {
> >  	irqreturn_t ret = IRQ_NONE;
> > +	unsigned int cpu = smp_processor_id();
> >  
> >  	if (nmi_cpu_backtrace(get_irq_regs()))
> >  		ret = IRQ_HANDLED;
> >  
> > +	if (kgdb_ipi_nmicallback(cpu, get_irq_regs()))
> > +		ret = IRQ_HANDLED;
> > +
> >  	return ret;
> 
> It would be better to declare existing return value for
> kgdb_nmicallback() to be dangerously stupid and fix it so it returns an
> irqreturn_t (that's easy since most callers do not need to check the
> return value).
> 
> Then this code simply becomes:
> 
> 	return kgdb_nmicallback(cpu, get_irq_regs());

Actually, reflecting on this maybe it is better to keep kgdb_nmicallin()
and kgdb_nmicallback() aligned w.r.t. return codes (even if they are a
little unusual).

I'm still not sure why we'd keep kgdb_ipi_nmicallback() though.
kgdb_nmicallback() is intended to be called from arch code...


Daniel.


> 
> 
> >  }
> >  
> > diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> > index 1a157ca3..c26e710 100644
> > --- a/arch/arm64/kernel/kgdb.c
> > +++ b/arch/arm64/kernel/kgdb.c
> > @@ -17,6 +17,7 @@
> >  
> >  #include <asm/debug-monitors.h>
> >  #include <asm/insn.h>
> > +#include <asm/nmi.h>
> >  #include <asm/traps.h>
> >  
> >  struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
> > @@ -353,3 +354,37 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
> >  	return aarch64_insn_write((void *)bpt->bpt_addr,
> >  			*(u32 *)bpt->saved_instr);
> >  }
> > +
> > +bool kgdb_ipi_nmicallback(int cpu, void *regs)
> > +{
> > +	if (atomic_read(&kgdb_active) != -1) {
> > +		kgdb_nmicallback(cpu, regs);
> > +		return true;
> > +	}
> > +
> > +	return false;
> > +}
> 
> I *really* don't like this function.
> 
> If the return code of kgdb_nmicallback() is broken then fix it, don't
> just wrap it and invent a new criteria for the return code.
> 
> To be honest I don't actually think the logic in kgdb_nmicallback() is
> broken. As mentioned above the return value has a weird definition (0
> for "handled it OK" and 1 for "nothing for me to do") but the logic to
> calculate the return code looks OK.
> 
> 
> > +
> > +static void kgdb_smp_callback(void *data)
> > +{
> > +	unsigned int cpu = smp_processor_id();
> > +
> > +	if (atomic_read(&kgdb_active) != -1)
> > +		kgdb_nmicallback(cpu, get_irq_regs());
> > +}
> 
> This is Unused. I presume it is litter from a previous revision of the
> code and can be deleted?
> 
> 
> > +
> > +bool kgdb_arch_roundup_cpus(void)
> > +{
> > +	struct cpumask mask;
> > +
> > +	if (!arm64_supports_nmi())
> > +		return false;
> > +
> > +	cpumask_copy(&mask, cpu_online_mask);
> > +	cpumask_clear_cpu(raw_smp_processor_id(), &mask);
> > +	if (cpumask_empty(&mask))
> > +		return false;
> 
> Why do we need to fallback if there is no work to do? There will still
> be no work to do when we call the fallback.
> 
> 
> Daniel.
Sumit Garg Nov. 2, 2020, 6:59 a.m. UTC | #3
On Thu, 29 Oct 2020 at 22:09, Daniel Thompson
<daniel.thompson@linaro.org> wrote:
>
> On Thu, Oct 29, 2020 at 04:22:34PM +0000, Daniel Thompson wrote:
> > On Thu, Oct 29, 2020 at 08:26:27PM +0530, Sumit Garg wrote:
> > > arm64 platforms with GICv3 or later supports pseudo NMIs which can be
> > > leveraged to roundup CPUs which are stuck in hard lockup state with
> > > interrupts disabled that wouldn't be possible with a normal IPI.
> > >
> > > So instead switch to roundup CPUs using IPI turned as NMI. And in
> > > case a particular arm64 platform doesn't supports pseudo NMIs,
> > > it will switch back to default kgdb CPUs roundup mechanism.
> > >
> > > Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
> > > ---
> > >  arch/arm64/include/asm/kgdb.h |  9 +++++++++
> > >  arch/arm64/kernel/ipi_nmi.c   |  5 +++++
> > >  arch/arm64/kernel/kgdb.c      | 35 +++++++++++++++++++++++++++++++++++
> > >  3 files changed, 49 insertions(+)
> > >
> > > diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
> > > index 21fc85e..c3d2425 100644
> > > --- a/arch/arm64/include/asm/kgdb.h
> > > +++ b/arch/arm64/include/asm/kgdb.h
> > > @@ -24,6 +24,15 @@ static inline void arch_kgdb_breakpoint(void)
> > >  extern void kgdb_handle_bus_error(void);
> > >  extern int kgdb_fault_expected;
> > >
> > > +#ifdef CONFIG_KGDB
> > > +extern bool kgdb_ipi_nmicallback(int cpu, void *regs);
> > > +#else
> > > +static inline bool kgdb_ipi_nmicallback(int cpu, void *regs)
> > > +{
> > > +   return false;
> > > +}
> > > +#endif
> > > +
> > >  #endif /* !__ASSEMBLY__ */
> > >
> > >  /*
> > > diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> > > index 597dcf7..6ace182 100644
> > > --- a/arch/arm64/kernel/ipi_nmi.c
> > > +++ b/arch/arm64/kernel/ipi_nmi.c
> > > @@ -8,6 +8,7 @@
> > >
> > >  #include <linux/interrupt.h>
> > >  #include <linux/irq.h>
> > > +#include <linux/kgdb.h>
> > >  #include <linux/nmi.h>
> > >  #include <linux/smp.h>
> > >
> > > @@ -45,10 +46,14 @@ bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
> > >  static irqreturn_t ipi_nmi_handler(int irq, void *data)
> > >  {
> > >     irqreturn_t ret = IRQ_NONE;
> > > +   unsigned int cpu = smp_processor_id();
> > >
> > >     if (nmi_cpu_backtrace(get_irq_regs()))
> > >             ret = IRQ_HANDLED;
> > >
> > > +   if (kgdb_ipi_nmicallback(cpu, get_irq_regs()))
> > > +           ret = IRQ_HANDLED;
> > > +
> > >     return ret;
> >
> > It would be better to declare existing return value for
> > kgdb_nmicallback() to be dangerously stupid and fix it so it returns an
> > irqreturn_t (that's easy since most callers do not need to check the
> > return value).
> >
> > Then this code simply becomes:
> >
> >       return kgdb_nmicallback(cpu, get_irq_regs());
>
> Actually, reflecting on this maybe it is better to keep kgdb_nmicallin()
> and kgdb_nmicallback() aligned w.r.t. return codes (even if they are a
> little unusual).
>
> I'm still not sure why we'd keep kgdb_ipi_nmicallback() though.
> kgdb_nmicallback() is intended to be called from arch code...
>

I added kgdb_ipi_nmicallback() just to add a check for "kgdb_active"
prior to entry into kgdb as here we are sharing NMI among backtrace
and kgdb.

But after your comments, I looked carefully into kgdb_nmicallback()
and I see the "raw_spin_is_locked(&dbg_master_lock)" check as well. So
it looked sufficient to me for calling kgdb_nmicallback() directly
from the arch code and hence I will remove kgdb_ipi_nmicallback() in
the next version.

>
> Daniel.
>
>
> >
> >
> > >  }
> > >
> > > diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> > > index 1a157ca3..c26e710 100644
> > > --- a/arch/arm64/kernel/kgdb.c
> > > +++ b/arch/arm64/kernel/kgdb.c
> > > @@ -17,6 +17,7 @@
> > >
> > >  #include <asm/debug-monitors.h>
> > >  #include <asm/insn.h>
> > > +#include <asm/nmi.h>
> > >  #include <asm/traps.h>
> > >
> > >  struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
> > > @@ -353,3 +354,37 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
> > >     return aarch64_insn_write((void *)bpt->bpt_addr,
> > >                     *(u32 *)bpt->saved_instr);
> > >  }
> > > +
> > > +bool kgdb_ipi_nmicallback(int cpu, void *regs)
> > > +{
> > > +   if (atomic_read(&kgdb_active) != -1) {
> > > +           kgdb_nmicallback(cpu, regs);
> > > +           return true;
> > > +   }
> > > +
> > > +   return false;
> > > +}
> >
> > I *really* don't like this function.
> >
> > If the return code of kgdb_nmicallback() is broken then fix it, don't
> > just wrap it and invent a new criteria for the return code.
> >
> > To be honest I don't actually think the logic in kgdb_nmicallback() is
> > broken. As mentioned above the return value has a weird definition (0
> > for "handled it OK" and 1 for "nothing for me to do") but the logic to
> > calculate the return code looks OK.
> >

Makes sense, will remove it instead.

> >
> > > +
> > > +static void kgdb_smp_callback(void *data)
> > > +{
> > > +   unsigned int cpu = smp_processor_id();
> > > +
> > > +   if (atomic_read(&kgdb_active) != -1)
> > > +           kgdb_nmicallback(cpu, get_irq_regs());
> > > +}
> >
> > This is Unused. I presume it is litter from a previous revision of the
> > code and can be deleted?
> >

Yeah.

> >
> > > +
> > > +bool kgdb_arch_roundup_cpus(void)
> > > +{
> > > +   struct cpumask mask;
> > > +
> > > +   if (!arm64_supports_nmi())
> > > +           return false;
> > > +
> > > +   cpumask_copy(&mask, cpu_online_mask);
> > > +   cpumask_clear_cpu(raw_smp_processor_id(), &mask);
> > > +   if (cpumask_empty(&mask))
> > > +           return false;
> >
> > Why do we need to fallback if there is no work to do? There will still
> > be no work to do when we call the fallback.

Okay, won't switch back to fallback mode here.

-Sumit

> >
> >
> > Daniel.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index 21fc85e..c3d2425 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -24,6 +24,15 @@  static inline void arch_kgdb_breakpoint(void)
 extern void kgdb_handle_bus_error(void);
 extern int kgdb_fault_expected;
 
+#ifdef CONFIG_KGDB
+extern bool kgdb_ipi_nmicallback(int cpu, void *regs);
+#else
+static inline bool kgdb_ipi_nmicallback(int cpu, void *regs)
+{
+	return false;
+}
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
index 597dcf7..6ace182 100644
--- a/arch/arm64/kernel/ipi_nmi.c
+++ b/arch/arm64/kernel/ipi_nmi.c
@@ -8,6 +8,7 @@ 
 
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/kgdb.h>
 #include <linux/nmi.h>
 #include <linux/smp.h>
 
@@ -45,10 +46,14 @@  bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
 static irqreturn_t ipi_nmi_handler(int irq, void *data)
 {
 	irqreturn_t ret = IRQ_NONE;
+	unsigned int cpu = smp_processor_id();
 
 	if (nmi_cpu_backtrace(get_irq_regs()))
 		ret = IRQ_HANDLED;
 
+	if (kgdb_ipi_nmicallback(cpu, get_irq_regs()))
+		ret = IRQ_HANDLED;
+
 	return ret;
 }
 
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 1a157ca3..c26e710 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -17,6 +17,7 @@ 
 
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
+#include <asm/nmi.h>
 #include <asm/traps.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
@@ -353,3 +354,37 @@  int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 	return aarch64_insn_write((void *)bpt->bpt_addr,
 			*(u32 *)bpt->saved_instr);
 }
+
+bool kgdb_ipi_nmicallback(int cpu, void *regs)
+{
+	if (atomic_read(&kgdb_active) != -1) {
+		kgdb_nmicallback(cpu, regs);
+		return true;
+	}
+
+	return false;
+}
+
+static void kgdb_smp_callback(void *data)
+{
+	unsigned int cpu = smp_processor_id();
+
+	if (atomic_read(&kgdb_active) != -1)
+		kgdb_nmicallback(cpu, get_irq_regs());
+}
+
+bool kgdb_arch_roundup_cpus(void)
+{
+	struct cpumask mask;
+
+	if (!arm64_supports_nmi())
+		return false;
+
+	cpumask_copy(&mask, cpu_online_mask);
+	cpumask_clear_cpu(raw_smp_processor_id(), &mask);
+	if (cpumask_empty(&mask))
+		return false;
+
+	arm64_send_nmi(&mask);
+	return true;
+}