diff mbox series

[v4] arm64: sdei: abort running SDEI handlers during crash

Message ID 20230625234033.672594-1-scott@os.amperecomputing.com (mailing list archive)
State New, archived
Headers show
Series [v4] arm64: sdei: abort running SDEI handlers during crash | expand

Commit Message

D Scott Phillips June 25, 2023, 11:40 p.m. UTC
Interrupts are blocked in SDEI context, per the SDEI spec: "The client
interrupts cannot preempt the event handler." If we crashed in the SDEI
handler-running context (as with ACPI's AGDI) then we need to clean up the
SDEI state before proceeding to the crash kernel so that the crash kernel
can have working interrupts.

Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
the handler, discarding the interrupted context.

Fixes: f5df26961853 ("arm64: kernel: Add arch-specific SDEI entry code and CPU masking")
Signed-off-by: D Scott Phillips <scott@os.amperecomputing.com>
Reviewed-by: James Morse <james.morse@arm.com>
Cc: stable@vger.kernel.org
---
 Changes since v3:
 - Fixed messed up #ifdef logic in entry.S
 - Moved sdei_handler_abort() logic from smp.c to sdei.c
 v3 Link: https://lore.kernel.org/linux-arm-kernel/20230607195546.2896-1-scott@os.amperecomputing.com/

 Changes since v2:
 - Dropped the patch fiddling with the sdei conduit.
 v2 Link: https://lore.kernel.org/linux-arm-kernel/20230329202519.6110-1-scott@os.amperecomputing.com/
 
 Changes since v1:
 - Store the active SDEI event being handled per-cpu, use the per-cpu active
   handler information to know when to abort.
 - Add prints before attempting to abort sdei handlers.
 v1 Link: https://lore.kernel.org/linux-arm-kernel/20230204000851.3871-1-scott@os.amperecomputing.com/

 arch/arm64/include/asm/sdei.h | 11 +++++++++++
 arch/arm64/kernel/entry.S     | 27 +++++++++++++++++++++++++--
 arch/arm64/kernel/sdei.c      | 22 ++++++++++++++++++++++
 arch/arm64/kernel/smp.c       |  8 ++++----
 4 files changed, 62 insertions(+), 6 deletions(-)

Comments

kernel test robot June 26, 2023, 2:01 a.m. UTC | #1
Hi Scott,

kernel test robot noticed the following build errors:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on arm/for-next arm/fixes kvmarm/next soc/for-next linus/master v6.4 next-20230623]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/D-Scott-Phillips/arm64-sdei-abort-running-SDEI-handlers-during-crash/20230626-074210
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link:    https://lore.kernel.org/r/20230625234033.672594-1-scott%40os.amperecomputing.com
patch subject: [PATCH v4] arm64: sdei: abort running SDEI handlers during crash
config: arm64-defconfig (https://download.01.org/0day-ci/archive/20230626/202306260925.6Qm42hTs-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230626/202306260925.6Qm42hTs-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306260925.6Qm42hTs-lkp@intel.com/

All errors (new ones prefixed by >>):

   arch/arm64/kernel/smp.c: In function 'crash_smp_send_stop':
>> arch/arm64/kernel/smp.c:1073:9: error: implicit declaration of function 'sdei_handler_abort'; did you mean 'acpi_handle_alert'? [-Werror=implicit-function-declaration]
    1073 |         sdei_handler_abort();
         |         ^~~~~~~~~~~~~~~~~~
         |         acpi_handle_alert
   cc1: some warnings being treated as errors


vim +1073 arch/arm64/kernel/smp.c

  1030	
  1031	#ifdef CONFIG_KEXEC_CORE
  1032	void crash_smp_send_stop(void)
  1033	{
  1034		static int cpus_stopped;
  1035		cpumask_t mask;
  1036		unsigned long timeout;
  1037	
  1038		/*
  1039		 * This function can be called twice in panic path, but obviously
  1040		 * we execute this only once.
  1041		 */
  1042		if (cpus_stopped)
  1043			return;
  1044	
  1045		cpus_stopped = 1;
  1046	
  1047		/*
  1048		 * If this cpu is the only one alive at this point in time, online or
  1049		 * not, there are no stop messages to be sent around, so just back out.
  1050		 */
  1051		if (num_other_online_cpus() == 0)
  1052			goto skip_ipi;
  1053	
  1054		cpumask_copy(&mask, cpu_online_mask);
  1055		cpumask_clear_cpu(smp_processor_id(), &mask);
  1056	
  1057		atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
  1058	
  1059		pr_crit("SMP: stopping secondary CPUs\n");
  1060		smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
  1061	
  1062		/* Wait up to one second for other CPUs to stop */
  1063		timeout = USEC_PER_SEC;
  1064		while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
  1065			udelay(1);
  1066	
  1067		if (atomic_read(&waiting_for_crash_ipi) > 0)
  1068			pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
  1069				cpumask_pr_args(&mask));
  1070	
  1071	skip_ipi:
  1072		sdei_mask_local_cpu();
> 1073		sdei_handler_abort();
  1074	}
  1075
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 4292d9bafb9d..98786108c493 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -17,6 +17,9 @@ 
 
 #include <asm/virt.h>
 
+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
 extern unsigned long sdei_exit_mode;
 
 /* Software Delegated Exception entry point from firmware*/
@@ -29,6 +32,14 @@  asmlinkage void __sdei_asm_entry_trampoline(unsigned long event_num,
 						   unsigned long pc,
 						   unsigned long pstate);
 
+#ifdef CONFIG_ARM_SDE_INTERFACE
+/* Abort a running handler. Context is discarded. */
+void sdei_handler_abort(void);
+void __sdei_handler_abort(void);
+#else
+static inline void sdei_handler_abort(void) { }
+#endif
+
 /*
  * The above entry point does the minimum to call C code. This function does
  * anything else, before calling the driver.
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ab2a6e33c052..1b4a65a33186 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1003,9 +1003,13 @@  SYM_CODE_START(__sdei_asm_handler)
 
 	mov	x19, x1
 
-#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK)
+	/* Store the registered-event for crash_smp_send_stop() */
 	ldrb	w4, [x19, #SDEI_EVENT_PRIORITY]
-#endif
+	cbnz	w4, 1f
+	adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+	b	2f
+1:	adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2:	str	x19, [x5]
 
 #ifdef CONFIG_VMAP_STACK
 	/*
@@ -1072,6 +1076,14 @@  SYM_CODE_START(__sdei_asm_handler)
 
 	ldr_l	x2, sdei_exit_mode
 
+	/* Clear the registered-event seen by crash_smp_send_stop() */
+	ldrb	w3, [x4, #SDEI_EVENT_PRIORITY]
+	cbnz	w3, 1f
+	adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+	b	2f
+1:	adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2:	str	xzr, [x5]
+
 alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
 	sdei_handler_exit exit_mode=x2
 alternative_else_nop_endif
@@ -1082,4 +1094,15 @@  alternative_else_nop_endif
 #endif
 SYM_CODE_END(__sdei_asm_handler)
 NOKPROBE(__sdei_asm_handler)
+
+SYM_CODE_START(__sdei_handler_abort)
+	mov_q	x0, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
+	adr	x1, 1f
+	ldr_l	x2, sdei_exit_mode
+	sdei_handler_exit exit_mode=x2
+	// exit the handler and jump to the next instruction.
+	// Exit will stomp x0-x17, PSTATE, ELR_ELx, and SPSR_ELx.
+1:	ret
+SYM_CODE_END(__sdei_handler_abort)
+NOKPROBE(__sdei_handler_abort)
 #endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 830be01af32d..fcd418af386e 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -47,6 +47,9 @@  DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
 DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
 #endif
 
+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
 static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu)
 {
 	unsigned long *p;
@@ -262,3 +265,22 @@  unsigned long __kprobes do_sdei_event(struct pt_regs *regs,
 
 	return vbar + 0x480;
 }
+
+void sdei_handler_abort(void)
+{
+	/*
+	 * If the crash happened in an SDEI event handler then we need to
+	 * finish the handler with the firmware so that we can have working
+	 * interrupts in the crash kernel.
+	 */
+	if (__this_cpu_read(sdei_active_critical_event)) {
+		pr_warn("SDEI: still in SDEI critical event context, attempting to finish handler.\n");
+		__sdei_handler_abort();
+		__this_cpu_write(sdei_active_critical_event, NULL);
+	}
+	if (__this_cpu_read(sdei_active_normal_event)) {
+		pr_warn("SDEI: still in SDEI normal event context, attempting to finish handler.\n");
+		__sdei_handler_abort();
+		__this_cpu_write(sdei_active_normal_event, NULL);
+	}
+}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index d00d4cbb31b1..c6b882e589e6 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1048,10 +1048,8 @@  void crash_smp_send_stop(void)
 	 * If this cpu is the only one alive at this point in time, online or
 	 * not, there are no stop messages to be sent around, so just back out.
 	 */
-	if (num_other_online_cpus() == 0) {
-		sdei_mask_local_cpu();
-		return;
-	}
+	if (num_other_online_cpus() == 0)
+		goto skip_ipi;
 
 	cpumask_copy(&mask, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), &mask);
@@ -1070,7 +1068,9 @@  void crash_smp_send_stop(void)
 		pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
 			cpumask_pr_args(&mask));
 
+skip_ipi:
 	sdei_mask_local_cpu();
+	sdei_handler_abort();
 }
 
 bool smp_crash_stop_failed(void)