diff mbox series

ACPI / APEI: restore interrupt before panic in sdei flow

Message ID 20211011151028.105215-1-zhangliguang@linux.alibaba.com (mailing list archive)
State Changes Requested, archived
Headers show
Series ACPI / APEI: restore interrupt before panic in sdei flow | expand

Commit Message

luanshi Oct. 11, 2021, 3:10 p.m. UTC
We use ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED Notify type for ras event.

In ATF:
ehf_activate_priority()
   dispatch sdei()
ehf_deactivate_priority()

If ras error severity is fatal, panic was called in sdei(),
ehf_deactivate_priority was not called. we should restore interrupt before panic
otherwise kdump will trigger error.

Signed-off-by: Liguang Zhang <zhangliguang@linux.alibaba.com>
---
 drivers/acpi/apei/ghes.c    | 25 +++++++++++++++++++++----
 drivers/firmware/arm_sdei.c | 14 ++++++++++++++
 include/linux/arm_sdei.h    |  2 ++
 3 files changed, 37 insertions(+), 4 deletions(-)

Comments

Borislav Petkov Oct. 11, 2021, 4:23 p.m. UTC | #1
On Mon, Oct 11, 2021 at 11:10:28PM +0800, Liguang Zhang wrote:
> We use ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED Notify type for ras event.
> 
> In ATF:
> ehf_activate_priority()
>    dispatch sdei()
> ehf_deactivate_priority()
> 
> If ras error severity is fatal, panic was called in sdei(),
> ehf_deactivate_priority was not called. we should restore interrupt before panic
> otherwise kdump will trigger error.

I have *absolutely* no clue what this commit message is trying to tell
me - sorry you'd have to try again. Maybe structuring it this way, would
help:

--
Problem is A.

It happens because of B.

Fix it by doing C.

(Potentially do D).

For more detailed info, see
Documentation/process/submitting-patches.rst, Section "2) Describe your
changes".

Also, to the tone, from Documentation/process/submitting-patches.rst:

 "Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour."

Also, do not talk about what your patch does - that should hopefully be
visible in the diff itself. Rather, talk about *why* you're doing what
you're doing.
kernel test robot Oct. 11, 2021, 11:06 p.m. UTC | #2
Hi Liguang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on rafael-pm/linux-next]
[also build test WARNING on linux/master linus/master v5.15-rc5 next-20211011]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Liguang-Zhang/ACPI-APEI-restore-interrupt-before-panic-in-sdei-flow/20211011-231126
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
config: x86_64-randconfig-a012-20211011 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/abb332bc84212b2ef288bb3e7ad24a04cd4853e6
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Liguang-Zhang/ACPI-APEI-restore-interrupt-before-panic-in-sdei-flow/20211011-231126
        git checkout abb332bc84212b2ef288bb3e7ad24a04cd4853e6
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from drivers/acpi/apei/ghes.c:20:
>> include/linux/arm_sdei.h:53:5: warning: no previous prototype for 'sdei_api_event_complete_and_resume' [-Wmissing-prototypes]
      53 | int sdei_api_event_complete_and_resume(u64 addr) { return 0; }
         |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +/sdei_api_event_complete_and_resume +53 include/linux/arm_sdei.h

    44	
    45	#ifdef CONFIG_ARM_SDE_INTERFACE
    46	/* For use by arch code when CPU hotplug notifiers are not appropriate. */
    47	int sdei_mask_local_cpu(void);
    48	int sdei_unmask_local_cpu(void);
    49	int sdei_api_event_complete_and_resume(u64 addr);
    50	#else
    51	static inline int sdei_mask_local_cpu(void) { return 0; }
    52	static inline int sdei_unmask_local_cpu(void) { return 0; }
  > 53	int sdei_api_event_complete_and_resume(u64 addr) { return 0; }
    54	#endif /* CONFIG_ARM_SDE_INTERFACE */
    55	
    56	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0c8330ed1ffd..4f734c60987c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -141,6 +141,7 @@  static unsigned long ghes_estatus_pool_size_request;
 static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
 static atomic_t ghes_estatus_cache_alloced;
 
+static bool ghes_sdei_callback;
 static int ghes_panic_timeout __read_mostly = 30;
 
 static void __iomem *ghes_map(u64 pfn, enum fixed_addresses fixmap_idx)
@@ -837,18 +838,30 @@  static void ghes_estatus_cache_add(
 	rcu_read_unlock();
 }
 
+static void sdei_api_restore_ras(void)
+{
+	/* reboot to log the error! */
+	if (!panic_timeout)
+		panic_timeout = ghes_panic_timeout;
+	panic("Fatal hardware error!");
+}
+
 static void __ghes_panic(struct ghes *ghes,
 			 struct acpi_hest_generic_status *estatus,
 			 u64 buf_paddr, enum fixed_addresses fixmap_idx)
 {
+	int err;
+
 	__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
 
 	ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
 
-	/* reboot to log the error! */
-	if (!panic_timeout)
-		panic_timeout = ghes_panic_timeout;
-	panic("Fatal hardware error!");
+	if (ghes_sdei_callback) {
+		err = sdei_api_event_complete_and_resume((unsigned long)sdei_api_restore_ras);
+		if (err)
+			sdei_api_restore_ras();
+	} else
+		sdei_api_restore_ras();
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -1224,7 +1237,9 @@  static int ghes_sdei_normal_callback(u32 event_num, struct pt_regs *regs,
 	int err;
 
 	raw_spin_lock(&ghes_notify_lock_sdei_normal);
+	ghes_sdei_callback = true;
 	err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_NORMAL);
+	ghes_sdei_callback = false;
 	raw_spin_unlock(&ghes_notify_lock_sdei_normal);
 
 	return err;
@@ -1238,7 +1253,9 @@  static int ghes_sdei_critical_callback(u32 event_num, struct pt_regs *regs,
 	int err;
 
 	raw_spin_lock(&ghes_notify_lock_sdei_critical);
+	ghes_sdei_callback = true;
 	err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_CRITICAL);
+	ghes_sdei_callback = false;
 	raw_spin_unlock(&ghes_notify_lock_sdei_critical);
 
 	return err;
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index a7e762c352f9..1af6b6b55c57 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -473,6 +473,20 @@  static int sdei_api_event_unregister(u32 event_num)
 			      0, 0, 0, NULL);
 }
 
+int sdei_api_event_complete_and_resume(u64 addr)
+{
+	int err;
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME, addr,
+			      0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to restore CPU[%u]: %d\n", smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
 /* Called directly by the hotplug callbacks */
 static void _local_event_unregister(void *data)
 {
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index 0a241c5c911d..b6d347085834 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -46,9 +46,11 @@  int sdei_unregister_ghes(struct ghes *ghes);
 /* For use by arch code when CPU hotplug notifiers are not appropriate. */
 int sdei_mask_local_cpu(void);
 int sdei_unmask_local_cpu(void);
+int sdei_api_event_complete_and_resume(u64 addr);
 #else
 static inline int sdei_mask_local_cpu(void) { return 0; }
 static inline int sdei_unmask_local_cpu(void) { return 0; }
+int sdei_api_event_complete_and_resume(u64 addr) { return 0; }
 #endif /* CONFIG_ARM_SDE_INTERFACE */