diff mbox

[1/2] ACPI, APEI, GHES: Remove strict check for memory error handling

Message ID 1385099825-31765-1-git-send-email-gong.chen@linux.intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Chen Gong Nov. 22, 2013, 5:57 a.m. UTC
Usually SCI is employed to handle corrected error, especially
for memory corrected error but in fact SCI still can be used
to handle any error like memory uncorrected error if BIOS
enable it. For this situation, memory uncorrected error
should be logged as corrected error does, too.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
---
 arch/x86/include/asm/mce.h            | 3 +--
 arch/x86/kernel/cpu/mcheck/mce-apei.c | 6 ++----
 drivers/acpi/apei/ghes.c              | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

Comments

Naveen N. Rao Nov. 25, 2013, 6:08 a.m. UTC | #1
On 2013/11/22 12:57AM, Chen Gong wrote:
> Usually SCI is employed to handle corrected error, especially
> for memory corrected error but in fact SCI still can be used
> to handle any error like memory uncorrected error if BIOS
> enable it. For this situation, memory uncorrected error
> should be logged as corrected error does, too.
> 
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> ---
>  arch/x86/include/asm/mce.h            | 3 +--
>  arch/x86/kernel/cpu/mcheck/mce-apei.c | 6 ++----
>  drivers/acpi/apei/ghes.c              | 3 +--
>  3 files changed, 4 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index cbe6b9e..94b263f 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -244,7 +244,6 @@ static inline void mcheck_intel_therm_init(void) { }
>   */
>  
>  struct cper_sec_mem_err;
> -extern void apei_mce_report_mem_error(int corrected,
> -				      struct cper_sec_mem_err *mem_err);
> +extern void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err);
>  
>  #endif /* _ASM_X86_MCE_H */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> index cd8b166..f09da48 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> @@ -37,13 +37,11 @@
>  
>  #include "mce-internal.h"
>  
> -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
> +void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err)
>  {
>  	struct mce m;
>  
> -	/* Only corrected MC is reported */
> -	if (!corrected || !(mem_err->validation_bits &
> -				CPER_MEM_VALID_PHYSICAL_ADDRESS))
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS))
>  		return;

This won't be enough. Further down, you'll see that all memory errors
get logged as corrected errors due to the hardcoded MCE status. A lot
more work will be needed if we have to log GHES errors through mcelog
properly.

- Naveen

>  
>  	mce_setup(&m);
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 8ec37bb..039c23c 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -453,8 +453,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_edac_report_mem_error(ghes, sev, mem_err);
>  
>  #ifdef CONFIG_X86_MCE
> -			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
> -						  mem_err);
> +			apei_mce_report_mem_error(mem_err);
>  #endif
>  			ghes_handle_memory_failure(gdata, sev);
>  		}
> -- 
> 1.8.4.rc3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong Nov. 25, 2013, 6:26 a.m. UTC | #2
On Mon, Nov 25, 2013 at 11:38:23AM +0530, Naveen N. Rao wrote:
> Date: Mon, 25 Nov 2013 11:38:23 +0530
> From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
> To: "Chen, Gong" <gong.chen@linux.intel.com>
> Cc: tony.luck@intel.com, bp@alien8.de, linux-acpi@vger.kernel.org
> Subject: Re: [PATCH 1/2] ACPI, APEI, GHES: Remove strict check for memory
>  error handling
> User-Agent: Mutt/1.5.21 (2010-09-15)
> 
> On 2013/11/22 12:57AM, Chen Gong wrote:
> > Usually SCI is employed to handle corrected error, especially
> > for memory corrected error but in fact SCI still can be used
> > to handle any error like memory uncorrected error if BIOS
> > enable it. For this situation, memory uncorrected error
> > should be logged as corrected error does, too.
> > 
> > Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> > ---
> >  arch/x86/include/asm/mce.h            | 3 +--
> >  arch/x86/kernel/cpu/mcheck/mce-apei.c | 6 ++----
> >  drivers/acpi/apei/ghes.c              | 3 +--
> >  3 files changed, 4 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> > index cbe6b9e..94b263f 100644
> > --- a/arch/x86/include/asm/mce.h
> > +++ b/arch/x86/include/asm/mce.h
> > @@ -244,7 +244,6 @@ static inline void mcheck_intel_therm_init(void) { }
> >   */
> >  
> >  struct cper_sec_mem_err;
> > -extern void apei_mce_report_mem_error(int corrected,
> > -				      struct cper_sec_mem_err *mem_err);
> > +extern void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err);
> >  
> >  #endif /* _ASM_X86_MCE_H */
> > diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > index cd8b166..f09da48 100644
> > --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > @@ -37,13 +37,11 @@
> >  
> >  #include "mce-internal.h"
> >  
> > -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
> > +void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err)
> >  {
> >  	struct mce m;
> >  
> > -	/* Only corrected MC is reported */
> > -	if (!corrected || !(mem_err->validation_bits &
> > -				CPER_MEM_VALID_PHYSICAL_ADDRESS))
> > +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS))
> >  		return;
> 
> This won't be enough. Further down, you'll see that all memory errors
> get logged as corrected errors due to the hardcoded MCE status. A lot
> more work will be needed if we have to log GHES errors through mcelog
> properly.
> 
> - Naveen
> 
Sure. In fact, the most valuable information from CPER is physical
address and most other data are faked or only little values. But
at least we can make it more precisely.
diff mbox

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cbe6b9e..94b263f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -244,7 +244,6 @@  static inline void mcheck_intel_therm_init(void) { }
  */
 
 struct cper_sec_mem_err;
-extern void apei_mce_report_mem_error(int corrected,
-				      struct cper_sec_mem_err *mem_err);
+extern void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err);
 
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index cd8b166..f09da48 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -37,13 +37,11 @@ 
 
 #include "mce-internal.h"
 
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(struct cper_sec_mem_err *mem_err)
 {
 	struct mce m;
 
-	/* Only corrected MC is reported */
-	if (!corrected || !(mem_err->validation_bits &
-				CPER_MEM_VALID_PHYSICAL_ADDRESS))
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS))
 		return;
 
 	mce_setup(&m);
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 8ec37bb..039c23c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -453,8 +453,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(ghes, sev, mem_err);
 
 #ifdef CONFIG_X86_MCE
-			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
-						  mem_err);
+			apei_mce_report_mem_error(mem_err);
 #endif
 			ghes_handle_memory_failure(gdata, sev);
 		}