diff mbox

[v2,1/2] ACPI, APEI, GHES: Remove strict check for memory error handling

Message ID 1385363701-12387-1-git-send-email-gong.chen@linux.intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Chen Gong Nov. 25, 2013, 7:15 a.m. UTC
Usually SCI is employed to handle corrected error, especially
for memory corrected error but in fact SCI still can be used
to handle any error like memory uncorrected error even fatal
error if BIOS enable it. For this kind of situation, it
should be logged, too.

v2 -> v1: make the event record more precisely

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c | 10 +++++++---
 drivers/acpi/apei/ghes.c              |  3 +--
 2 files changed, 8 insertions(+), 5 deletions(-)

Comments

Borislav Petkov Nov. 25, 2013, 5:13 p.m. UTC | #1
On Mon, Nov 25, 2013 at 02:15:00AM -0500, Chen, Gong wrote:
> Usually SCI is employed to handle corrected error, especially
> for memory corrected error but in fact SCI still can be used
> to handle any error like memory uncorrected error even fatal
> error if BIOS enable it. For this kind of situation, it
> should be logged, too.
> 
> v2 -> v1: make the event record more precisely
> 
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>

Looks ok to me.

Acked-by: Borislav Petkov <bp@suse.de>
Naveen N. Rao Nov. 26, 2013, 9:02 a.m. UTC | #2
On 11/25/2013 12:45 PM, Chen, Gong wrote:
> Usually SCI is employed to handle corrected error, especially
> for memory corrected error but in fact SCI still can be used
> to handle any error like memory uncorrected error even fatal
> error if BIOS enable it. For this kind of situation, it
> should be logged, too.
>
> v2 -> v1: make the event record more precisely
>
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> ---
>   arch/x86/kernel/cpu/mcheck/mce-apei.c | 10 +++++++---
>   drivers/acpi/apei/ghes.c              |  3 +--
>   2 files changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> index de8b60a..d137ab8 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> @@ -33,6 +33,7 @@
>   #include <linux/acpi.h>
>   #include <linux/cper.h>
>   #include <acpi/apei.h>
> +#include <acpi/ghes.h>
>   #include <asm/mce.h>
>
>   #include "mce-internal.h"
> @@ -41,14 +42,17 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
>   {
>   	struct mce m;
>
> -	/* Only corrected MC is reported */
> -	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>   		return;
>
>   	mce_setup(&m);
>   	m.bank = 1;
> -	/* Fake a memory read corrected error with unknown channel */
> +	/* Fake a memory read error with unknown channel */
>   	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
> +	if (corrected >= GHES_SEV_RECOVERABLE)
> +		m.status |= MCI_STATUS_UC;
> +	if (corrected >= GHES_SEV_PANIC)
> +		m.status |= MCI_STATUS_PCC;

Hmm... so you only fill up the most basic information from the cper 
record. In the absence of 'S', 'AR' bits, I am not sure how useful this 
is - except for logging the error through /dev/mcelog for legacy users. 
If that is the intent, you have my

Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>


- Naveen

>   	m.addr = mem_err->physical_addr;
>   	mce_log(&m);
>   	mce_notify_irq();
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index a30bc31..ce3683d 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -453,8 +453,7 @@ static void ghes_do_proc(struct ghes *ghes,
>   			ghes_edac_report_mem_error(ghes, sev, mem_err);
>
>   #ifdef CONFIG_X86_MCE
> -			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
> -						  mem_err);
> +			apei_mce_report_mem_error(sev, mem_err);
>   #endif
>   			ghes_handle_memory_failure(gdata, sev);
>   		}
>

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong Nov. 26, 2013, 9:31 a.m. UTC | #3
On Tue, Nov 26, 2013 at 02:32:53PM +0530, Naveen N. Rao wrote:
> Date: Tue, 26 Nov 2013 14:32:53 +0530
> From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
> To: "Chen, Gong" <gong.chen@linux.intel.com>, tony.luck@intel.com,
>  bp@alien8.de
> CC: linux-acpi@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] ACPI, APEI, GHES: Remove strict check for
>  memory error handling
> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101
>  Thunderbird/24.1.0
> 
> On 11/25/2013 12:45 PM, Chen, Gong wrote:
> >Usually SCI is employed to handle corrected error, especially
> >for memory corrected error but in fact SCI still can be used
> >to handle any error like memory uncorrected error even fatal
> >error if BIOS enable it. For this kind of situation, it
> >should be logged, too.
> >
> >v2 -> v1: make the event record more precisely
> >
> >Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> >---
> >  arch/x86/kernel/cpu/mcheck/mce-apei.c | 10 +++++++---
> >  drivers/acpi/apei/ghes.c              |  3 +--
> >  2 files changed, 8 insertions(+), 5 deletions(-)
> >
> >diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> >index de8b60a..d137ab8 100644
> >--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
> >+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> >@@ -33,6 +33,7 @@
> >  #include <linux/acpi.h>
> >  #include <linux/cper.h>
> >  #include <acpi/apei.h>
> >+#include <acpi/ghes.h>
> >  #include <asm/mce.h>
> >
> >  #include "mce-internal.h"
> >@@ -41,14 +42,17 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
> >  {
> >  	struct mce m;
> >
> >-	/* Only corrected MC is reported */
> >-	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
> >+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> >  		return;
> >
> >  	mce_setup(&m);
> >  	m.bank = 1;
> >-	/* Fake a memory read corrected error with unknown channel */
> >+	/* Fake a memory read error with unknown channel */
> >  	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
> >+	if (corrected >= GHES_SEV_RECOVERABLE)
> >+		m.status |= MCI_STATUS_UC;
> >+	if (corrected >= GHES_SEV_PANIC)
> >+		m.status |= MCI_STATUS_PCC;
> 
> Hmm... so you only fill up the most basic information from the cper
> record. In the absence of 'S', 'AR' bits, I am not sure how useful
> this is - except for logging the error through /dev/mcelog for
> legacy users. If that is the intent, you have my
> 
> Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> 
> 
> - Naveen
> 

Thanks for your ACK. We want to record more information but you know
UEFI/CPER is not related to MCE in essentially. So we can't figure
out all necessary information to construct MCE record. IOW, we can
just apply the most valuable information like physical address and
fake other fields. From this point of view, this kind of H/W error
event report method is still not perfect.
Chen Gong Dec. 14, 2013, 1:42 p.m. UTC | #4
On Tue, Nov 26, 2013 at 04:31:36AM -0500, Chen, Gong wrote:
> Date:	Tue, 26 Nov 2013 04:31:36 -0500
> From: "Chen, Gong" <gong.chen@linux.intel.com>
> To: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
> Cc: tony.luck@intel.com, bp@alien8.de, linux-acpi@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] ACPI, APEI, GHES: Remove strict check for
>  memory error handling
> User-Agent: Mutt/1.5.21 (2010-09-15)
> 
> On Tue, Nov 26, 2013 at 02:32:53PM +0530, Naveen N. Rao wrote:
> > Date: Tue, 26 Nov 2013 14:32:53 +0530
> > From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
> > To: "Chen, Gong" <gong.chen@linux.intel.com>, tony.luck@intel.com,
> >  bp@alien8.de
> > CC: linux-acpi@vger.kernel.org
> > Subject: Re: [PATCH v2 1/2] ACPI, APEI, GHES: Remove strict check for
> >  memory error handling
> > User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101
> >  Thunderbird/24.1.0
> > 
> > On 11/25/2013 12:45 PM, Chen, Gong wrote:
> > >Usually SCI is employed to handle corrected error, especially
> > >for memory corrected error but in fact SCI still can be used
> > >to handle any error like memory uncorrected error even fatal
> > >error if BIOS enable it. For this kind of situation, it
> > >should be logged, too.
> > >
> > >v2 -> v1: make the event record more precisely
> > >
> > >Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> > >---
> > >  arch/x86/kernel/cpu/mcheck/mce-apei.c | 10 +++++++---
> > >  drivers/acpi/apei/ghes.c              |  3 +--
> > >  2 files changed, 8 insertions(+), 5 deletions(-)
> > >
> > >diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > >index de8b60a..d137ab8 100644
> > >--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > >+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
> > >@@ -33,6 +33,7 @@
> > >  #include <linux/acpi.h>
> > >  #include <linux/cper.h>
> > >  #include <acpi/apei.h>
> > >+#include <acpi/ghes.h>
> > >  #include <asm/mce.h>
> > >
> > >  #include "mce-internal.h"
> > >@@ -41,14 +42,17 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
> > >  {
> > >  	struct mce m;
> > >
> > >-	/* Only corrected MC is reported */
> > >-	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
> > >+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> > >  		return;
> > >
> > >  	mce_setup(&m);
> > >  	m.bank = 1;
> > >-	/* Fake a memory read corrected error with unknown channel */
> > >+	/* Fake a memory read error with unknown channel */
> > >  	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
> > >+	if (corrected >= GHES_SEV_RECOVERABLE)
> > >+		m.status |= MCI_STATUS_UC;
> > >+	if (corrected >= GHES_SEV_PANIC)
> > >+		m.status |= MCI_STATUS_PCC;
> > 
> > Hmm... so you only fill up the most basic information from the cper
> > record. In the absence of 'S', 'AR' bits, I am not sure how useful
> > this is - except for logging the error through /dev/mcelog for
> > legacy users. If that is the intent, you have my
> > 
> > Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> > 
> > 
> > - Naveen
> > 
> 
> Thanks for your ACK. We want to record more information but you know
> UEFI/CPER is not related to MCE in essentially. So we can't figure
> out all necessary information to construct MCE record. IOW, we can
> just apply the most valuable information like physical address and
> fake other fields. From this point of view, this kind of H/W error
> event report method is still not perfect.

Hi, Boris

Will you pick up this patch in your RAS request pull?
diff mbox

Patch

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a..d137ab8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,6 +33,7 @@ 
 #include <linux/acpi.h>
 #include <linux/cper.h>
 #include <acpi/apei.h>
+#include <acpi/ghes.h>
 #include <asm/mce.h>
 
 #include "mce-internal.h"
@@ -41,14 +42,17 @@  void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
 {
 	struct mce m;
 
-	/* Only corrected MC is reported */
-	if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
 		return;
 
 	mce_setup(&m);
 	m.bank = 1;
-	/* Fake a memory read corrected error with unknown channel */
+	/* Fake a memory read error with unknown channel */
 	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+	if (corrected >= GHES_SEV_RECOVERABLE)
+		m.status |= MCI_STATUS_UC;
+	if (corrected >= GHES_SEV_PANIC)
+		m.status |= MCI_STATUS_PCC;
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
 	mce_notify_irq();
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a30bc31..ce3683d 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -453,8 +453,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(ghes, sev, mem_err);
 
 #ifdef CONFIG_X86_MCE
-			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
-						  mem_err);
+			apei_mce_report_mem_error(sev, mem_err);
 #endif
 			ghes_handle_memory_failure(gdata, sev);
 		}