diff mbox

[v3,8/9] ACPI, APEI, CPER: Cleanup CPER memory error output format

Message ID 1382084624-10857-9-git-send-email-gong.chen@linux.intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Chen Gong Oct. 18, 2013, 8:23 a.m. UTC
Keep up only the most important fields for memory error
reporting. The detail information will be moved to perf/trace
interface.

Suggested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Reviewed-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
---
 drivers/acpi/apei/cper.c | 67 ++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

Comments

Naveen N. Rao Oct. 18, 2013, 12:01 p.m. UTC | #1
On 10/18/2013 01:53 PM, Chen, Gong wrote:
> Keep up only the most important fields for memory error
> reporting. The detail information will be moved to perf/trace
> interface.
>
> Suggested-by: Tony Luck <tony.luck@intel.com>
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> Reviewed-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
> ---
>   drivers/acpi/apei/cper.c | 67 ++++++++++++++++++++++--------------------------
>   1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
> index b1a8a55..9dd54e1 100644
> --- a/drivers/acpi/apei/cper.c
> +++ b/drivers/acpi/apei/cper.c
> @@ -33,6 +33,7 @@
>   #include <linux/pci.h>
>   #include <linux/aer.h>
>
> +#define INDENT_SP	" "
>   /*
>    * CPER record ID need to be unique even after reboot, because record
>    * ID is used as index for ERST storage, while CPER records from
> @@ -206,29 +207,29 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
>   		printk("%s""physical_address_mask: 0x%016llx\n",
>   		       pfx, mem->physical_addr_mask);

Can you also change the above address mask to pr_debug(). I don't think 
this is useful at all if set, since we always deal at a page granularity.

>   	if (mem->validation_bits & CPER_MEM_VALID_NODE)
> -		printk("%s""node: %d\n", pfx, mem->node);
> +		pr_debug("node: %d\n", mem->node);
>   	if (mem->validation_bits & CPER_MEM_VALID_CARD)
> -		printk("%s""card: %d\n", pfx, mem->card);
> +		pr_debug("card: %d\n", mem->card);
>   	if (mem->validation_bits & CPER_MEM_VALID_MODULE)
> -		printk("%s""module: %d\n", pfx, mem->module);
> +		pr_debug("module: %d\n", mem->module);
>   	if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
> -		printk("%s""rank: %d\n", pfx, mem->rank);
> +		pr_debug("rank: %d\n", mem->rank);
>   	if (mem->validation_bits & CPER_MEM_VALID_BANK)
> -		printk("%s""bank: %d\n", pfx, mem->bank);
> +		pr_debug("bank: %d\n", mem->bank);
>   	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
> -		printk("%s""device: %d\n", pfx, mem->device);
> +		pr_debug("device: %d\n", mem->device);
>   	if (mem->validation_bits & CPER_MEM_VALID_ROW)
> -		printk("%s""row: %d\n", pfx, mem->row);
> +		pr_debug("row: %d\n", mem->row);
>   	if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
> -		printk("%s""column: %d\n", pfx, mem->column);
> +		pr_debug("column: %d\n", mem->column);
>   	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
> -		printk("%s""bit_position: %d\n", pfx, mem->bit_pos);
> +		pr_debug("bit_position: %d\n", mem->bit_pos);
>   	if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
> -		printk("%s""requestor_id: 0x%016llx\n", pfx, mem->requestor_id);
> +		pr_debug("requestor_id: 0x%016llx\n", mem->requestor_id);
>   	if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
> -		printk("%s""responder_id: 0x%016llx\n", pfx, mem->responder_id);
> +		pr_debug("responder_id: 0x%016llx\n", mem->responder_id);
>   	if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
> -		printk("%s""target_id: 0x%016llx\n", pfx, mem->target_id);
> +		pr_debug("target_id: 0x%016llx\n", mem->target_id);
>   	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
>   		u8 etype = mem->error_type;
>   		printk("%s""error_type: %d, %s\n", pfx, etype,
> @@ -296,55 +297,45 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
>   	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
>   }
>
> -static const char * const cper_estatus_section_flag_strs[] = {
> -	"primary",
> -	"containment warning",
> -	"reset",
> -	"error threshold exceeded",
> -	"resource not accessible",
> -	"latent error",
> -};
> -
>   static void cper_estatus_print_section(
>   	const char *pfx, const struct acpi_generic_data *gdata, int sec_no)
>   {
>   	uuid_le *sec_type = (uuid_le *)gdata->section_type;
>   	__u16 severity;
> +	char newpfx[64];
>
>   	severity = gdata->error_severity;
> -	printk("%s""section: %d, severity: %d, %s\n", pfx, sec_no, severity,
> +	printk("%s""Error %d, type: %s\n", pfx, sec_no,

Nit: Isn't the original text more appropriate here? We are printing each 
section in the error status block. So, section 0, 1 makes better sense 
for me rather than calling these as errors. Each of these sub-sections 
(if more than one) refer to the same error event per the ACPI spec.

>   	       cper_severity_str(severity));
> -	printk("%s""flags: 0x%02x\n", pfx, gdata->flags);
> -	cper_print_bits(pfx, gdata->flags, cper_estatus_section_flag_strs,
> -			ARRAY_SIZE(cper_estatus_section_flag_strs));
>   	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
>   		printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id);
>   	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
>   		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
>
> +	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
>   	if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
>   		struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
> -		printk("%s""section_type: general processor error\n", pfx);
> +		printk("%s""section_type: general processor error\n", newpfx);
>   		if (gdata->error_data_length >= sizeof(*proc_err))
> -			cper_print_proc_generic(pfx, proc_err);
> +			cper_print_proc_generic(newpfx, proc_err);
>   		else
>   			goto err_section_too_small;
>   	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
>   		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
> -		printk("%s""section_type: memory error\n", pfx);
> +		printk("%s""section_type: memory error\n", newpfx);
>   		if (gdata->error_data_length >= sizeof(*mem_err))
> -			cper_print_mem(pfx, mem_err);
> +			cper_print_mem(newpfx, mem_err);
>   		else
>   			goto err_section_too_small;
>   	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
>   		struct cper_sec_pcie *pcie = (void *)(gdata + 1);
> -		printk("%s""section_type: PCIe error\n", pfx);
> +		printk("%s""section_type: PCIe error\n", newpfx);
>   		if (gdata->error_data_length >= sizeof(*pcie))
> -			cper_print_pcie(pfx, pcie, gdata);
> +			cper_print_pcie(newpfx, pcie, gdata);
>   		else
>   			goto err_section_too_small;
>   	} else
> -		printk("%s""section type: unknown, %pUl\n", pfx, sec_type);
> +		printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
>
>   	return;
>
> @@ -358,17 +349,21 @@ void cper_estatus_print(const char *pfx,
>   	struct acpi_generic_data *gdata;
>   	unsigned int data_len, gedata_len;
>   	int sec_no = 0;
> +	char newpfx[64];
>   	__u16 severity;
>
> -	printk("%s""Generic Hardware Error Status\n", pfx);
>   	severity = estatus->error_severity;
> -	printk("%s""severity: %d, %s\n", pfx, severity,
> -	       cper_severity_str(severity));
> +	if (severity != CPER_SEV_FATAL)

Shouldn't this just be (severity == CPER_SEV_CORRECTED)?

Thanks,
Naveen

> +		printk("%s%s\n", pfx,
> +		       "It has been corrected by h/w "
> +		       "and requires no further action");
> +	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
>   	data_len = estatus->data_length;
>   	gdata = (struct acpi_generic_data *)(estatus + 1);
> +	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
>   	while (data_len >= sizeof(*gdata)) {
>   		gedata_len = gdata->error_data_length;
> -		cper_estatus_print_section(pfx, gdata, sec_no);
> +		cper_estatus_print_section(newpfx, gdata, sec_no);
>   		data_len -= gedata_len + sizeof(*gdata);
>   		gdata = (void *)(gdata + 1) + gedata_len;
>   		sec_no++;
>

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong Oct. 19, 2013, 11:26 a.m. UTC | #2
On Fri, Oct 18, 2013 at 05:31:21PM +0530, Naveen N. Rao wrote:
> Date: Fri, 18 Oct 2013 17:31:21 +0530
> From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
> To: "Chen, Gong" <gong.chen@linux.intel.com>, tony.luck@intel.com,
>  bp@alien8.de, joe@perches.com, m.chehab@samsung.com
> CC: arozansk@redhat.com, linux-acpi@vger.kernel.org,
>  linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3 8/9] ACPI, APEI, CPER: Cleanup CPER memory error
>  output format
> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101
>  Thunderbird/24.0
> 
[...]
> >
> >@@ -358,17 +349,21 @@ void cper_estatus_print(const char *pfx,
> >  	struct acpi_generic_data *gdata;
> >  	unsigned int data_len, gedata_len;
> >  	int sec_no = 0;
> >+	char newpfx[64];
> >  	__u16 severity;
> >
> >-	printk("%s""Generic Hardware Error Status\n", pfx);
> >  	severity = estatus->error_severity;
> >-	printk("%s""severity: %d, %s\n", pfx, severity,
> >-	       cper_severity_str(severity));
> >+	if (severity != CPER_SEV_FATAL)
> 
> Shouldn't this just be (severity == CPER_SEV_CORRECTED)?
> 
> Thanks,
> Naveen
> 
IMO, only fatal error can't be handlered gracefully in current
kernel plus H/W. Once it can be recovered by H/W and OS, we
can call it recovered.
Naveen N. Rao Oct. 21, 2013, 4:22 p.m. UTC | #3
On 10/19/2013 04:56 PM, Chen Gong wrote:
> On Fri, Oct 18, 2013 at 05:31:21PM +0530, Naveen N. Rao wrote:
>> Date: Fri, 18 Oct 2013 17:31:21 +0530
>> From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
>> To: "Chen, Gong" <gong.chen@linux.intel.com>, tony.luck@intel.com,
>>   bp@alien8.de, joe@perches.com, m.chehab@samsung.com
>> CC: arozansk@redhat.com, linux-acpi@vger.kernel.org,
>>   linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH v3 8/9] ACPI, APEI, CPER: Cleanup CPER memory error
>>   output format
>> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101
>>   Thunderbird/24.0
>>
> [...]
>>>
>>> @@ -358,17 +349,21 @@ void cper_estatus_print(const char *pfx,
>>>   	struct acpi_generic_data *gdata;
>>>   	unsigned int data_len, gedata_len;
>>>   	int sec_no = 0;
>>> +	char newpfx[64];
>>>   	__u16 severity;
>>>
>>> -	printk("%s""Generic Hardware Error Status\n", pfx);
>>>   	severity = estatus->error_severity;
>>> -	printk("%s""severity: %d, %s\n", pfx, severity,
>>> -	       cper_severity_str(severity));
>>> +	if (severity != CPER_SEV_FATAL)
>>
>> Shouldn't this just be (severity == CPER_SEV_CORRECTED)?
>>
>> Thanks,
>> Naveen
>>
> IMO, only fatal error can't be handlered gracefully in current
> kernel plus H/W. Once it can be recovered by H/W and OS, we
> can call it recovered.
>

Sure, but we don't recover in all scenarios. So, calling it corrected 
seems incorrect to me.



- Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tony Luck Oct. 21, 2013, 5:14 p.m. UTC | #4
>>>> +	if (severity != CPER_SEV_FATAL)

>>>

>>> Shouldn't this just be (severity == CPER_SEV_CORRECTED)?



>> IMO, only fatal error can't be handlered gracefully in current

>> kernel plus H/W. Once it can be recovered by H/W and OS, we

>> can call it recovered.


> Sure, but we don't recover in all scenarios. So, calling it corrected 

> seems incorrect to me.


Even if we recovered from a UC error (which is by no means a sure
thing) ... I don't think the "requires no further action" message applies.

Soft single bit errors are common (well, common-ish ... they should still
be somewhat rare by most objective standard).  Double bit errors are
much rarer ... and are very unlikely to be the result of two single bit errors
happening to be inside the same cache line.  I'd recommend further investigation
of the source of a UC error (even one that is "recovered" in software).

-Tony
Borislav Petkov Oct. 22, 2013, 8:42 a.m. UTC | #5
On Mon, Oct 21, 2013 at 05:14:05PM +0000, Luck, Tony wrote:
> Even if we recovered from a UC error (which is by no means a sure
> thing) ... I don't think the "requires no further action" message
> applies.
>
> Soft single bit errors are common (well, common-ish ... they should
> still be somewhat rare by most objective standard). Double bit errors
> are much rarer ... and are very unlikely to be the result of two
> single bit errors happening to be inside the same cache line. I'd
> recommend further investigation of the source of a UC error (even one
> that is "recovered" in software).

Btw, do we even need to make this distinction? I mean, do we even reach
this path on an error where we need to raise a #MC exception? In the
initial design we were called from machine_check_poll which is not the
exception path and now we're on the decode_chain which gets all errors.

Are we ready to handle all? And also, why do we even need to
differentiate the error types on reporting? I mean, if it is, say, a
contained UC error and we can start a recovery action from userspace
like killing the process, we probably want to have that same detailed
report too?

 [ This is purely hypothetical, of course, as we do the poisoning game
 and killing of processes from kernel space now but still... ]

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index b1a8a55..9dd54e1 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -33,6 +33,7 @@ 
 #include <linux/pci.h>
 #include <linux/aer.h>
 
+#define INDENT_SP	" "
 /*
  * CPER record ID need to be unique even after reboot, because record
  * ID is used as index for ERST storage, while CPER records from
@@ -206,29 +207,29 @@  static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 		printk("%s""physical_address_mask: 0x%016llx\n",
 		       pfx, mem->physical_addr_mask);
 	if (mem->validation_bits & CPER_MEM_VALID_NODE)
-		printk("%s""node: %d\n", pfx, mem->node);
+		pr_debug("node: %d\n", mem->node);
 	if (mem->validation_bits & CPER_MEM_VALID_CARD)
-		printk("%s""card: %d\n", pfx, mem->card);
+		pr_debug("card: %d\n", mem->card);
 	if (mem->validation_bits & CPER_MEM_VALID_MODULE)
-		printk("%s""module: %d\n", pfx, mem->module);
+		pr_debug("module: %d\n", mem->module);
 	if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
-		printk("%s""rank: %d\n", pfx, mem->rank);
+		pr_debug("rank: %d\n", mem->rank);
 	if (mem->validation_bits & CPER_MEM_VALID_BANK)
-		printk("%s""bank: %d\n", pfx, mem->bank);
+		pr_debug("bank: %d\n", mem->bank);
 	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
-		printk("%s""device: %d\n", pfx, mem->device);
+		pr_debug("device: %d\n", mem->device);
 	if (mem->validation_bits & CPER_MEM_VALID_ROW)
-		printk("%s""row: %d\n", pfx, mem->row);
+		pr_debug("row: %d\n", mem->row);
 	if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
-		printk("%s""column: %d\n", pfx, mem->column);
+		pr_debug("column: %d\n", mem->column);
 	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
-		printk("%s""bit_position: %d\n", pfx, mem->bit_pos);
+		pr_debug("bit_position: %d\n", mem->bit_pos);
 	if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
-		printk("%s""requestor_id: 0x%016llx\n", pfx, mem->requestor_id);
+		pr_debug("requestor_id: 0x%016llx\n", mem->requestor_id);
 	if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
-		printk("%s""responder_id: 0x%016llx\n", pfx, mem->responder_id);
+		pr_debug("responder_id: 0x%016llx\n", mem->responder_id);
 	if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
-		printk("%s""target_id: 0x%016llx\n", pfx, mem->target_id);
+		pr_debug("target_id: 0x%016llx\n", mem->target_id);
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 		u8 etype = mem->error_type;
 		printk("%s""error_type: %d, %s\n", pfx, etype,
@@ -296,55 +297,45 @@  static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
 	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
 }
 
-static const char * const cper_estatus_section_flag_strs[] = {
-	"primary",
-	"containment warning",
-	"reset",
-	"error threshold exceeded",
-	"resource not accessible",
-	"latent error",
-};
-
 static void cper_estatus_print_section(
 	const char *pfx, const struct acpi_generic_data *gdata, int sec_no)
 {
 	uuid_le *sec_type = (uuid_le *)gdata->section_type;
 	__u16 severity;
+	char newpfx[64];
 
 	severity = gdata->error_severity;
-	printk("%s""section: %d, severity: %d, %s\n", pfx, sec_no, severity,
+	printk("%s""Error %d, type: %s\n", pfx, sec_no,
 	       cper_severity_str(severity));
-	printk("%s""flags: 0x%02x\n", pfx, gdata->flags);
-	cper_print_bits(pfx, gdata->flags, cper_estatus_section_flag_strs,
-			ARRAY_SIZE(cper_estatus_section_flag_strs));
 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
 		printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id);
 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
 
+	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
 	if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
 		struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
-		printk("%s""section_type: general processor error\n", pfx);
+		printk("%s""section_type: general processor error\n", newpfx);
 		if (gdata->error_data_length >= sizeof(*proc_err))
-			cper_print_proc_generic(pfx, proc_err);
+			cper_print_proc_generic(newpfx, proc_err);
 		else
 			goto err_section_too_small;
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
 		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
-		printk("%s""section_type: memory error\n", pfx);
+		printk("%s""section_type: memory error\n", newpfx);
 		if (gdata->error_data_length >= sizeof(*mem_err))
-			cper_print_mem(pfx, mem_err);
+			cper_print_mem(newpfx, mem_err);
 		else
 			goto err_section_too_small;
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
 		struct cper_sec_pcie *pcie = (void *)(gdata + 1);
-		printk("%s""section_type: PCIe error\n", pfx);
+		printk("%s""section_type: PCIe error\n", newpfx);
 		if (gdata->error_data_length >= sizeof(*pcie))
-			cper_print_pcie(pfx, pcie, gdata);
+			cper_print_pcie(newpfx, pcie, gdata);
 		else
 			goto err_section_too_small;
 	} else
-		printk("%s""section type: unknown, %pUl\n", pfx, sec_type);
+		printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
 
 	return;
 
@@ -358,17 +349,21 @@  void cper_estatus_print(const char *pfx,
 	struct acpi_generic_data *gdata;
 	unsigned int data_len, gedata_len;
 	int sec_no = 0;
+	char newpfx[64];
 	__u16 severity;
 
-	printk("%s""Generic Hardware Error Status\n", pfx);
 	severity = estatus->error_severity;
-	printk("%s""severity: %d, %s\n", pfx, severity,
-	       cper_severity_str(severity));
+	if (severity != CPER_SEV_FATAL)
+		printk("%s%s\n", pfx,
+		       "It has been corrected by h/w "
+		       "and requires no further action");
+	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
 	data_len = estatus->data_length;
 	gdata = (struct acpi_generic_data *)(estatus + 1);
+	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
 	while (data_len >= sizeof(*gdata)) {
 		gedata_len = gdata->error_data_length;
-		cper_estatus_print_section(pfx, gdata, sec_no);
+		cper_estatus_print_section(newpfx, gdata, sec_no);
 		data_len -= gedata_len + sizeof(*gdata);
 		gdata = (void *)(gdata + 1) + gedata_len;
 		sec_no++;