diff mbox series

[3/3] EDAC/mce_amd: Add support for FRU Text in MCA

Message ID 20220418174440.334336-4-yazen.ghannam@amd.com (mailing list archive)
State New, archived
Headers show
Series New SMCA Syndrome registers and FRU Text feature | expand

Commit Message

Yazen Ghannam April 18, 2022, 5:44 p.m. UTC
A new "FRU Text in MCA" feature is defined where the Field Replaceable
Unit (FRU) Text for a device is represented by a string in the new
MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).

The FRU Text is populated dynamically for each individual error state
(MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank
covers multiple devices, for example, a Unified Memory Controller (UMC)
bank that manages two DIMMs.

Print the FRU Text string, if available, when decoding an MCA error.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 drivers/edac/mce_amd.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

Comments

Borislav Petkov July 4, 2022, 9:13 a.m. UTC | #1
On Mon, Apr 18, 2022 at 05:44:40PM +0000, Yazen Ghannam wrote:
> @@ -1254,11 +1255,10 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
>  		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
>  
>  	if (boot_cpu_has(X86_FEATURE_SMCA)) {
> -		u32 low, high;
>  		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
>  
> -		if (!rdmsr_safe(addr, &low, &high) &&
> -		    (low & MCI_CONFIG_MCAX))
> +		if (!rdmsrl_safe_on_cpu(m->extcpu, addr, &mca_config) &&

This change needs to be mentioned in the commit message.

> +		    (mca_config & MCI_CONFIG_MCAX))
>  			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
>  
>  		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
> @@ -1300,6 +1300,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
>  		pr_cont("\n");

So here above the code prints SYND1 and SYND2.

If they contain FRU strings, then this printing should be an if-else by
checking bit 9.

	if (BIT(9))
		print fru text
	else
		print naked syndromes


>  		decode_smca_error(m);
> +
> +		if (mca_config & BIT(9)) {
> +			char frutext[32];

Why 32?

> +			memset(frutext, 0, sizeof(frutext));
> +			memcpy(&frutext[0], &m->synd1, 8);
> +			memcpy(&frutext[8], &m->synd2, 8);
> +
> +			pr_emerg(HW_ERR "FRU Text: %s\n", frutext);
> +		}
> +
>  		goto err_code;
>  	}
Yazen Ghannam July 11, 2022, 5:41 p.m. UTC | #2
On Mon, Jul 04, 2022 at 11:13:56AM +0200, Borislav Petkov wrote:
> On Mon, Apr 18, 2022 at 05:44:40PM +0000, Yazen Ghannam wrote:
> > @@ -1254,11 +1255,10 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
> >  		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
> >  
> >  	if (boot_cpu_has(X86_FEATURE_SMCA)) {
> > -		u32 low, high;
> >  		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
> >  
> > -		if (!rdmsr_safe(addr, &low, &high) &&
> > -		    (low & MCI_CONFIG_MCAX))
> > +		if (!rdmsrl_safe_on_cpu(m->extcpu, addr, &mca_config) &&
> 
> This change needs to be mentioned in the commit message.
>

Okay.

> > +		    (mca_config & MCI_CONFIG_MCAX))
> >  			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
> >  
> >  		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
> > @@ -1300,6 +1300,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
> >  		pr_cont("\n");
> 
> So here above the code prints SYND1 and SYND2.
> 
> If they contain FRU strings, then this printing should be an if-else by
> checking bit 9.
> 
> 	if (BIT(9))
> 		print fru text
> 	else
> 		print naked syndromes
> 
>

Okay, will change.

> >  		decode_smca_error(m);
> > +
> > +		if (mca_config & BIT(9)) {
> > +			char frutext[32];
> 
> Why 32?
>

I picked the next power of 2 after 16 in order to have some space for
terminating NULL in the string. I can't think of a good reason it needs to be
a power of 2, so I can change this to 17.

> > +			memset(frutext, 0, sizeof(frutext));
> > +			memcpy(&frutext[0], &m->synd1, 8);
> > +			memcpy(&frutext[8], &m->synd2, 8);
> > +
> > +			pr_emerg(HW_ERR "FRU Text: %s\n", frutext);
> > +		}
> > +
> >  		goto err_code;
> >  	}
>

Thanks,
Yazen
diff mbox series

Patch

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 28b48c711fe0..3cacc3f22379 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1235,6 +1235,7 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 {
 	struct mce *m = (struct mce *)data;
 	unsigned int fam = x86_family(m->cpuid);
+	u64 mca_config = 0;
 	int ecc;
 
 	if (m->kflags & MCE_HANDLED_CEC)
@@ -1254,11 +1255,10 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
 
 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		u32 low, high;
 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 
-		if (!rdmsr_safe(addr, &low, &high) &&
-		    (low & MCI_CONFIG_MCAX))
+		if (!rdmsrl_safe_on_cpu(m->extcpu, addr, &mca_config) &&
+		    (mca_config & MCI_CONFIG_MCAX))
 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 
 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
@@ -1300,6 +1300,17 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		pr_cont("\n");
 
 		decode_smca_error(m);
+
+		if (mca_config & BIT(9)) {
+			char frutext[32];
+
+			memset(frutext, 0, sizeof(frutext));
+			memcpy(&frutext[0], &m->synd1, 8);
+			memcpy(&frutext[8], &m->synd2, 8);
+
+			pr_emerg(HW_ERR "FRU Text: %s\n", frutext);
+		}
+
 		goto err_code;
 	}