diff mbox series

[06/20] x86/mce/amd: Use helper for GPU UMC bank type checks

Message ID 20231118193248.1296798-7-yazen.ghannam@amd.com (mailing list archive)
State Handled Elsewhere
Headers show
Series MCA Updates | expand

Commit Message

Yazen Ghannam Nov. 18, 2023, 7:32 p.m. UTC
The type of an Scalable MCA bank should be determined solely using the
values in its MCA_IPID register.

Define and use a helper function to determine if a bank represents a GPU
Unified Memory Controller (UMC), and where the exact bank type is not
needed.

Use bitops and rename old mask until removed.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/include/asm/mce.h              |  4 +++-
 arch/x86/kernel/cpu/mce/amd.c           | 12 +++++++++++-
 drivers/edac/amd64_edac.c               |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  9 ++++-----
 4 files changed, 19 insertions(+), 8 deletions(-)

Comments

Borislav Petkov Nov. 27, 2023, 11:46 a.m. UTC | #1
On Sat, Nov 18, 2023 at 01:32:34PM -0600, Yazen Ghannam wrote:
> +/* GPU UMCs have MCATYPE=0x1.*/
> +bool smca_gpu_umc_bank_type(u64 ipid)
> +{
> +	if (!smca_umc_bank_type(ipid))
> +		return false;
> +
> +	return FIELD_GET(MCI_IPID_MCATYPE, ipid) == 0x1;
> +}

And now this tells you that you want to use

	u32 hwid_mcatype;       /* (hwid,mcatype) tuple */

everywhere and do your macros around that thing.

No need for yet another helper as this all is static information which
doesn't change.

> +EXPORT_SYMBOL_GPL(smca_gpu_umc_bank_type);

Definitely not another export.

Thx.
Yazen Ghannam Nov. 27, 2023, 3:12 p.m. UTC | #2
On 11/27/2023 6:46 AM, Borislav Petkov wrote:
> On Sat, Nov 18, 2023 at 01:32:34PM -0600, Yazen Ghannam wrote:
>> +/* GPU UMCs have MCATYPE=0x1.*/
>> +bool smca_gpu_umc_bank_type(u64 ipid)
>> +{
>> +	if (!smca_umc_bank_type(ipid))
>> +		return false;
>> +
>> +	return FIELD_GET(MCI_IPID_MCATYPE, ipid) == 0x1;
>> +}
> 
> And now this tells you that you want to use
> 
> 	u32 hwid_mcatype;       /* (hwid,mcatype) tuple */
> 
> everywhere and do your macros around that thing.
> 
> No need for yet another helper as this all is static information which
> doesn't change.
> 
>> +EXPORT_SYMBOL_GPL(smca_gpu_umc_bank_type);
> 
> Definitely not another export.
> 

Right, we already use the tuple thing. Patch 8 in this set uses the 
tuple to look up (search for) the bank type at runtime. This is in order 
to get rid of all the per-CPU stuff.

Now I figured it'd be good to have special helpers to do a quick check 
for the UMC bank types. Because memory errors will be the most commonly 
reported errors.

But it's likely not important to save a few cycles here. Especially 
since the helpers will be used in the notifier chain and not in the #MC 
handler, etc.

I'll think on it a bit more, but this patch and the previous one can 
likely be dropped.

Thanks,
Yazen
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c43b41677a3e..012caf68dcbb 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -59,8 +59,9 @@ 
  *  - TCC bit is present in MCx_STATUS.
  */
 #define MCI_CONFIG_MCAX		0x1
-#define MCI_IPID_MCATYPE	0xFFFF0000
+#define MCI_IPID_MCATYPE_OLD	0xFFFF0000
 #define MCI_IPID_HWID_OLD	0xFFF
+#define MCI_IPID_MCATYPE	GENMASK_ULL(63, 48)
 #define MCI_IPID_HWID		GENMASK_ULL(43, 32)
 
 /*
@@ -341,6 +342,7 @@  extern int mce_threshold_remove_device(unsigned int cpu);
 
 void mce_amd_feature_init(struct cpuinfo_x86 *c);
 enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
+bool smca_gpu_umc_bank_type(u64 ipid);
 #else
 
 static inline int mce_threshold_create_device(unsigned int cpu)		{ return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c8fb6c24170f..6fc35967b11b 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -150,6 +150,16 @@  static bool smca_umc_bank_type(u64 ipid)
 	return FIELD_GET(MCI_IPID_HWID, ipid) == 0x96;
 }
 
+/* GPU UMCs have MCATYPE=0x1.*/
+bool smca_gpu_umc_bank_type(u64 ipid)
+{
+	if (!smca_umc_bank_type(ipid))
+		return false;
+
+	return FIELD_GET(MCI_IPID_MCATYPE, ipid) == 0x1;
+}
+EXPORT_SYMBOL_GPL(smca_gpu_umc_bank_type);
+
 static const struct smca_hwid smca_hwid_mcatypes[] = {
 	/* { bank_type, hwid_mcatype } */
 
@@ -312,7 +322,7 @@  static void smca_configure(unsigned int bank, unsigned int cpu)
 	}
 
 	hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID_OLD,
-				    (high & MCI_IPID_MCATYPE) >> 16);
+				    (high & MCI_IPID_MCATYPE_OLD) >> 16);
 
 	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
 		s_hwid = &smca_hwid_mcatypes[i];
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9b6642d00871..b593795e1e6b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1032,7 +1032,7 @@  static int fixup_node_id(int node_id, struct mce *m)
 	/* MCA_IPID[InstanceIdHi] give the AMD Node ID for the bank. */
 	u8 nid = (m->ipid >> 44) & 0xF;
 
-	if (smca_get_bank_type(m->extcpu, m->bank) != SMCA_UMC_V2)
+	if (!smca_gpu_umc_bank_type(m->ipid))
 		return node_id;
 
 	/* Nodes below the GPU base node are CPU nodes and don't need a fixup. */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..7235668b3cc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3279,12 +3279,11 @@  static int amdgpu_bad_page_notifier(struct notifier_block *nb,
 	uint32_t umc_inst = 0, ch_inst = 0;
 
 	/*
-	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
-	 * and error occurred in DramECC (Extended error code = 0) then only
-	 * process the error, else bail out.
+	 * If the error was generated in a GPU UMC and error occurred in
+	 * DramECC (Extended error code = 0) then only process the error,
+	 * else bail out.
 	 */
-	if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
-		    (XEC(m->status, 0x3f) == 0x0)))
+	if (!m || !(smca_gpu_umc_bank_type(m->ipid) && (XEC(m->status, 0x3f) == 0x0)))
 		return NOTIFY_DONE;
 
 	/*