Message ID | 20240918055436.15551-4-TonyWWang-oc@zhaoxin.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | x86/mce: Add Zhaoxin MCE support | expand |
On Wed, Sep 18, 2024 at 01:54:36PM +0800, Tony W Wang-oc wrote: > From: Lyle Li <LyleLi@zhaoxin.com> > > Zhaoxin CPUs support CMCI compatible with Intel, because > Zhaoxin's UCR error is not reported through CMCI, and in > order to be compatible with intel's CMCI code, so add Zhaoxin > CMCI storm toggle to support the new CMCI storm switching > in mce/intel.c, mce/zhaoxin.c, mce/threshold.c, and mce/internal.h. > > Signed-off-by: Lyle Li <LyleLi@zhaoxin.com> > Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> > --- > arch/x86/kernel/cpu/mce/intel.c | 5 ++--- > arch/x86/kernel/cpu/mce/internal.h | 7 ++++++- > arch/x86/kernel/cpu/mce/threshold.c | 4 ++++ > arch/x86/kernel/cpu/mce/zhaoxin.c | 18 ++++++++++++++++++ > 4 files changed, 30 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c > index b7e67f4f7..aa75e2848 100644 > --- a/arch/x86/kernel/cpu/mce/intel.c > +++ b/arch/x86/kernel/cpu/mce/intel.c > @@ -45,7 +45,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); > * cmci_discover_lock protects against parallel discovery attempts > * which could race against each other. > */ > -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); > +DEFINE_RAW_SPINLOCK(cmci_discover_lock); > > /* > * On systems that do support CMCI but it's disabled, polling for MCEs can > @@ -61,7 +61,7 @@ static DEFINE_SPINLOCK(cmci_poll_lock); > * MCi_CTL2 threshold for each bank when there is no storm. > * Default value for each bank may have been set by BIOS. > */ > -static u16 cmci_threshold[MAX_NR_BANKS]; > +u16 cmci_threshold[MAX_NR_BANKS]; > > /* > * High threshold to limit CMCI rate during storms. Max supported is > @@ -73,7 +73,6 @@ static u16 cmci_threshold[MAX_NR_BANKS]; > * to corrected errors, so keeping CMCI enabled means that uncorrected > * errors will still be processed in a timely fashion. > */ > -#define CMCI_STORM_THRESHOLD 32749 > > static int cmci_supported(int *banks) > { > diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h > index 836e56027..086b833c5 100644 > --- a/arch/x86/kernel/cpu/mce/internal.h > +++ b/arch/x86/kernel/cpu/mce/internal.h > @@ -7,7 +7,7 @@ > > #include <linux/device.h> > #include <asm/mce.h> > - > +#include <linux/spinlock.h> > enum severity_level { > MCE_NO_SEVERITY, > MCE_DEFERRED_SEVERITY, > @@ -334,11 +334,16 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) > } > > extern void (*mc_poll_banks)(void); > +#define CMCI_STORM_THRESHOLD 32749 > +extern raw_spinlock_t cmci_discover_lock; > +extern u16 cmci_threshold[MAX_NR_BANKS]; > #ifdef CONFIG_X86_MCE_ZHAOXIN > void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c); > void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c); > +void mce_zhaoxin_handle_storm(int bank, bool on); > #else > static inline void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { } > static inline void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { } > +static inline void mce_zhaoxin_handle_storm(int bank, bool on) { } > #endif > #endif /* __X86_MCE_INTERNAL_H__ */ > diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c > index 89e31e1e5..200280387 100644 > --- a/arch/x86/kernel/cpu/mce/threshold.c > +++ b/arch/x86/kernel/cpu/mce/threshold.c > @@ -63,6 +63,10 @@ static void mce_handle_storm(unsigned int bank, bool on) > case X86_VENDOR_INTEL: > mce_intel_handle_storm(bank, on); > break; > + case X86_VENDOR_ZHAOXIN: > + case X86_VENDOR_CENTAUR: > + mce_zhaoxin_handle_storm(bank, on); > + break; > } > } > > diff --git a/arch/x86/kernel/cpu/mce/zhaoxin.c b/arch/x86/kernel/cpu/mce/zhaoxin.c > index de69c560f..994b8520a 100644 > --- a/arch/x86/kernel/cpu/mce/zhaoxin.c > +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c > @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) > { > intel_clear_lmce(); > } > + > +void mce_zhaoxin_handle_storm(int bank, bool on) > +{ > + unsigned long flags; > + u64 val; > + > + raw_spin_lock_irqsave(&cmci_discover_lock, flags); > + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); > + if (on) { > + val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK); > + val |= CMCI_STORM_THRESHOLD; > + } else { > + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; > + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); > + } > + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); > + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); > +} Why does this need to be different than mce_intel_handle_storm()? Thanks, Yazen
> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> > [...] > Subject: [PATCH v3 3/3] x86/mce: Add CMCI storm switching support for > [...] > --- a/arch/x86/kernel/cpu/mce/zhaoxin.c > +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c > @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) > { > intel_clear_lmce(); > } > + > +void mce_zhaoxin_handle_storm(int bank, bool on) { > + unsigned long flags; > + u64 val; > + > + raw_spin_lock_irqsave(&cmci_discover_lock, flags); > + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); > + if (on) { > + val &= ~(MCI_CTL2_CMCI_EN | > MCI_CTL2_CMCI_THRESHOLD_MASK); > + val |= CMCI_STORM_THRESHOLD; > + } else { > + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; > + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); > + } > + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); > + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } Are there any reasons or comments why it needs to disable/enable the CMCI interrupt here during a CMCI storm on/off? If not, then reuse mce_intel_handle_storm() to avoid duplicating the code. -Qiuxu
On 2024/9/19 22:06, Yazen Ghannam wrote: > >> +void mce_zhaoxin_handle_storm(int bank, bool on) >> +{ >> + unsigned long flags; >> + u64 val; >> + >> + raw_spin_lock_irqsave(&cmci_discover_lock, flags); >> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); >> + if (on) { >> + val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK); >> + val |= CMCI_STORM_THRESHOLD; >> + } else { >> + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; >> + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); >> + } >> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); >> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); >> +} > > Why does this need to be different than mce_intel_handle_storm()? > The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR error is not reported through CMCI", and we want to disable CMCI interrupt when CMCI storm happened. Sincerely TonyWWang-oc
On 2024/9/20 17:17, Zhuo, Qiuxu wrote: > >> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> >> [...] >> Subject: [PATCH v3 3/3] x86/mce: Add CMCI storm switching support for >> [...] >> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c >> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c >> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) >> { >> intel_clear_lmce(); >> } >> + >> +void mce_zhaoxin_handle_storm(int bank, bool on) { >> + unsigned long flags; >> + u64 val; >> + >> + raw_spin_lock_irqsave(&cmci_discover_lock, flags); >> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); >> + if (on) { >> + val &= ~(MCI_CTL2_CMCI_EN | >> MCI_CTL2_CMCI_THRESHOLD_MASK); >> + val |= CMCI_STORM_THRESHOLD; >> + } else { >> + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; >> + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); >> + } >> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); >> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } > > Are there any reasons or comments why it needs to disable/enable the CMCI interrupt > here during a CMCI storm on/off? If not, then reuse mce_intel_handle_storm() to avoid > duplicating the code. > As explained in another email. The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR error is not reported through CMCI", and we want to disable CMCI interrupt when CMCI storm happened. Sincerely TonyWWang-oc
> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> > [...] > >> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c > >> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c > >> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 > >> *c) { > >> intel_clear_lmce(); > >> } > >> + > >> +void mce_zhaoxin_handle_storm(int bank, bool on) { > >> + unsigned long flags; > >> + u64 val; > >> + > >> + raw_spin_lock_irqsave(&cmci_discover_lock, flags); > >> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); > >> + if (on) { > >> + val &= ~(MCI_CTL2_CMCI_EN | > >> MCI_CTL2_CMCI_THRESHOLD_MASK); > >> + val |= CMCI_STORM_THRESHOLD; > >> + } else { > >> + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; > >> + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); > >> + } > >> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); > >> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } > > > > Are there any reasons or comments why it needs to disable/enable the > > CMCI interrupt here during a CMCI storm on/off? If not, then reuse > > mce_intel_handle_storm() to avoid duplicating the code. > > > > As explained in another email. > The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR > error is not reported through CMCI", and we want to disable CMCI interrupt > when CMCI storm happened. So, this is just you want to disable CMCI when a CMCI storm happens. This doesn't explain much to me. What's the problem if not disable CMCI when a CMCI storm happens?
On Fri, Sep 20, 2024 at 06:42:15PM +0800, Tony W Wang-oc wrote: > > > On 2024/9/19 22:06, Yazen Ghannam wrote: > > > > > +void mce_zhaoxin_handle_storm(int bank, bool on) > > > +{ > > > + unsigned long flags; > > > + u64 val; > > > + > > > + raw_spin_lock_irqsave(&cmci_discover_lock, flags); > > > + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); > > > + if (on) { > > > + val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK); > > > + val |= CMCI_STORM_THRESHOLD; > > > + } else { > > > + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; > > > + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); > > > + } > > > + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); > > > + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); > > > +} > > > > Why does this need to be different than mce_intel_handle_storm()? > > > > The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR > error is not reported through CMCI", and we want to disable CMCI interrupt > when CMCI storm happened. > I see, so you want to disable the interrupt entirely rather than reprogramming to a high value. Makes sense. Thanks, Yazen
On Fri, Sep 20, 2024 at 11:44:59AM +0000, Zhuo, Qiuxu wrote: > > From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> > > [...] > > >> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c > > >> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c > > >> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 > > >> *c) { > > >> intel_clear_lmce(); > > >> } > > >> + > > >> +void mce_zhaoxin_handle_storm(int bank, bool on) { > > >> + unsigned long flags; > > >> + u64 val; > > >> + > > >> + raw_spin_lock_irqsave(&cmci_discover_lock, flags); > > >> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); > > >> + if (on) { > > >> + val &= ~(MCI_CTL2_CMCI_EN | > > >> MCI_CTL2_CMCI_THRESHOLD_MASK); > > >> + val |= CMCI_STORM_THRESHOLD; > > >> + } else { > > >> + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; > > >> + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); > > >> + } > > >> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); > > >> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } > > > > > > Are there any reasons or comments why it needs to disable/enable the > > > CMCI interrupt here during a CMCI storm on/off? If not, then reuse > > > mce_intel_handle_storm() to avoid duplicating the code. > > > > > > > As explained in another email. > > The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR > > error is not reported through CMCI", and we want to disable CMCI interrupt > > when CMCI storm happened. > > So, this is just you want to disable CMCI when a CMCI storm happens. > This doesn't explain much to me. > What's the problem if not disable CMCI when a CMCI storm happens? > A more direct way to handle an interrupt storm is to turn off the interrupt. The proposed AMD solution would also do this. Reprogramming the threshold to a high value does not 100% guarantee that a storm will be mitigated. But this is a necessary trade-off given that the CMCI is used to report other error types on Intel systems. Thanks, Yazen
> From: Yazen Ghannam <yazen.ghannam@amd.com> > Sent: Friday, September 20, 2024 9:36 PM > [...] > > So, this is just you want to disable CMCI when a CMCI storm happens. > > This doesn't explain much to me. > > What's the problem if not disable CMCI when a CMCI storm happens? > > > > A more direct way to handle an interrupt storm is to turn off the interrupt. The > proposed AMD solution would also do this. > > Reprogramming the threshold to a high value does not 100% guarantee that a > storm will be mitigated. But this is a necessary trade-off given that the CMCI is > used to report other error types on Intel systems. Thanks for your comments.
Resend this mail because I received the message: Undelivered Mail Returned to Sender On 2024/9/20 20:09, Tony W Wang-oc wrote: > > > On 2024/9/20 19:44, Zhuo, Qiuxu wrote: >> >> >>> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com> >>> [...] >>>>> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c >>>>> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c >>>>> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 >>>>> *c) { >>>>> intel_clear_lmce(); >>>>> } >>>>> + >>>>> +void mce_zhaoxin_handle_storm(int bank, bool on) { >>>>> + unsigned long flags; >>>>> + u64 val; >>>>> + >>>>> + raw_spin_lock_irqsave(&cmci_discover_lock, flags); >>>>> + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); >>>>> + if (on) { >>>>> + val &= ~(MCI_CTL2_CMCI_EN | >>>>> MCI_CTL2_CMCI_THRESHOLD_MASK); >>>>> + val |= CMCI_STORM_THRESHOLD; >>>>> + } else { >>>>> + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; >>>>> + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); >>>>> + } >>>>> + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); >>>>> + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } >>>> >>>> Are there any reasons or comments why it needs to disable/enable the >>>> CMCI interrupt here during a CMCI storm on/off? If not, then reuse >>>> mce_intel_handle_storm() to avoid duplicating the code. >>>> >>> >>> As explained in another email. >>> The reason is actually mentioned in the cover letter: "because >>> Zhaoxin's UCR >>> error is not reported through CMCI", and we want to disable CMCI >>> interrupt >>> when CMCI storm happened. >> >> So, this is just you want to disable CMCI when a CMCI storm happens. >> This doesn't explain much to me. >> What's the problem if not disable CMCI when a CMCI storm happens? >> > > In practice, we have encountered a lot of CE errors such as DRAM CE > errors, so it feels safer to disable CMCI interrupt than to set a large > threshold. At the same time, Zhaoxin's UCR is not reported through CMCI, > so we implemented like this. > > Sincerely > TonyWWang-oc >
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index b7e67f4f7..aa75e2848 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -45,7 +45,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +DEFINE_RAW_SPINLOCK(cmci_discover_lock); /* * On systems that do support CMCI but it's disabled, polling for MCEs can @@ -61,7 +61,7 @@ static DEFINE_SPINLOCK(cmci_poll_lock); * MCi_CTL2 threshold for each bank when there is no storm. * Default value for each bank may have been set by BIOS. */ -static u16 cmci_threshold[MAX_NR_BANKS]; +u16 cmci_threshold[MAX_NR_BANKS]; /* * High threshold to limit CMCI rate during storms. Max supported is @@ -73,7 +73,6 @@ static u16 cmci_threshold[MAX_NR_BANKS]; * to corrected errors, so keeping CMCI enabled means that uncorrected * errors will still be processed in a timely fashion. */ -#define CMCI_STORM_THRESHOLD 32749 static int cmci_supported(int *banks) { diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 836e56027..086b833c5 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -7,7 +7,7 @@ #include <linux/device.h> #include <asm/mce.h> - +#include <linux/spinlock.h> enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -334,11 +334,16 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) } extern void (*mc_poll_banks)(void); +#define CMCI_STORM_THRESHOLD 32749 +extern raw_spinlock_t cmci_discover_lock; +extern u16 cmci_threshold[MAX_NR_BANKS]; #ifdef CONFIG_X86_MCE_ZHAOXIN void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c); void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c); +void mce_zhaoxin_handle_storm(int bank, bool on); #else static inline void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { } static inline void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { } +static inline void mce_zhaoxin_handle_storm(int bank, bool on) { } #endif #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 89e31e1e5..200280387 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -63,6 +63,10 @@ static void mce_handle_storm(unsigned int bank, bool on) case X86_VENDOR_INTEL: mce_intel_handle_storm(bank, on); break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + mce_zhaoxin_handle_storm(bank, on); + break; } } diff --git a/arch/x86/kernel/cpu/mce/zhaoxin.c b/arch/x86/kernel/cpu/mce/zhaoxin.c index de69c560f..994b8520a 100644 --- a/arch/x86/kernel/cpu/mce/zhaoxin.c +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); } + +void mce_zhaoxin_handle_storm(int bank, bool on) +{ + unsigned long flags; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + if (on) { + val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK); + val |= CMCI_STORM_THRESHOLD; + } else { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]); + } + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +}