diff mbox series

[v3,3/3] x86/mce: Add CMCI storm switching support for Zhaoxin

Message ID 20240918055436.15551-4-TonyWWang-oc@zhaoxin.com (mailing list archive)
State New
Headers show
Series x86/mce: Add Zhaoxin MCE support | expand

Commit Message

Tony W Wang-oc Sept. 18, 2024, 5:54 a.m. UTC
From: Lyle Li <LyleLi@zhaoxin.com>

Zhaoxin CPUs support CMCI compatible with Intel, because
Zhaoxin's UCR error is not reported through CMCI, and in
order to be compatible with intel's CMCI code, so add Zhaoxin
CMCI storm toggle to support the new CMCI storm switching
in mce/intel.c, mce/zhaoxin.c, mce/threshold.c, and mce/internal.h.

Signed-off-by: Lyle Li <LyleLi@zhaoxin.com>
Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
---
 arch/x86/kernel/cpu/mce/intel.c     |  5 ++---
 arch/x86/kernel/cpu/mce/internal.h  |  7 ++++++-
 arch/x86/kernel/cpu/mce/threshold.c |  4 ++++
 arch/x86/kernel/cpu/mce/zhaoxin.c   | 18 ++++++++++++++++++
 4 files changed, 30 insertions(+), 4 deletions(-)

Comments

Yazen Ghannam Sept. 19, 2024, 2:06 p.m. UTC | #1
On Wed, Sep 18, 2024 at 01:54:36PM +0800, Tony W Wang-oc wrote:
> From: Lyle Li <LyleLi@zhaoxin.com>
> 
> Zhaoxin CPUs support CMCI compatible with Intel, because
> Zhaoxin's UCR error is not reported through CMCI, and in
> order to be compatible with intel's CMCI code, so add Zhaoxin
> CMCI storm toggle to support the new CMCI storm switching
> in mce/intel.c, mce/zhaoxin.c, mce/threshold.c, and mce/internal.h.
> 
> Signed-off-by: Lyle Li <LyleLi@zhaoxin.com>
> Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
> ---
>  arch/x86/kernel/cpu/mce/intel.c     |  5 ++---
>  arch/x86/kernel/cpu/mce/internal.h  |  7 ++++++-
>  arch/x86/kernel/cpu/mce/threshold.c |  4 ++++
>  arch/x86/kernel/cpu/mce/zhaoxin.c   | 18 ++++++++++++++++++
>  4 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
> index b7e67f4f7..aa75e2848 100644
> --- a/arch/x86/kernel/cpu/mce/intel.c
> +++ b/arch/x86/kernel/cpu/mce/intel.c
> @@ -45,7 +45,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
>   * cmci_discover_lock protects against parallel discovery attempts
>   * which could race against each other.
>   */
> -static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
> +DEFINE_RAW_SPINLOCK(cmci_discover_lock);
>  
>  /*
>   * On systems that do support CMCI but it's disabled, polling for MCEs can
> @@ -61,7 +61,7 @@ static DEFINE_SPINLOCK(cmci_poll_lock);
>   * MCi_CTL2 threshold for each bank when there is no storm.
>   * Default value for each bank may have been set by BIOS.
>   */
> -static u16 cmci_threshold[MAX_NR_BANKS];
> +u16 cmci_threshold[MAX_NR_BANKS];
>  
>  /*
>   * High threshold to limit CMCI rate during storms. Max supported is
> @@ -73,7 +73,6 @@ static u16 cmci_threshold[MAX_NR_BANKS];
>   * to corrected errors, so keeping CMCI enabled means that uncorrected
>   * errors will still be processed in a timely fashion.
>   */
> -#define CMCI_STORM_THRESHOLD	32749
>  
>  static int cmci_supported(int *banks)
>  {
> diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
> index 836e56027..086b833c5 100644
> --- a/arch/x86/kernel/cpu/mce/internal.h
> +++ b/arch/x86/kernel/cpu/mce/internal.h
> @@ -7,7 +7,7 @@
>  
>  #include <linux/device.h>
>  #include <asm/mce.h>
> -
> +#include <linux/spinlock.h>
>  enum severity_level {
>  	MCE_NO_SEVERITY,
>  	MCE_DEFERRED_SEVERITY,
> @@ -334,11 +334,16 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
>  }
>  
>  extern void (*mc_poll_banks)(void);
> +#define CMCI_STORM_THRESHOLD    32749
> +extern raw_spinlock_t cmci_discover_lock;
> +extern u16 cmci_threshold[MAX_NR_BANKS];
>  #ifdef CONFIG_X86_MCE_ZHAOXIN
>  void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c);
>  void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c);
> +void mce_zhaoxin_handle_storm(int bank, bool on);
>  #else
>  static inline void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { }
>  static inline void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { }
> +static inline void mce_zhaoxin_handle_storm(int bank, bool on) { }
>  #endif
>  #endif /* __X86_MCE_INTERNAL_H__ */
> diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
> index 89e31e1e5..200280387 100644
> --- a/arch/x86/kernel/cpu/mce/threshold.c
> +++ b/arch/x86/kernel/cpu/mce/threshold.c
> @@ -63,6 +63,10 @@ static void mce_handle_storm(unsigned int bank, bool on)
>  	case X86_VENDOR_INTEL:
>  		mce_intel_handle_storm(bank, on);
>  		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		mce_zhaoxin_handle_storm(bank, on);
> +		break;
>  	}
>  }
>  
> diff --git a/arch/x86/kernel/cpu/mce/zhaoxin.c b/arch/x86/kernel/cpu/mce/zhaoxin.c
> index de69c560f..994b8520a 100644
> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
>  {
>  	intel_clear_lmce();
>  }
> +
> +void mce_zhaoxin_handle_storm(int bank, bool on)
> +{
> +	unsigned long flags;
> +	u64 val;
> +
> +	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> +	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	if (on) {
> +		val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK);
> +		val |= CMCI_STORM_THRESHOLD;
> +	} else {
> +		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
> +		val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
> +	}
> +	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
> +}

Why does this need to be different than mce_intel_handle_storm()?

Thanks,
Yazen
Zhuo, Qiuxu Sept. 20, 2024, 9:17 a.m. UTC | #2
> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
> [...]
> Subject: [PATCH v3 3/3] x86/mce: Add CMCI storm switching support for
> [...]
> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
> {
>  	intel_clear_lmce();
>  }
> +
> +void mce_zhaoxin_handle_storm(int bank, bool on) {
> +	unsigned long flags;
> +	u64 val;
> +
> +	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> +	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	if (on) {
> +		val &= ~(MCI_CTL2_CMCI_EN |
> MCI_CTL2_CMCI_THRESHOLD_MASK);
> +		val |= CMCI_STORM_THRESHOLD;
> +	} else {
> +		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
> +		val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
> +	}
> +	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); }

Are there any reasons or comments why it needs to disable/enable the CMCI interrupt 
here during a CMCI storm on/off? If not, then reuse mce_intel_handle_storm() to avoid 
duplicating the code.

-Qiuxu
Tony W Wang-oc Sept. 20, 2024, 10:42 a.m. UTC | #3
On 2024/9/19 22:06, Yazen Ghannam wrote:
> 
>> +void mce_zhaoxin_handle_storm(int bank, bool on)
>> +{
>> +     unsigned long flags;
>> +     u64 val;
>> +
>> +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>> +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +     if (on) {
>> +             val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK);
>> +             val |= CMCI_STORM_THRESHOLD;
>> +     } else {
>> +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
>> +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
>> +     }
>> +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>> +}
> 
> Why does this need to be different than mce_intel_handle_storm()?
> 

The reason is actually mentioned in the cover letter: "because Zhaoxin's 
UCR error is not reported through CMCI", and we want to disable CMCI 
interrupt when CMCI storm happened.

Sincerely
TonyWWang-oc
Tony W Wang-oc Sept. 20, 2024, 10:42 a.m. UTC | #4
On 2024/9/20 17:17, Zhuo, Qiuxu wrote:
> 
>> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
>> [...]
>> Subject: [PATCH v3 3/3] x86/mce: Add CMCI storm switching support for
>> [...]
>> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
>> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
>> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
>> {
>>        intel_clear_lmce();
>>   }
>> +
>> +void mce_zhaoxin_handle_storm(int bank, bool on) {
>> +     unsigned long flags;
>> +     u64 val;
>> +
>> +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>> +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +     if (on) {
>> +             val &= ~(MCI_CTL2_CMCI_EN |
>> MCI_CTL2_CMCI_THRESHOLD_MASK);
>> +             val |= CMCI_STORM_THRESHOLD;
>> +     } else {
>> +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
>> +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
>> +     }
>> +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); }
> 
> Are there any reasons or comments why it needs to disable/enable the CMCI interrupt
> here during a CMCI storm on/off? If not, then reuse mce_intel_handle_storm() to avoid
> duplicating the code.
> 

As explained in another email.
The reason is actually mentioned in the cover letter: "because Zhaoxin's 
UCR error is not reported through CMCI", and we want to disable CMCI 
interrupt when CMCI storm happened.

Sincerely
TonyWWang-oc
Zhuo, Qiuxu Sept. 20, 2024, 11:44 a.m. UTC | #5
> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
> [...]
> >> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
> >> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
> >> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86
> >> *c) {
> >>        intel_clear_lmce();
> >>   }
> >> +
> >> +void mce_zhaoxin_handle_storm(int bank, bool on) {
> >> +     unsigned long flags;
> >> +     u64 val;
> >> +
> >> +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> >> +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> >> +     if (on) {
> >> +             val &= ~(MCI_CTL2_CMCI_EN |
> >> MCI_CTL2_CMCI_THRESHOLD_MASK);
> >> +             val |= CMCI_STORM_THRESHOLD;
> >> +     } else {
> >> +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
> >> +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
> >> +     }
> >> +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> >> +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); }
> >
> > Are there any reasons or comments why it needs to disable/enable the
> > CMCI interrupt here during a CMCI storm on/off? If not, then reuse
> > mce_intel_handle_storm() to avoid duplicating the code.
> >
> 
> As explained in another email.
> The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR
> error is not reported through CMCI", and we want to disable CMCI interrupt
> when CMCI storm happened.

So, this is just you want to disable CMCI when a CMCI storm happens. 
This doesn't explain much to me.
What's the problem if not disable CMCI when a CMCI storm happens?
Yazen Ghannam Sept. 20, 2024, 1:23 p.m. UTC | #6
On Fri, Sep 20, 2024 at 06:42:15PM +0800, Tony W Wang-oc wrote:
> 
> 
> On 2024/9/19 22:06, Yazen Ghannam wrote:
> > 
> > > +void mce_zhaoxin_handle_storm(int bank, bool on)
> > > +{
> > > +     unsigned long flags;
> > > +     u64 val;
> > > +
> > > +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> > > +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> > > +     if (on) {
> > > +             val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK);
> > > +             val |= CMCI_STORM_THRESHOLD;
> > > +     } else {
> > > +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
> > > +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
> > > +     }
> > > +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> > > +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
> > > +}
> > 
> > Why does this need to be different than mce_intel_handle_storm()?
> > 
> 
> The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR
> error is not reported through CMCI", and we want to disable CMCI interrupt
> when CMCI storm happened.
>

I see, so you want to disable the interrupt entirely rather than
reprogramming to a high value. Makes sense.

Thanks,
Yazen
Yazen Ghannam Sept. 20, 2024, 1:36 p.m. UTC | #7
On Fri, Sep 20, 2024 at 11:44:59AM +0000, Zhuo, Qiuxu wrote:
> > From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
> > [...]
> > >> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
> > >> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
> > >> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86
> > >> *c) {
> > >>        intel_clear_lmce();
> > >>   }
> > >> +
> > >> +void mce_zhaoxin_handle_storm(int bank, bool on) {
> > >> +     unsigned long flags;
> > >> +     u64 val;
> > >> +
> > >> +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> > >> +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> > >> +     if (on) {
> > >> +             val &= ~(MCI_CTL2_CMCI_EN |
> > >> MCI_CTL2_CMCI_THRESHOLD_MASK);
> > >> +             val |= CMCI_STORM_THRESHOLD;
> > >> +     } else {
> > >> +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
> > >> +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
> > >> +     }
> > >> +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> > >> +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); }
> > >
> > > Are there any reasons or comments why it needs to disable/enable the
> > > CMCI interrupt here during a CMCI storm on/off? If not, then reuse
> > > mce_intel_handle_storm() to avoid duplicating the code.
> > >
> > 
> > As explained in another email.
> > The reason is actually mentioned in the cover letter: "because Zhaoxin's UCR
> > error is not reported through CMCI", and we want to disable CMCI interrupt
> > when CMCI storm happened.
> 
> So, this is just you want to disable CMCI when a CMCI storm happens. 
> This doesn't explain much to me.
> What's the problem if not disable CMCI when a CMCI storm happens?
>

A more direct way to handle an interrupt storm is to turn off the
interrupt. The proposed AMD solution would also do this.

Reprogramming the threshold to a high value does not 100% guarantee that
a storm will be mitigated. But this is a necessary trade-off given that
the CMCI is used to report other error types on Intel systems.

Thanks,
Yazen
Zhuo, Qiuxu Sept. 21, 2024, 5:39 a.m. UTC | #8
> From: Yazen Ghannam <yazen.ghannam@amd.com>
> Sent: Friday, September 20, 2024 9:36 PM
> [...]
> > So, this is just you want to disable CMCI when a CMCI storm happens.
> > This doesn't explain much to me.
> > What's the problem if not disable CMCI when a CMCI storm happens?
> >
> 
> A more direct way to handle an interrupt storm is to turn off the interrupt. The
> proposed AMD solution would also do this.
> 
> Reprogramming the threshold to a high value does not 100% guarantee that a
> storm will be mitigated. But this is a necessary trade-off given that the CMCI is
> used to report other error types on Intel systems.

Thanks for your comments.
Tony W Wang-oc Sept. 23, 2024, 3:23 a.m. UTC | #9
Resend this mail because I received the message: Undelivered Mail 
Returned to Sender

On 2024/9/20 20:09, Tony W Wang-oc wrote:
> 
> 
> On 2024/9/20 19:44, Zhuo, Qiuxu wrote:
>>
>>
>>> From: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
>>> [...]
>>>>> --- a/arch/x86/kernel/cpu/mce/zhaoxin.c
>>>>> +++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
>>>>> @@ -63,3 +63,21 @@ void mce_zhaoxin_feature_clear(struct cpuinfo_x86
>>>>> *c) {
>>>>>         intel_clear_lmce();
>>>>>    }
>>>>> +
>>>>> +void mce_zhaoxin_handle_storm(int bank, bool on) {
>>>>> +     unsigned long flags;
>>>>> +     u64 val;
>>>>> +
>>>>> +     raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>>>>> +     rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
>>>>> +     if (on) {
>>>>> +             val &= ~(MCI_CTL2_CMCI_EN |
>>>>> MCI_CTL2_CMCI_THRESHOLD_MASK);
>>>>> +             val |= CMCI_STORM_THRESHOLD;
>>>>> +     } else {
>>>>> +             val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
>>>>> +             val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
>>>>> +     }
>>>>> +     wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
>>>>> +     raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); }
>>>>
>>>> Are there any reasons or comments why it needs to disable/enable the
>>>> CMCI interrupt here during a CMCI storm on/off? If not, then reuse
>>>> mce_intel_handle_storm() to avoid duplicating the code.
>>>>
>>>
>>> As explained in another email.
>>> The reason is actually mentioned in the cover letter: "because 
>>> Zhaoxin's UCR
>>> error is not reported through CMCI", and we want to disable CMCI 
>>> interrupt
>>> when CMCI storm happened.
>>
>> So, this is just you want to disable CMCI when a CMCI storm happens.
>> This doesn't explain much to me.
>> What's the problem if not disable CMCI when a CMCI storm happens?
>>
> 
> In practice, we have encountered a lot of CE errors such as DRAM CE 
> errors, so it feels safer to disable CMCI interrupt than to set a large 
> threshold. At the same time, Zhaoxin's UCR is not reported through CMCI, 
> so we implemented like this.
> 
> Sincerely
> TonyWWang-oc
>
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index b7e67f4f7..aa75e2848 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -45,7 +45,7 @@  static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
  * cmci_discover_lock protects against parallel discovery attempts
  * which could race against each other.
  */
-static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
 /*
  * On systems that do support CMCI but it's disabled, polling for MCEs can
@@ -61,7 +61,7 @@  static DEFINE_SPINLOCK(cmci_poll_lock);
  * MCi_CTL2 threshold for each bank when there is no storm.
  * Default value for each bank may have been set by BIOS.
  */
-static u16 cmci_threshold[MAX_NR_BANKS];
+u16 cmci_threshold[MAX_NR_BANKS];
 
 /*
  * High threshold to limit CMCI rate during storms. Max supported is
@@ -73,7 +73,6 @@  static u16 cmci_threshold[MAX_NR_BANKS];
  * to corrected errors, so keeping CMCI enabled means that uncorrected
  * errors will still be processed in a timely fashion.
  */
-#define CMCI_STORM_THRESHOLD	32749
 
 static int cmci_supported(int *banks)
 {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 836e56027..086b833c5 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -7,7 +7,7 @@ 
 
 #include <linux/device.h>
 #include <asm/mce.h>
-
+#include <linux/spinlock.h>
 enum severity_level {
 	MCE_NO_SEVERITY,
 	MCE_DEFERRED_SEVERITY,
@@ -334,11 +334,16 @@  static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
 }
 
 extern void (*mc_poll_banks)(void);
+#define CMCI_STORM_THRESHOLD    32749
+extern raw_spinlock_t cmci_discover_lock;
+extern u16 cmci_threshold[MAX_NR_BANKS];
 #ifdef CONFIG_X86_MCE_ZHAOXIN
 void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c);
 void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c);
+void mce_zhaoxin_handle_storm(int bank, bool on);
 #else
 static inline void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { }
 static inline void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { }
+static inline void mce_zhaoxin_handle_storm(int bank, bool on) { }
 #endif
 #endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 89e31e1e5..200280387 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -63,6 +63,10 @@  static void mce_handle_storm(unsigned int bank, bool on)
 	case X86_VENDOR_INTEL:
 		mce_intel_handle_storm(bank, on);
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		mce_zhaoxin_handle_storm(bank, on);
+		break;
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/mce/zhaoxin.c b/arch/x86/kernel/cpu/mce/zhaoxin.c
index de69c560f..994b8520a 100644
--- a/arch/x86/kernel/cpu/mce/zhaoxin.c
+++ b/arch/x86/kernel/cpu/mce/zhaoxin.c
@@ -63,3 +63,21 @@  void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
 {
 	intel_clear_lmce();
 }
+
+void mce_zhaoxin_handle_storm(int bank, bool on)
+{
+	unsigned long flags;
+	u64 val;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	if (on) {
+		val &= ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK);
+		val |= CMCI_STORM_THRESHOLD;
+	} else {
+		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+		val |= (MCI_CTL2_CMCI_EN | cmci_threshold[bank]);
+	}
+	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}