Message ID | 20230616182744.17632-4-tony.luck@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Handle corrected machine check interrupt storms | expand |
On Fri, Jun 16, 2023 at 11:27:43AM -0700, Tony Luck wrote: > +static void _reset_block(struct threshold_block *block) > +{ > + struct thresh_restart tr; > + > + memset(&tr, 0, sizeof(tr)); > + tr.b = block; > + threshold_restart_bank(&tr); > +} > + > +static void toggle_interrupt_reset_block(struct threshold_block *block, bool on) > +{ > + if (!block) > + return; > + > + block->interrupt_enable = !!on; > + _reset_block(block); > +} > + > +void mce_amd_handle_storm(int bank, bool on) > +{ > + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; > + struct threshold_bank **bp = this_cpu_read(threshold_banks); > + unsigned long flags; > + > + if (!bp) > + return; > + > + local_irq_save(flags); > + > + first_block = bp[bank]->blocks; > + if (!first_block) > + goto end; > + > + toggle_interrupt_reset_block(first_block, on); > + > + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) > + toggle_interrupt_reset_block(block, on); > +end: > + local_irq_restore(flags); > +} There's already other code which does this threshold block control. Pls refactor and unify it instead of adding almost redundant similar functions. > static void mce_threshold_block_init(struct threshold_block *b, int offset) > { > struct thresh_restart tr = { > @@ -868,6 +909,7 @@ static void amd_threshold_interrupt(void) > struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; > struct threshold_bank **bp = this_cpu_read(threshold_banks); > unsigned int bank, cpu = smp_processor_id(); > + u64 status; > > /* > * Validate that the threshold bank has been initialized already. The > @@ -881,6 +923,13 @@ static void amd_threshold_interrupt(void) > if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) > continue; > > + rdmsrl(mca_msr_reg(bank, MCA_STATUS), status); > + track_cmci_storm(bank, status); So this is called from interrupt context. There's another track_cmci_storm() from machine_check_poll() which can happen in process context. And there's no sync (locking) between the two. Not good. Why are even two calls needed on AMD?
On 6/23/2023 10:45 AM, Borislav Petkov wrote: > On Fri, Jun 16, 2023 at 11:27:43AM -0700, Tony Luck wrote: >> +static void _reset_block(struct threshold_block *block) >> +{ >> + struct thresh_restart tr; >> + >> + memset(&tr, 0, sizeof(tr)); >> + tr.b = block; >> + threshold_restart_bank(&tr); >> +} > >> + >> +static void toggle_interrupt_reset_block(struct threshold_block *block, bool on) >> +{ >> + if (!block) >> + return; >> + >> + block->interrupt_enable = !!on; >> + _reset_block(block); >> +} >> + >> +void mce_amd_handle_storm(int bank, bool on) >> +{ >> + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; >> + struct threshold_bank **bp = this_cpu_read(threshold_banks); >> + unsigned long flags; >> + >> + if (!bp) >> + return; >> + >> + local_irq_save(flags); >> + >> + first_block = bp[bank]->blocks; >> + if (!first_block) >> + goto end; >> + >> + toggle_interrupt_reset_block(first_block, on); >> + >> + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) >> + toggle_interrupt_reset_block(block, on); >> +end: >> + local_irq_restore(flags); >> +} > > There's already other code which does this threshold block control. Pls > refactor and unify it instead of adding almost redundant similar functions. > Okay, will do. >> static void mce_threshold_block_init(struct threshold_block *b, int offset) >> { >> struct thresh_restart tr = { >> @@ -868,6 +909,7 @@ static void amd_threshold_interrupt(void) >> struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; >> struct threshold_bank **bp = this_cpu_read(threshold_banks); >> unsigned int bank, cpu = smp_processor_id(); >> + u64 status; >> >> /* >> * Validate that the threshold bank has been initialized already. The >> @@ -881,6 +923,13 @@ static void amd_threshold_interrupt(void) >> if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) >> continue; >> >> + rdmsrl(mca_msr_reg(bank, MCA_STATUS), status); >> + track_cmci_storm(bank, status); > > So this is called from interrupt context. > > There's another track_cmci_storm() from machine_check_poll() which can > happen in process context. > > And there's no sync (locking) between the two. Not good. > > Why are even two calls needed on AMD? > I think because the AMD interrupt handlers don't call machine_check_poll(). This is a good opportunity to unify the AMD thresholding and deferred error interrupt handlers with machine_check_poll(). Tony, Please leave out this AMD patch for now. I'll work on refactoring it. Thanks, Yazen
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index eae88a824d97..22899d28138f 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -232,6 +232,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +void mce_amd_handle_storm(int bank, bool on); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -259,6 +260,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void mce_amd_handle_storm(int bank, bool on) {} static inline void smca_extract_err_addr(struct mce *m) { } #endif diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0b971f974096..b19f3eb70187 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -468,6 +468,47 @@ static void threshold_restart_bank(void *_tr) wrmsr(tr->b->address, lo, hi); } +static void _reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} + +static void toggle_interrupt_reset_block(struct threshold_block *block, bool on) +{ + if (!block) + return; + + block->interrupt_enable = !!on; + _reset_block(block); +} + +void mce_amd_handle_storm(int bank, bool on) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); + unsigned long flags; + + if (!bp) + return; + + local_irq_save(flags); + + first_block = bp[bank]->blocks; + if (!first_block) + goto end; + + toggle_interrupt_reset_block(first_block, on); + + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + toggle_interrupt_reset_block(block, on); +end: + local_irq_restore(flags); +} + static void mce_threshold_block_init(struct threshold_block *b, int offset) { struct thresh_restart tr = { @@ -868,6 +909,7 @@ static void amd_threshold_interrupt(void) struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank, cpu = smp_processor_id(); + u64 status; /* * Validate that the threshold bank has been initialized already. The @@ -881,6 +923,13 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; + rdmsrl(mca_msr_reg(bank, MCA_STATUS), status); + track_cmci_storm(bank, status); + + /* Return early on an interrupt storm */ + if (this_cpu_read(storm_desc.bank_storm[bank])) + return; + first_block = bp[bank]->blocks; if (!first_block) continue; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index cd9d9ea5bb0a..d4c9dc194d56 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2055,6 +2055,9 @@ static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) void mce_handle_storm(int bank, bool on) { switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + mce_amd_handle_storm(bank, on); + break; } }