Message ID | 20240523155641.2805411-5-yazen.ghannam@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | AMD MCA interrupts rework | expand |
On Thu, May 23, 2024 at 10:56:36AM -0500, Yazen Ghannam wrote: > @@ -709,48 +747,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) > if (!mca_cfg.cmci_disabled) > mce_track_storm(&m); > > - /* If this entry is not valid, ignore it */ > - if (!(m.status & MCI_STATUS_VAL)) > + if (!log_poll_error(flags, &m)) > continue; > > - /* > - * If we are logging everything (at CPU online) or this > - * is a corrected error, then we must log it. > - */ > - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) > - goto log_it; > - > - /* > - * Newer Intel systems that support software error > - * recovery need to make additional checks. Other > - * CPUs should skip over uncorrected errors, but log > - * everything else. > - */ You lost that comment. > - if (!mca_cfg.ser) { > - if (m.status & MCI_STATUS_UC) > - continue; > - goto log_it; > - } > - > - /* Log "not enabled" (speculative) errors */ > - if (!(m.status & MCI_STATUS_EN)) > - goto log_it; > - > - /* > - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") > - * UC == 1 && PCC == 0 && S == 0 > - */ > - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) > - goto log_it; > - > - /* > - * Skip anything else. Presumption is that our read of this > - * bank is racing with a machine check. Leave the log alone > - * for do_machine_check() to deal with it. > - */ > - continue; > - > -log_it: > if (flags & MCP_DONTLOG) > goto clear_it; Btw, the code looks really weird now: if (!log_poll_error(flags, &m)) continue; if (flags & MCP_DONTLOG) goto clear_it; i.e., 1. Should I log it? 2. Should I not log it? Oh well, it was like that before logically so...
On Mon, Jun 03, 2024 at 07:37:27PM +0200, Borislav Petkov wrote: > On Thu, May 23, 2024 at 10:56:36AM -0500, Yazen Ghannam wrote: > > @@ -709,48 +747,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) > > if (!mca_cfg.cmci_disabled) > > mce_track_storm(&m); > > > > - /* If this entry is not valid, ignore it */ > > - if (!(m.status & MCI_STATUS_VAL)) > > + if (!log_poll_error(flags, &m)) > > continue; > > > > - /* > > - * If we are logging everything (at CPU online) or this > > - * is a corrected error, then we must log it. > > - */ > > - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) > > - goto log_it; > > - > > - /* > > - * Newer Intel systems that support software error > > - * recovery need to make additional checks. Other > > - * CPUs should skip over uncorrected errors, but log > > - * everything else. > > - */ > > You lost that comment. > Sorry, will keep it. > > - if (!mca_cfg.ser) { > > - if (m.status & MCI_STATUS_UC) > > - continue; > > - goto log_it; > > - } > > - > > - /* Log "not enabled" (speculative) errors */ > > - if (!(m.status & MCI_STATUS_EN)) > > - goto log_it; > > - > > - /* > > - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") > > - * UC == 1 && PCC == 0 && S == 0 > > - */ > > - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) > > - goto log_it; > > - > > - /* > > - * Skip anything else. Presumption is that our read of this > > - * bank is racing with a machine check. Leave the log alone > > - * for do_machine_check() to deal with it. > > - */ > > - continue; > > - > > -log_it: > > if (flags & MCP_DONTLOG) > > goto clear_it; > > Btw, the code looks really weird now: > > if (!log_poll_error(flags, &m)) > continue; > > if (flags & MCP_DONTLOG) > goto clear_it; > > i.e., > > 1. Should I log it? > > 2. Should I not log it? > > Oh well, it was like that before logically so... > We can rename the new function and add comments. What do you think of the change below? Thanks, Yazen @@ -797,9 +797,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(&m); - if (!log_poll_error(flags, &m, &status_reg)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!loggable_poll_error(flags, &m, &status_reg)) continue; + /* Clear a loggable error, e.g., one leftover from boot time. */ if (flags & MCP_DONTLOG) goto clear_it;
On Fri, Jul 26, 2024 at 10:00:30AM -0400, Yazen Ghannam wrote: > @@ -797,9 +797,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) > if (!mca_cfg.cmci_disabled) > mce_track_storm(&m); > > - if (!log_poll_error(flags, &m, &status_reg)) > + /* Verify that the error should be logged based on hardware conditions. */ > + if (!loggable_poll_error(flags, &m, &status_reg)) Right, or do if (!should_log_poll_error(...) as it becomes maximally readable then - like a plain English sentence. > continue; > > + /* Clear a loggable error, e.g., one leftover from boot time. */ > if (flags & MCP_DONTLOG) > goto clear_it; So yeah, I guess.
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 70c8df1a766a..704e651203b4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -662,6 +662,44 @@ static noinstr void mce_read_aux(struct mce *m, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); +static bool ser_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool log_poll_error(enum mcp_flags flags, struct mce *m) +{ + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -709,48 +747,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(&m); - /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!log_poll_error(flags, &m)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: if (flags & MCP_DONTLOG) goto clear_it;
There are a number of generic and vendor-specific status checks in machine_check_poll(). These are used to determine if an error should be skipped. Move these into helper functions. Future vendor-specific checks will be added to the helpers. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> --- arch/x86/kernel/cpu/mce/core.c | 79 +++++++++++++++++----------------- 1 file changed, 39 insertions(+), 40 deletions(-)