diff mbox series

drivers/cros_ec: Handle CrOS EC panics

Message ID 20221221185540.2265771-1-robbarnes@google.com (mailing list archive)
State Superseded
Headers show
Series drivers/cros_ec: Handle CrOS EC panics | expand

Commit Message

Rob Barnes Dec. 21, 2022, 6:55 p.m. UTC
Add handler for CrOS EC panic events. When a panic is reported,
poll EC log then force an orderly shutdown.

This will preserve the EC log leading up to the crash.

Signed-off-by: Rob Barnes <robbarnes@google.com>
---
 drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
 drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
 include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
 3 files changed, 43 insertions(+)

Comments

Prashant Malani Dec. 21, 2022, 7:23 p.m. UTC | #1
Hi Rob,

I'd suggest using the commit title tag "platform/chrome: cros_ec: ..."
for commits which are "ChromeOS EC" wide. That's in line with
other recent commits in this directory.

On Wed, Dec 21, 2022 at 10:56 AM Rob Barnes <robbarnes@google.com> wrote:
>
> Add handler for CrOS EC panic events. When a panic is reported,
> poll EC log then force an orderly shutdown.
>
> This will preserve the EC log leading up to the crash.
>
> Signed-off-by: Rob Barnes <robbarnes@google.com>
> ---
>  drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
>  drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
>  include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
>  3 files changed, 43 insertions(+)
>
> diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
> index 21d973fc6be2..31637a4e4cf9 100644
> --- a/drivers/platform/chrome/cros_ec_debugfs.c
> +++ b/drivers/platform/chrome/cros_ec_debugfs.c
> @@ -49,6 +49,7 @@ struct cros_ec_debugfs {
>         struct delayed_work log_poll_work;
>         /* EC panicinfo */
>         struct debugfs_blob_wrapper panicinfo_blob;
> +       struct notifier_block notifier_panic;
>  };
>
>  /*
> @@ -437,6 +438,23 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
>         return ret;
>  }
>
> +static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
> +                                      unsigned long queued_during_suspend,
> +                                      void *_notify)
> +{
> +       struct cros_ec_debugfs *debug_info =
> +               container_of(nb, struct cros_ec_debugfs, notifier_panic);
> +
> +       if (debug_info->log_buffer.buf) {
> +               /* Force log poll work to run immediately */
> +               mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
> +               /* Block until log poll work finishes */
> +               flush_delayed_work(&debug_info->log_poll_work);
> +       }
> +
> +       return NOTIFY_DONE;
> +}
> +
>  static int cros_ec_debugfs_probe(struct platform_device *pd)
>  {
>         struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
> @@ -473,6 +491,12 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
>         debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
>                            &ec->ec_dev->suspend_timeout_ms);
>
> +       debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
> +       ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
> +                                              &debug_info->notifier_panic);
> +       if (ret)
> +               goto remove_debugfs;
> +
>         ec->debug_info = debug_info;
>
>         dev_set_drvdata(&pd->dev, ec);
> diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
> index 7fc8f82280ac..21958c3b0c28 100644
> --- a/drivers/platform/chrome/cros_ec_lpc.c
> +++ b/drivers/platform/chrome/cros_ec_lpc.c
> @@ -21,6 +21,7 @@
>  #include <linux/platform_data/cros_ec_proto.h>
>  #include <linux/platform_device.h>
>  #include <linux/printk.h>
> +#include <linux/reboot.h>
>  #include <linux/suspend.h>
>
>  #include "cros_ec.h"
> @@ -332,6 +333,15 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
>
>         if (value == ACPI_NOTIFY_DEVICE_WAKE)
>                 pm_system_wakeup();
> +
> +       if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
> +               dev_err(ec_dev->dev,
> +                       "CrOS EC Panic Reported. Shutdown is imminent!");
> +               blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
> +                                            ec_dev);
> +               /* Begin orderly shutdown. Force shutdown after 1 second. */
> +               hw_protection_shutdown("CrOS EC Panic", 1000);

I feel like this patch is doing 2 things: pulling the logs, and then
starting a shutdown.
This should be split into 2 patches.

> +       }
>  }

Line limits are now 100 chars[1], so most of these lines can fit on 1 line.

>
>  static int cros_ec_lpc_probe(struct platform_device *pdev)
> diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
> index e43107e0bee1..1c4487271836 100644
> --- a/include/linux/platform_data/cros_ec_proto.h
> +++ b/include/linux/platform_data/cros_ec_proto.h
> @@ -41,6 +41,13 @@
>  #define EC_MAX_REQUEST_OVERHEAD                1
>  #define EC_MAX_RESPONSE_OVERHEAD       32
>
> +/*
> + * EC panic is not covered by the standard (0-F) ACPI notify values.
> + * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
> + * device specific ACPI notify range.
> + */
> +#define ACPI_NOTIFY_CROS_EC_PANIC      0xB0

Can you provide a link (either in the commit, or here in the comment)
to the coreboot/BIOS code which uses this value? I feel this should
be documented in some form that correlates the caller and the callee.

> +
>  /*
>   * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
>   */
> @@ -176,6 +183,8 @@ struct cros_ec_device {
>         /* The platform devices used by the mfd driver */
>         struct platform_device *ec;
>         struct platform_device *pd;
> +
> +       struct blocking_notifier_head panic_notifier;

Any reason we cannot use the existing event_notifier (with value argument)?
It's a system panic, so I doubt that computational overhead for other
notifier block
listeners is a concern.

BR,

-Prashant

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bdc48fa11e46f867ea4d75fa59ee87a7f48be144
Guenter Roeck Dec. 21, 2022, 8:07 p.m. UTC | #2
On Wed, Dec 21, 2022 at 10:56 AM Rob Barnes <robbarnes@google.com> wrote:
>
> Add handler for CrOS EC panic events. When a panic is reported,
> poll EC log then force an orderly shutdown.
>
> This will preserve the EC log leading up to the crash.
>
> Signed-off-by: Rob Barnes <robbarnes@google.com>
> ---
>  drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
>  drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
>  include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
>  3 files changed, 43 insertions(+)
>
> diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
> index 21d973fc6be2..31637a4e4cf9 100644
> --- a/drivers/platform/chrome/cros_ec_debugfs.c
> +++ b/drivers/platform/chrome/cros_ec_debugfs.c
> @@ -49,6 +49,7 @@ struct cros_ec_debugfs {
>         struct delayed_work log_poll_work;
>         /* EC panicinfo */
>         struct debugfs_blob_wrapper panicinfo_blob;
> +       struct notifier_block notifier_panic;
>  };
>
>  /*
> @@ -437,6 +438,23 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
>         return ret;
>  }
>
> +static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
> +                                      unsigned long queued_during_suspend,
> +                                      void *_notify)
> +{
> +       struct cros_ec_debugfs *debug_info =
> +               container_of(nb, struct cros_ec_debugfs, notifier_panic);
> +
> +       if (debug_info->log_buffer.buf) {
> +               /* Force log poll work to run immediately */
> +               mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
> +               /* Block until log poll work finishes */
> +               flush_delayed_work(&debug_info->log_poll_work);
> +       }
> +
> +       return NOTIFY_DONE;
> +}
> +
>  static int cros_ec_debugfs_probe(struct platform_device *pd)
>  {
>         struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
> @@ -473,6 +491,12 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
>         debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
>                            &ec->ec_dev->suspend_timeout_ms);
>
> +       debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
> +       ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
> +                                              &debug_info->notifier_panic);
> +       if (ret)
> +               goto remove_debugfs;
> +
>         ec->debug_info = debug_info;
>
>         dev_set_drvdata(&pd->dev, ec);
> diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
> index 7fc8f82280ac..21958c3b0c28 100644
> --- a/drivers/platform/chrome/cros_ec_lpc.c
> +++ b/drivers/platform/chrome/cros_ec_lpc.c
> @@ -21,6 +21,7 @@
>  #include <linux/platform_data/cros_ec_proto.h>
>  #include <linux/platform_device.h>
>  #include <linux/printk.h>
> +#include <linux/reboot.h>
>  #include <linux/suspend.h>
>
>  #include "cros_ec.h"
> @@ -332,6 +333,15 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
>
>         if (value == ACPI_NOTIFY_DEVICE_WAKE)
>                 pm_system_wakeup();
> +
> +       if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
> +               dev_err(ec_dev->dev,
> +                       "CrOS EC Panic Reported. Shutdown is imminent!");

dev_emerg() would seem to be more appropriate, given that we'll be
crashing a second later.

> +               blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
> +                                            ec_dev);
> +               /* Begin orderly shutdown. Force shutdown after 1 second. */
> +               hw_protection_shutdown("CrOS EC Panic", 1000);

That seems to be the wrong API. From its description:

 * Initiate an emergency system shutdown in order to protect hardware from
 * further damage. Usage examples include a thermal protection or a voltage or
 * current regulator failures.

This is an EC crash that apparently can not be handled gracefully.
That has nothing to do with hardware protection. Why not just call
panic() ?

> +       }

It troubles me that this comes last. That means the entire
mkbp_event_supported handling executes first. What troubles me even
more is that 'value' is, with the exception of
ACPI_NOTIFY_DEVICE_WAKE, not checked at all. Presumably the EC sends
some well defined value if it has an event to report. I don't think it
is a good idea to depend on cros_ec_get_next_event() returning
something useful after the EC crashed.

How is this handled today ? Does the EC just reset the AP ?

>  }
>
>  static int cros_ec_lpc_probe(struct platform_device *pdev)
> diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
> index e43107e0bee1..1c4487271836 100644
> --- a/include/linux/platform_data/cros_ec_proto.h
> +++ b/include/linux/platform_data/cros_ec_proto.h
> @@ -41,6 +41,13 @@
>  #define EC_MAX_REQUEST_OVERHEAD                1
>  #define EC_MAX_RESPONSE_OVERHEAD       32
>
> +/*
> + * EC panic is not covered by the standard (0-F) ACPI notify values.
> + * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
> + * device specific ACPI notify range.
> + */
> +#define ACPI_NOTIFY_CROS_EC_PANIC      0xB0
> +
>  /*
>   * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
>   */
> @@ -176,6 +183,8 @@ struct cros_ec_device {
>         /* The platform devices used by the mfd driver */
>         struct platform_device *ec;
>         struct platform_device *pd;
> +
> +       struct blocking_notifier_head panic_notifier;
>  };
>
>  /**
> --
> 2.39.0.314.g84b9a713c41-goog
>
Rob Barnes Dec. 21, 2022, 11:55 p.m. UTC | #3
On Wed, Dec 21, 2022 at 12:23 PM Prashant Malani <pmalani@chromium.org> wrote:
>
> Hi Rob,
>
> I'd suggest using the commit title tag "platform/chrome: cros_ec: ..."
> for commits which are "ChromeOS EC" wide. That's in line with
> other recent commits in this directory.

Ack. Will do.

>
> On Wed, Dec 21, 2022 at 10:56 AM Rob Barnes <robbarnes@google.com> wrote:
> >
> > Add handler for CrOS EC panic events. When a panic is reported,
> > poll EC log then force an orderly shutdown.
> >
> > This will preserve the EC log leading up to the crash.
> >
> > Signed-off-by: Rob Barnes <robbarnes@google.com>
> > ---
> >  drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
> >  drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
> >  include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
> >  3 files changed, 43 insertions(+)
> >
> > diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
> > index 21d973fc6be2..31637a4e4cf9 100644
> > --- a/drivers/platform/chrome/cros_ec_debugfs.c
> > +++ b/drivers/platform/chrome/cros_ec_debugfs.c
> > @@ -49,6 +49,7 @@ struct cros_ec_debugfs {
> >         struct delayed_work log_poll_work;
> >         /* EC panicinfo */
> >         struct debugfs_blob_wrapper panicinfo_blob;
> > +       struct notifier_block notifier_panic;
> >  };
> >
> >  /*
> > @@ -437,6 +438,23 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
> >         return ret;
> >  }
> >
> > +static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
> > +                                      unsigned long queued_during_suspend,
> > +                                      void *_notify)
> > +{
> > +       struct cros_ec_debugfs *debug_info =
> > +               container_of(nb, struct cros_ec_debugfs, notifier_panic);
> > +
> > +       if (debug_info->log_buffer.buf) {
> > +               /* Force log poll work to run immediately */
> > +               mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
> > +               /* Block until log poll work finishes */
> > +               flush_delayed_work(&debug_info->log_poll_work);
> > +       }
> > +
> > +       return NOTIFY_DONE;
> > +}
> > +
> >  static int cros_ec_debugfs_probe(struct platform_device *pd)
> >  {
> >         struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
> > @@ -473,6 +491,12 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
> >         debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
> >                            &ec->ec_dev->suspend_timeout_ms);
> >
> > +       debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
> > +       ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
> > +                                              &debug_info->notifier_panic);
> > +       if (ret)
> > +               goto remove_debugfs;
> > +
> >         ec->debug_info = debug_info;
> >
> >         dev_set_drvdata(&pd->dev, ec);
> > diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
> > index 7fc8f82280ac..21958c3b0c28 100644
> > --- a/drivers/platform/chrome/cros_ec_lpc.c
> > +++ b/drivers/platform/chrome/cros_ec_lpc.c
> > @@ -21,6 +21,7 @@
> >  #include <linux/platform_data/cros_ec_proto.h>
> >  #include <linux/platform_device.h>
> >  #include <linux/printk.h>
> > +#include <linux/reboot.h>
> >  #include <linux/suspend.h>
> >
> >  #include "cros_ec.h"
> > @@ -332,6 +333,15 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
> >
> >         if (value == ACPI_NOTIFY_DEVICE_WAKE)
> >                 pm_system_wakeup();
> > +
> > +       if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
> > +               dev_err(ec_dev->dev,
> > +                       "CrOS EC Panic Reported. Shutdown is imminent!");
> > +               blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
> > +                                            ec_dev);
> > +               /* Begin orderly shutdown. Force shutdown after 1 second. */
> > +               hw_protection_shutdown("CrOS EC Panic", 1000);
>
> I feel like this patch is doing 2 things: pulling the logs, and then
> starting a shutdown.
> This should be split into 2 patches.

Ack. Will do.

>
> > +       }
> >  }
>
> Line limits are now 100 chars[1], so most of these lines can fit on 1 line.
>
> >
> >  static int cros_ec_lpc_probe(struct platform_device *pdev)
> > diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
> > index e43107e0bee1..1c4487271836 100644
> > --- a/include/linux/platform_data/cros_ec_proto.h
> > +++ b/include/linux/platform_data/cros_ec_proto.h
> > @@ -41,6 +41,13 @@
> >  #define EC_MAX_REQUEST_OVERHEAD                1
> >  #define EC_MAX_RESPONSE_OVERHEAD       32
> >
> > +/*
> > + * EC panic is not covered by the standard (0-F) ACPI notify values.
> > + * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
> > + * device specific ACPI notify range.
> > + */
> > +#define ACPI_NOTIFY_CROS_EC_PANIC      0xB0
>
> Can you provide a link (either in the commit, or here in the comment)
> to the coreboot/BIOS code which uses this value? I feel this should
> be documented in some form that correlates the caller and the callee.

Link: https://chromium-review.googlesource.com/c/chromiumos/third_party/coreboot/+/4023535

>
> > +
> >  /*
> >   * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
> >   */
> > @@ -176,6 +183,8 @@ struct cros_ec_device {
> >         /* The platform devices used by the mfd driver */
> >         struct platform_device *ec;
> >         struct platform_device *pd;
> > +
> > +       struct blocking_notifier_head panic_notifier;
>
> Any reason we cannot use the existing event_notifier (with value argument)?
> It's a system panic, so I doubt that computational overhead for other
> notifier block
> listeners is a concern.

The value field is already being used for "queued_during_suspend" in
event_notifier.
>
> BR,
>
> -Prashant
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bdc48fa11e46f867ea4d75fa59ee87a7f48be144
Rob Barnes Dec. 21, 2022, 11:55 p.m. UTC | #4
On Wed, Dec 21, 2022 at 1:07 PM Guenter Roeck <groeck@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 10:56 AM Rob Barnes <robbarnes@google.com> wrote:
> >
> > Add handler for CrOS EC panic events. When a panic is reported,
> > poll EC log then force an orderly shutdown.
> >
> > This will preserve the EC log leading up to the crash.
> >
> > Signed-off-by: Rob Barnes <robbarnes@google.com>
> > ---
> >  drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
> >  drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
> >  include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
> >  3 files changed, 43 insertions(+)
> >
> > diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
> > index 21d973fc6be2..31637a4e4cf9 100644
> > --- a/drivers/platform/chrome/cros_ec_debugfs.c
> > +++ b/drivers/platform/chrome/cros_ec_debugfs.c
> > @@ -49,6 +49,7 @@ struct cros_ec_debugfs {
> >         struct delayed_work log_poll_work;
> >         /* EC panicinfo */
> >         struct debugfs_blob_wrapper panicinfo_blob;
> > +       struct notifier_block notifier_panic;
> >  };
> >
> >  /*
> > @@ -437,6 +438,23 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
> >         return ret;
> >  }
> >
> > +static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
> > +                                      unsigned long queued_during_suspend,
> > +                                      void *_notify)
> > +{
> > +       struct cros_ec_debugfs *debug_info =
> > +               container_of(nb, struct cros_ec_debugfs, notifier_panic);
> > +
> > +       if (debug_info->log_buffer.buf) {
> > +               /* Force log poll work to run immediately */
> > +               mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
> > +               /* Block until log poll work finishes */
> > +               flush_delayed_work(&debug_info->log_poll_work);
> > +       }
> > +
> > +       return NOTIFY_DONE;
> > +}
> > +
> >  static int cros_ec_debugfs_probe(struct platform_device *pd)
> >  {
> >         struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
> > @@ -473,6 +491,12 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
> >         debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
> >                            &ec->ec_dev->suspend_timeout_ms);
> >
> > +       debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
> > +       ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
> > +                                              &debug_info->notifier_panic);
> > +       if (ret)
> > +               goto remove_debugfs;
> > +
> >         ec->debug_info = debug_info;
> >
> >         dev_set_drvdata(&pd->dev, ec);
> > diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
> > index 7fc8f82280ac..21958c3b0c28 100644
> > --- a/drivers/platform/chrome/cros_ec_lpc.c
> > +++ b/drivers/platform/chrome/cros_ec_lpc.c
> > @@ -21,6 +21,7 @@
> >  #include <linux/platform_data/cros_ec_proto.h>
> >  #include <linux/platform_device.h>
> >  #include <linux/printk.h>
> > +#include <linux/reboot.h>
> >  #include <linux/suspend.h>
> >
> >  #include "cros_ec.h"
> > @@ -332,6 +333,15 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
> >
> >         if (value == ACPI_NOTIFY_DEVICE_WAKE)
> >                 pm_system_wakeup();
> > +
> > +       if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
> > +               dev_err(ec_dev->dev,
> > +                       "CrOS EC Panic Reported. Shutdown is imminent!");
>
> dev_emerg() would seem to be more appropriate, given that we'll be
> crashing a second later.

Makes sense. Will do.

>
> > +               blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
> > +                                            ec_dev);
> > +               /* Begin orderly shutdown. Force shutdown after 1 second. */
> > +               hw_protection_shutdown("CrOS EC Panic", 1000);
>
> That seems to be the wrong API. From its description:
>
>  * Initiate an emergency system shutdown in order to protect hardware from
>  * further damage. Usage examples include a thermal protection or a voltage or
>  * current regulator failures.
>
> This is an EC crash that apparently can not be handled gracefully.
> That has nothing to do with hardware protection. Why not just call
> panic() ?

I'm adding the ability for the EC to somewhat gracefully handle a
class of panics. The EC will enter "system safe mode" aka limp mode.
All tasks are disabled except for a few critical ones. The purpose of
this mode is to give the OS a couple of seconds to extract info about
the EC panic and shutdown as gracefully as possible. EC will still
force a reset after 2 seconds.

Since the EC is in a precarious, non deterministic state and the EC is
at least partially responsible for thermal and power regulation, it is
arguably a hardware protection event.

The primary motivation for allowing the OS to attempt an orderly
shutdown is to sync user data and EC panic data to storage. Will panic
sync storage? Or will that need to be a seperate call?

>
> > +       }
>
> It troubles me that this comes last. That means the entire
> mkbp_event_supported handling executes first. What troubles me even
> more is that 'value' is, with the exception of
> ACPI_NOTIFY_DEVICE_WAKE, not checked at all. Presumably the EC sends
> some well defined value if it has an event to report. I don't think it
> is a good idea to depend on cros_ec_get_next_event() returning
> something useful after the EC crashed.

Could certainly move the ACPI_NOTIFY_CROS_EC_PANIC to the top. I think
'value' will be 0x80 when there's MKBP events. I will validate this
assumption.

> How is this handled today ? Does the EC just reset the AP ?

EC just immediately resets the AP and itself. System safe mode (see
go/ec-system-safe-mode) will allow the EC to run a few seconds after
some types of crashes.



>
> >  }
> >
> >  static int cros_ec_lpc_probe(struct platform_device *pdev)
> > diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
> > index e43107e0bee1..1c4487271836 100644
> > --- a/include/linux/platform_data/cros_ec_proto.h
> > +++ b/include/linux/platform_data/cros_ec_proto.h
> > @@ -41,6 +41,13 @@
> >  #define EC_MAX_REQUEST_OVERHEAD                1
> >  #define EC_MAX_RESPONSE_OVERHEAD       32
> >
> > +/*
> > + * EC panic is not covered by the standard (0-F) ACPI notify values.
> > + * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
> > + * device specific ACPI notify range.
> > + */
> > +#define ACPI_NOTIFY_CROS_EC_PANIC      0xB0
> > +
> >  /*
> >   * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
> >   */
> > @@ -176,6 +183,8 @@ struct cros_ec_device {
> >         /* The platform devices used by the mfd driver */
> >         struct platform_device *ec;
> >         struct platform_device *pd;
> > +
> > +       struct blocking_notifier_head panic_notifier;
> >  };
> >
> >  /**
> > --
> > 2.39.0.314.g84b9a713c41-goog
> >
Guenter Roeck Dec. 22, 2022, 12:08 a.m. UTC | #5
On Wed, Dec 21, 2022 at 3:56 PM Rob Barnes <robbarnes@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 1:07 PM Guenter Roeck <groeck@google.com> wrote:
> >
> > On Wed, Dec 21, 2022 at 10:56 AM Rob Barnes <robbarnes@google.com> wrote:
> > >
> > > Add handler for CrOS EC panic events. When a panic is reported,
> > > poll EC log then force an orderly shutdown.
> > >
> > > This will preserve the EC log leading up to the crash.
> > >
> > > Signed-off-by: Rob Barnes <robbarnes@google.com>
> > > ---
> > >  drivers/platform/chrome/cros_ec_debugfs.c   | 24 +++++++++++++++++++++
> > >  drivers/platform/chrome/cros_ec_lpc.c       | 10 +++++++++
> > >  include/linux/platform_data/cros_ec_proto.h |  9 ++++++++
> > >  3 files changed, 43 insertions(+)
> > >
> > > diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
> > > index 21d973fc6be2..31637a4e4cf9 100644
> > > --- a/drivers/platform/chrome/cros_ec_debugfs.c
> > > +++ b/drivers/platform/chrome/cros_ec_debugfs.c
> > > @@ -49,6 +49,7 @@ struct cros_ec_debugfs {
> > >         struct delayed_work log_poll_work;
> > >         /* EC panicinfo */
> > >         struct debugfs_blob_wrapper panicinfo_blob;
> > > +       struct notifier_block notifier_panic;
> > >  };
> > >
> > >  /*
> > > @@ -437,6 +438,23 @@ static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
> > >         return ret;
> > >  }
> > >
> > > +static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
> > > +                                      unsigned long queued_during_suspend,
> > > +                                      void *_notify)
> > > +{
> > > +       struct cros_ec_debugfs *debug_info =
> > > +               container_of(nb, struct cros_ec_debugfs, notifier_panic);
> > > +
> > > +       if (debug_info->log_buffer.buf) {
> > > +               /* Force log poll work to run immediately */
> > > +               mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
> > > +               /* Block until log poll work finishes */
> > > +               flush_delayed_work(&debug_info->log_poll_work);
> > > +       }
> > > +
> > > +       return NOTIFY_DONE;
> > > +}
> > > +
> > >  static int cros_ec_debugfs_probe(struct platform_device *pd)
> > >  {
> > >         struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
> > > @@ -473,6 +491,12 @@ static int cros_ec_debugfs_probe(struct platform_device *pd)
> > >         debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
> > >                            &ec->ec_dev->suspend_timeout_ms);
> > >
> > > +       debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
> > > +       ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
> > > +                                              &debug_info->notifier_panic);
> > > +       if (ret)
> > > +               goto remove_debugfs;
> > > +
> > >         ec->debug_info = debug_info;
> > >
> > >         dev_set_drvdata(&pd->dev, ec);
> > > diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
> > > index 7fc8f82280ac..21958c3b0c28 100644
> > > --- a/drivers/platform/chrome/cros_ec_lpc.c
> > > +++ b/drivers/platform/chrome/cros_ec_lpc.c
> > > @@ -21,6 +21,7 @@
> > >  #include <linux/platform_data/cros_ec_proto.h>
> > >  #include <linux/platform_device.h>
> > >  #include <linux/printk.h>
> > > +#include <linux/reboot.h>
> > >  #include <linux/suspend.h>
> > >
> > >  #include "cros_ec.h"
> > > @@ -332,6 +333,15 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
> > >
> > >         if (value == ACPI_NOTIFY_DEVICE_WAKE)
> > >                 pm_system_wakeup();
> > > +
> > > +       if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
> > > +               dev_err(ec_dev->dev,
> > > +                       "CrOS EC Panic Reported. Shutdown is imminent!");
> >
> > dev_emerg() would seem to be more appropriate, given that we'll be
> > crashing a second later.
>
> Makes sense. Will do.
>
> >
> > > +               blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
> > > +                                            ec_dev);
> > > +               /* Begin orderly shutdown. Force shutdown after 1 second. */
> > > +               hw_protection_shutdown("CrOS EC Panic", 1000);
> >
> > That seems to be the wrong API. From its description:
> >
> >  * Initiate an emergency system shutdown in order to protect hardware from
> >  * further damage. Usage examples include a thermal protection or a voltage or
> >  * current regulator failures.
> >
> > This is an EC crash that apparently can not be handled gracefully.
> > That has nothing to do with hardware protection. Why not just call
> > panic() ?
>
> I'm adding the ability for the EC to somewhat gracefully handle a
> class of panics. The EC will enter "system safe mode" aka limp mode.
> All tasks are disabled except for a few critical ones. The purpose of
> this mode is to give the OS a couple of seconds to extract info about
> the EC panic and shutdown as gracefully as possible. EC will still
> force a reset after 2 seconds.
>
> Since the EC is in a precarious, non deterministic state and the EC is
> at least partially responsible for thermal and power regulation, it is
> arguably a hardware protection event.
>

In my opinion it is an abuse of the API. I guess we have to agree to disagree.

Guenter

> The primary motivation for allowing the OS to attempt an orderly
> shutdown is to sync user data and EC panic data to storage. Will panic
> sync storage? Or will that need to be a seperate call?
>
> >
> > > +       }
> >
> > It troubles me that this comes last. That means the entire
> > mkbp_event_supported handling executes first. What troubles me even
> > more is that 'value' is, with the exception of
> > ACPI_NOTIFY_DEVICE_WAKE, not checked at all. Presumably the EC sends
> > some well defined value if it has an event to report. I don't think it
> > is a good idea to depend on cros_ec_get_next_event() returning
> > something useful after the EC crashed.
>
> Could certainly move the ACPI_NOTIFY_CROS_EC_PANIC to the top. I think
> 'value' will be 0x80 when there's MKBP events. I will validate this
> assumption.
>
> > How is this handled today ? Does the EC just reset the AP ?
>
> EC just immediately resets the AP and itself. System safe mode (see
> go/ec-system-safe-mode) will allow the EC to run a few seconds after
> some types of crashes.
>
>
>
> >
> > >  }
> > >
> > >  static int cros_ec_lpc_probe(struct platform_device *pdev)
> > > diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
> > > index e43107e0bee1..1c4487271836 100644
> > > --- a/include/linux/platform_data/cros_ec_proto.h
> > > +++ b/include/linux/platform_data/cros_ec_proto.h
> > > @@ -41,6 +41,13 @@
> > >  #define EC_MAX_REQUEST_OVERHEAD                1
> > >  #define EC_MAX_RESPONSE_OVERHEAD       32
> > >
> > > +/*
> > > + * EC panic is not covered by the standard (0-F) ACPI notify values.
> > > + * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
> > > + * device specific ACPI notify range.
> > > + */
> > > +#define ACPI_NOTIFY_CROS_EC_PANIC      0xB0
> > > +
> > >  /*
> > >   * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
> > >   */
> > > @@ -176,6 +183,8 @@ struct cros_ec_device {
> > >         /* The platform devices used by the mfd driver */
> > >         struct platform_device *ec;
> > >         struct platform_device *pd;
> > > +
> > > +       struct blocking_notifier_head panic_notifier;
> > >  };
> > >
> > >  /**
> > > --
> > > 2.39.0.314.g84b9a713c41-goog
> > >
Prashant Malani Dec. 22, 2022, 12:38 a.m. UTC | #6
On Wed, Dec 21, 2022 at 3:55 PM Rob Barnes <robbarnes@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 12:23 PM Prashant Malani <pmalani@chromium.org> wrote:
> >
> >
> > Can you provide a link (either in the commit, or here in the comment)
> > to the coreboot/BIOS code which uses this value? I feel this should
> > be documented in some form that correlates the caller and the callee.
>
> Link: https://chromium-review.googlesource.com/c/chromiumos/third_party/coreboot/+/4023535

Thanks. Please add a code link (for example, I could find [2], but you
could use another code mirror
if there is a canonical one for coreboot) to the commit description,
or in the comment when you send
out v2.

> > Any reason we cannot use the existing event_notifier (with value argument)?
> > It's a system panic, so I doubt that computational overhead for other
> > notifier block
> > listeners is a concern.
>
> The value field is already being used for "queued_during_suspend" in
> event_notifier.

OK, I suppose you could use the data pointer...

It's just I find having a notifier for a single use case overkill(even
2 would be fine); one could get away with exposing a method
in cros_typec_debugfs via a local .h file (it can compile to a stub if
cros_typec_debugfs is not compiled to the kernel);
the LPC code can then just call that method instead of invoking a notifier.

Best regards,

-Prashant

> > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bdc48fa11e46f867ea4d75fa59ee87a7f48be144
[2] https://github.com/coreboot/coreboot/blob/ff6b3af113f84a1957dcdf8a8179a751ce08becf/src/ec/google/chromeec/acpi/ec.asl#L15
Rob Barnes Jan. 3, 2023, 11:14 p.m. UTC | #7
On Wed, Dec 21, 2022 at 5:38 PM Prashant Malani <pmalani@chromium.org> wrote:
>
> On Wed, Dec 21, 2022 at 3:55 PM Rob Barnes <robbarnes@google.com> wrote:
> >
> > On Wed, Dec 21, 2022 at 12:23 PM Prashant Malani <pmalani@chromium.org> wrote:
> > >
> > >
> > > Can you provide a link (either in the commit, or here in the comment)
> > > to the coreboot/BIOS code which uses this value? I feel this should
> > > be documented in some form that correlates the caller and the callee.
> >
> > Link: https://chromium-review.googlesource.com/c/chromiumos/third_party/coreboot/+/4023535
>
> Thanks. Please add a code link (for example, I could find [2], but you
> could use another code mirror
> if there is a canonical one for coreboot) to the commit description,
> or in the comment when you send
> out v2.
>
> > > Any reason we cannot use the existing event_notifier (with value argument)?
> > > It's a system panic, so I doubt that computational overhead for other
> > > notifier block
> > > listeners is a concern.
> >
> > The value field is already being used for "queued_during_suspend" in
> > event_notifier.
>
> OK, I suppose you could use the data pointer...

The data pointer contains ec_dev, so that's not really an option either.

>
> It's just I find having a notifier for a single use case overkill(even
> 2 would be fine); one could get away with exposing a method
> in cros_typec_debugfs via a local .h file (it can compile to a stub if
> cros_typec_debugfs is not compiled to the kernel);
> the LPC code can then just call that method instead of invoking a notifier.

My first implementation did make a direct call to cros_ec_debugfs.c,
but an internal reviewer recommended using an event notifier instead.
So I'm histent to go back to a direct call.

There may be other sub drivers that want to handle EC panics. So I
think keeping this as a separate notifier makes sense given the
constraints.

>
> Best regards,
>
> -Prashant
>
> > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bdc48fa11e46f867ea4d75fa59ee87a7f48be144
> [2] https://github.com/coreboot/coreboot/blob/ff6b3af113f84a1957dcdf8a8179a751ce08becf/src/ec/google/chromeec/acpi/ec.asl#L15
Prashant Malani Jan. 3, 2023, 11:27 p.m. UTC | #8
On Tue, Jan 3, 2023 at 3:15 PM Rob Barnes <robbarnes@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 5:38 PM Prashant Malani <pmalani@chromium.org> wrote:
> >
> > On Wed, Dec 21, 2022 at 3:55 PM Rob Barnes <robbarnes@google.com> wrote:
> > >
> > > On Wed, Dec 21, 2022 at 12:23 PM Prashant Malani <pmalani@chromium.org> wrote:
> > > >
>
> >
> > It's just I find having a notifier for a single use case overkill(even
> > 2 would be fine); one could get away with exposing a method
> > in cros_typec_debugfs via a local .h file (it can compile to a stub if
> > cros_typec_debugfs is not compiled to the kernel);
> > the LPC code can then just call that method instead of invoking a notifier.
>
> My first implementation did make a direct call to cros_ec_debugfs.c,
> but an internal reviewer recommended using an event notifier instead.
> So I'm histent to go back to a direct call.
>
> There may be other sub drivers that want to handle EC panics. So I
> think keeping this as a separate notifier makes sense given the
> constraints.

The issue with that reasoning vis-à-vis your implementation is that
the panic notifier is tied to cros_ec_debugfs. What if another
(sub)-driver wants to use the
panic notifier to do something, but that system doesn't have CONFIG_DEBUGFS
enabled?

Having a direct/explicit dependency avoids that issue; LPC depends on
debugfs being compiled
to have the log printed out (a stub is used when debugfs is not
enabled), but nothing else
relies on debugfs for a panic notifier.

BR,

-Prashant
Rob Barnes Jan. 4, 2023, 1:19 a.m. UTC | #9
On Tue, Jan 3, 2023 at 4:28 PM Prashant Malani <pmalani@chromium.org> wrote:
>
> On Tue, Jan 3, 2023 at 3:15 PM Rob Barnes <robbarnes@google.com> wrote:
> >
> > On Wed, Dec 21, 2022 at 5:38 PM Prashant Malani <pmalani@chromium.org> wrote:
> > >
> > > On Wed, Dec 21, 2022 at 3:55 PM Rob Barnes <robbarnes@google.com> wrote:
> > > >
> > > > On Wed, Dec 21, 2022 at 12:23 PM Prashant Malani <pmalani@chromium.org> wrote:
> > > > >
> >
> > >
> > > It's just I find having a notifier for a single use case overkill(even
> > > 2 would be fine); one could get away with exposing a method
> > > in cros_typec_debugfs via a local .h file (it can compile to a stub if
> > > cros_typec_debugfs is not compiled to the kernel);
> > > the LPC code can then just call that method instead of invoking a notifier.
> >
> > My first implementation did make a direct call to cros_ec_debugfs.c,
> > but an internal reviewer recommended using an event notifier instead.
> > So I'm histent to go back to a direct call.
> >
> > There may be other sub drivers that want to handle EC panics. So I
> > think keeping this as a separate notifier makes sense given the
> > constraints.
>
> The issue with that reasoning vis-à-vis your implementation is that
> the panic notifier is tied to cros_ec_debugfs. What if another
> (sub)-driver wants to use the
> panic notifier to do something, but that system doesn't have CONFIG_DEBUGFS
> enabled?

There isn't a dependency between cros_ec_debugfs.c and
`panic_notifier` inside `cros_ec_proto.h`. So (sub)-drivers can handle
EC panics when CONFIG_DEBUGFS is not enabled.

>
> Having a direct/explicit dependency avoids that issue; LPC depends on
> debugfs being compiled
> to have the log printed out (a stub is used when debugfs is not
> enabled), but nothing else
> relies on debugfs for a panic notifier.
>
> BR,
>
> -Prashant
diff mbox series

Patch

diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 21d973fc6be2..31637a4e4cf9 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -49,6 +49,7 @@  struct cros_ec_debugfs {
 	struct delayed_work log_poll_work;
 	/* EC panicinfo */
 	struct debugfs_blob_wrapper panicinfo_blob;
+	struct notifier_block notifier_panic;
 };
 
 /*
@@ -437,6 +438,23 @@  static int cros_ec_create_panicinfo(struct cros_ec_debugfs *debug_info)
 	return ret;
 }
 
+static int cros_ec_debugfs_panic_event(struct notifier_block *nb,
+				       unsigned long queued_during_suspend,
+				       void *_notify)
+{
+	struct cros_ec_debugfs *debug_info =
+		container_of(nb, struct cros_ec_debugfs, notifier_panic);
+
+	if (debug_info->log_buffer.buf) {
+		/* Force log poll work to run immediately */
+		mod_delayed_work(debug_info->log_poll_work.wq, &debug_info->log_poll_work, 0);
+		/* Block until log poll work finishes */
+		flush_delayed_work(&debug_info->log_poll_work);
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int cros_ec_debugfs_probe(struct platform_device *pd)
 {
 	struct cros_ec_dev *ec = dev_get_drvdata(pd->dev.parent);
@@ -473,6 +491,12 @@  static int cros_ec_debugfs_probe(struct platform_device *pd)
 	debugfs_create_u16("suspend_timeout_ms", 0664, debug_info->dir,
 			   &ec->ec_dev->suspend_timeout_ms);
 
+	debug_info->notifier_panic.notifier_call = cros_ec_debugfs_panic_event;
+	ret = blocking_notifier_chain_register(&ec->ec_dev->panic_notifier,
+					       &debug_info->notifier_panic);
+	if (ret)
+		goto remove_debugfs;
+
 	ec->debug_info = debug_info;
 
 	dev_set_drvdata(&pd->dev, ec);
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 7fc8f82280ac..21958c3b0c28 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -21,6 +21,7 @@ 
 #include <linux/platform_data/cros_ec_proto.h>
 #include <linux/platform_device.h>
 #include <linux/printk.h>
+#include <linux/reboot.h>
 #include <linux/suspend.h>
 
 #include "cros_ec.h"
@@ -332,6 +333,15 @@  static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
 
 	if (value == ACPI_NOTIFY_DEVICE_WAKE)
 		pm_system_wakeup();
+
+	if (value == ACPI_NOTIFY_CROS_EC_PANIC) {
+		dev_err(ec_dev->dev,
+			"CrOS EC Panic Reported. Shutdown is imminent!");
+		blocking_notifier_call_chain(&ec_dev->panic_notifier, 0,
+					     ec_dev);
+		/* Begin orderly shutdown. Force shutdown after 1 second. */
+		hw_protection_shutdown("CrOS EC Panic", 1000);
+	}
 }
 
 static int cros_ec_lpc_probe(struct platform_device *pdev)
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index e43107e0bee1..1c4487271836 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -41,6 +41,13 @@ 
 #define EC_MAX_REQUEST_OVERHEAD		1
 #define EC_MAX_RESPONSE_OVERHEAD	32
 
+/*
+ * EC panic is not covered by the standard (0-F) ACPI notify values.
+ * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF
+ * device specific ACPI notify range.
+ */
+#define ACPI_NOTIFY_CROS_EC_PANIC	0xB0
+
 /*
  * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
  */
@@ -176,6 +183,8 @@  struct cros_ec_device {
 	/* The platform devices used by the mfd driver */
 	struct platform_device *ec;
 	struct platform_device *pd;
+
+	struct blocking_notifier_head panic_notifier;
 };
 
 /**