diff mbox series

[4/4] thermal: intel: hfi: Add a suspend notifier

Message ID 20231227062940.10780-5-ricardo.neri-calderon@linux.intel.com (mailing list archive)
State Changes Requested, archived
Headers show
Series thermal: intel: hfi: Fix memory corruption on resume from hibernation | expand

Commit Message

Ricardo Neri Dec. 27, 2023, 6:29 a.m. UTC
The kernel gives the HFI hardware a memory region that the latter uses to
provide updates to the HFI table. The kernel allocates this memory region
at boot. It remains constant throughout runtime time.

When resuming from suspend or hibernation, the restore kernel allocates a
second memory buffer and reprograms the HFI hardware with the new location
as part of a normal boot. The location of the second memory buffer may
differ from the one allocated by the image kernel. Subsequently, when the
restore kernel transfers control to the image kernel, the second buffer
becomes invalid, potentially leading to memory corruption if the hardware
writes to it (hardware continues using the buffer from the restore kernel).

Add a suspend notifier to disable all HFI instances before jumping to the
image kernel and enable them once the image kernel has been restored. Use
the memory buffer that the image kernel allocated.

For non-boot CPUs, rely on the CPU hotplug callbacks as CPUs are disabled
and enabled during suspend and resume, respectively.

The CPU hotplug callbacks do not cover the boot CPU. Handle the HFI
instance of the boot CPU from the suspend notifier callback.

Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Zhao Liu <zhao1.liu@linux.intel.com>
Cc: linux-pm@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 drivers/thermal/intel/intel_hfi.c | 53 +++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

Comments

Rafael J. Wysocki Dec. 29, 2023, 5:27 p.m. UTC | #1
On Wed, Dec 27, 2023 at 7:28 AM Ricardo Neri
<ricardo.neri-calderon@linux.intel.com> wrote:
>
> The kernel gives the HFI hardware a memory region that the latter uses to
> provide updates to the HFI table. The kernel allocates this memory region
> at boot. It remains constant throughout runtime time.
>
> When resuming from suspend or hibernation, the restore kernel allocates a
> second memory buffer and reprograms the HFI hardware with the new location
> as part of a normal boot. The location of the second memory buffer may
> differ from the one allocated by the image kernel. Subsequently, when the
> restore kernel transfers control to the image kernel, the second buffer
> becomes invalid, potentially leading to memory corruption if the hardware
> writes to it (hardware continues using the buffer from the restore kernel).
>
> Add a suspend notifier to disable all HFI instances before jumping to the
> image kernel and enable them once the image kernel has been restored. Use
> the memory buffer that the image kernel allocated.
>
> For non-boot CPUs, rely on the CPU hotplug callbacks as CPUs are disabled
> and enabled during suspend and resume, respectively.
>
> The CPU hotplug callbacks do not cover the boot CPU. Handle the HFI
> instance of the boot CPU from the suspend notifier callback.
>
> Cc: Chen Yu <yu.c.chen@intel.com>
> Cc: Len Brown <len.brown@intel.com>
> Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> Cc: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
> Cc: Zhang Rui <rui.zhang@intel.com>
> Cc: Zhao Liu <zhao1.liu@linux.intel.com>
> Cc: linux-pm@vger.kernel.org
> Cc: stable@vger.kernel.org
> Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
> ---
>  drivers/thermal/intel/intel_hfi.c | 53 +++++++++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
>
> diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
> index d2c874f43786..965c245e5e78 100644
> --- a/drivers/thermal/intel/intel_hfi.c
> +++ b/drivers/thermal/intel/intel_hfi.c
> @@ -30,11 +30,13 @@
>  #include <linux/kernel.h>
>  #include <linux/math.h>
>  #include <linux/mutex.h>
> +#include <linux/notifier.h>
>  #include <linux/percpu-defs.h>
>  #include <linux/printk.h>
>  #include <linux/processor.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
> +#include <linux/suspend.h>
>  #include <linux/string.h>
>  #include <linux/topology.h>
>  #include <linux/workqueue.h>
> @@ -569,11 +571,62 @@ static __init int hfi_parse_features(void)
>         return 0;
>  }
>
> +static void hfi_do_pm_enable(void *info)
> +{
> +       struct hfi_instance *hfi_instance = info;
> +
> +       hfi_set_hw_table(hfi_instance);
> +       hfi_enable();

The above do RMW, so should locking be used here?

> +}
> +
> +static void hfi_do_pm_disable(void *info)
> +{
> +       hfi_disable();
> +}

And here?

> +
> +static int hfi_pm_notify(struct notifier_block *nb,
> +                        unsigned long mode, void *unused)
> +{
> +       struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0);
> +       struct hfi_instance *hfi_instance = info->hfi_instance;
> +
> +       /* HFI may not be in use. */
> +       if (!hfi_instance)
> +               return 0;
> +
> +       /*
> +        * Only handle the HFI instance of the package of the boot CPU. The
> +        * instances of other packages are handled in the CPU hotplug callbacks.
> +        */
> +       switch (mode) {
> +       case PM_HIBERNATION_PREPARE:
> +       case PM_SUSPEND_PREPARE:
> +       case PM_RESTORE_PREPARE:
> +               return smp_call_function_single(0, hfi_do_pm_disable,
> +                                               NULL, true);
> +
> +       case PM_POST_RESTORE:
> +       case PM_POST_HIBERNATION:
> +       case PM_POST_SUSPEND:
> +               return smp_call_function_single(0, hfi_do_pm_enable,
> +                                               hfi_instance, true);
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
> +static struct notifier_block hfi_pm_nb = {
> +       .notifier_call = hfi_pm_notify,
> +};
> +
>  void __init intel_hfi_init(void)
>  {
>         struct hfi_instance *hfi_instance;
>         int i, j;
>
> +       if (register_pm_notifier(&hfi_pm_nb))
> +               return;
> +
>         if (hfi_parse_features())
>                 return;
>
> --
Ricardo Neri Jan. 2, 2024, 3:40 a.m. UTC | #2
On Fri, Dec 29, 2023 at 06:27:30PM +0100, Rafael J. Wysocki wrote:
> On Wed, Dec 27, 2023 at 7:28 AM Ricardo Neri
> <ricardo.neri-calderon@linux.intel.com> wrote:
> >
> > The kernel gives the HFI hardware a memory region that the latter uses to
> > provide updates to the HFI table. The kernel allocates this memory region
> > at boot. It remains constant throughout runtime time.
> >
> > When resuming from suspend or hibernation, the restore kernel allocates a
> > second memory buffer and reprograms the HFI hardware with the new location
> > as part of a normal boot. The location of the second memory buffer may
> > differ from the one allocated by the image kernel. Subsequently, when the
> > restore kernel transfers control to the image kernel, the second buffer
> > becomes invalid, potentially leading to memory corruption if the hardware
> > writes to it (hardware continues using the buffer from the restore kernel).
> >
> > Add a suspend notifier to disable all HFI instances before jumping to the
> > image kernel and enable them once the image kernel has been restored. Use
> > the memory buffer that the image kernel allocated.
> >
> > For non-boot CPUs, rely on the CPU hotplug callbacks as CPUs are disabled
> > and enabled during suspend and resume, respectively.
> >
> > The CPU hotplug callbacks do not cover the boot CPU. Handle the HFI
> > instance of the boot CPU from the suspend notifier callback.
> >
> > Cc: Chen Yu <yu.c.chen@intel.com>
> > Cc: Len Brown <len.brown@intel.com>
> > Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> > Cc: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
> > Cc: Zhang Rui <rui.zhang@intel.com>
> > Cc: Zhao Liu <zhao1.liu@linux.intel.com>
> > Cc: linux-pm@vger.kernel.org
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
> > ---
> >  drivers/thermal/intel/intel_hfi.c | 53 +++++++++++++++++++++++++++++++
> >  1 file changed, 53 insertions(+)
> >
> > diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
> > index d2c874f43786..965c245e5e78 100644
> > --- a/drivers/thermal/intel/intel_hfi.c
> > +++ b/drivers/thermal/intel/intel_hfi.c
> > @@ -30,11 +30,13 @@
> >  #include <linux/kernel.h>
> >  #include <linux/math.h>
> >  #include <linux/mutex.h>
> > +#include <linux/notifier.h>
> >  #include <linux/percpu-defs.h>
> >  #include <linux/printk.h>
> >  #include <linux/processor.h>
> >  #include <linux/slab.h>
> >  #include <linux/spinlock.h>
> > +#include <linux/suspend.h>
> >  #include <linux/string.h>
> >  #include <linux/topology.h>
> >  #include <linux/workqueue.h>
> > @@ -569,11 +571,62 @@ static __init int hfi_parse_features(void)
> >         return 0;
> >  }
> >
> > +static void hfi_do_pm_enable(void *info)
> > +{
> > +       struct hfi_instance *hfi_instance = info;
> > +
> > +       hfi_set_hw_table(hfi_instance);
> > +       hfi_enable();
> 
> The above do RMW, so should locking be used here?
> 
> > +}
> > +
> > +static void hfi_do_pm_disable(void *info)
> > +{
> > +       hfi_disable();
> > +}
> 
> And here?

On single-package systems, HFI enable/disable only happens on the boot CPU,
via either the CPU hotplug callbacks or the suspend notifier. It is
unlikely they will run concurrently, IMO. It also looks as good hygiene to
me to use locking, just in case. I will use the hfi_instance_lock in my
next version.

On multi-package systems, HFI instance of non-boot packages are always
handled via the CPU hotplug callbacks.
 
> 
> > +
> > +static int hfi_pm_notify(struct notifier_block *nb,
> > +                        unsigned long mode, void *unused)
> > +{
> > +       struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0);
> > +       struct hfi_instance *hfi_instance = info->hfi_instance;
> > +
> > +       /* HFI may not be in use. */
> > +       if (!hfi_instance)
> > +               return 0;
> > +
> > +       /*
> > +        * Only handle the HFI instance of the package of the boot CPU. The
> > +        * instances of other packages are handled in the CPU hotplug callbacks.
> > +        */
> > +       switch (mode) {
> > +       case PM_HIBERNATION_PREPARE:
> > +       case PM_SUSPEND_PREPARE:
> > +       case PM_RESTORE_PREPARE:
> > +               return smp_call_function_single(0, hfi_do_pm_disable,
> > +                                               NULL, true);
> > +
> > +       case PM_POST_RESTORE:
> > +       case PM_POST_HIBERNATION:
> > +       case PM_POST_SUSPEND:
> > +               return smp_call_function_single(0, hfi_do_pm_enable,
> > +                                               hfi_instance, true);
> > +       default:
> > +               return -EINVAL;
> > +       }
> > +}
> > +
> > +static struct notifier_block hfi_pm_nb = {
> > +       .notifier_call = hfi_pm_notify,
> > +};
> > +
> >  void __init intel_hfi_init(void)
> >  {
> >         struct hfi_instance *hfi_instance;
> >         int i, j;
> >
> > +       if (register_pm_notifier(&hfi_pm_nb))
> > +               return;
> > +
> >         if (hfi_parse_features())
> >                 return;
> >
> > --
diff mbox series

Patch

diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index d2c874f43786..965c245e5e78 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -30,11 +30,13 @@ 
 #include <linux/kernel.h>
 #include <linux/math.h>
 #include <linux/mutex.h>
+#include <linux/notifier.h>
 #include <linux/percpu-defs.h>
 #include <linux/printk.h>
 #include <linux/processor.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/suspend.h>
 #include <linux/string.h>
 #include <linux/topology.h>
 #include <linux/workqueue.h>
@@ -569,11 +571,62 @@  static __init int hfi_parse_features(void)
 	return 0;
 }
 
+static void hfi_do_pm_enable(void *info)
+{
+	struct hfi_instance *hfi_instance = info;
+
+	hfi_set_hw_table(hfi_instance);
+	hfi_enable();
+}
+
+static void hfi_do_pm_disable(void *info)
+{
+	hfi_disable();
+}
+
+static int hfi_pm_notify(struct notifier_block *nb,
+			 unsigned long mode, void *unused)
+{
+	struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0);
+	struct hfi_instance *hfi_instance = info->hfi_instance;
+
+	/* HFI may not be in use. */
+	if (!hfi_instance)
+		return 0;
+
+	/*
+	 * Only handle the HFI instance of the package of the boot CPU. The
+	 * instances of other packages are handled in the CPU hotplug callbacks.
+	 */
+	switch (mode) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+	case PM_RESTORE_PREPARE:
+		return smp_call_function_single(0, hfi_do_pm_disable,
+						NULL, true);
+
+	case PM_POST_RESTORE:
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		return smp_call_function_single(0, hfi_do_pm_enable,
+						hfi_instance, true);
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct notifier_block hfi_pm_nb = {
+	.notifier_call = hfi_pm_notify,
+};
+
 void __init intel_hfi_init(void)
 {
 	struct hfi_instance *hfi_instance;
 	int i, j;
 
+	if (register_pm_notifier(&hfi_pm_nb))
+		return;
+
 	if (hfi_parse_features())
 		return;