diff mbox series

thermal: intel_powerclamp: Fix cur_state for multi package system

Message ID 20230201180625.2156520-1-srinivas.pandruvada@linux.intel.com (mailing list archive)
State Superseded, archived
Headers show
Series thermal: intel_powerclamp: Fix cur_state for multi package system | expand

Commit Message

Srinivas Pandruvada Feb. 1, 2023, 6:06 p.m. UTC
The powerclamp cooling device cur_state shows actual idle observed by
package C-state idle counters. But the implementation is not sufficient
for multi package or multi die system. The cur_state value is incorrect.
On these systems, these counters must be read from each package/die and
somehow aggregate them. But there is no good method for aggregation.

It was not a problem when explicit CPU model addition was required to
enable intel powerclamp. In this way certain CPU models could have
been avoided. But with the removal of CPU model check with the
availability of Package C-state counters, the driver is loaded on most
of the recent systems.

For multi package/die systems, just show the actual target idle state,
the system is trying to achieve. In powerclamp this is the user set
state minus one.

Also there is no use of starting a worker thread for polling package
C-state counters and applying any compensation.

Fixes: b721ca0d1927 ("thermal/powerclamp: remove cpu whitelist")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: stable@vger.kernel.org # 4.14+
---
 drivers/thermal/intel/intel_powerclamp.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Rafael J. Wysocki Feb. 1, 2023, 7:10 p.m. UTC | #1
On Wed, Feb 1, 2023 at 7:06 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>
> The powerclamp cooling device cur_state shows actual idle observed by
> package C-state idle counters. But the implementation is not sufficient
> for multi package or multi die system. The cur_state value is incorrect.
> On these systems, these counters must be read from each package/die and
> somehow aggregate them. But there is no good method for aggregation.
>
> It was not a problem when explicit CPU model addition was required to
> enable intel powerclamp. In this way certain CPU models could have
> been avoided. But with the removal of CPU model check with the
> availability of Package C-state counters, the driver is loaded on most
> of the recent systems.
>
> For multi package/die systems, just show the actual target idle state,
> the system is trying to achieve. In powerclamp this is the user set
> state minus one.
>
> Also there is no use of starting a worker thread for polling package
> C-state counters and applying any compensation.

I think that the last paragraph applies to systems with multiple dies/packages?

> Fixes: b721ca0d1927 ("thermal/powerclamp: remove cpu whitelist")



> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> Cc: stable@vger.kernel.org # 4.14+
> ---
>  drivers/thermal/intel/intel_powerclamp.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
> index b80e25ec1261..64f082c584b2 100644
> --- a/drivers/thermal/intel/intel_powerclamp.c
> +++ b/drivers/thermal/intel/intel_powerclamp.c
> @@ -57,6 +57,7 @@
>
>  static unsigned int target_mwait;
>  static struct dentry *debug_dir;
> +static bool poll_pkg_cstate_enable;
>
>  /* user selected target */
>  static unsigned int set_target_ratio;
> @@ -261,6 +262,9 @@ static unsigned int get_compensation(int ratio)
>  {
>         unsigned int comp = 0;
>
> +       if (!poll_pkg_cstate_enable)
> +               return 0;
> +
>         /* we only use compensation if all adjacent ones are good */
>         if (ratio == 1 &&
>                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
> @@ -519,7 +523,8 @@ static int start_power_clamp(void)
>         control_cpu = cpumask_first(cpu_online_mask);
>
>         clamping = true;
> -       schedule_delayed_work(&poll_pkg_cstate_work, 0);
> +       if (poll_pkg_cstate_enable)
> +               schedule_delayed_work(&poll_pkg_cstate_work, 0);
>
>         /* start one kthread worker per online cpu */
>         for_each_online_cpu(cpu) {
> @@ -585,11 +590,15 @@ static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
>  static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
>                                  unsigned long *state)
>  {
> -       if (true == clamping)
> -               *state = pkg_cstate_ratio_cur;
> -       else
> +       if (true == clamping) {

This really should be

        if (clamping) {

> +               if (poll_pkg_cstate_enable)
> +                       *state = pkg_cstate_ratio_cur;
> +               else
> +                       *state = set_target_ratio;
> +       } else {
>                 /* to save power, do not poll idle ratio while not clamping */
>                 *state = -1; /* indicates invalid state */
> +       }
>
>         return 0;
>  }
> @@ -712,6 +721,9 @@ static int __init powerclamp_init(void)
>                 goto exit_unregister;
>         }
>
> +       if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
> +               poll_pkg_cstate_enable = true;
> +
>         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
>                                                 &powerclamp_cooling_ops);
>         if (IS_ERR(cooling_dev)) {
> --

This fixes a rather old bug and we are late in the cycle, so I'm a bit
reluctant to push it for -rc7 or -rc8.  I would prefer to apply it for
6.3, but let it go before the other powerclamp driver changes from
you.  This way, if anyone needs to backport it or put it into
-stable, they will be able to do that without pulling in the more
intrusive material.

Now, I do realize that this avoids changing the current behavior too
much, but I think that it is plain confusing to return
pkg_cstate_ratio_cur from powerclamp_get_cur_state() in any case.  It
should always return set_target_ratio IMV.
Srinivas Pandruvada Feb. 1, 2023, 7:19 p.m. UTC | #2
On Wed, 2023-02-01 at 20:10 +0100, Rafael J. Wysocki wrote:
> On Wed, Feb 1, 2023 at 7:06 PM Srinivas Pandruvada
> <srinivas.pandruvada@linux.intel.com> wrote:
> > 
> > The powerclamp cooling device cur_state shows actual idle observed
> > by
> > package C-state idle counters. But the implementation is not
> > sufficient
> > for multi package or multi die system. The cur_state value is
> > incorrect.
> > On these systems, these counters must be read from each package/die
> > and
> > somehow aggregate them. But there is no good method for
> > aggregation.
> > 
> > It was not a problem when explicit CPU model addition was required
> > to
> > enable intel powerclamp. In this way certain CPU models could have
> > been avoided. But with the removal of CPU model check with the
> > availability of Package C-state counters, the driver is loaded on
> > most
> > of the recent systems.
> > 
> > For multi package/die systems, just show the actual target idle
> > state,
> > the system is trying to achieve. In powerclamp this is the user set
> > state minus one.
> > 
> > Also there is no use of starting a worker thread for polling
> > package
> > C-state counters and applying any compensation.
> 
> I think that the last paragraph applies to systems with multiple
> dies/packages?
Yes.

> 
> > Fixes: b721ca0d1927 ("thermal/powerclamp: remove cpu whitelist")
> 
> 
> 
> > Signed-off-by: Srinivas Pandruvada
> > <srinivas.pandruvada@linux.intel.com>
> > Cc: stable@vger.kernel.org # 4.14+
> > ---
> >  drivers/thermal/intel/intel_powerclamp.c | 20 ++++++++++++++++----
> >  1 file changed, 16 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/thermal/intel/intel_powerclamp.c
> > b/drivers/thermal/intel/intel_powerclamp.c
> > index b80e25ec1261..64f082c584b2 100644
> > --- a/drivers/thermal/intel/intel_powerclamp.c
> > +++ b/drivers/thermal/intel/intel_powerclamp.c
> > @@ -57,6 +57,7 @@
> > 
> >  static unsigned int target_mwait;
> >  static struct dentry *debug_dir;
> > +static bool poll_pkg_cstate_enable;
> > 
> >  /* user selected target */
> >  static unsigned int set_target_ratio;
> > @@ -261,6 +262,9 @@ static unsigned int get_compensation(int ratio)
> >  {
> >         unsigned int comp = 0;
> > 
> > +       if (!poll_pkg_cstate_enable)
> > +               return 0;
> > +
> >         /* we only use compensation if all adjacent ones are good
> > */
> >         if (ratio == 1 &&
> >                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
> > @@ -519,7 +523,8 @@ static int start_power_clamp(void)
> >         control_cpu = cpumask_first(cpu_online_mask);
> > 
> >         clamping = true;
> > -       schedule_delayed_work(&poll_pkg_cstate_work, 0);
> > +       if (poll_pkg_cstate_enable)
> > +               schedule_delayed_work(&poll_pkg_cstate_work, 0);
> > 
> >         /* start one kthread worker per online cpu */
> >         for_each_online_cpu(cpu) {
> > @@ -585,11 +590,15 @@ static int powerclamp_get_max_state(struct
> > thermal_cooling_device *cdev,
> >  static int powerclamp_get_cur_state(struct thermal_cooling_device
> > *cdev,
> >                                  unsigned long *state)
> >  {
> > -       if (true == clamping)
> > -               *state = pkg_cstate_ratio_cur;
> > -       else
> > +       if (true == clamping) {
> 
> This really should be
I can change that, just kept the old style.
I will send an update.

> 
>         if (clamping) {
> 
> > +               if (poll_pkg_cstate_enable)
> > +                       *state = pkg_cstate_ratio_cur;
> > +               else
> > +                       *state = set_target_ratio;
> > +       } else {
> >                 /* to save power, do not poll idle ratio while not
> > clamping */
> >                 *state = -1; /* indicates invalid state */
> > +       }
> > 
> >         return 0;
> >  }
> > @@ -712,6 +721,9 @@ static int __init powerclamp_init(void)
> >                 goto exit_unregister;
> >         }
> > 
> > +       if (topology_max_packages() == 1 &&
> > topology_max_die_per_package() == 1)
> > +               poll_pkg_cstate_enable = true;
> > +
> >         cooling_dev =
> > thermal_cooling_device_register("intel_powerclamp", NULL,
> >                                                
> > &powerclamp_cooling_ops);
> >         if (IS_ERR(cooling_dev)) {
> > --
> 
> This fixes a rather old bug and we are late in the cycle, so I'm a
> bit
> reluctant to push it for -rc7 or -rc8.  I would prefer to apply it
> for
> 6.3, but let it go before the other powerclamp driver changes from
> you. 
Yes, that's why I rebased other patches on top of this.

>  This way, if anyone needs to backport it or put it into
> -stable, they will be able to do that without pulling in the more
> intrusive material.
> 
> Now, I do realize that this avoids changing the current behavior too
> much, but I think that it is plain confusing to return
> pkg_cstate_ratio_cur from powerclamp_get_cur_state() in any case.  It
> should always return set_target_ratio IMV.
It should. It in unnecessary complications. When I use in thermald, I
don't look at the returned value from cur_state as this doesn't matter
if the temperature is not under control. I will change this for all
cases.

Thanks,
Srinivas
Rafael J. Wysocki Feb. 1, 2023, 7:22 p.m. UTC | #3
On Wed, Feb 1, 2023 at 8:19 PM srinivas pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>
> On Wed, 2023-02-01 at 20:10 +0100, Rafael J. Wysocki wrote:
> > On Wed, Feb 1, 2023 at 7:06 PM Srinivas Pandruvada
> > <srinivas.pandruvada@linux.intel.com> wrote:
> > >
> > > The powerclamp cooling device cur_state shows actual idle observed
> > > by
> > > package C-state idle counters. But the implementation is not
> > > sufficient
> > > for multi package or multi die system. The cur_state value is
> > > incorrect.
> > > On these systems, these counters must be read from each package/die
> > > and
> > > somehow aggregate them. But there is no good method for
> > > aggregation.
> > >
> > > It was not a problem when explicit CPU model addition was required
> > > to
> > > enable intel powerclamp. In this way certain CPU models could have
> > > been avoided. But with the removal of CPU model check with the
> > > availability of Package C-state counters, the driver is loaded on
> > > most
> > > of the recent systems.
> > >
> > > For multi package/die systems, just show the actual target idle
> > > state,
> > > the system is trying to achieve. In powerclamp this is the user set
> > > state minus one.
> > >
> > > Also there is no use of starting a worker thread for polling
> > > package
> > > C-state counters and applying any compensation.
> >
> > I think that the last paragraph applies to systems with multiple
> > dies/packages?
> Yes.
>
> >
> > > Fixes: b721ca0d1927 ("thermal/powerclamp: remove cpu whitelist")
> >
> >
> >
> > > Signed-off-by: Srinivas Pandruvada
> > > <srinivas.pandruvada@linux.intel.com>
> > > Cc: stable@vger.kernel.org # 4.14+
> > > ---
> > >  drivers/thermal/intel/intel_powerclamp.c | 20 ++++++++++++++++----
> > >  1 file changed, 16 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/thermal/intel/intel_powerclamp.c
> > > b/drivers/thermal/intel/intel_powerclamp.c
> > > index b80e25ec1261..64f082c584b2 100644
> > > --- a/drivers/thermal/intel/intel_powerclamp.c
> > > +++ b/drivers/thermal/intel/intel_powerclamp.c
> > > @@ -57,6 +57,7 @@
> > >
> > >  static unsigned int target_mwait;
> > >  static struct dentry *debug_dir;
> > > +static bool poll_pkg_cstate_enable;
> > >
> > >  /* user selected target */
> > >  static unsigned int set_target_ratio;
> > > @@ -261,6 +262,9 @@ static unsigned int get_compensation(int ratio)
> > >  {
> > >         unsigned int comp = 0;
> > >
> > > +       if (!poll_pkg_cstate_enable)
> > > +               return 0;
> > > +
> > >         /* we only use compensation if all adjacent ones are good
> > > */
> > >         if (ratio == 1 &&
> > >                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
> > > @@ -519,7 +523,8 @@ static int start_power_clamp(void)
> > >         control_cpu = cpumask_first(cpu_online_mask);
> > >
> > >         clamping = true;
> > > -       schedule_delayed_work(&poll_pkg_cstate_work, 0);
> > > +       if (poll_pkg_cstate_enable)
> > > +               schedule_delayed_work(&poll_pkg_cstate_work, 0);
> > >
> > >         /* start one kthread worker per online cpu */
> > >         for_each_online_cpu(cpu) {
> > > @@ -585,11 +590,15 @@ static int powerclamp_get_max_state(struct
> > > thermal_cooling_device *cdev,
> > >  static int powerclamp_get_cur_state(struct thermal_cooling_device
> > > *cdev,
> > >                                  unsigned long *state)
> > >  {
> > > -       if (true == clamping)
> > > -               *state = pkg_cstate_ratio_cur;
> > > -       else
> > > +       if (true == clamping) {
> >
> > This really should be
> I can change that, just kept the old style.
> I will send an update.
>
> >
> >         if (clamping) {
> >
> > > +               if (poll_pkg_cstate_enable)
> > > +                       *state = pkg_cstate_ratio_cur;
> > > +               else
> > > +                       *state = set_target_ratio;
> > > +       } else {
> > >                 /* to save power, do not poll idle ratio while not
> > > clamping */
> > >                 *state = -1; /* indicates invalid state */
> > > +       }
> > >
> > >         return 0;
> > >  }
> > > @@ -712,6 +721,9 @@ static int __init powerclamp_init(void)
> > >                 goto exit_unregister;
> > >         }
> > >
> > > +       if (topology_max_packages() == 1 &&
> > > topology_max_die_per_package() == 1)
> > > +               poll_pkg_cstate_enable = true;
> > > +
> > >         cooling_dev =
> > > thermal_cooling_device_register("intel_powerclamp", NULL,
> > >
> > > &powerclamp_cooling_ops);
> > >         if (IS_ERR(cooling_dev)) {
> > > --
> >
> > This fixes a rather old bug and we are late in the cycle, so I'm a
> > bit
> > reluctant to push it for -rc7 or -rc8.  I would prefer to apply it
> > for
> > 6.3, but let it go before the other powerclamp driver changes from
> > you.
> Yes, that's why I rebased other patches on top of this.
>
> >  This way, if anyone needs to backport it or put it into
> > -stable, they will be able to do that without pulling in the more
> > intrusive material.
> >
> > Now, I do realize that this avoids changing the current behavior too
> > much, but I think that it is plain confusing to return
> > pkg_cstate_ratio_cur from powerclamp_get_cur_state() in any case.  It
> > should always return set_target_ratio IMV.
> It should. It in unnecessary complications. When I use in thermald, I
> don't look at the returned value from cur_state as this doesn't matter
> if the temperature is not under control. I will change this for all
> cases.

I think that this should be a separate patch, though, not to be
confused with the fix.
diff mbox series

Patch

diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
index b80e25ec1261..64f082c584b2 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -57,6 +57,7 @@ 
 
 static unsigned int target_mwait;
 static struct dentry *debug_dir;
+static bool poll_pkg_cstate_enable;
 
 /* user selected target */
 static unsigned int set_target_ratio;
@@ -261,6 +262,9 @@  static unsigned int get_compensation(int ratio)
 {
 	unsigned int comp = 0;
 
+	if (!poll_pkg_cstate_enable)
+		return 0;
+
 	/* we only use compensation if all adjacent ones are good */
 	if (ratio == 1 &&
 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
@@ -519,7 +523,8 @@  static int start_power_clamp(void)
 	control_cpu = cpumask_first(cpu_online_mask);
 
 	clamping = true;
-	schedule_delayed_work(&poll_pkg_cstate_work, 0);
+	if (poll_pkg_cstate_enable)
+		schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
 	/* start one kthread worker per online cpu */
 	for_each_online_cpu(cpu) {
@@ -585,11 +590,15 @@  static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
-	if (true == clamping)
-		*state = pkg_cstate_ratio_cur;
-	else
+	if (true == clamping) {
+		if (poll_pkg_cstate_enable)
+			*state = pkg_cstate_ratio_cur;
+		else
+			*state = set_target_ratio;
+	} else {
 		/* to save power, do not poll idle ratio while not clamping */
 		*state = -1; /* indicates invalid state */
+	}
 
 	return 0;
 }
@@ -712,6 +721,9 @@  static int __init powerclamp_init(void)
 		goto exit_unregister;
 	}
 
+	if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
+		poll_pkg_cstate_enable = true;
+
 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 						&powerclamp_cooling_ops);
 	if (IS_ERR(cooling_dev)) {