diff mbox series

[v2,1/2] drm/nouveau: return temperatures in temp_get() via parameter

Message ID 20200916194711.999602-2-jcline@redhat.com (mailing list archive)
State New, archived
Headers show
Series Add fine-grain temperature reporting | expand

Commit Message

Jeremy Cline Sept. 16, 2020, 7:47 p.m. UTC
The temp_get() function currently returns negative error numbers or a
temperature. However, the thermal sensors can (in theory) measure
negative temperatures. Some implementations of temp_get() correctly
clamp negative temperature readings to 0 so that users do not mistake
them for errors, but some, like gp100_temp_get(), do not.

Rather than relying on implementations remembering to clamp values,
dedicate the function return value to error codes and accept a pointer
to an integer where the temperature reading should be stored.

Signed-off-by: Jeremy Cline <jcline@redhat.com>
---
 .../drm/nouveau/include/nvkm/subdev/therm.h   |  2 +-
 drivers/gpu/drm/nouveau/nouveau_hwmon.c       | 12 ++++++------
 .../gpu/drm/nouveau/nvkm/subdev/therm/base.c  | 19 ++++++++++++++-----
 .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c   | 11 ++++++-----
 .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
 .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c  |  9 +++------
 .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c  |  9 +++------
 .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h  | 17 +++++++++++++++--
 .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c  | 12 ++++++++----
 9 files changed, 62 insertions(+), 40 deletions(-)

Comments

Karol Herbst Sept. 16, 2020, 8:01 p.m. UTC | #1
On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <jcline@redhat.com> wrote:
>
> The temp_get() function currently returns negative error numbers or a
> temperature. However, the thermal sensors can (in theory) measure
> negative temperatures. Some implementations of temp_get() correctly
> clamp negative temperature readings to 0 so that users do not mistake
> them for errors, but some, like gp100_temp_get(), do not.
>
> Rather than relying on implementations remembering to clamp values,
> dedicate the function return value to error codes and accept a pointer
> to an integer where the temperature reading should be stored.
>
> Signed-off-by: Jeremy Cline <jcline@redhat.com>
> ---
>  .../drm/nouveau/include/nvkm/subdev/therm.h   |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_hwmon.c       | 12 ++++++------
>  .../gpu/drm/nouveau/nvkm/subdev/therm/base.c  | 19 ++++++++++++++-----
>  .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c   | 11 ++++++-----
>  .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
>  .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c  |  9 +++------
>  .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c  |  9 +++------
>  .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h  | 17 +++++++++++++++--
>  .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c  | 12 ++++++++----
>  9 files changed, 62 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> index 62c34f98c930..bfe9779216fc 100644
> --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> @@ -99,7 +99,7 @@ struct nvkm_therm {
>         bool clkgating_enabled;
>  };
>
> -int nvkm_therm_temp_get(struct nvkm_therm *);
> +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
>  int nvkm_therm_fan_sense(struct nvkm_therm *);
>  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
>  void nvkm_therm_clkgate_init(struct nvkm_therm *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> index 1c3104d20571..aff6aa296678 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
>         struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
>         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
>
> -       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> +       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
>                 return 0;
>
>         switch (attr) {
> @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
>         struct drm_device *drm_dev = dev_get_drvdata(dev);
>         struct nouveau_drm *drm = nouveau_drm(drm_dev);
>         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> -       int ret;
> +       int ret = 0, temp;
>
>         if (!therm || !therm->attr_get)
>                 return -EOPNOTSUPP;
> @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
>         case hwmon_temp_input:
>                 if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
>                         return -EINVAL;
> -               ret = nvkm_therm_temp_get(therm);
> -               *val = ret < 0 ? ret : (ret * 1000);
> +               ret = nvkm_therm_temp_get(therm, &temp);
> +               *val = temp * 1000;
>                 break;
>         case hwmon_temp_max:
>                 *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
>                 return -EOPNOTSUPP;
>         }
>
> -       return 0;
> +       return ret;
>  }
>
>  static int
> @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
>         hwmon->dev = dev;
>
>         if (therm && therm->attr_get && therm->attr_set) {
> -               if (nvkm_therm_temp_get(therm) >= 0)
> +               if (nvkm_therm_temp_get(therm, NULL) >= 0)
>                         special_groups[i++] = &temp1_auto_point_sensor_group;
>                 if (therm->fan_get && therm->fan_get(therm) >= 0)
>                         special_groups[i++] = &pwm_fan_sensor_group;
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> index 4a4d1e224126..e7ffea1512e6 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> @@ -27,10 +27,15 @@
>  #include <subdev/pmu.h>
>
>  int
> -nvkm_therm_temp_get(struct nvkm_therm *therm)
> +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
>  {
> +       int ignored_reading;
> +
> +       if (temp == NULL)
> +               temp = &ignored_reading;
> +
>         if (therm->func->temp_get)
> -               return therm->func->temp_get(therm);
> +               return therm->func->temp_get(therm, temp);
>         return -ENODEV;
>  }
>
> @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
>         struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
>                                        *cur_trip = NULL,
>                                        *last_trip = therm->last_trip;
> -       u8  temp = therm->func->temp_get(therm);
> +       int temp;
>         u16 duty, i;
>
> +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> +

I think adding WARN_ONs like this is fine, I am just not sure how much
our code is actually protected against the GPU being runtime
suspended... so on laptops this could trigger some warnings.. might
even be just because nouveau_pmops_runtime_suspend and the hwmon API
race here, even though it's quite unlikely that the hwmon thread
stalls enough so it survives the entire suspend path. But in theory
that could be causing random errors getting reported.

>         /* look for the trip point corresponding to the current temperature */
>         cur_trip = NULL;
>         for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> @@ -70,9 +77,11 @@ static int
>  nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
>                                 u8 linear_max_temp)
>  {
> -       u8  temp = therm->func->temp_get(therm);
> +       int temp;
>         u16 duty;
>
> +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> +
>         /* handle the non-linear part first */
>         if (temp < linear_min_temp)
>                 return therm->fan->bios.min_duty;
> @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
>         /* do not allow automatic fan management if the thermal sensor is
>          * not available */
>         if (mode == NVKM_THERM_CTRL_AUTO &&
> -           therm->func->temp_get(therm) < 0)
> +           nvkm_therm_temp_get(therm, NULL) < 0)
>                 return -EINVAL;
>
>         if (therm->mode == mode)
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> index 96f8da40ac82..e2f891d5c7ba 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> @@ -27,14 +27,15 @@
>  #include <subdev/fuse.h>
>
>  int
> -g84_temp_get(struct nvkm_therm *therm)
> +g84_temp_get(struct nvkm_therm *therm, int *temp)
>  {
>         struct nvkm_device *device = therm->subdev.device;
>
> -       if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> -               return nvkm_rd32(device, 0x20400);
> -       else
> +       if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
>                 return -ENODEV;
> +
> +       *temp = nvkm_rd32(device, 0x20400);
> +       return 0;
>  }
>
>  void
> @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
>         }
>
>         /* fix the state (in case someone reprogrammed the alarms) */
> -       cur = therm->func->temp_get(therm);
> +       WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
>         if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
>                 new_state = NVKM_THERM_THRS_HIGHER;
>         else if (new_state == NVKM_THERM_THRS_HIGHER &&
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> index 9f0dea3f61dc..4c32e4f21bec 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> @@ -24,7 +24,7 @@
>  #include "priv.h"
>
>  static int
> -gp100_temp_get(struct nvkm_therm *therm)
> +gp100_temp_get(struct nvkm_therm *therm, int *temp)
>  {
>         struct nvkm_device *device = therm->subdev.device;
>         struct nvkm_subdev *subdev = &therm->subdev;
> @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
>         if (tsensor & 0x40000000)
>                 nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
>
> -       /* device valid */
> -       if (tsensor & 0x20000000)
> -               return (inttemp >> 8);
> -       else
> +       /* device invalid */
> +       if (!(tsensor & 0x20000000))
>                 return -ENODEV;
> +
> +       *temp = inttemp >> 8;
> +       return 0;
>  }
>
>  static const struct nvkm_therm_func
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> index 2c92ffb5f9d0..9753ad4bee4a 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
>  }
>
>  static int
> -nv40_temp_get(struct nvkm_therm *therm)
> +nv40_temp_get(struct nvkm_therm *therm, int *temp)
>  {
>         struct nvkm_device *device = therm->subdev.device;
>         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
>         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
>         core_temp = core_temp + sensor->offset_constant - 8;
>
> -       /* reserve negative temperatures for errors */
> -       if (core_temp < 0)
> -               core_temp = 0;
> -
> -       return core_temp;
> +       *temp = core_temp;
> +       return 0;
>  }
>
>  static int
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> index 9b57b433d4cf..38fa6777c129 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
>  }
>
>  static int
> -nv50_temp_get(struct nvkm_therm *therm)
> +nv50_temp_get(struct nvkm_therm *therm, int *temp)
>  {
>         struct nvkm_device *device = therm->subdev.device;
>         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
>         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
>         core_temp = core_temp + sensor->offset_constant - 8;
>
> -       /* reserve negative temperatures for errors */
> -       if (core_temp < 0)
> -               core_temp = 0;
> -
> -       return core_temp;
> +       *temp = core_temp;
> +       return 0;
>  }
>
>  static void
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> index 21659daf1864..04607d8b1755 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> @@ -91,7 +91,15 @@ struct nvkm_therm_func {
>         int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
>         int (*pwm_clock)(struct nvkm_therm *, int line);
>
> -       int (*temp_get)(struct nvkm_therm *);
> +       /**
> +        * @temp_get: Get the temperature reading from a thermal device
> +        *
> +        * @therm: The thermal device instance.
> +        * @temp: A pointer to write the temperature reading to.
> +        *
> +        * Returns: Zero on success, or a negative error code on failure.
> +        */
> +       int (*temp_get)(struct nvkm_therm *therm, int *temp);
>
>         int (*fan_sense)(struct nvkm_therm *);
>
> @@ -110,7 +118,12 @@ int  nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
>  int  nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
>  int  nv50_fan_pwm_clock(struct nvkm_therm *, int);
>
> -int  g84_temp_get(struct nvkm_therm *);
> +/**
> + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> + * @therm: The thermal device instance.
> + * @temp: A pointer to write the temperature reading to.
> + */
> +int  g84_temp_get(struct nvkm_therm *therm, int *temp);
>  void g84_sensor_setup(struct nvkm_therm *);
>  void g84_therm_fini(struct nvkm_therm *);
>
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> index ddb2b2c600ca..1e8803901abc 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
>         static const char * const thresholds[] = {
>                 "fanboost", "downclock", "critical", "shutdown"
>         };
> -       int temperature = therm->func->temp_get(therm);
> +       int temperature;
> +
> +       WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
>
>         if (thrs < 0 || thrs > 3)
>                 return;
> @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
>  {
>         enum nvkm_therm_thrs_direction direction;
>         enum nvkm_therm_thrs_state prev_state, new_state;
> -       int temp = therm->func->temp_get(therm);
> +       int temp;
> +
> +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
>
>         prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
>
> @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
>         spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
>
>         /* schedule the next poll in one second */
> -       if (therm->func->temp_get(therm) >= 0)
> +       if (nvkm_therm_temp_get(therm, NULL) >= 0)
>                 nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
>  }
>
> @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
>  {
>         const char *sensor_avail = "yes";
>
> -       if (therm->func->temp_get(therm) < 0)
> +       if (nvkm_therm_temp_get(therm, NULL) < 0)
>                 sensor_avail = "no";
>
>         nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> --
> 2.26.2
>
Karol Herbst Sept. 16, 2020, 8:03 p.m. UTC | #2
On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <kherbst@redhat.com> wrote:
>
> On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <jcline@redhat.com> wrote:
> >
> > The temp_get() function currently returns negative error numbers or a
> > temperature. However, the thermal sensors can (in theory) measure
> > negative temperatures. Some implementations of temp_get() correctly
> > clamp negative temperature readings to 0 so that users do not mistake
> > them for errors, but some, like gp100_temp_get(), do not.
> >
> > Rather than relying on implementations remembering to clamp values,
> > dedicate the function return value to error codes and accept a pointer
> > to an integer where the temperature reading should be stored.
> >
> > Signed-off-by: Jeremy Cline <jcline@redhat.com>
> > ---
> >  .../drm/nouveau/include/nvkm/subdev/therm.h   |  2 +-
> >  drivers/gpu/drm/nouveau/nouveau_hwmon.c       | 12 ++++++------
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/base.c  | 19 ++++++++++++++-----
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c   | 11 ++++++-----
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c  |  9 +++------
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c  |  9 +++------
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h  | 17 +++++++++++++++--
> >  .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c  | 12 ++++++++----
> >  9 files changed, 62 insertions(+), 40 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > index 62c34f98c930..bfe9779216fc 100644
> > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > @@ -99,7 +99,7 @@ struct nvkm_therm {
> >         bool clkgating_enabled;
> >  };
> >
> > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> >  int nvkm_therm_fan_sense(struct nvkm_therm *);
> >  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> >  void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > index 1c3104d20571..aff6aa296678 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> >         struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> >
> > -       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > +       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> >                 return 0;
> >
> >         switch (attr) {
> > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> >         struct drm_device *drm_dev = dev_get_drvdata(dev);
> >         struct nouveau_drm *drm = nouveau_drm(drm_dev);
> >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > -       int ret;
> > +       int ret = 0, temp;
> >
> >         if (!therm || !therm->attr_get)
> >                 return -EOPNOTSUPP;
> > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> >         case hwmon_temp_input:
> >                 if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> >                         return -EINVAL;
> > -               ret = nvkm_therm_temp_get(therm);
> > -               *val = ret < 0 ? ret : (ret * 1000);
> > +               ret = nvkm_therm_temp_get(therm, &temp);
> > +               *val = temp * 1000;
> >                 break;
> >         case hwmon_temp_max:
> >                 *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> >                 return -EOPNOTSUPP;
> >         }
> >
> > -       return 0;
> > +       return ret;
> >  }
> >
> >  static int
> > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> >         hwmon->dev = dev;
> >
> >         if (therm && therm->attr_get && therm->attr_set) {
> > -               if (nvkm_therm_temp_get(therm) >= 0)
> > +               if (nvkm_therm_temp_get(therm, NULL) >= 0)
> >                         special_groups[i++] = &temp1_auto_point_sensor_group;
> >                 if (therm->fan_get && therm->fan_get(therm) >= 0)
> >                         special_groups[i++] = &pwm_fan_sensor_group;
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > index 4a4d1e224126..e7ffea1512e6 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > @@ -27,10 +27,15 @@
> >  #include <subdev/pmu.h>
> >
> >  int
> > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> >  {
> > +       int ignored_reading;
> > +
> > +       if (temp == NULL)
> > +               temp = &ignored_reading;
> > +
> >         if (therm->func->temp_get)
> > -               return therm->func->temp_get(therm);
> > +               return therm->func->temp_get(therm, temp);
> >         return -ENODEV;
> >  }
> >
> > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> >         struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> >                                        *cur_trip = NULL,
> >                                        *last_trip = therm->last_trip;
> > -       u8  temp = therm->func->temp_get(therm);
> > +       int temp;
> >         u16 duty, i;
> >
> > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > +
>
> I think adding WARN_ONs like this is fine, I am just not sure how much
> our code is actually protected against the GPU being runtime
> suspended... so on laptops this could trigger some warnings.. might
> even be just because nouveau_pmops_runtime_suspend and the hwmon API
> race here, even though it's quite unlikely that the hwmon thread
> stalls enough so it survives the entire suspend path. But in theory
> that could be causing random errors getting reported.
>

Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
guess this race is indeed real. I thought we did that, but maybe
that's because I never send out the patch to use them.

> >         /* look for the trip point corresponding to the current temperature */
> >         cur_trip = NULL;
> >         for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > @@ -70,9 +77,11 @@ static int
> >  nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> >                                 u8 linear_max_temp)
> >  {
> > -       u8  temp = therm->func->temp_get(therm);
> > +       int temp;
> >         u16 duty;
> >
> > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > +
> >         /* handle the non-linear part first */
> >         if (temp < linear_min_temp)
> >                 return therm->fan->bios.min_duty;
> > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> >         /* do not allow automatic fan management if the thermal sensor is
> >          * not available */
> >         if (mode == NVKM_THERM_CTRL_AUTO &&
> > -           therm->func->temp_get(therm) < 0)
> > +           nvkm_therm_temp_get(therm, NULL) < 0)
> >                 return -EINVAL;
> >
> >         if (therm->mode == mode)
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > index 96f8da40ac82..e2f891d5c7ba 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > @@ -27,14 +27,15 @@
> >  #include <subdev/fuse.h>
> >
> >  int
> > -g84_temp_get(struct nvkm_therm *therm)
> > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> >  {
> >         struct nvkm_device *device = therm->subdev.device;
> >
> > -       if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > -               return nvkm_rd32(device, 0x20400);
> > -       else
> > +       if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> >                 return -ENODEV;
> > +
> > +       *temp = nvkm_rd32(device, 0x20400);
> > +       return 0;
> >  }
> >
> >  void
> > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> >         }
> >
> >         /* fix the state (in case someone reprogrammed the alarms) */
> > -       cur = therm->func->temp_get(therm);
> > +       WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> >         if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> >                 new_state = NVKM_THERM_THRS_HIGHER;
> >         else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > index 9f0dea3f61dc..4c32e4f21bec 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > @@ -24,7 +24,7 @@
> >  #include "priv.h"
> >
> >  static int
> > -gp100_temp_get(struct nvkm_therm *therm)
> > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> >  {
> >         struct nvkm_device *device = therm->subdev.device;
> >         struct nvkm_subdev *subdev = &therm->subdev;
> > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> >         if (tsensor & 0x40000000)
> >                 nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> >
> > -       /* device valid */
> > -       if (tsensor & 0x20000000)
> > -               return (inttemp >> 8);
> > -       else
> > +       /* device invalid */
> > +       if (!(tsensor & 0x20000000))
> >                 return -ENODEV;
> > +
> > +       *temp = inttemp >> 8;
> > +       return 0;
> >  }
> >
> >  static const struct nvkm_therm_func
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> >  }
> >
> >  static int
> > -nv40_temp_get(struct nvkm_therm *therm)
> > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> >  {
> >         struct nvkm_device *device = therm->subdev.device;
> >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> >         core_temp = core_temp + sensor->offset_constant - 8;
> >
> > -       /* reserve negative temperatures for errors */
> > -       if (core_temp < 0)
> > -               core_temp = 0;
> > -
> > -       return core_temp;
> > +       *temp = core_temp;
> > +       return 0;
> >  }
> >
> >  static int
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > index 9b57b433d4cf..38fa6777c129 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> >  }
> >
> >  static int
> > -nv50_temp_get(struct nvkm_therm *therm)
> > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> >  {
> >         struct nvkm_device *device = therm->subdev.device;
> >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> >         core_temp = core_temp + sensor->offset_constant - 8;
> >
> > -       /* reserve negative temperatures for errors */
> > -       if (core_temp < 0)
> > -               core_temp = 0;
> > -
> > -       return core_temp;
> > +       *temp = core_temp;
> > +       return 0;
> >  }
> >
> >  static void
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > index 21659daf1864..04607d8b1755 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> >         int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> >         int (*pwm_clock)(struct nvkm_therm *, int line);
> >
> > -       int (*temp_get)(struct nvkm_therm *);
> > +       /**
> > +        * @temp_get: Get the temperature reading from a thermal device
> > +        *
> > +        * @therm: The thermal device instance.
> > +        * @temp: A pointer to write the temperature reading to.
> > +        *
> > +        * Returns: Zero on success, or a negative error code on failure.
> > +        */
> > +       int (*temp_get)(struct nvkm_therm *therm, int *temp);
> >
> >         int (*fan_sense)(struct nvkm_therm *);
> >
> > @@ -110,7 +118,12 @@ int  nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> >  int  nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> >  int  nv50_fan_pwm_clock(struct nvkm_therm *, int);
> >
> > -int  g84_temp_get(struct nvkm_therm *);
> > +/**
> > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > + * @therm: The thermal device instance.
> > + * @temp: A pointer to write the temperature reading to.
> > + */
> > +int  g84_temp_get(struct nvkm_therm *therm, int *temp);
> >  void g84_sensor_setup(struct nvkm_therm *);
> >  void g84_therm_fini(struct nvkm_therm *);
> >
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > index ddb2b2c600ca..1e8803901abc 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> >         static const char * const thresholds[] = {
> >                 "fanboost", "downclock", "critical", "shutdown"
> >         };
> > -       int temperature = therm->func->temp_get(therm);
> > +       int temperature;
> > +
> > +       WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> >
> >         if (thrs < 0 || thrs > 3)
> >                 return;
> > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> >  {
> >         enum nvkm_therm_thrs_direction direction;
> >         enum nvkm_therm_thrs_state prev_state, new_state;
> > -       int temp = therm->func->temp_get(therm);
> > +       int temp;
> > +
> > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> >
> >         prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> >
> > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> >         spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> >
> >         /* schedule the next poll in one second */
> > -       if (therm->func->temp_get(therm) >= 0)
> > +       if (nvkm_therm_temp_get(therm, NULL) >= 0)
> >                 nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> >  }
> >
> > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> >  {
> >         const char *sensor_avail = "yes";
> >
> > -       if (therm->func->temp_get(therm) < 0)
> > +       if (nvkm_therm_temp_get(therm, NULL) < 0)
> >                 sensor_avail = "no";
> >
> >         nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > --
> > 2.26.2
> >
Jeremy Cline Sept. 17, 2020, 2:11 p.m. UTC | #3
On Wed, Sep 16, 2020 at 10:03:22PM +0200, Karol Herbst wrote:
> On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <kherbst@redhat.com> wrote:
> >
> > On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <jcline@redhat.com> wrote:
> > >
> > > The temp_get() function currently returns negative error numbers or a
> > > temperature. However, the thermal sensors can (in theory) measure
> > > negative temperatures. Some implementations of temp_get() correctly
> > > clamp negative temperature readings to 0 so that users do not mistake
> > > them for errors, but some, like gp100_temp_get(), do not.
> > >
> > > Rather than relying on implementations remembering to clamp values,
> > > dedicate the function return value to error codes and accept a pointer
> > > to an integer where the temperature reading should be stored.
> > >
> > > Signed-off-by: Jeremy Cline <jcline@redhat.com>
> > > ---
> > >  .../drm/nouveau/include/nvkm/subdev/therm.h   |  2 +-
> > >  drivers/gpu/drm/nouveau/nouveau_hwmon.c       | 12 ++++++------
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/base.c  | 19 ++++++++++++++-----
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c   | 11 ++++++-----
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c  |  9 +++------
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c  |  9 +++------
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h  | 17 +++++++++++++++--
> > >  .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c  | 12 ++++++++----
> > >  9 files changed, 62 insertions(+), 40 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > index 62c34f98c930..bfe9779216fc 100644
> > > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > @@ -99,7 +99,7 @@ struct nvkm_therm {
> > >         bool clkgating_enabled;
> > >  };
> > >
> > > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> > >  int nvkm_therm_fan_sense(struct nvkm_therm *);
> > >  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > >  void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > index 1c3104d20571..aff6aa296678 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> > >         struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> > >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > >
> > > -       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > > +       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> > >                 return 0;
> > >
> > >         switch (attr) {
> > > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > >         struct drm_device *drm_dev = dev_get_drvdata(dev);
> > >         struct nouveau_drm *drm = nouveau_drm(drm_dev);
> > >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > -       int ret;
> > > +       int ret = 0, temp;
> > >
> > >         if (!therm || !therm->attr_get)
> > >                 return -EOPNOTSUPP;
> > > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > >         case hwmon_temp_input:
> > >                 if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > >                         return -EINVAL;
> > > -               ret = nvkm_therm_temp_get(therm);
> > > -               *val = ret < 0 ? ret : (ret * 1000);
> > > +               ret = nvkm_therm_temp_get(therm, &temp);
> > > +               *val = temp * 1000;
> > >                 break;
> > >         case hwmon_temp_max:
> > >                 *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > >                 return -EOPNOTSUPP;
> > >         }
> > >
> > > -       return 0;
> > > +       return ret;
> > >  }
> > >
> > >  static int
> > > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> > >         hwmon->dev = dev;
> > >
> > >         if (therm && therm->attr_get && therm->attr_set) {
> > > -               if (nvkm_therm_temp_get(therm) >= 0)
> > > +               if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > >                         special_groups[i++] = &temp1_auto_point_sensor_group;
> > >                 if (therm->fan_get && therm->fan_get(therm) >= 0)
> > >                         special_groups[i++] = &pwm_fan_sensor_group;
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > index 4a4d1e224126..e7ffea1512e6 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > @@ -27,10 +27,15 @@
> > >  #include <subdev/pmu.h>
> > >
> > >  int
> > > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> > >  {
> > > +       int ignored_reading;
> > > +
> > > +       if (temp == NULL)
> > > +               temp = &ignored_reading;
> > > +
> > >         if (therm->func->temp_get)
> > > -               return therm->func->temp_get(therm);
> > > +               return therm->func->temp_get(therm, temp);
> > >         return -ENODEV;
> > >  }
> > >
> > > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> > >         struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> > >                                        *cur_trip = NULL,
> > >                                        *last_trip = therm->last_trip;
> > > -       u8  temp = therm->func->temp_get(therm);
> > > +       int temp;
> > >         u16 duty, i;
> > >
> > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > +
> >
> > I think adding WARN_ONs like this is fine, I am just not sure how much
> > our code is actually protected against the GPU being runtime
> > suspended... so on laptops this could trigger some warnings.. might
> > even be just because nouveau_pmops_runtime_suspend and the hwmon API
> > race here, even though it's quite unlikely that the hwmon thread
> > stalls enough so it survives the entire suspend path. But in theory
> > that could be causing random errors getting reported.
> >
> 
> Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
> guess this race is indeed real. I thought we did that, but maybe
> that's because I never send out the patch to use them.
> 

Yeah, so my thinking was that since before there was no error handling
it would show whether or not it's a real problem, but it sounds like it
is so I think it'd be better to handle that.

I see nvkm_therm_update() is checking to make sure the duty is positive.
Should it always be safe to skip setting the fan in cases where the
temperature isn't readable? If so we can just return the error here.

> > >         /* look for the trip point corresponding to the current temperature */
> > >         cur_trip = NULL;
> > >         for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > > @@ -70,9 +77,11 @@ static int
> > >  nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> > >                                 u8 linear_max_temp)
> > >  {
> > > -       u8  temp = therm->func->temp_get(therm);
> > > +       int temp;
> > >         u16 duty;
> > >
> > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > +
> > >         /* handle the non-linear part first */
> > >         if (temp < linear_min_temp)
> > >                 return therm->fan->bios.min_duty;
> > > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> > >         /* do not allow automatic fan management if the thermal sensor is
> > >          * not available */
> > >         if (mode == NVKM_THERM_CTRL_AUTO &&
> > > -           therm->func->temp_get(therm) < 0)
> > > +           nvkm_therm_temp_get(therm, NULL) < 0)
> > >                 return -EINVAL;
> > >
> > >         if (therm->mode == mode)
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > index 96f8da40ac82..e2f891d5c7ba 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > @@ -27,14 +27,15 @@
> > >  #include <subdev/fuse.h>
> > >
> > >  int
> > > -g84_temp_get(struct nvkm_therm *therm)
> > > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> > >  {
> > >         struct nvkm_device *device = therm->subdev.device;
> > >
> > > -       if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > > -               return nvkm_rd32(device, 0x20400);
> > > -       else
> > > +       if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> > >                 return -ENODEV;
> > > +
> > > +       *temp = nvkm_rd32(device, 0x20400);
> > > +       return 0;
> > >  }
> > >
> > >  void
> > > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> > >         }
> > >
> > >         /* fix the state (in case someone reprogrammed the alarms) */
> > > -       cur = therm->func->temp_get(therm);
> > > +       WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> > >         if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> > >                 new_state = NVKM_THERM_THRS_HIGHER;
> > >         else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > index 9f0dea3f61dc..4c32e4f21bec 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > @@ -24,7 +24,7 @@
> > >  #include "priv.h"
> > >
> > >  static int
> > > -gp100_temp_get(struct nvkm_therm *therm)
> > > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> > >  {
> > >         struct nvkm_device *device = therm->subdev.device;
> > >         struct nvkm_subdev *subdev = &therm->subdev;
> > > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> > >         if (tsensor & 0x40000000)
> > >                 nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> > >
> > > -       /* device valid */
> > > -       if (tsensor & 0x20000000)
> > > -               return (inttemp >> 8);
> > > -       else
> > > +       /* device invalid */
> > > +       if (!(tsensor & 0x20000000))
> > >                 return -ENODEV;
> > > +
> > > +       *temp = inttemp >> 8;
> > > +       return 0;
> > >  }
> > >
> > >  static const struct nvkm_therm_func
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> > >  }
> > >
> > >  static int
> > > -nv40_temp_get(struct nvkm_therm *therm)
> > > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> > >  {
> > >         struct nvkm_device *device = therm->subdev.device;
> > >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> > >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > >         core_temp = core_temp + sensor->offset_constant - 8;
> > >
> > > -       /* reserve negative temperatures for errors */
> > > -       if (core_temp < 0)
> > > -               core_temp = 0;
> > > -
> > > -       return core_temp;
> > > +       *temp = core_temp;
> > > +       return 0;
> > >  }
> > >
> > >  static int
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > index 9b57b433d4cf..38fa6777c129 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> > >  }
> > >
> > >  static int
> > > -nv50_temp_get(struct nvkm_therm *therm)
> > > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> > >  {
> > >         struct nvkm_device *device = therm->subdev.device;
> > >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> > >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > >         core_temp = core_temp + sensor->offset_constant - 8;
> > >
> > > -       /* reserve negative temperatures for errors */
> > > -       if (core_temp < 0)
> > > -               core_temp = 0;
> > > -
> > > -       return core_temp;
> > > +       *temp = core_temp;
> > > +       return 0;
> > >  }
> > >
> > >  static void
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > index 21659daf1864..04607d8b1755 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> > >         int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> > >         int (*pwm_clock)(struct nvkm_therm *, int line);
> > >
> > > -       int (*temp_get)(struct nvkm_therm *);
> > > +       /**
> > > +        * @temp_get: Get the temperature reading from a thermal device
> > > +        *
> > > +        * @therm: The thermal device instance.
> > > +        * @temp: A pointer to write the temperature reading to.
> > > +        *
> > > +        * Returns: Zero on success, or a negative error code on failure.
> > > +        */
> > > +       int (*temp_get)(struct nvkm_therm *therm, int *temp);
> > >
> > >         int (*fan_sense)(struct nvkm_therm *);
> > >
> > > @@ -110,7 +118,12 @@ int  nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> > >  int  nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> > >  int  nv50_fan_pwm_clock(struct nvkm_therm *, int);
> > >
> > > -int  g84_temp_get(struct nvkm_therm *);
> > > +/**
> > > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > > + * @therm: The thermal device instance.
> > > + * @temp: A pointer to write the temperature reading to.
> > > + */
> > > +int  g84_temp_get(struct nvkm_therm *therm, int *temp);
> > >  void g84_sensor_setup(struct nvkm_therm *);
> > >  void g84_therm_fini(struct nvkm_therm *);
> > >
> > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > index ddb2b2c600ca..1e8803901abc 100644
> > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> > >         static const char * const thresholds[] = {
> > >                 "fanboost", "downclock", "critical", "shutdown"
> > >         };
> > > -       int temperature = therm->func->temp_get(therm);
> > > +       int temperature;
> > > +
> > > +       WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> > >
> > >         if (thrs < 0 || thrs > 3)
> > >                 return;
> > > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> > >  {
> > >         enum nvkm_therm_thrs_direction direction;
> > >         enum nvkm_therm_thrs_state prev_state, new_state;
> > > -       int temp = therm->func->temp_get(therm);
> > > +       int temp;
> > > +
> > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > >
> > >         prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> > >
> > > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> > >         spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> > >
> > >         /* schedule the next poll in one second */
> > > -       if (therm->func->temp_get(therm) >= 0)
> > > +       if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > >                 nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> > >  }
> > >
> > > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> > >  {
> > >         const char *sensor_avail = "yes";
> > >
> > > -       if (therm->func->temp_get(therm) < 0)
> > > +       if (nvkm_therm_temp_get(therm, NULL) < 0)
> > >                 sensor_avail = "no";
> > >
> > >         nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > > --
> > > 2.26.2
> > >
>
Karol Herbst Sept. 17, 2020, 2:23 p.m. UTC | #4
On Thu, Sep 17, 2020 at 4:11 PM Jeremy Cline <jcline@redhat.com> wrote:
>
> On Wed, Sep 16, 2020 at 10:03:22PM +0200, Karol Herbst wrote:
> > On Wed, Sep 16, 2020 at 10:01 PM Karol Herbst <kherbst@redhat.com> wrote:
> > >
> > > On Wed, Sep 16, 2020 at 9:47 PM Jeremy Cline <jcline@redhat.com> wrote:
> > > >
> > > > The temp_get() function currently returns negative error numbers or a
> > > > temperature. However, the thermal sensors can (in theory) measure
> > > > negative temperatures. Some implementations of temp_get() correctly
> > > > clamp negative temperature readings to 0 so that users do not mistake
> > > > them for errors, but some, like gp100_temp_get(), do not.
> > > >
> > > > Rather than relying on implementations remembering to clamp values,
> > > > dedicate the function return value to error codes and accept a pointer
> > > > to an integer where the temperature reading should be stored.
> > > >
> > > > Signed-off-by: Jeremy Cline <jcline@redhat.com>
> > > > ---
> > > >  .../drm/nouveau/include/nvkm/subdev/therm.h   |  2 +-
> > > >  drivers/gpu/drm/nouveau/nouveau_hwmon.c       | 12 ++++++------
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/base.c  | 19 ++++++++++++++-----
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/g84.c   | 11 ++++++-----
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/gp100.c | 11 ++++++-----
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv40.c  |  9 +++------
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/nv50.c  |  9 +++------
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/priv.h  | 17 +++++++++++++++--
> > > >  .../gpu/drm/nouveau/nvkm/subdev/therm/temp.c  | 12 ++++++++----
> > > >  9 files changed, 62 insertions(+), 40 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > index 62c34f98c930..bfe9779216fc 100644
> > > > --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
> > > > @@ -99,7 +99,7 @@ struct nvkm_therm {
> > > >         bool clkgating_enabled;
> > > >  };
> > > >
> > > > -int nvkm_therm_temp_get(struct nvkm_therm *);
> > > > +int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
> > > >  int nvkm_therm_fan_sense(struct nvkm_therm *);
> > > >  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
> > > >  void nvkm_therm_clkgate_init(struct nvkm_therm *,
> > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > index 1c3104d20571..aff6aa296678 100644
> > > > --- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > +++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
> > > > @@ -325,7 +325,7 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
> > > >         struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
> > > >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > >
> > > > -       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
> > > > +       if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
> > > >                 return 0;
> > > >
> > > >         switch (attr) {
> > > > @@ -419,7 +419,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > >         struct drm_device *drm_dev = dev_get_drvdata(dev);
> > > >         struct nouveau_drm *drm = nouveau_drm(drm_dev);
> > > >         struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
> > > > -       int ret;
> > > > +       int ret = 0, temp;
> > > >
> > > >         if (!therm || !therm->attr_get)
> > > >                 return -EOPNOTSUPP;
> > > > @@ -428,8 +428,8 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > >         case hwmon_temp_input:
> > > >                 if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
> > > >                         return -EINVAL;
> > > > -               ret = nvkm_therm_temp_get(therm);
> > > > -               *val = ret < 0 ? ret : (ret * 1000);
> > > > +               ret = nvkm_therm_temp_get(therm, &temp);
> > > > +               *val = temp * 1000;
> > > >                 break;
> > > >         case hwmon_temp_max:
> > > >                 *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> > > > @@ -459,7 +459,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
> > > >                 return -EOPNOTSUPP;
> > > >         }
> > > >
> > > > -       return 0;
> > > > +       return ret;
> > > >  }
> > > >
> > > >  static int
> > > > @@ -735,7 +735,7 @@ nouveau_hwmon_init(struct drm_device *dev)
> > > >         hwmon->dev = dev;
> > > >
> > > >         if (therm && therm->attr_get && therm->attr_set) {
> > > > -               if (nvkm_therm_temp_get(therm) >= 0)
> > > > +               if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > >                         special_groups[i++] = &temp1_auto_point_sensor_group;
> > > >                 if (therm->fan_get && therm->fan_get(therm) >= 0)
> > > >                         special_groups[i++] = &pwm_fan_sensor_group;
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > index 4a4d1e224126..e7ffea1512e6 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
> > > > @@ -27,10 +27,15 @@
> > > >  #include <subdev/pmu.h>
> > > >
> > > >  int
> > > > -nvkm_therm_temp_get(struct nvkm_therm *therm)
> > > > +nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
> > > >  {
> > > > +       int ignored_reading;
> > > > +
> > > > +       if (temp == NULL)
> > > > +               temp = &ignored_reading;
> > > > +
> > > >         if (therm->func->temp_get)
> > > > -               return therm->func->temp_get(therm);
> > > > +               return therm->func->temp_get(therm, temp);
> > > >         return -ENODEV;
> > > >  }
> > > >
> > > > @@ -40,9 +45,11 @@ nvkm_therm_update_trip(struct nvkm_therm *therm)
> > > >         struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
> > > >                                        *cur_trip = NULL,
> > > >                                        *last_trip = therm->last_trip;
> > > > -       u8  temp = therm->func->temp_get(therm);
> > > > +       int temp;
> > > >         u16 duty, i;
> > > >
> > > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > > +
> > >
> > > I think adding WARN_ONs like this is fine, I am just not sure how much
> > > our code is actually protected against the GPU being runtime
> > > suspended... so on laptops this could trigger some warnings.. might
> > > even be just because nouveau_pmops_runtime_suspend and the hwmon API
> > > race here, even though it's quite unlikely that the hwmon thread
> > > stalls enough so it survives the entire suspend path. But in theory
> > > that could be causing random errors getting reported.
> > >
> >
> > Actually, we never set the status to DRM_SWITCH_POWER_CHANGING, so I
> > guess this race is indeed real. I thought we did that, but maybe
> > that's because I never send out the patch to use them.
> >
>
> Yeah, so my thinking was that since before there was no error handling
> it would show whether or not it's a real problem, but it sounds like it
> is so I think it'd be better to handle that.
>

yeah.. I guess that issue existed before already although I think we
treated negative values special enough so it never showed? mhh..
anyway, I think we want to set the power state to _CHANGING anyway and
I can send out a patch for that. The other part is to make sure that
the suspend paths are stopping all the relevant kworkers which I'd
assume they do anyway.

> I see nvkm_therm_update() is checking to make sure the duty is positive.
> Should it always be safe to skip setting the fan in cases where the
> temperature isn't readable? If so we can just return the error here.
>

I think the fan workers should be stopped once the GPU runtime
suspends, but I am not 100% on this, so we might want to check up on
that.

> > > >         /* look for the trip point corresponding to the current temperature */
> > > >         cur_trip = NULL;
> > > >         for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
> > > > @@ -70,9 +77,11 @@ static int
> > > >  nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
> > > >                                 u8 linear_max_temp)
> > > >  {
> > > > -       u8  temp = therm->func->temp_get(therm);
> > > > +       int temp;
> > > >         u16 duty;
> > > >
> > > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > > +
> > > >         /* handle the non-linear part first */
> > > >         if (temp < linear_min_temp)
> > > >                 return therm->fan->bios.min_duty;
> > > > @@ -200,7 +209,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
> > > >         /* do not allow automatic fan management if the thermal sensor is
> > > >          * not available */
> > > >         if (mode == NVKM_THERM_CTRL_AUTO &&
> > > > -           therm->func->temp_get(therm) < 0)
> > > > +           nvkm_therm_temp_get(therm, NULL) < 0)
> > > >                 return -EINVAL;
> > > >
> > > >         if (therm->mode == mode)
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > index 96f8da40ac82..e2f891d5c7ba 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
> > > > @@ -27,14 +27,15 @@
> > > >  #include <subdev/fuse.h>
> > > >
> > > >  int
> > > > -g84_temp_get(struct nvkm_therm *therm)
> > > > +g84_temp_get(struct nvkm_therm *therm, int *temp)
> > > >  {
> > > >         struct nvkm_device *device = therm->subdev.device;
> > > >
> > > > -       if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
> > > > -               return nvkm_rd32(device, 0x20400);
> > > > -       else
> > > > +       if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
> > > >                 return -ENODEV;
> > > > +
> > > > +       *temp = nvkm_rd32(device, 0x20400);
> > > > +       return 0;
> > > >  }
> > > >
> > > >  void
> > > > @@ -115,7 +116,7 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
> > > >         }
> > > >
> > > >         /* fix the state (in case someone reprogrammed the alarms) */
> > > > -       cur = therm->func->temp_get(therm);
> > > > +       WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
> > > >         if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
> > > >                 new_state = NVKM_THERM_THRS_HIGHER;
> > > >         else if (new_state == NVKM_THERM_THRS_HIGHER &&
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > index 9f0dea3f61dc..4c32e4f21bec 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
> > > > @@ -24,7 +24,7 @@
> > > >  #include "priv.h"
> > > >
> > > >  static int
> > > > -gp100_temp_get(struct nvkm_therm *therm)
> > > > +gp100_temp_get(struct nvkm_therm *therm, int *temp)
> > > >  {
> > > >         struct nvkm_device *device = therm->subdev.device;
> > > >         struct nvkm_subdev *subdev = &therm->subdev;
> > > > @@ -35,11 +35,12 @@ gp100_temp_get(struct nvkm_therm *therm)
> > > >         if (tsensor & 0x40000000)
> > > >                 nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
> > > >
> > > > -       /* device valid */
> > > > -       if (tsensor & 0x20000000)
> > > > -               return (inttemp >> 8);
> > > > -       else
> > > > +       /* device invalid */
> > > > +       if (!(tsensor & 0x20000000))
> > > >                 return -ENODEV;
> > > > +
> > > > +       *temp = inttemp >> 8;
> > > > +       return 0;
> > > >  }
> > > >
> > > >  static const struct nvkm_therm_func
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > index 2c92ffb5f9d0..9753ad4bee4a 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
> > > > @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
> > > >  }
> > > >
> > > >  static int
> > > > -nv40_temp_get(struct nvkm_therm *therm)
> > > > +nv40_temp_get(struct nvkm_therm *therm, int *temp)
> > > >  {
> > > >         struct nvkm_device *device = therm->subdev.device;
> > > >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > > @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
> > > >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > >         core_temp = core_temp + sensor->offset_constant - 8;
> > > >
> > > > -       /* reserve negative temperatures for errors */
> > > > -       if (core_temp < 0)
> > > > -               core_temp = 0;
> > > > -
> > > > -       return core_temp;
> > > > +       *temp = core_temp;
> > > > +       return 0;
> > > >  }
> > > >
> > > >  static int
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > index 9b57b433d4cf..38fa6777c129 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
> > > > @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
> > > >  }
> > > >
> > > >  static int
> > > > -nv50_temp_get(struct nvkm_therm *therm)
> > > > +nv50_temp_get(struct nvkm_therm *therm, int *temp)
> > > >  {
> > > >         struct nvkm_device *device = therm->subdev.device;
> > > >         struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
> > > > @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
> > > >         core_temp = core_temp + sensor->offset_num / sensor->offset_den;
> > > >         core_temp = core_temp + sensor->offset_constant - 8;
> > > >
> > > > -       /* reserve negative temperatures for errors */
> > > > -       if (core_temp < 0)
> > > > -               core_temp = 0;
> > > > -
> > > > -       return core_temp;
> > > > +       *temp = core_temp;
> > > > +       return 0;
> > > >  }
> > > >
> > > >  static void
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > index 21659daf1864..04607d8b1755 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
> > > > @@ -91,7 +91,15 @@ struct nvkm_therm_func {
> > > >         int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
> > > >         int (*pwm_clock)(struct nvkm_therm *, int line);
> > > >
> > > > -       int (*temp_get)(struct nvkm_therm *);
> > > > +       /**
> > > > +        * @temp_get: Get the temperature reading from a thermal device
> > > > +        *
> > > > +        * @therm: The thermal device instance.
> > > > +        * @temp: A pointer to write the temperature reading to.
> > > > +        *
> > > > +        * Returns: Zero on success, or a negative error code on failure.
> > > > +        */
> > > > +       int (*temp_get)(struct nvkm_therm *therm, int *temp);
> > > >
> > > >         int (*fan_sense)(struct nvkm_therm *);
> > > >
> > > > @@ -110,7 +118,12 @@ int  nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
> > > >  int  nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
> > > >  int  nv50_fan_pwm_clock(struct nvkm_therm *, int);
> > > >
> > > > -int  g84_temp_get(struct nvkm_therm *);
> > > > +/**
> > > > + * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
> > > > + * @therm: The thermal device instance.
> > > > + * @temp: A pointer to write the temperature reading to.
> > > > + */
> > > > +int  g84_temp_get(struct nvkm_therm *therm, int *temp);
> > > >  void g84_sensor_setup(struct nvkm_therm *);
> > > >  void g84_therm_fini(struct nvkm_therm *);
> > > >
> > > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > index ddb2b2c600ca..1e8803901abc 100644
> > > > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
> > > > @@ -86,7 +86,9 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
> > > >         static const char * const thresholds[] = {
> > > >                 "fanboost", "downclock", "critical", "shutdown"
> > > >         };
> > > > -       int temperature = therm->func->temp_get(therm);
> > > > +       int temperature;
> > > > +
> > > > +       WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
> > > >
> > > >         if (thrs < 0 || thrs > 3)
> > > >                 return;
> > > > @@ -140,7 +142,9 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
> > > >  {
> > > >         enum nvkm_therm_thrs_direction direction;
> > > >         enum nvkm_therm_thrs_state prev_state, new_state;
> > > > -       int temp = therm->func->temp_get(therm);
> > > > +       int temp;
> > > > +
> > > > +       WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
> > > >
> > > >         prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
> > > >
> > > > @@ -185,7 +189,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
> > > >         spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
> > > >
> > > >         /* schedule the next poll in one second */
> > > > -       if (therm->func->temp_get(therm) >= 0)
> > > > +       if (nvkm_therm_temp_get(therm, NULL) >= 0)
> > > >                 nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
> > > >  }
> > > >
> > > > @@ -229,7 +233,7 @@ nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
> > > >  {
> > > >         const char *sensor_avail = "yes";
> > > >
> > > > -       if (therm->func->temp_get(therm) < 0)
> > > > +       if (nvkm_therm_temp_get(therm, NULL) < 0)
> > > >                 sensor_avail = "no";
> > > >
> > > >         nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
> > > > --
> > > > 2.26.2
> > > >
> >
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
index 62c34f98c930..bfe9779216fc 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
@@ -99,7 +99,7 @@  struct nvkm_therm {
 	bool clkgating_enabled;
 };
 
-int nvkm_therm_temp_get(struct nvkm_therm *);
+int nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp);
 int nvkm_therm_fan_sense(struct nvkm_therm *);
 int nvkm_therm_cstate(struct nvkm_therm *, int, int);
 void nvkm_therm_clkgate_init(struct nvkm_therm *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_hwmon.c b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
index 1c3104d20571..aff6aa296678 100644
--- a/drivers/gpu/drm/nouveau/nouveau_hwmon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_hwmon.c
@@ -325,7 +325,7 @@  nouveau_temp_is_visible(const void *data, u32 attr, int channel)
 	struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
 	struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
 
-	if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm) < 0)
+	if (!therm || !therm->attr_get || nvkm_therm_temp_get(therm, NULL) < 0)
 		return 0;
 
 	switch (attr) {
@@ -419,7 +419,7 @@  nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
 	struct drm_device *drm_dev = dev_get_drvdata(dev);
 	struct nouveau_drm *drm = nouveau_drm(drm_dev);
 	struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
-	int ret;
+	int ret = 0, temp;
 
 	if (!therm || !therm->attr_get)
 		return -EOPNOTSUPP;
@@ -428,8 +428,8 @@  nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
 	case hwmon_temp_input:
 		if (drm_dev->switch_power_state != DRM_SWITCH_POWER_ON)
 			return -EINVAL;
-		ret = nvkm_therm_temp_get(therm);
-		*val = ret < 0 ? ret : (ret * 1000);
+		ret = nvkm_therm_temp_get(therm, &temp);
+		*val = temp * 1000;
 		break;
 	case hwmon_temp_max:
 		*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
@@ -459,7 +459,7 @@  nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
 		return -EOPNOTSUPP;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int
@@ -735,7 +735,7 @@  nouveau_hwmon_init(struct drm_device *dev)
 	hwmon->dev = dev;
 
 	if (therm && therm->attr_get && therm->attr_set) {
-		if (nvkm_therm_temp_get(therm) >= 0)
+		if (nvkm_therm_temp_get(therm, NULL) >= 0)
 			special_groups[i++] = &temp1_auto_point_sensor_group;
 		if (therm->fan_get && therm->fan_get(therm) >= 0)
 			special_groups[i++] = &pwm_fan_sensor_group;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index 4a4d1e224126..e7ffea1512e6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -27,10 +27,15 @@ 
 #include <subdev/pmu.h>
 
 int
-nvkm_therm_temp_get(struct nvkm_therm *therm)
+nvkm_therm_temp_get(struct nvkm_therm *therm, int *temp)
 {
+	int ignored_reading;
+
+	if (temp == NULL)
+		temp = &ignored_reading;
+
 	if (therm->func->temp_get)
-		return therm->func->temp_get(therm);
+		return therm->func->temp_get(therm, temp);
 	return -ENODEV;
 }
 
@@ -40,9 +45,11 @@  nvkm_therm_update_trip(struct nvkm_therm *therm)
 	struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
 				       *cur_trip = NULL,
 				       *last_trip = therm->last_trip;
-	u8  temp = therm->func->temp_get(therm);
+	int temp;
 	u16 duty, i;
 
+	WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+
 	/* look for the trip point corresponding to the current temperature */
 	cur_trip = NULL;
 	for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
@@ -70,9 +77,11 @@  static int
 nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
                                u8 linear_max_temp)
 {
-	u8  temp = therm->func->temp_get(therm);
+	int temp;
 	u16 duty;
 
+	WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
+
 	/* handle the non-linear part first */
 	if (temp < linear_min_temp)
 		return therm->fan->bios.min_duty;
@@ -200,7 +209,7 @@  nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
 	/* do not allow automatic fan management if the thermal sensor is
 	 * not available */
 	if (mode == NVKM_THERM_CTRL_AUTO &&
-	    therm->func->temp_get(therm) < 0)
+	    nvkm_therm_temp_get(therm, NULL) < 0)
 		return -EINVAL;
 
 	if (therm->mode == mode)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
index 96f8da40ac82..e2f891d5c7ba 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/g84.c
@@ -27,14 +27,15 @@ 
 #include <subdev/fuse.h>
 
 int
-g84_temp_get(struct nvkm_therm *therm)
+g84_temp_get(struct nvkm_therm *therm, int *temp)
 {
 	struct nvkm_device *device = therm->subdev.device;
 
-	if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
-		return nvkm_rd32(device, 0x20400);
-	else
+	if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
 		return -ENODEV;
+
+	*temp = nvkm_rd32(device, 0x20400);
+	return 0;
 }
 
 void
@@ -115,7 +116,7 @@  g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
 	}
 
 	/* fix the state (in case someone reprogrammed the alarms) */
-	cur = therm->func->temp_get(therm);
+	WARN_ON(nvkm_therm_temp_get(therm, &cur) < 0);
 	if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
 		new_state = NVKM_THERM_THRS_HIGHER;
 	else if (new_state == NVKM_THERM_THRS_HIGHER &&
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
index 9f0dea3f61dc..4c32e4f21bec 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
@@ -24,7 +24,7 @@ 
 #include "priv.h"
 
 static int
-gp100_temp_get(struct nvkm_therm *therm)
+gp100_temp_get(struct nvkm_therm *therm, int *temp)
 {
 	struct nvkm_device *device = therm->subdev.device;
 	struct nvkm_subdev *subdev = &therm->subdev;
@@ -35,11 +35,12 @@  gp100_temp_get(struct nvkm_therm *therm)
 	if (tsensor & 0x40000000)
 		nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
 
-	/* device valid */
-	if (tsensor & 0x20000000)
-		return (inttemp >> 8);
-	else
+	/* device invalid */
+	if (!(tsensor & 0x20000000))
 		return -ENODEV;
+
+	*temp = inttemp >> 8;
+	return 0;
 }
 
 static const struct nvkm_therm_func
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
index 2c92ffb5f9d0..9753ad4bee4a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv40.c
@@ -70,7 +70,7 @@  nv40_sensor_setup(struct nvkm_therm *therm)
 }
 
 static int
-nv40_temp_get(struct nvkm_therm *therm)
+nv40_temp_get(struct nvkm_therm *therm, int *temp)
 {
 	struct nvkm_device *device = therm->subdev.device;
 	struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
@@ -95,11 +95,8 @@  nv40_temp_get(struct nvkm_therm *therm)
 	core_temp = core_temp + sensor->offset_num / sensor->offset_den;
 	core_temp = core_temp + sensor->offset_constant - 8;
 
-	/* reserve negative temperatures for errors */
-	if (core_temp < 0)
-		core_temp = 0;
-
-	return core_temp;
+	*temp = core_temp;
+	return 0;
 }
 
 static int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
index 9b57b433d4cf..38fa6777c129 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/nv50.c
@@ -126,7 +126,7 @@  nv50_sensor_setup(struct nvkm_therm *therm)
 }
 
 static int
-nv50_temp_get(struct nvkm_therm *therm)
+nv50_temp_get(struct nvkm_therm *therm, int *temp)
 {
 	struct nvkm_device *device = therm->subdev.device;
 	struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
@@ -143,11 +143,8 @@  nv50_temp_get(struct nvkm_therm *therm)
 	core_temp = core_temp + sensor->offset_num / sensor->offset_den;
 	core_temp = core_temp + sensor->offset_constant - 8;
 
-	/* reserve negative temperatures for errors */
-	if (core_temp < 0)
-		core_temp = 0;
-
-	return core_temp;
+	*temp = core_temp;
+	return 0;
 }
 
 static void
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
index 21659daf1864..04607d8b1755 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/priv.h
@@ -91,7 +91,15 @@  struct nvkm_therm_func {
 	int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
 	int (*pwm_clock)(struct nvkm_therm *, int line);
 
-	int (*temp_get)(struct nvkm_therm *);
+	/**
+	 * @temp_get: Get the temperature reading from a thermal device
+	 *
+	 * @therm: The thermal device instance.
+	 * @temp: A pointer to write the temperature reading to.
+	 *
+	 * Returns: Zero on success, or a negative error code on failure.
+	 */
+	int (*temp_get)(struct nvkm_therm *therm, int *temp);
 
 	int (*fan_sense)(struct nvkm_therm *);
 
@@ -110,7 +118,12 @@  int  nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
 int  nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
 int  nv50_fan_pwm_clock(struct nvkm_therm *, int);
 
-int  g84_temp_get(struct nvkm_therm *);
+/**
+ * g84_temp_get() - An implementation of the &struct nvkm_therm_func temp_get()
+ * @therm: The thermal device instance.
+ * @temp: A pointer to write the temperature reading to.
+ */
+int  g84_temp_get(struct nvkm_therm *therm, int *temp);
 void g84_sensor_setup(struct nvkm_therm *);
 void g84_therm_fini(struct nvkm_therm *);
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
index ddb2b2c600ca..1e8803901abc 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/temp.c
@@ -86,7 +86,9 @@  nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
 	static const char * const thresholds[] = {
 		"fanboost", "downclock", "critical", "shutdown"
 	};
-	int temperature = therm->func->temp_get(therm);
+	int temperature;
+
+	WARN_ON(nvkm_therm_temp_get(therm, &temperature) < 0);
 
 	if (thrs < 0 || thrs > 3)
 		return;
@@ -140,7 +142,9 @@  nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
 {
 	enum nvkm_therm_thrs_direction direction;
 	enum nvkm_therm_thrs_state prev_state, new_state;
-	int temp = therm->func->temp_get(therm);
+	int temp;
+
+	WARN_ON(nvkm_therm_temp_get(therm, &temp) < 0);
 
 	prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
 
@@ -185,7 +189,7 @@  alarm_timer_callback(struct nvkm_alarm *alarm)
 	spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
 
 	/* schedule the next poll in one second */
-	if (therm->func->temp_get(therm) >= 0)
+	if (nvkm_therm_temp_get(therm, NULL) >= 0)
 		nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
 }
 
@@ -229,7 +233,7 @@  nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
 {
 	const char *sensor_avail = "yes";
 
-	if (therm->func->temp_get(therm) < 0)
+	if (nvkm_therm_temp_get(therm, NULL) < 0)
 		sensor_avail = "no";
 
 	nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);