diff mbox

[11/15] thermal: thermal: Add support for hardware-tracked trip points

Message ID 1431507163-19933-12-git-send-email-s.hauer@pengutronix.de
State New, archived
Headers show

Commit Message

Sascha Hauer May 13, 2015, 8:52 a.m. UTC
This adds support for hardware-tracked trip points to the device tree
thermal sensor framework.

The framework supports an arbitrary number of trip points. Whenever
the current temperature is updated, the trip points immediately
below and above the current temperature are found. A .set_trips
callback is then called with the temperatures. If there is no trip
point above or below the current temperature, the passed trip
temperature will be -INT_MAX or INT_MAX respectively. In this callback,
the driver should program the hardware such that it is notified
when either of these trip points are triggered. When a trip point
is triggered, the driver should call `thermal_zone_device_update'
for the respective thermal zone. This will cause the trip points
to be updated again.

If .set_trips is not implemented, the framework behaves as before.

This patch is based on an earlier version from Mikko Perttunen
<mikko.perttunen@kapsi.fi>

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
 drivers/thermal/thermal_core.c | 43 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/thermal.h        |  3 +++
 2 files changed, 46 insertions(+)

Comments

Mikko Perttunen May 18, 2015, 9:06 a.m. UTC | #1
Thanks for picking up this patch :)

Sorry for being late with these, but here's a few comments..

On 05/13/2015 11:52 AM, Sascha Hauer wrote:
 > This adds support for hardware-tracked trip points to the device tree
 > thermal sensor framework.
 >
 > The framework supports an arbitrary number of trip points. Whenever
 > the current temperature is updated, the trip points immediately
 > below and above the current temperature are found. A .set_trips
 > callback is then called with the temperatures. If there is no trip
 > point above or below the current temperature, the passed trip
 > temperature will be -INT_MAX or INT_MAX respectively. In this callback,
 > the driver should program the hardware such that it is notified
 > when either of these trip points are triggered. When a trip point
 > is triggered, the driver should call `thermal_zone_device_update'
 > for the respective thermal zone. This will cause the trip points
 > to be updated again.
 >
 > If .set_trips is not implemented, the framework behaves as before.
 >
 > This patch is based on an earlier version from Mikko Perttunen
 > <mikko.perttunen@kapsi.fi>
 >
 > Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
 > ---
 >  drivers/thermal/thermal_core.c | 43 
++++++++++++++++++++++++++++++++++++++++++
 >  include/linux/thermal.h        |  3 +++
 >  2 files changed, 46 insertions(+)
 >
 > diff --git a/drivers/thermal/thermal_core.c 
b/drivers/thermal/thermal_core.c
 > index 6bbf61f..3ae1795 100644
 > --- a/drivers/thermal/thermal_core.c
 > +++ b/drivers/thermal/thermal_core.c
 > @@ -453,6 +453,45 @@ int thermal_zone_get_temp(struct 
thermal_zone_device *tz, int *temp)
 >  }
 >  EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 >
 > +static void thermal_zone_set_trips(struct thermal_zone_device *tz)
 > +{
 > +	int low = -INT_MAX;
 > +	int high = INT_MAX;
 > +	int trip_temp, hysteresis;
 > +	int temp = tz->temperature;
 > +	int i;
 > +
 > +	if (!tz->ops->set_trips)
 > +		return;
 > +
 > +	/* No need to change trip points */
 > +	if (temp > tz->prev_low_trip && temp < tz->prev_high_trip)
 > +		return;
 > +
 > +	for (i = 0; i < tz->trips; i++) {
 > +		int trip_low;
 > +
 > +		tz->ops->get_trip_temp(tz, i, &trip_temp);
 > +		tz->ops->get_trip_hyst(tz, i, &hysteresis);
 > +
 > +		trip_low = trip_temp - hysteresis;
 > +
 > +		if (trip_low < temp && trip_low > low)
 > +			low = trip_low;
 > +
 > +		if (trip_temp > temp && trip_temp < high)
 > +			high = trip_temp;
 > +	}
 > +
 > +	tz->prev_low_trip = low;
 > +	tz->prev_high_trip = high;
 > +
 > +	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
 > +			low, high);
 > +
 > +	tz->ops->set_trips(tz, low, high);

This should probably do something if set_trips returns an error code; at 
least an error message, perhaps enable polling? I'm not exactly sure 
what safety features the thermal framework has in general if errors happen..

One interesting thing I noticed was that at least the bang-bang governor 
only acts if the temperature is properly smaller than (trip temp - 
hysteresis). So perhaps we should specify the non-tripping range as 
[low, high)? Or we could change bang-bang.

 > +}
 > +
 >  void thermal_zone_device_update(struct thermal_zone_device *tz)
 >  {
 >  	int temp, ret, count;
 > @@ -479,6 +518,8 @@ void thermal_zone_device_update(struct 
thermal_zone_device *tz)
 >  	dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
 >  				tz->last_temperature, tz->temperature);
 >
 > +	thermal_zone_set_trips(tz);
 > +
 >  	for (count = 0; count < tz->trips; count++)
 >  		handle_thermal_trip(tz, count);
 >  }

set_trips should also be called from temp_store and other places that 
modify values that affect the trip points.

 > @@ -1494,6 +1535,8 @@ struct thermal_zone_device 
*thermal_zone_device_register(const char *type,
 >  	tz->trips = trips;
 >  	tz->passive_delay = passive_delay;
 >  	tz->polling_delay = polling_delay;
 > +	tz->prev_low_trip = INT_MAX;
 > +	tz->prev_high_trip = -INT_MAX;
 >
 >  	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 >  	result = device_register(&tz->device);
 > diff --git a/include/linux/thermal.h b/include/linux/thermal.h
 > index 07bd5e8..aef6e13 100644
 > --- a/include/linux/thermal.h
 > +++ b/include/linux/thermal.h
 > @@ -87,6 +87,7 @@ struct thermal_zone_device_ops {
 >  	int (*unbind) (struct thermal_zone_device *,
 >  		       struct thermal_cooling_device *);
 >  	int (*get_temp) (struct thermal_zone_device *, int *);
 > +	int (*set_trips) (struct thermal_zone_device *, int, int);
 >  	int (*get_mode) (struct thermal_zone_device *,
 >  			 enum thermal_device_mode *);
 >  	int (*set_mode) (struct thermal_zone_device *,
 > @@ -180,6 +181,8 @@ struct thermal_zone_device {
 >  	int last_temperature;
 >  	int emul_temperature;
 >  	int passive;
 > +	int prev_low_trip;
 > +	int prev_high_trip;
 >  	unsigned int forced_passive;
 >  	const struct thermal_zone_device_ops *ops;
 >  	const struct thermal_zone_params *tzp;
 >
Sascha Hauer May 18, 2015, 12:09 p.m. UTC | #2
Hi Mikko,

On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> > +	for (i = 0; i < tz->trips; i++) {
> > +		int trip_low;
> > +
> > +		tz->ops->get_trip_temp(tz, i, &trip_temp);
> > +		tz->ops->get_trip_hyst(tz, i, &hysteresis);
> > +
> > +		trip_low = trip_temp - hysteresis;
> > +
> > +		if (trip_low < temp && trip_low > low)
> > +			low = trip_low;
> > +
> > +		if (trip_temp > temp && trip_temp < high)
> > +			high = trip_temp;
> > +	}
> > +
> > +	tz->prev_low_trip = low;
> > +	tz->prev_high_trip = high;
> > +
> > +	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
> > +			low, high);
> > +
> > +	tz->ops->set_trips(tz, low, high);
> 
> This should probably do something if set_trips returns an error
> code; at least an error message, perhaps enable polling? I'm not
> exactly sure what safety features the thermal framework has in
> general if errors happen..

Currently a thermal zone has the passive_delay and polling_delay
variables. If these are nonzero the thermal core will always poll. A
purely interrupt driven thermal zone would set these values to zero.
In this case the thermal core has no basis for polling, so we would
have to make up polling intervals when set_trips fails. Another
possibility would be to interpret the *_delay variables as 'when
set_trips is available, do not poll. When something goes wrong, use
*_delay as polling intervals'

> 
> One interesting thing I noticed was that at least the bang-bang
> governor only acts if the temperature is properly smaller than (trip
> temp - hysteresis). So perhaps we should specify the non-tripping
> range as [low, high)? Or we could change bang-bang.

I wonder how we can protect against such off-by-one errors anyway.
Generally a hardware might operate on raw values rather than directly
in temperature values in °C. This means a driver for this must have
celsius_to_raw and raw_to_celsius conversion functions. Now it can
happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
value that when converted back to celsius is different from the
original value in °C. This would mean the hardware triggers an interrupt
for a trip point and the thermal core does not react because get_temp
actually returns a different temperature than previously programmed as
interrupt trigger. This way we would lose hot (or cold) events.

> 
> > +}
> > +
> >  void thermal_zone_device_update(struct thermal_zone_device *tz)
> >  {
> >  	int temp, ret, count;
> > @@ -479,6 +518,8 @@ void thermal_zone_device_update(struct
> thermal_zone_device *tz)
> >  	dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
> >  				tz->last_temperature, tz->temperature);
> >
> > +	thermal_zone_set_trips(tz);
> > +
> >  	for (count = 0; count < tz->trips; count++)
> >  		handle_thermal_trip(tz, count);
> >  }
> 
> set_trips should also be called from temp_store and other places
> that modify values that affect the trip points.

Good point.

Sascha
Brian Norris May 18, 2015, 6:44 p.m. UTC | #3
On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
> On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> > One interesting thing I noticed was that at least the bang-bang
> > governor only acts if the temperature is properly smaller than (trip
> > temp - hysteresis). So perhaps we should specify the non-tripping
> > range as [low, high)? Or we could change bang-bang.
> 
> I wonder how we can protect against such off-by-one errors anyway.
> Generally a hardware might operate on raw values rather than directly
> in temperature values in °C. This means a driver for this must have
> celsius_to_raw and raw_to_celsius conversion functions. Now it can
> happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
> value that when converted back to celsius is different from the
> original value in °C. This would mean the hardware triggers an interrupt
> for a trip point and the thermal core does not react because get_temp
> actually returns a different temperature than previously programmed as
> interrupt trigger. This way we would lose hot (or cold) events.

This also highlights another fact: there's a race between interrupt
generation and temperature reading (->get_temp()). I would expect any
hardware interrupt thermal sensor would also have a latched temperature
reading to correspond with it, and there would be no guarantee that this
latched temperature will match the polled reading seen once you reach
thermal_zone_device_update(). So a hardware driver might report a
thermal update, but the temperature reported to the core won't
necessarily match what interrupt was meant for.

I have a patch that adds a thermal_zone_device_update_temp() API, so
drivers can report the temperature along with the interrupt
notification. (Such a patch also helps so that the driver can choose to
round down on cold events and up on hot events, resolving your rounding
issue too.)

Brian
Mikko Perttunen May 18, 2015, 7:13 p.m. UTC | #4
On 05/18/2015 09:44 PM, Brian Norris wrote:
> On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
>> On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
>>> One interesting thing I noticed was that at least the bang-bang
>>> governor only acts if the temperature is properly smaller than (trip
>>> temp - hysteresis). So perhaps we should specify the non-tripping
>>> range as [low, high)? Or we could change bang-bang.
>>
>> I wonder how we can protect against such off-by-one errors anyway.
>> Generally a hardware might operate on raw values rather than directly
>> in temperature values in °C. This means a driver for this must have
>> celsius_to_raw and raw_to_celsius conversion functions. Now it can
>> happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
>> value that when converted back to celsius is different from the
>> original value in °C. This would mean the hardware triggers an interrupt
>> for a trip point and the thermal core does not react because get_temp
>> actually returns a different temperature than previously programmed as
>> interrupt trigger. This way we would lose hot (or cold) events.
>
> This also highlights another fact: there's a race between interrupt
> generation and temperature reading (->get_temp()). I would expect any
> hardware interrupt thermal sensor would also have a latched temperature
> reading to correspond with it, and there would be no guarantee that this
> latched temperature will match the polled reading seen once you reach
> thermal_zone_device_update(). So a hardware driver might report a
> thermal update, but the temperature reported to the core won't
> necessarily match what interrupt was meant for.

Does this actually matter? The thermal core will reset trips and apply 
cooling using the new - most recent - value. Using bang bang as example, 
if the temperature has risen since the interrupt fired, the cooling 
device will correctly not be switched off. If the temperature has 
fallen, it will again be correctly switched off. The only issue is then 
if the temperature is exactly 'trip temp - trip hyst' which will cause 
set_trips to load the trip points below, but not cause bang bang to turn 
off the cooling device, and the next chance it will have will only be at 
the next below trip point. Well, this is still safe (at least until you 
replace "cooling device" with "heating device"), so maybe it isn't that 
big of an issue.

Please point out if there's a problem with my line of reasoning.

FWIW - at least Tegra doesn't have a latched register like this. There's 
just a bit indicating that an interrupt was raised and a temperature 
register that updates according to the sensor's input clock.

>
> I have a patch that adds a thermal_zone_device_update_temp() API, so
> drivers can report the temperature along with the interrupt
> notification. (Such a patch also helps so that the driver can choose to
> round down on cold events and up on hot events, resolving your rounding
> issue too.)
>
> Brian
>

Cheers,
Mikko
Brian Norris May 18, 2015, 8:28 p.m. UTC | #5
On Mon, May 18, 2015 at 10:13:46PM +0300, Mikko Perttunen wrote:
> On 05/18/2015 09:44 PM, Brian Norris wrote:
> >On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
> >>On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> >>>One interesting thing I noticed was that at least the bang-bang
> >>>governor only acts if the temperature is properly smaller than (trip
> >>>temp - hysteresis). So perhaps we should specify the non-tripping
> >>>range as [low, high)? Or we could change bang-bang.
> >>
> >>I wonder how we can protect against such off-by-one errors anyway.
> >>Generally a hardware might operate on raw values rather than directly
> >>in temperature values in °C. This means a driver for this must have
> >>celsius_to_raw and raw_to_celsius conversion functions. Now it can
> >>happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
> >>value that when converted back to celsius is different from the
> >>original value in °C. This would mean the hardware triggers an interrupt
> >>for a trip point and the thermal core does not react because get_temp
> >>actually returns a different temperature than previously programmed as
> >>interrupt trigger. This way we would lose hot (or cold) events.
> >
> >This also highlights another fact: there's a race between interrupt
> >generation and temperature reading (->get_temp()). I would expect any
> >hardware interrupt thermal sensor would also have a latched temperature
> >reading to correspond with it, and there would be no guarantee that this
> >latched temperature will match the polled reading seen once you reach
> >thermal_zone_device_update(). So a hardware driver might report a
> >thermal update, but the temperature reported to the core won't
> >necessarily match what interrupt was meant for.
> 
> Does this actually matter? The thermal core will reset trips and
> apply cooling using the new - most recent - value. Using bang bang
> as example, if the temperature has risen since the interrupt fired,
> the cooling device will correctly not be switched off. If the
> temperature has fallen, it will again be correctly switched off. The
> only issue is then if the temperature is exactly 'trip temp - trip
> hyst' which will cause set_trips to load the trip points below, but
> not cause bang bang to turn off the cooling device, and the next
> chance it will have will only be at the next below trip point. Well,
> this is still safe (at least until you replace "cooling device" with
> "heating device"), so maybe it isn't that big of an issue.
> 
> Please point out if there's a problem with my line of reasoning.

I'm not sure I followed exactly the reason for the low-temp/hyst corner
case, but otherwise I guess that makes sense. The only problem IMO, is
that you're encouraging the generation of spurious notifications; if the
temperature is constantly changing right around 'trip temp', but it
never settles above 'trip temp' long enough for the core to re-capture
the high temperature scenario, you'll just keep making useless calls to
thermal_zone_device_update(). This kind of defeats the purpose of the
hysteresis, right?

I'd really rather have a high temperature interrupt generate exactly one
notification to the core framework, and that the sensor driver can rely
on that one interrupt being handled as a high temperature situation,
allowing it to disable the high-temp interrupt.

One of my biggest problems with the thermal subsystem so far is that
thermal_zone_device_update() doesn't actually seem to have any specific
semantic meaning. It just means that there was some reason to update
something. So then, you have to reason about a particular thermal
governor (bang bang) in order to make something sensible. If I want to
use a different sort of user-space governor, then I have to reevaluate
all these same assumptions, and it seems like I end up with a sub-par
solution.

As a side note: I have patches to extend some of the uevent information
passed by the user-space governor too, to accomplish what I'm suggesting
above. Perhaps that would be a better way to discuss what I'm thinking.

> FWIW - at least Tegra doesn't have a latched register like this.
> There's just a bit indicating that an interrupt was raised and a
> temperature register that updates according to the sensor's input
> clock.

A sensor for Broadcom's BCM7xxx has a latched register. If I get the
time, I'll post my driver soon.

Brian
Mikko Perttunen May 19, 2015, 12:43 p.m. UTC | #6
On 05/18/15 23:28, Brian Norris wrote:
> On Mon, May 18, 2015 at 10:13:46PM +0300, Mikko Perttunen wrote:
>> On 05/18/2015 09:44 PM, Brian Norris wrote:
>>> On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
>>>> On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
>>>>> One interesting thing I noticed was that at least the bang-bang
>>>>> governor only acts if the temperature is properly smaller than (trip
>>>>> temp - hysteresis). So perhaps we should specify the non-tripping
>>>>> range as [low, high)? Or we could change bang-bang.
>>>>
>>>> I wonder how we can protect against such off-by-one errors anyway.
>>>> Generally a hardware might operate on raw values rather than directly
>>>> in temperature values in °C. This means a driver for this must have
>>>> celsius_to_raw and raw_to_celsius conversion functions. Now it can
>>>> happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
>>>> value that when converted back to celsius is different from the
>>>> original value in °C. This would mean the hardware triggers an interrupt
>>>> for a trip point and the thermal core does not react because get_temp
>>>> actually returns a different temperature than previously programmed as
>>>> interrupt trigger. This way we would lose hot (or cold) events.
>>>
>>> This also highlights another fact: there's a race between interrupt
>>> generation and temperature reading (->get_temp()). I would expect any
>>> hardware interrupt thermal sensor would also have a latched temperature
>>> reading to correspond with it, and there would be no guarantee that this
>>> latched temperature will match the polled reading seen once you reach
>>> thermal_zone_device_update(). So a hardware driver might report a
>>> thermal update, but the temperature reported to the core won't
>>> necessarily match what interrupt was meant for.
>>
>> Does this actually matter? The thermal core will reset trips and
>> apply cooling using the new - most recent - value. Using bang bang
>> as example, if the temperature has risen since the interrupt fired,
>> the cooling device will correctly not be switched off. If the
>> temperature has fallen, it will again be correctly switched off. The
>> only issue is then if the temperature is exactly 'trip temp - trip
>> hyst' which will cause set_trips to load the trip points below, but
>> not cause bang bang to turn off the cooling device, and the next
>> chance it will have will only be at the next below trip point. Well,
>> this is still safe (at least until you replace "cooling device" with
>> "heating device"), so maybe it isn't that big of an issue.
>>
>> Please point out if there's a problem with my line of reasoning.
>
> I'm not sure I followed exactly the reason for the low-temp/hyst corner
> case, but otherwise I guess that makes sense. The only problem IMO, is
> that you're encouraging the generation of spurious notifications; if the
> temperature is constantly changing right around 'trip temp', but it
> never settles above 'trip temp' long enough for the core to re-capture
> the high temperature scenario, you'll just keep making useless calls to
> thermal_zone_device_update(). This kind of defeats the purpose of the
> hysteresis, right?

The corner case with bang bang is as follows:
- Say we have trip points as 50C and 80C, both with 5C hysteresis, and 
these are programmed into hardware. So the actual hardware trip points 
are 45C and 80C.
- Currently the temperature is, say, 60C and the fan is turned on.
- Temperature drops to 45C, the lower trip point is triggered.
- 45C >= 50C - 5C, so the fan is not turned off.

If we said that the hysteresis was 0C, then bang bang is certainly 
correct in that if the trip point was at 50C, it shouldn't turn the fan 
off, since that is greater than or equal to the requested temperature 
for cooling.

The function you describe would certainly be useful for eliminating 
possible superfluous interrupts due to temperature wobble, though I'm 
not sure how much of a problem that even would be.

>
> I'd really rather have a high temperature interrupt generate exactly one
> notification to the core framework, and that the sensor driver can rely
> on that one interrupt being handled as a high temperature situation,
> allowing it to disable the high-temp interrupt.
>
> One of my biggest problems with the thermal subsystem so far is that
> thermal_zone_device_update() doesn't actually seem to have any specific
> semantic meaning. It just means that there was some reason to update
> something. So then, you have to reason about a particular thermal
> governor (bang bang) in order to make something sensible. If I want to
> use a different sort of user-space governor, then I have to reevaluate
> all these same assumptions, and it seems like I end up with a sub-par
> solution.

Yeah, though I'm not sure if you can ever be sure that the governor is 
fine not getting regular temperature updates, so I imagine you might 
always end up needing to pick your governors with that in mind. In 
practice, this might not be so horrible.

>
> As a side note: I have patches to extend some of the uevent information
> passed by the user-space governor too, to accomplish what I'm suggesting
> above. Perhaps that would be a better way to discuss what I'm thinking.
>
>> FWIW - at least Tegra doesn't have a latched register like this.
>> There's just a bit indicating that an interrupt was raised and a
>> temperature register that updates according to the sensor's input
>> clock.
>
> A sensor for Broadcom's BCM7xxx has a latched register. If I get the
> time, I'll post my driver soon.
>
> Brian
>
Sascha Hauer May 19, 2015, 1:58 p.m. UTC | #7
On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
> Hi Mikko,
> 
> On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> > > +	for (i = 0; i < tz->trips; i++) {
> > > +		int trip_low;
> > > +
> > > +		tz->ops->get_trip_temp(tz, i, &trip_temp);
> > > +		tz->ops->get_trip_hyst(tz, i, &hysteresis);
> > > +
> > > +		trip_low = trip_temp - hysteresis;
> > > +
> > > +		if (trip_low < temp && trip_low > low)
> > > +			low = trip_low;
> > > +
> > > +		if (trip_temp > temp && trip_temp < high)
> > > +			high = trip_temp;
> > > +	}
> > > +
> > > +	tz->prev_low_trip = low;
> > > +	tz->prev_high_trip = high;
> > > +
> > > +	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
> > > +			low, high);
> > > +
> > > +	tz->ops->set_trips(tz, low, high);
> > 
> > This should probably do something if set_trips returns an error
> > code; at least an error message, perhaps enable polling? I'm not
> > exactly sure what safety features the thermal framework has in
> > general if errors happen..
> 
> Currently a thermal zone has the passive_delay and polling_delay
> variables. If these are nonzero the thermal core will always poll. A
> purely interrupt driven thermal zone would set these values to zero.
> In this case the thermal core has no basis for polling, so we would
> have to make up polling intervals when set_trips fails. Another
> possibility would be to interpret the *_delay variables as 'when
> set_trips is available, do not poll. When something goes wrong, use
> *_delay as polling intervals'
> 
> > 
> > One interesting thing I noticed was that at least the bang-bang
> > governor only acts if the temperature is properly smaller than (trip
> > temp - hysteresis). So perhaps we should specify the non-tripping
> > range as [low, high)? Or we could change bang-bang.
> 
> I wonder how we can protect against such off-by-one errors anyway.
> Generally a hardware might operate on raw values rather than directly
> in temperature values in °C. This means a driver for this must have
> celsius_to_raw and raw_to_celsius conversion functions. Now it can
> happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
> value that when converted back to celsius is different from the
> original value in °C. This would mean the hardware triggers an interrupt
> for a trip point and the thermal core does not react because get_temp
> actually returns a different temperature than previously programmed as
> interrupt trigger. This way we would lose hot (or cold) events.

As a simple example we could imagine a 12bit adc which has:

u32 mcelsius_to_raw(int temp)
{
	return temp / 30;
}

int raw_to_mcelsius(u32 raw)
{
	return temp * 30;
}

Now if the thermal framework requests an interrupt at 77000mC we
would program a raw value of 77000 / 30 = 2566.666667, due to integer
rounding we would program 2566. Now when the interrupt is triggered with
this exact raw value we would convert it back to 2566 * 30 = 76980. The
thermal framework would realize that this is below the threshold, do
nothing and go back to sleep.
I am beginning to think that implementing interrupts like this is not a
good idea, at least I found no convenient way out of this situation.

Sascha
Mikko Perttunen May 19, 2015, 2:05 p.m. UTC | #8
On 05/19/15 16:58, Sascha Hauer wrote:
> On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
>> Hi Mikko,
>>
>> On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
>>>> +	for (i = 0; i < tz->trips; i++) {
>>>> +		int trip_low;
>>>> +
>>>> +		tz->ops->get_trip_temp(tz, i, &trip_temp);
>>>> +		tz->ops->get_trip_hyst(tz, i, &hysteresis);
>>>> +
>>>> +		trip_low = trip_temp - hysteresis;
>>>> +
>>>> +		if (trip_low < temp && trip_low > low)
>>>> +			low = trip_low;
>>>> +
>>>> +		if (trip_temp > temp && trip_temp < high)
>>>> +			high = trip_temp;
>>>> +	}
>>>> +
>>>> +	tz->prev_low_trip = low;
>>>> +	tz->prev_high_trip = high;
>>>> +
>>>> +	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
>>>> +			low, high);
>>>> +
>>>> +	tz->ops->set_trips(tz, low, high);
>>>
>>> This should probably do something if set_trips returns an error
>>> code; at least an error message, perhaps enable polling? I'm not
>>> exactly sure what safety features the thermal framework has in
>>> general if errors happen..
>>
>> Currently a thermal zone has the passive_delay and polling_delay
>> variables. If these are nonzero the thermal core will always poll. A
>> purely interrupt driven thermal zone would set these values to zero.
>> In this case the thermal core has no basis for polling, so we would
>> have to make up polling intervals when set_trips fails. Another
>> possibility would be to interpret the *_delay variables as 'when
>> set_trips is available, do not poll. When something goes wrong, use
>> *_delay as polling intervals'
>>
>>>
>>> One interesting thing I noticed was that at least the bang-bang
>>> governor only acts if the temperature is properly smaller than (trip
>>> temp - hysteresis). So perhaps we should specify the non-tripping
>>> range as [low, high)? Or we could change bang-bang.
>>
>> I wonder how we can protect against such off-by-one errors anyway.
>> Generally a hardware might operate on raw values rather than directly
>> in temperature values in °C. This means a driver for this must have
>> celsius_to_raw and raw_to_celsius conversion functions. Now it can
>> happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
>> value that when converted back to celsius is different from the
>> original value in °C. This would mean the hardware triggers an interrupt
>> for a trip point and the thermal core does not react because get_temp
>> actually returns a different temperature than previously programmed as
>> interrupt trigger. This way we would lose hot (or cold) events.
>
> As a simple example we could imagine a 12bit adc which has:
>
> u32 mcelsius_to_raw(int temp)
> {
> 	return temp / 30;
> }
>
> int raw_to_mcelsius(u32 raw)
> {
> 	return temp * 30;
> }
>
> Now if the thermal framework requests an interrupt at 77000mC we
> would program a raw value of 77000 / 30 = 2566.666667, due to integer
> rounding we would program 2566. Now when the interrupt is triggered with
> this exact raw value we would convert it back to 2566 * 30 = 76980. The
> thermal framework would realize that this is below the threshold, do
> nothing and go back to sleep.
> I am beginning to think that implementing interrupts like this is not a
> good idea, at least I found no convenient way out of this situation.

Couldn't you just specify that the driver should do the best it can? 
That is, in this case, the driver would program the hardware for the 
least possible value x for which raw_to_mcelsius(x) >= 77000.

>
> Sascha
>

Cheers,
Mikko.
Sascha Hauer May 19, 2015, 2:05 p.m. UTC | #9
On Mon, May 18, 2015 at 11:44:33AM -0700, Brian Norris wrote:
> On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
> > On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> > > One interesting thing I noticed was that at least the bang-bang
> > > governor only acts if the temperature is properly smaller than (trip
> > > temp - hysteresis). So perhaps we should specify the non-tripping
> > > range as [low, high)? Or we could change bang-bang.
> > 
> > I wonder how we can protect against such off-by-one errors anyway.
> > Generally a hardware might operate on raw values rather than directly
> > in temperature values in °C. This means a driver for this must have
> > celsius_to_raw and raw_to_celsius conversion functions. Now it can
> > happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
> > value that when converted back to celsius is different from the
> > original value in °C. This would mean the hardware triggers an interrupt
> > for a trip point and the thermal core does not react because get_temp
> > actually returns a different temperature than previously programmed as
> > interrupt trigger. This way we would lose hot (or cold) events.
> 
> This also highlights another fact: there's a race between interrupt
> generation and temperature reading (->get_temp()). I would expect any
> hardware interrupt thermal sensor would also have a latched temperature
> reading to correspond with it, and there would be no guarantee that this
> latched temperature will match the polled reading seen once you reach
> thermal_zone_device_update(). So a hardware driver might report a
> thermal update, but the temperature reported to the core won't
> necessarily match what interrupt was meant for.
> 
> I have a patch that adds a thermal_zone_device_update_temp() API, so
> drivers can report the temperature along with the interrupt
> notification. (Such a patch also helps so that the driver can choose to
> round down on cold events and up on hot events, resolving your rounding
> issue too.)

Could you send me that patch? Thinking about it this might indeed work.
The only thing that a driver needs to make sure then is that it actually
at least one time reports a temperature beyond the currently programmed
thresholds. With the patch you describe a driver could simply do that by
ignoring the current ADC values and simply reporting the previously
desired values.

Sascha
Sascha Hauer May 20, 2015, 1:21 p.m. UTC | #10
On Tue, May 19, 2015 at 05:05:29PM +0300, Mikko Perttunen wrote:
> On 05/19/15 16:58, Sascha Hauer wrote:
> >On Mon, May 18, 2015 at 02:09:44PM +0200, Sascha Hauer wrote:
> >>Hi Mikko,
> >>
> >>On Mon, May 18, 2015 at 12:06:50PM +0300, Mikko Perttunen wrote:
> >>>>+	for (i = 0; i < tz->trips; i++) {
> >>>>+		int trip_low;
> >>>>+
> >>>>+		tz->ops->get_trip_temp(tz, i, &trip_temp);
> >>>>+		tz->ops->get_trip_hyst(tz, i, &hysteresis);
> >>>>+
> >>>>+		trip_low = trip_temp - hysteresis;
> >>>>+
> >>>>+		if (trip_low < temp && trip_low > low)
> >>>>+			low = trip_low;
> >>>>+
> >>>>+		if (trip_temp > temp && trip_temp < high)
> >>>>+			high = trip_temp;
> >>>>+	}
> >>>>+
> >>>>+	tz->prev_low_trip = low;
> >>>>+	tz->prev_high_trip = high;
> >>>>+
> >>>>+	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
> >>>>+			low, high);
> >>>>+
> >>>>+	tz->ops->set_trips(tz, low, high);
> >>>
> >>>This should probably do something if set_trips returns an error
> >>>code; at least an error message, perhaps enable polling? I'm not
> >>>exactly sure what safety features the thermal framework has in
> >>>general if errors happen..
> >>
> >>Currently a thermal zone has the passive_delay and polling_delay
> >>variables. If these are nonzero the thermal core will always poll. A
> >>purely interrupt driven thermal zone would set these values to zero.
> >>In this case the thermal core has no basis for polling, so we would
> >>have to make up polling intervals when set_trips fails. Another
> >>possibility would be to interpret the *_delay variables as 'when
> >>set_trips is available, do not poll. When something goes wrong, use
> >>*_delay as polling intervals'
> >>
> >>>
> >>>One interesting thing I noticed was that at least the bang-bang
> >>>governor only acts if the temperature is properly smaller than (trip
> >>>temp - hysteresis). So perhaps we should specify the non-tripping
> >>>range as [low, high)? Or we could change bang-bang.
> >>
> >>I wonder how we can protect against such off-by-one errors anyway.
> >>Generally a hardware might operate on raw values rather than directly
> >>in temperature values in °C. This means a driver for this must have
> >>celsius_to_raw and raw_to_celsius conversion functions. Now it can
> >>happen that due to rounding errors celsius_to_raw(Tcrit) returns a raw
> >>value that when converted back to celsius is different from the
> >>original value in °C. This would mean the hardware triggers an interrupt
> >>for a trip point and the thermal core does not react because get_temp
> >>actually returns a different temperature than previously programmed as
> >>interrupt trigger. This way we would lose hot (or cold) events.
> >
> >As a simple example we could imagine a 12bit adc which has:
> >
> >u32 mcelsius_to_raw(int temp)
> >{
> >	return temp / 30;
> >}
> >
> >int raw_to_mcelsius(u32 raw)
> >{
> >	return temp * 30;
> >}
> >
> >Now if the thermal framework requests an interrupt at 77000mC we
> >would program a raw value of 77000 / 30 = 2566.666667, due to integer
> >rounding we would program 2566. Now when the interrupt is triggered with
> >this exact raw value we would convert it back to 2566 * 30 = 76980. The
> >thermal framework would realize that this is below the threshold, do
> >nothing and go back to sleep.
> >I am beginning to think that implementing interrupts like this is not a
> >good idea, at least I found no convenient way out of this situation.
> 
> Couldn't you just specify that the driver should do the best it can?
> That is, in this case, the driver would program the hardware for the
> least possible value x for which raw_to_mcelsius(x) >= 77000.

That's what I did now.

Sascha
diff mbox

Patch

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 6bbf61f..3ae1795 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -453,6 +453,45 @@  int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
+static void thermal_zone_set_trips(struct thermal_zone_device *tz)
+{
+	int low = -INT_MAX;
+	int high = INT_MAX;
+	int trip_temp, hysteresis;
+	int temp = tz->temperature;
+	int i;
+
+	if (!tz->ops->set_trips)
+		return;
+
+	/* No need to change trip points */
+	if (temp > tz->prev_low_trip && temp < tz->prev_high_trip)
+		return;
+
+	for (i = 0; i < tz->trips; i++) {
+		int trip_low;
+
+		tz->ops->get_trip_temp(tz, i, &trip_temp);
+		tz->ops->get_trip_hyst(tz, i, &hysteresis);
+
+		trip_low = trip_temp - hysteresis;
+
+		if (trip_low < temp && trip_low > low)
+			low = trip_low;
+
+		if (trip_temp > temp && trip_temp < high)
+			high = trip_temp;
+	}
+
+	tz->prev_low_trip = low;
+	tz->prev_high_trip = high;
+
+	dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n",
+			low, high);
+
+	tz->ops->set_trips(tz, low, high);
+}
+
 void thermal_zone_device_update(struct thermal_zone_device *tz)
 {
 	int temp, ret, count;
@@ -479,6 +518,8 @@  void thermal_zone_device_update(struct thermal_zone_device *tz)
 	dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
 				tz->last_temperature, tz->temperature);
 
+	thermal_zone_set_trips(tz);
+
 	for (count = 0; count < tz->trips; count++)
 		handle_thermal_trip(tz, count);
 }
@@ -1494,6 +1535,8 @@  struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	tz->trips = trips;
 	tz->passive_delay = passive_delay;
 	tz->polling_delay = polling_delay;
+	tz->prev_low_trip = INT_MAX;
+	tz->prev_high_trip = -INT_MAX;
 
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 07bd5e8..aef6e13 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -87,6 +87,7 @@  struct thermal_zone_device_ops {
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
 	int (*get_temp) (struct thermal_zone_device *, int *);
+	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*get_mode) (struct thermal_zone_device *,
 			 enum thermal_device_mode *);
 	int (*set_mode) (struct thermal_zone_device *,
@@ -180,6 +181,8 @@  struct thermal_zone_device {
 	int last_temperature;
 	int emul_temperature;
 	int passive;
+	int prev_low_trip;
+	int prev_high_trip;
 	unsigned int forced_passive;
 	const struct thermal_zone_device_ops *ops;
 	const struct thermal_zone_params *tzp;