diff mbox series

[v3,1/1] arm64: dts: sdm845: wireup the thermal trip points to cpufreq

Message ID 6a21a9ee7663e1b32d8ea81ac5e51d187aed25fb.1548093127.git.amit.kucheria@linaro.org (mailing list archive)
State New, archived
Headers show
Series Thermal throttling for SDM845 | expand

Commit Message

Amit Kucheria Jan. 21, 2019, 6:08 p.m. UTC
Since all cpus in the big and little clusters, respectively, are in the
same frequency domain, use all of them for mitigation in the
cooling-map. We end up with two cooling devices - one each for the big
and little clusters.

We throttle lightly at the first trip point, just removing the boost
frequency. At the next trip point we allow ourselves to be throttled to
any extent.

Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
---
 arch/arm64/boot/dts/qcom/sdm845.dtsi | 225 +++++++++++++++++++++++++--
 1 file changed, 209 insertions(+), 16 deletions(-)

Comments

Matthias Kaehlcke Jan. 23, 2019, 2:12 a.m. UTC | #1
Hi Amit,

On Mon, Jan 21, 2019 at 11:38:34PM +0530, Amit Kucheria wrote:
> Since all cpus in the big and little clusters, respectively, are in the
> same frequency domain, use all of them for mitigation in the
> cooling-map. We end up with two cooling devices - one each for the big
> and little clusters.
> 
> We throttle lightly at the first trip point, just removing the boost
> frequency. At the next trip point we allow ourselves to be throttled to
> any extent.
> 
> Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
> ---
>  arch/arm64/boot/dts/qcom/sdm845.dtsi | 225 +++++++++++++++++++++++++--
>  1 file changed, 209 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> index c27cbd3bcb0a..878f661d16eb 100644
> --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> @@ -13,6 +13,7 @@
>  #include <dt-bindings/reset/qcom,sdm845-aoss.h>
>  #include <dt-bindings/soc/qcom,rpmh-rsc.h>
>  #include <dt-bindings/clock/qcom,gcc-sdm845.h>
> +#include <dt-bindings/thermal/thermal.h>
>  
>  / {
>  	interrupt-parent = <&intc>;
> @@ -99,6 +100,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x0>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_0>;
>  			L2_0: l2-cache {
>  				compatible = "cache";
> @@ -114,6 +116,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x100>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_100>;
>  			L2_100: l2-cache {
>  				compatible = "cache";
> @@ -126,6 +129,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x200>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_200>;
>  			L2_200: l2-cache {
>  				compatible = "cache";
> @@ -138,6 +142,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x300>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_300>;
>  			L2_300: l2-cache {
>  				compatible = "cache";
> @@ -150,6 +155,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x400>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_400>;
>  			L2_400: l2-cache {
>  				compatible = "cache";
> @@ -162,6 +168,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x500>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_500>;
>  			L2_500: l2-cache {
>  				compatible = "cache";
> @@ -174,6 +181,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x600>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_600>;
>  			L2_600: l2-cache {
>  				compatible = "cache";
> @@ -186,6 +194,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x700>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_700>;
>  			L2_700: l2-cache {
>  				compatible = "cache";
> @@ -1691,18 +1700,41 @@
>  			thermal-sensors = <&tsens0 1>;
>  
>  			trips {
> -				cpu_alert0: trip0 {
> +				cpu0_alert1: trip-point@0 {
>  					temperature = <75000>;

In my observations a 'switch on/threshold' temperature of 75 degrees
leads to aggressive throttling with IPA when the temperature is above
this threshold:

[  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
[  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
[  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
[  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
[  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
[  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
[  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
[  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
[  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
[  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
[  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
[  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759

(the logs come from a local patch in our tree:
https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)

At this point I don't have a clear idea what would be a reasonable
value for the 'switch on/threshold' temperature, but probably it
should to be higher than 75 degrees, at least with IPA. If there is
no reasonable common configuration for different thermal governors I
guess we'll have to target a commonly used governor and systems
using other 'incompatible' governors need to override the config in
their <board>.dtsi.

I should also say that the system I'm testing on isn't a
representative environment (if such a thing exists at all...). It
isn't running an upstream kernel (it's a recent version though,
4.19). We try to stay as close to upstream as possible, however our
tree includes EAS related patches that affect thermal throttling which
haven't landed upstream yet. Also we currently use a guesstimated
value for 'dynamic-power-coefficient', which impacts IPA. And our
device doesn't have it's final thermal envelope yet, possible future
hardware changes (e.g. heatsink) may alter the behavior.

>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit0: trip1 {
> +				cpu0_alert0: trip-point@1 {

The labels of the two trip points (cpu0_alert0 and cpu0_alert1) are
inverted.

> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu0_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu0_alert0>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> +				};

With IPA this doesn't really limit throttling to the boost
frequency. Not sure if it has a negative impact, some other platforms
with a thermal configuration that targets IPA only have a cooling map
entry for the 'desired/target' temperature.

Cheers

Matthias
Matthias Kaehlcke Jan. 24, 2019, 11:35 p.m. UTC | #2
On Tue, Jan 22, 2019 at 06:12:51PM -0800, Matthias Kaehlcke wrote:
> Hi Amit,
> 
> On Mon, Jan 21, 2019 at 11:38:34PM +0530, Amit Kucheria wrote:
> > Since all cpus in the big and little clusters, respectively, are in the
> > same frequency domain, use all of them for mitigation in the
> > cooling-map. We end up with two cooling devices - one each for the big
> > and little clusters.
> > 
> > We throttle lightly at the first trip point, just removing the boost
> > frequency. At the next trip point we allow ourselves to be throttled to
> > any extent.
> > 
> > Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
> > ---
> >  arch/arm64/boot/dts/qcom/sdm845.dtsi | 225 +++++++++++++++++++++++++--
> >  1 file changed, 209 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > index c27cbd3bcb0a..878f661d16eb 100644
> > --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > @@ -13,6 +13,7 @@
> >  #include <dt-bindings/reset/qcom,sdm845-aoss.h>
> >  #include <dt-bindings/soc/qcom,rpmh-rsc.h>
> >  #include <dt-bindings/clock/qcom,gcc-sdm845.h>
> > +#include <dt-bindings/thermal/thermal.h>
> >  
> >  / {
> >  	interrupt-parent = <&intc>;
> > @@ -99,6 +100,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x0>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_0>;
> >  			L2_0: l2-cache {
> >  				compatible = "cache";
> > @@ -114,6 +116,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x100>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_100>;
> >  			L2_100: l2-cache {
> >  				compatible = "cache";
> > @@ -126,6 +129,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x200>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_200>;
> >  			L2_200: l2-cache {
> >  				compatible = "cache";
> > @@ -138,6 +142,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x300>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_300>;
> >  			L2_300: l2-cache {
> >  				compatible = "cache";
> > @@ -150,6 +155,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x400>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_400>;
> >  			L2_400: l2-cache {
> >  				compatible = "cache";
> > @@ -162,6 +168,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x500>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_500>;
> >  			L2_500: l2-cache {
> >  				compatible = "cache";
> > @@ -174,6 +181,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x600>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_600>;
> >  			L2_600: l2-cache {
> >  				compatible = "cache";
> > @@ -186,6 +194,7 @@
> >  			compatible = "qcom,kryo385";
> >  			reg = <0x0 0x700>;
> >  			enable-method = "psci";
> > +			#cooling-cells = <2>;
> >  			next-level-cache = <&L2_700>;
> >  			L2_700: l2-cache {
> >  				compatible = "cache";
> > @@ -1691,18 +1700,41 @@
> >  			thermal-sensors = <&tsens0 1>;
> >  
> >  			trips {
> > -				cpu_alert0: trip0 {
> > +				cpu0_alert1: trip-point@0 {
> >  					temperature = <75000>;
> 
> In my observations a 'switch on/threshold' temperature of 75 degrees
> leads to aggressive throttling with IPA when the temperature is above
> this threshold:
> 
> [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> 
> (the logs come from a local patch in our tree:
> https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> 
> At this point I don't have a clear idea what would be a reasonable
> value for the 'switch on/threshold' temperature, but probably it
> should to be higher than 75 degrees, at least with IPA. If there is
> no reasonable common configuration for different thermal governors I
> guess we'll have to target a commonly used governor and systems
> using other 'incompatible' governors need to override the config in
> their <board>.dtsi.

On my system I don't see a significant delta in core temperatures for
'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
performance goes up by ~8% when changing the trip point from 80 to
85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
all trip points the average core temperatures are ~80°C (silver) and
~85°C (gold). Interestingly I observed the highest average
temperatures with the trip point at 80°C (repeated measurements were
taken for different temperatures).

Supposedly LMH throttling is disabled in the firmware I used for
these tests, however data suggests that it is still active
(temperature doesn't rise beyond 95°C, even without throttling in
Linux; Dhrystone performance drops when raising the temperature beyond
95°C with a heat gun. I will do some more testing when I get my hands
on a FW that effectively disables LMH (or raises the threshold to
something like 105°C).

From the data collected so far I'd suggest a 'threshold' temperature
of 90°C or if that seems to high 85°C. Behavior might be different
with other thermal governors or without LMH throttling..

> I should also say that the system I'm testing on isn't a
> representative environment (if such a thing exists at all...). It
> isn't running an upstream kernel (it's a recent version though,
> 4.19). We try to stay as close to upstream as possible, however our
> tree includes EAS related patches that affect thermal throttling which
> haven't landed upstream yet. Also we currently use a guesstimated
> value for 'dynamic-power-coefficient', which impacts IPA. And our
> device doesn't have it's final thermal envelope yet, possible future
> hardware changes (e.g. heatsink) may alter the behavior.
> 
> >  					hysteresis = <2000>;
> >  					type = "passive";
> >  				};
> >  
> > -				cpu_crit0: trip1 {
> > +				cpu0_alert0: trip-point@1 {
> 
> The labels of the two trip points (cpu0_alert0 and cpu0_alert1) are
> inverted.
> 
> > +					temperature = <95000>;
> > +					hysteresis = <2000>;
> > +					type = "passive";
> > +				};
> > +
> > +				cpu0_crit: cpu_crit {
> >  					temperature = <110000>;
> >  					hysteresis = <1000>;
> >  					type = "critical";
> >  				};
> >  			};
> > +
> > +			cooling-maps {
> > +				map0 {
> > +					trip = <&cpu0_alert0>;
> > +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> > +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> > +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> > +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> > +				};
> 
> With IPA this doesn't really limit throttling to the boost
> frequency. Not sure if it has a negative impact, some other platforms
> with a thermal configuration that targets IPA only have a cooling map
> entry for the 'desired/target' temperature.
> 
> Cheers
> 
> Matthias
Matthias Kaehlcke Jan. 25, 2019, 10:20 p.m. UTC | #3
On Thu, Jan 24, 2019 at 03:35:28PM -0800, Matthias Kaehlcke wrote:
> On Tue, Jan 22, 2019 at 06:12:51PM -0800, Matthias Kaehlcke wrote:
> > Hi Amit,
> > 
> > On Mon, Jan 21, 2019 at 11:38:34PM +0530, Amit Kucheria wrote:
> > > Since all cpus in the big and little clusters, respectively, are in the
> > > same frequency domain, use all of them for mitigation in the
> > > cooling-map. We end up with two cooling devices - one each for the big
> > > and little clusters.
> > > 
> > > We throttle lightly at the first trip point, just removing the boost
> > > frequency. At the next trip point we allow ourselves to be throttled to
> > > any extent.
> > > 
> > > Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
> > > ---
> > >  arch/arm64/boot/dts/qcom/sdm845.dtsi | 225 +++++++++++++++++++++++++--
> > >  1 file changed, 209 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > > index c27cbd3bcb0a..878f661d16eb 100644
> > > --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > > +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> > > @@ -13,6 +13,7 @@
> > >  #include <dt-bindings/reset/qcom,sdm845-aoss.h>
> > >  #include <dt-bindings/soc/qcom,rpmh-rsc.h>
> > >  #include <dt-bindings/clock/qcom,gcc-sdm845.h>
> > > +#include <dt-bindings/thermal/thermal.h>
> > >  
> > >  / {
> > >  	interrupt-parent = <&intc>;
> > > @@ -99,6 +100,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x0>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_0>;
> > >  			L2_0: l2-cache {
> > >  				compatible = "cache";
> > > @@ -114,6 +116,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x100>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_100>;
> > >  			L2_100: l2-cache {
> > >  				compatible = "cache";
> > > @@ -126,6 +129,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x200>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_200>;
> > >  			L2_200: l2-cache {
> > >  				compatible = "cache";
> > > @@ -138,6 +142,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x300>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_300>;
> > >  			L2_300: l2-cache {
> > >  				compatible = "cache";
> > > @@ -150,6 +155,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x400>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_400>;
> > >  			L2_400: l2-cache {
> > >  				compatible = "cache";
> > > @@ -162,6 +168,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x500>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_500>;
> > >  			L2_500: l2-cache {
> > >  				compatible = "cache";
> > > @@ -174,6 +181,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x600>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_600>;
> > >  			L2_600: l2-cache {
> > >  				compatible = "cache";
> > > @@ -186,6 +194,7 @@
> > >  			compatible = "qcom,kryo385";
> > >  			reg = <0x0 0x700>;
> > >  			enable-method = "psci";
> > > +			#cooling-cells = <2>;
> > >  			next-level-cache = <&L2_700>;
> > >  			L2_700: l2-cache {
> > >  				compatible = "cache";
> > > @@ -1691,18 +1700,41 @@
> > >  			thermal-sensors = <&tsens0 1>;
> > >  
> > >  			trips {
> > > -				cpu_alert0: trip0 {
> > > +				cpu0_alert1: trip-point@0 {
> > >  					temperature = <75000>;
> > 
> > In my observations a 'switch on/threshold' temperature of 75 degrees
> > leads to aggressive throttling with IPA when the temperature is above
> > this threshold:
> > 
> > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > 
> > (the logs come from a local patch in our tree:
> > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > 
> > At this point I don't have a clear idea what would be a reasonable
> > value for the 'switch on/threshold' temperature, but probably it
> > should to be higher than 75 degrees, at least with IPA. If there is
> > no reasonable common configuration for different thermal governors I
> > guess we'll have to target a commonly used governor and systems
> > using other 'incompatible' governors need to override the config in
> > their <board>.dtsi.
> 
> On my system I don't see a significant delta in core temperatures for
> 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> performance goes up by ~8% when changing the trip point from 80 to
> 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> all trip points the average core temperatures are ~80°C (silver) and
> ~85°C (gold). Interestingly I observed the highest average
> temperatures with the trip point at 80°C (repeated measurements were
> taken for different temperatures).
> 
> Supposedly LMH throttling is disabled in the firmware I used for
> these tests, however data suggests that it is still active
> (temperature doesn't rise beyond 95°C, even without throttling in
> Linux; Dhrystone performance drops when raising the temperature beyond
> 95°C with a heat gun. I will do some more testing when I get my hands
> on a FW that effectively disables LMH (or raises the threshold to
> something like 105°C).
> 
> From the data collected so far I'd suggest a 'threshold' temperature
> of 90°C or if that seems to high 85°C. Behavior might be different
> with other thermal governors or without LMH throttling..

Some more data from measurements with different trips points, for the
IPA and the Fair Share governors, LMH throttling was enabled:

			IPA
	Dhrystone	Temp Silver	Temp Gold
75	6M		78.4		84.9
80	6.21M		81.4		89.8
85	6.74M		81.7		88.2
90	6.88M		79.4		84.6

			Fair Share
	Dhrystone	Temp Silver	Temp Gold
75	6.63M		80.1		88.5
80	6.71M		80.1		88.5
85	6.77M		81.1		87.8
90	7.12M		81.2		87.8

Within this range the 'threshold' temperature doesn't seem to have a
large impact on the average CPU temperature. There is a bit of
fluctuation between individual measurements, I wouldn't be surprised
if the outliers of Temp Gold for 75 and 90°C converged more with the
other values with some more measurements.

I learned how to effectively disable LMH throttling, however with that
it was fairly easy to have the CPUs overheat, even with throttling in
Linux. If it is feasible at all to run with LMH disabled some more
actions will be needed (e.g. attaching a heatsink or interrupt support
for thermal sensors instead of polling, ...).

Cheers

Matthias
Eduardo Valentin Feb. 6, 2019, 12:04 a.m. UTC | #4
On Mon, Jan 21, 2019 at 11:38:34PM +0530, Amit Kucheria wrote:
> Since all cpus in the big and little clusters, respectively, are in the
> same frequency domain, use all of them for mitigation in the
> cooling-map. We end up with two cooling devices - one each for the big
> and little clusters.
> 
> We throttle lightly at the first trip point, just removing the boost
> frequency. At the next trip point we allow ourselves to be throttled to
> any extent.
> 

From OF thermal descriptor, this patch looks fine to me:

Acked-by: Eduardo Valentin <edubezval@gmail.com>

> Signed-off-by: Amit Kucheria <amit.kucheria@linaro.org>
> ---
>  arch/arm64/boot/dts/qcom/sdm845.dtsi | 225 +++++++++++++++++++++++++--
>  1 file changed, 209 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> index c27cbd3bcb0a..878f661d16eb 100644
> --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
> @@ -13,6 +13,7 @@
>  #include <dt-bindings/reset/qcom,sdm845-aoss.h>
>  #include <dt-bindings/soc/qcom,rpmh-rsc.h>
>  #include <dt-bindings/clock/qcom,gcc-sdm845.h>
> +#include <dt-bindings/thermal/thermal.h>
>  
>  / {
>  	interrupt-parent = <&intc>;
> @@ -99,6 +100,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x0>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_0>;
>  			L2_0: l2-cache {
>  				compatible = "cache";
> @@ -114,6 +116,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x100>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_100>;
>  			L2_100: l2-cache {
>  				compatible = "cache";
> @@ -126,6 +129,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x200>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_200>;
>  			L2_200: l2-cache {
>  				compatible = "cache";
> @@ -138,6 +142,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x300>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_300>;
>  			L2_300: l2-cache {
>  				compatible = "cache";
> @@ -150,6 +155,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x400>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_400>;
>  			L2_400: l2-cache {
>  				compatible = "cache";
> @@ -162,6 +168,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x500>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_500>;
>  			L2_500: l2-cache {
>  				compatible = "cache";
> @@ -174,6 +181,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x600>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_600>;
>  			L2_600: l2-cache {
>  				compatible = "cache";
> @@ -186,6 +194,7 @@
>  			compatible = "qcom,kryo385";
>  			reg = <0x0 0x700>;
>  			enable-method = "psci";
> +			#cooling-cells = <2>;
>  			next-level-cache = <&L2_700>;
>  			L2_700: l2-cache {
>  				compatible = "cache";
> @@ -1691,18 +1700,41 @@
>  			thermal-sensors = <&tsens0 1>;
>  
>  			trips {
> -				cpu_alert0: trip0 {
> +				cpu0_alert1: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit0: trip1 {
> +				cpu0_alert0: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu0_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu0_alert0>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu0_alert1>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu1-thermal {
> @@ -1712,18 +1744,41 @@
>  			thermal-sensors = <&tsens0 2>;
>  
>  			trips {
> -				cpu_alert1: trip0 {
> +				cpu1_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit1: trip1 {
> +				cpu1_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu1_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu1_alert0>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu1_alert1>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu2-thermal {
> @@ -1733,18 +1788,41 @@
>  			thermal-sensors = <&tsens0 3>;
>  
>  			trips {
> -				cpu_alert2: trip0 {
> +				cpu2_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit2: trip1 {
> +				cpu2_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu2_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu2_alert0>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu2_alert1>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu3-thermal {
> @@ -1754,18 +1832,41 @@
>  			thermal-sensors = <&tsens0 4>;
>  
>  			trips {
> -				cpu_alert3: trip0 {
> +				cpu3_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit3: trip1 {
> +				cpu3_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu3_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu3_alert0>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
> +							 <&CPU1 THERMAL_NO_LIMIT 1>,
> +							 <&CPU2 THERMAL_NO_LIMIT 1>,
> +							 <&CPU3 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu3_alert1>;
> +					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu4-thermal {
> @@ -1775,18 +1876,41 @@
>  			thermal-sensors = <&tsens0 7>;
>  
>  			trips {
> -				cpu_alert4: trip0 {
> +				cpu4_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit4: trip1 {
> +				cpu4_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu4_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu4_alert0>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
> +							 <&CPU5 THERMAL_NO_LIMIT 1>,
> +							 <&CPU6 THERMAL_NO_LIMIT 1>,
> +							 <&CPU7 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu4_alert1>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu5-thermal {
> @@ -1796,18 +1920,41 @@
>  			thermal-sensors = <&tsens0 8>;
>  
>  			trips {
> -				cpu_alert5: trip0 {
> +				cpu5_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit5: trip1 {
> +				cpu5_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu5_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu5_alert0>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
> +							 <&CPU5 THERMAL_NO_LIMIT 1>,
> +							 <&CPU6 THERMAL_NO_LIMIT 1>,
> +							 <&CPU7 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu5_alert1>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu6-thermal {
> @@ -1817,18 +1964,41 @@
>  			thermal-sensors = <&tsens0 9>;
>  
>  			trips {
> -				cpu_alert6: trip0 {
> +				cpu6_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit6: trip1 {
> +				cpu6_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu6_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu6_alert0>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
> +							 <&CPU5 THERMAL_NO_LIMIT 1>,
> +							 <&CPU6 THERMAL_NO_LIMIT 1>,
> +							 <&CPU7 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu6_alert1>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  
>  		cpu7-thermal {
> @@ -1838,18 +2008,41 @@
>  			thermal-sensors = <&tsens0 10>;
>  
>  			trips {
> -				cpu_alert7: trip0 {
> +				cpu7_alert0: trip-point@0 {
>  					temperature = <75000>;
>  					hysteresis = <2000>;
>  					type = "passive";
>  				};
>  
> -				cpu_crit7: trip1 {
> +				cpu7_alert1: trip-point@1 {
> +					temperature = <95000>;
> +					hysteresis = <2000>;
> +					type = "passive";
> +				};
> +
> +				cpu7_crit: cpu_crit {
>  					temperature = <110000>;
>  					hysteresis = <1000>;
>  					type = "critical";
>  				};
>  			};
> +
> +			cooling-maps {
> +				map0 {
> +					trip = <&cpu7_alert0>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
> +							 <&CPU5 THERMAL_NO_LIMIT 1>,
> +							 <&CPU6 THERMAL_NO_LIMIT 1>,
> +							 <&CPU7 THERMAL_NO_LIMIT 1>;
> +				};
> +				map1 {
> +					trip = <&cpu7_alert1>;
> +					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
> +							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
> +				};
> +			};
>  		};
>  	};
>  };
> -- 
> 2.17.1
>
Amit Kucheria Feb. 6, 2019, 10:35 a.m. UTC | #5
On Sat, Jan 26, 2019 at 3:50 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> > > >                   trips {
> > > > -                         cpu_alert0: trip0 {
> > > > +                         cpu0_alert1: trip-point@0 {
> > > >                                   temperature = <75000>;
> > >
> > > In my observations a 'switch on/threshold' temperature of 75 degrees
> > > leads to aggressive throttling with IPA when the temperature is above
> > > this threshold:
> > >
> > > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > >
> > > (the logs come from a local patch in our tree:
> > > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > >
> > > At this point I don't have a clear idea what would be a reasonable
> > > value for the 'switch on/threshold' temperature, but probably it
> > > should to be higher than 75 degrees, at least with IPA. If there is
> > > no reasonable common configuration for different thermal governors I
> > > guess we'll have to target a commonly used governor and systems
> > > using other 'incompatible' governors need to override the config in
> > > their <board>.dtsi.
> >

Thanks for the elaborate testing and for sharing the numbers. This is
very useful information.

> > On my system I don't see a significant delta in core temperatures for
> > 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> > performance goes up by ~8% when changing the trip point from 80 to
> > 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> > all trip points the average core temperatures are ~80°C (silver) and
> > ~85°C (gold). Interestingly I observed the highest average
> > temperatures with the trip point at 80°C (repeated measurements were
> > taken for different temperatures).
> >
> > Supposedly LMH throttling is disabled in the firmware I used for
> > these tests, however data suggests that it is still active
> > (temperature doesn't rise beyond 95°C, even without throttling in
> > Linux; Dhrystone performance drops when raising the temperature beyond
> > 95°C with a heat gun. I will do some more testing when I get my hands
> > on a FW that effectively disables LMH (or raises the threshold to
> > something like 105°C).
> >
> > From the data collected so far I'd suggest a 'threshold' temperature
> > of 90°C or if that seems to high 85°C. Behavior might be different
> > with other thermal governors or without LMH throttling..
>
> Some more data from measurements with different trips points, for the
> IPA and the Fair Share governors, LMH throttling was enabled:
>
>                         IPA
>         Dhrystone       Temp Silver     Temp Gold
> 75      6M              78.4            84.9
> 80      6.21M           81.4            89.8
> 85      6.74M           81.7            88.2
> 90      6.88M           79.4            84.6
>
>                         Fair Share
>         Dhrystone       Temp Silver     Temp Gold
> 75      6.63M           80.1            88.5
> 80      6.71M           80.1            88.5
> 85      6.77M           81.1            87.8
> 90      7.12M           81.2            87.8

Interesting that you get more MIPs out of fair share governor when
compared to IPA across the board. What devices were providing energy
cost information (dynamic-power-coefficient) to the IPA engine? Just
CPU and GPU? Can you point me to those patches in gerrit?

> Within this range the 'threshold' temperature doesn't seem to have a
> large impact on the average CPU temperature. There is a bit of
> fluctuation between individual measurements, I wouldn't be surprised
> if the outliers of Temp Gold for 75 and 90°C converged more with the
> other values with some more measurements.
>
> I learned how to effectively disable LMH throttling, however with that
> it was fairly easy to have the CPUs overheat, even with throttling in
> Linux. If it is feasible at all to run with LMH disabled some more
> actions will be needed (e.g. attaching a heatsink or interrupt support
> for thermal sensors instead of polling, ...).

Given that LMH kicks in at 95 and IPA manages to maintain temperatures
in the ballpark of 80-90 regardless of the trip point value, I agree
that we should move the 1st trip point to 90. This will give maximum
performance. So in "threshold" and "target" terms 90 becomes the
threshold. And since LMH kicks in at 95, I've left it as the target
trip.

These should be sane defaults for upstream and any device can override
those numbers in their board file.

Thanks again for your thorough review.

Regards,
Amit
Matthias Kaehlcke Feb. 6, 2019, 7:34 p.m. UTC | #6
On Wed, Feb 06, 2019 at 04:05:41PM +0530, Amit Kucheria wrote:
> On Sat, Jan 26, 2019 at 3:50 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> > > > >                   trips {
> > > > > -                         cpu_alert0: trip0 {
> > > > > +                         cpu0_alert1: trip-point@0 {
> > > > >                                   temperature = <75000>;
> > > >
> > > > In my observations a 'switch on/threshold' temperature of 75 degrees
> > > > leads to aggressive throttling with IPA when the temperature is above
> > > > this threshold:
> > > >
> > > > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > > > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > > > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > > > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > > > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > > > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > > > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > > > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > > > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > > > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > > >
> > > > (the logs come from a local patch in our tree:
> > > > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > > >
> > > > At this point I don't have a clear idea what would be a reasonable
> > > > value for the 'switch on/threshold' temperature, but probably it
> > > > should to be higher than 75 degrees, at least with IPA. If there is
> > > > no reasonable common configuration for different thermal governors I
> > > > guess we'll have to target a commonly used governor and systems
> > > > using other 'incompatible' governors need to override the config in
> > > > their <board>.dtsi.
> > >
> 
> Thanks for the elaborate testing and for sharing the numbers. This is
> very useful information.
> 
> > > On my system I don't see a significant delta in core temperatures for
> > > 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> > > performance goes up by ~8% when changing the trip point from 80 to
> > > 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> > > all trip points the average core temperatures are ~80°C (silver) and
> > > ~85°C (gold). Interestingly I observed the highest average
> > > temperatures with the trip point at 80°C (repeated measurements were
> > > taken for different temperatures).
> > >
> > > Supposedly LMH throttling is disabled in the firmware I used for
> > > these tests, however data suggests that it is still active
> > > (temperature doesn't rise beyond 95°C, even without throttling in
> > > Linux; Dhrystone performance drops when raising the temperature beyond
> > > 95°C with a heat gun. I will do some more testing when I get my hands
> > > on a FW that effectively disables LMH (or raises the threshold to
> > > something like 105°C).
> > >
> > > From the data collected so far I'd suggest a 'threshold' temperature
> > > of 90°C or if that seems to high 85°C. Behavior might be different
> > > with other thermal governors or without LMH throttling..
> >
> > Some more data from measurements with different trips points, for the
> > IPA and the Fair Share governors, LMH throttling was enabled:
> >
> >                         IPA
> >         Dhrystone       Temp Silver     Temp Gold
> > 75      6M              78.4            84.9
> > 80      6.21M           81.4            89.8
> > 85      6.74M           81.7            88.2
> > 90      6.88M           79.4            84.6
> >
> >                         Fair Share
> >         Dhrystone       Temp Silver     Temp Gold
> > 75      6.63M           80.1            88.5
> > 80      6.71M           80.1            88.5
> > 85      6.77M           81.1            87.8
> > 90      7.12M           81.2            87.8
> 
> Interesting that you get more MIPs out of fair share governor when
> compared to IPA across the board. What devices were providing energy
> cost information (dynamic-power-coefficient) to the IPA engine? Just
> CPU and GPU? Can you point me to those patches in gerrit?

Only the CPUs provide energy cost information, the GPU isn't fully
hooked up in our tree yet. The cause of the delta could be that for
temperatures < 'target' Fair Share only uses the performance states
specified in 'threshold' for throttling (currently only the boost
frequency), while IPA may use the full range of states  of the
'target' trip point.

You can find the patches/configuration here:

https://chromium.googlesource.com/chromiumos/third_party/kernel/+log/refs/sandbox/dianders/190130-wip-tree

arch/arm64/boot/dts/qcom/sdm845.dtsi
arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi

> > Within this range the 'threshold' temperature doesn't seem to have a
> > large impact on the average CPU temperature. There is a bit of
> > fluctuation between individual measurements, I wouldn't be surprised
> > if the outliers of Temp Gold for 75 and 90°C converged more with the
> > other values with some more measurements.
> >
> > I learned how to effectively disable LMH throttling, however with that
> > it was fairly easy to have the CPUs overheat, even with throttling in
> > Linux. If it is feasible at all to run with LMH disabled some more
> > actions will be needed (e.g. attaching a heatsink or interrupt support
> > for thermal sensors instead of polling, ...).
> 
> Given that LMH kicks in at 95 and IPA manages to maintain temperatures
> in the ballpark of 80-90 regardless of the trip point value, I agree
> that we should move the 1st trip point to 90. This will give maximum
> performance. So in "threshold" and "target" terms 90 becomes the
> threshold. And since LMH kicks in at 95, I've left it as the target
> trip.
> 
> These should be sane defaults for upstream and any device can override
> those numbers in their board file.

Sounds good, thanks!

FYI, since I mentioned this earlier: a small heatsink on the SoC
makes a huge difference, with that I didn't encounter thermal
shutdowns during my (limited) tests with LMH disabled. Temperatures
were only slightly higher than with LMH throttling, so it might be
feasible to raise the LMH threshold and only use it as last resort.

Cheers

Matthias
Matthias Kaehlcke Feb. 7, 2019, 1:57 a.m. UTC | #7
On Wed, Feb 06, 2019 at 11:34:41AM -0800, Matthias Kaehlcke wrote:
> On Wed, Feb 06, 2019 at 04:05:41PM +0530, Amit Kucheria wrote:
> > On Sat, Jan 26, 2019 at 3:50 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> > > > > >                   trips {
> > > > > > -                         cpu_alert0: trip0 {
> > > > > > +                         cpu0_alert1: trip-point@0 {
> > > > > >                                   temperature = <75000>;
> > > > >
> > > > > In my observations a 'switch on/threshold' temperature of 75 degrees
> > > > > leads to aggressive throttling with IPA when the temperature is above
> > > > > this threshold:
> > > > >
> > > > > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > > > > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > > > > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > > > > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > > > > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > > > > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > > > > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > > > > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > > > > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > > > > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > > > >
> > > > > (the logs come from a local patch in our tree:
> > > > > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > > > >
> > > > > At this point I don't have a clear idea what would be a reasonable
> > > > > value for the 'switch on/threshold' temperature, but probably it
> > > > > should to be higher than 75 degrees, at least with IPA. If there is
> > > > > no reasonable common configuration for different thermal governors I
> > > > > guess we'll have to target a commonly used governor and systems
> > > > > using other 'incompatible' governors need to override the config in
> > > > > their <board>.dtsi.
> > > >
> > 
> > Thanks for the elaborate testing and for sharing the numbers. This is
> > very useful information.
> > 
> > > > On my system I don't see a significant delta in core temperatures for
> > > > 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> > > > performance goes up by ~8% when changing the trip point from 80 to
> > > > 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> > > > all trip points the average core temperatures are ~80°C (silver) and
> > > > ~85°C (gold). Interestingly I observed the highest average
> > > > temperatures with the trip point at 80°C (repeated measurements were
> > > > taken for different temperatures).
> > > >
> > > > Supposedly LMH throttling is disabled in the firmware I used for
> > > > these tests, however data suggests that it is still active
> > > > (temperature doesn't rise beyond 95°C, even without throttling in
> > > > Linux; Dhrystone performance drops when raising the temperature beyond
> > > > 95°C with a heat gun. I will do some more testing when I get my hands
> > > > on a FW that effectively disables LMH (or raises the threshold to
> > > > something like 105°C).
> > > >
> > > > From the data collected so far I'd suggest a 'threshold' temperature
> > > > of 90°C or if that seems to high 85°C. Behavior might be different
> > > > with other thermal governors or without LMH throttling..
> > >
> > > Some more data from measurements with different trips points, for the
> > > IPA and the Fair Share governors, LMH throttling was enabled:
> > >
> > >                         IPA
> > >         Dhrystone       Temp Silver     Temp Gold
> > > 75      6M              78.4            84.9
> > > 80      6.21M           81.4            89.8
> > > 85      6.74M           81.7            88.2
> > > 90      6.88M           79.4            84.6
> > >
> > >                         Fair Share
> > >         Dhrystone       Temp Silver     Temp Gold
> > > 75      6.63M           80.1            88.5
> > > 80      6.71M           80.1            88.5
> > > 85      6.77M           81.1            87.8
> > > 90      7.12M           81.2            87.8
> > 
> > Interesting that you get more MIPs out of fair share governor when
> > compared to IPA across the board. What devices were providing energy
> > cost information (dynamic-power-coefficient) to the IPA engine? Just
> > CPU and GPU? Can you point me to those patches in gerrit?
> 
> Only the CPUs provide energy cost information, the GPU isn't fully
> hooked up in our tree yet. The cause of the delta could be that for
> temperatures < 'target' Fair Share only uses the performance states
> specified in 'threshold' for throttling (currently only the boost
> frequency), while IPA may use the full range of states  of the
> 'target' trip point.

I saw that in v4 you allow all performance state to be used for
throttling at the 'threshold' temperature. With this configuration
I get:

               Dhrystone      Temp Silver    Temp Gold

Fair Share     7.29M	      81.4           87.7

IPA            7.14M          81.7           88.3


I have no good sense why we are seeing more MIPs for IPA than with the
previous configuration. As for earlier tests the values are the
average from 4 runs.

In any case it seems like a reasonable default configuration with the
data we have at this point.

> You can find the patches/configuration here:
> 
> https://chromium.googlesource.com/chromiumos/third_party/kernel/+log/refs/sandbox/dianders/190130-wip-tree
> 
> arch/arm64/boot/dts/qcom/sdm845.dtsi
> arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
> 
> > > Within this range the 'threshold' temperature doesn't seem to have a
> > > large impact on the average CPU temperature. There is a bit of
> > > fluctuation between individual measurements, I wouldn't be surprised
> > > if the outliers of Temp Gold for 75 and 90°C converged more with the
> > > other values with some more measurements.
> > >
> > > I learned how to effectively disable LMH throttling, however with that
> > > it was fairly easy to have the CPUs overheat, even with throttling in
> > > Linux. If it is feasible at all to run with LMH disabled some more
> > > actions will be needed (e.g. attaching a heatsink or interrupt support
> > > for thermal sensors instead of polling, ...).
> > 
> > Given that LMH kicks in at 95 and IPA manages to maintain temperatures
> > in the ballpark of 80-90 regardless of the trip point value, I agree
> > that we should move the 1st trip point to 90. This will give maximum
> > performance. So in "threshold" and "target" terms 90 becomes the
> > threshold. And since LMH kicks in at 95, I've left it as the target
> > trip.
> > 
> > These should be sane defaults for upstream and any device can override
> > those numbers in their board file.
> 
> Sounds good, thanks!
> 
> FYI, since I mentioned this earlier: a small heatsink on the SoC
> makes a huge difference, with that I didn't encounter thermal
> shutdowns during my (limited) tests with LMH disabled. Temperatures
> were only slightly higher than with LMH throttling, so it might be
> feasible to raise the LMH threshold and only use it as last resort.
> 
> Cheers
> 
> Matthias
Amit Kucheria Feb. 7, 2019, 4:39 a.m. UTC | #8
On Thu, Feb 7, 2019 at 7:27 AM Matthias Kaehlcke <mka@chromium.org> wrote:
>
> On Wed, Feb 06, 2019 at 11:34:41AM -0800, Matthias Kaehlcke wrote:
> > On Wed, Feb 06, 2019 at 04:05:41PM +0530, Amit Kucheria wrote:
> > > On Sat, Jan 26, 2019 at 3:50 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> > > > > > >                   trips {
> > > > > > > -                         cpu_alert0: trip0 {
> > > > > > > +                         cpu0_alert1: trip-point@0 {
> > > > > > >                                   temperature = <75000>;
> > > > > >
> > > > > > In my observations a 'switch on/threshold' temperature of 75 degrees
> > > > > > leads to aggressive throttling with IPA when the temperature is above
> > > > > > this threshold:
> > > > > >
> > > > > > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > > > > > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > > > > > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > > > > > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > > > > > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > > > > > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > > > > > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > > > > > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > > > > > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > > > > > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > > > > >
> > > > > > (the logs come from a local patch in our tree:
> > > > > > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > > > > >
> > > > > > At this point I don't have a clear idea what would be a reasonable
> > > > > > value for the 'switch on/threshold' temperature, but probably it
> > > > > > should to be higher than 75 degrees, at least with IPA. If there is
> > > > > > no reasonable common configuration for different thermal governors I
> > > > > > guess we'll have to target a commonly used governor and systems
> > > > > > using other 'incompatible' governors need to override the config in
> > > > > > their <board>.dtsi.
> > > > >
> > >
> > > Thanks for the elaborate testing and for sharing the numbers. This is
> > > very useful information.
> > >
> > > > > On my system I don't see a significant delta in core temperatures for
> > > > > 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> > > > > performance goes up by ~8% when changing the trip point from 80 to
> > > > > 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> > > > > all trip points the average core temperatures are ~80°C (silver) and
> > > > > ~85°C (gold). Interestingly I observed the highest average
> > > > > temperatures with the trip point at 80°C (repeated measurements were
> > > > > taken for different temperatures).
> > > > >
> > > > > Supposedly LMH throttling is disabled in the firmware I used for
> > > > > these tests, however data suggests that it is still active
> > > > > (temperature doesn't rise beyond 95°C, even without throttling in
> > > > > Linux; Dhrystone performance drops when raising the temperature beyond
> > > > > 95°C with a heat gun. I will do some more testing when I get my hands
> > > > > on a FW that effectively disables LMH (or raises the threshold to
> > > > > something like 105°C).
> > > > >
> > > > > From the data collected so far I'd suggest a 'threshold' temperature
> > > > > of 90°C or if that seems to high 85°C. Behavior might be different
> > > > > with other thermal governors or without LMH throttling..
> > > >
> > > > Some more data from measurements with different trips points, for the
> > > > IPA and the Fair Share governors, LMH throttling was enabled:
> > > >
> > > >                         IPA
> > > >         Dhrystone       Temp Silver     Temp Gold
> > > > 75      6M              78.4            84.9
> > > > 80      6.21M           81.4            89.8
> > > > 85      6.74M           81.7            88.2
> > > > 90      6.88M           79.4            84.6
> > > >
> > > >                         Fair Share
> > > >         Dhrystone       Temp Silver     Temp Gold
> > > > 75      6.63M           80.1            88.5
> > > > 80      6.71M           80.1            88.5
> > > > 85      6.77M           81.1            87.8
> > > > 90      7.12M           81.2            87.8
> > >
> > > Interesting that you get more MIPs out of fair share governor when
> > > compared to IPA across the board. What devices were providing energy
> > > cost information (dynamic-power-coefficient) to the IPA engine? Just
> > > CPU and GPU? Can you point me to those patches in gerrit?
> >
> > Only the CPUs provide energy cost information, the GPU isn't fully
> > hooked up in our tree yet. The cause of the delta could be that for
> > temperatures < 'target' Fair Share only uses the performance states
> > specified in 'threshold' for throttling (currently only the boost
> > frequency), while IPA may use the full range of states  of the
> > 'target' trip point.
>
> I saw that in v4 you allow all performance state to be used for

Yes, I found during my testing that I got better convergence at the
threshold temperature when I allowed the entire range of operating
points with almost no decrease in MIPs.

> throttling at the 'threshold' temperature. With this configuration
> I get:
>
>                Dhrystone      Temp Silver    Temp Gold
>
> Fair Share     7.29M          81.4           87.7
>
> IPA            7.14M          81.7           88.3
>
>
> I have no good sense why we are seeing more MIPs for IPA than with the
> previous configuration. As for earlier tests the values are the
> average from 4 runs.

I suspect that EAS task placement might be at work here. Were your
test threads locked to CPUs or were they free to be scheduled around?

> In any case it seems like a reasonable default configuration with the
> data we have at this point.

Thanks for your thorough reviews to get this point. Will you send out
the patches to add support for IPA at some point?

Regards,
Amit
Matthias Kaehlcke Feb. 7, 2019, 7:10 p.m. UTC | #9
On Thu, Feb 07, 2019 at 10:09:29AM +0530, Amit Kucheria wrote:
> On Thu, Feb 7, 2019 at 7:27 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> >
> > On Wed, Feb 06, 2019 at 11:34:41AM -0800, Matthias Kaehlcke wrote:
> > > On Wed, Feb 06, 2019 at 04:05:41PM +0530, Amit Kucheria wrote:
> > > > On Sat, Jan 26, 2019 at 3:50 AM Matthias Kaehlcke <mka@chromium.org> wrote:
> > > > > > > >                   trips {
> > > > > > > > -                         cpu_alert0: trip0 {
> > > > > > > > +                         cpu0_alert1: trip-point@0 {
> > > > > > > >                                   temperature = <75000>;
> > > > > > >
> > > > > > > In my observations a 'switch on/threshold' temperature of 75 degrees
> > > > > > > leads to aggressive throttling with IPA when the temperature is above
> > > > > > > this threshold:
> > > > > > >
> > > > > > > [  716.760804] cpu_cooling_ratelimit: 31 callbacks suppressed
> > > > > > > [  716.760836] cpu cpu4: Cooling state set to 10. New max freq = 1920000
> > > > > > > [  716.773390] power_allocator_ratelimit: 15 callbacks suppressed
> > > > > > > [  716.773405] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=73500, curr_temp=75200 total_requested_power=39025 total_granted_power=18654
> > > > > > > [  749.609336] cpu_cooling_ratelimit: 45 callbacks suppressed
> > > > > > > [  749.609371] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > > > [  749.624300] power_allocator_ratelimit: 24 callbacks suppressed
> > > > > > > [  749.624323] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=70800, curr_temp=77200 total_requested_power=40136 total_granted_power=17402
> > > > > > > [  780.152633] cpu_cooling_ratelimit: 41 callbacks suppressed
> > > > > > > [  780.152666] cpu cpu4: Cooling state set to 11. New max freq = 1843200
> > > > > > > [  780.165247] power_allocator_ratelimit: 21 callbacks suppressed
> > > > > > > [  780.165261] thermal thermal_zone5: Controlling power: control_temp=95000 last_temp=64800, curr_temp=76900 total_requested_power=39719 total_granted_power=1759
> > > > > > >
> > > > > > > (the logs come from a local patch in our tree:
> > > > > > > https://chromium.googlesource.com/chromiumos/third_party/kernel/+/ec1c501a8093fed44a6697a5913ef2765f518e1f)
> > > > > > >
> > > > > > > At this point I don't have a clear idea what would be a reasonable
> > > > > > > value for the 'switch on/threshold' temperature, but probably it
> > > > > > > should to be higher than 75 degrees, at least with IPA. If there is
> > > > > > > no reasonable common configuration for different thermal governors I
> > > > > > > guess we'll have to target a commonly used governor and systems
> > > > > > > using other 'incompatible' governors need to override the config in
> > > > > > > their <board>.dtsi.
> > > > > >
> > > >
> > > > Thanks for the elaborate testing and for sharing the numbers. This is
> > > > very useful information.
> > > >
> > > > > > On my system I don't see a significant delta in core temperatures for
> > > > > > 'threshold' temperatures of 80, 85 or 90°C. However Dhrystone
> > > > > > performance goes up by ~8% when changing the trip point from 80 to
> > > > > > 85°C. For a switch from 85 to 90°C I see a ~2% performance delta. For
> > > > > > all trip points the average core temperatures are ~80°C (silver) and
> > > > > > ~85°C (gold). Interestingly I observed the highest average
> > > > > > temperatures with the trip point at 80°C (repeated measurements were
> > > > > > taken for different temperatures).
> > > > > >
> > > > > > Supposedly LMH throttling is disabled in the firmware I used for
> > > > > > these tests, however data suggests that it is still active
> > > > > > (temperature doesn't rise beyond 95°C, even without throttling in
> > > > > > Linux; Dhrystone performance drops when raising the temperature beyond
> > > > > > 95°C with a heat gun. I will do some more testing when I get my hands
> > > > > > on a FW that effectively disables LMH (or raises the threshold to
> > > > > > something like 105°C).
> > > > > >
> > > > > > From the data collected so far I'd suggest a 'threshold' temperature
> > > > > > of 90°C or if that seems to high 85°C. Behavior might be different
> > > > > > with other thermal governors or without LMH throttling..
> > > > >
> > > > > Some more data from measurements with different trips points, for the
> > > > > IPA and the Fair Share governors, LMH throttling was enabled:
> > > > >
> > > > >                         IPA
> > > > >         Dhrystone       Temp Silver     Temp Gold
> > > > > 75      6M              78.4            84.9
> > > > > 80      6.21M           81.4            89.8
> > > > > 85      6.74M           81.7            88.2
> > > > > 90      6.88M           79.4            84.6
> > > > >
> > > > >                         Fair Share
> > > > >         Dhrystone       Temp Silver     Temp Gold
> > > > > 75      6.63M           80.1            88.5
> > > > > 80      6.71M           80.1            88.5
> > > > > 85      6.77M           81.1            87.8
> > > > > 90      7.12M           81.2            87.8
> > > >
> > > > Interesting that you get more MIPs out of fair share governor when
> > > > compared to IPA across the board. What devices were providing energy
> > > > cost information (dynamic-power-coefficient) to the IPA engine? Just
> > > > CPU and GPU? Can you point me to those patches in gerrit?
> > >
> > > Only the CPUs provide energy cost information, the GPU isn't fully
> > > hooked up in our tree yet. The cause of the delta could be that for
> > > temperatures < 'target' Fair Share only uses the performance states
> > > specified in 'threshold' for throttling (currently only the boost
> > > frequency), while IPA may use the full range of states  of the
> > > 'target' trip point.
> >
> > I saw that in v4 you allow all performance state to be used for
> 
> Yes, I found during my testing that I got better convergence at the
> threshold temperature when I allowed the entire range of operating
> points with almost no decrease in MIPs.
> 
> > throttling at the 'threshold' temperature. With this configuration
> > I get:
> >
> >                Dhrystone      Temp Silver    Temp Gold
> >
> > Fair Share     7.29M          81.4           87.7
> >
> > IPA            7.14M          81.7           88.3
> >
> >
> > I have no good sense why we are seeing more MIPs for IPA than with the
> > previous configuration. As for earlier tests the values are the
> > average from 4 runs.
> 
> I suspect that EAS task placement might be at work here. Were your
> test threads locked to CPUs or were they free to be scheduled around?

The tasks where pinned to CPUs.

It could also be that my previous tests just hit a series of lower
performance runs, there is some level of fluctuation. Temperatures
for both clusters were also lower than with lower 'threshold'
temperatures, which a priori isn't expected.

> > In any case it seems like a reasonable default configuration with the
> > data we have at this point.
> 
> Thanks for your thorough reviews to get this point. Will you send out
> the patches to add support for IPA at some point?

I currently don't have plans to work on this personally, I believe
Quentin Perret has this on his radar. Quentin, please correct me if
I'm wrong, I'm not intending to 'volunteer' you ;-)
diff mbox series

Patch

diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
index c27cbd3bcb0a..878f661d16eb 100644
--- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
@@ -13,6 +13,7 @@ 
 #include <dt-bindings/reset/qcom,sdm845-aoss.h>
 #include <dt-bindings/soc/qcom,rpmh-rsc.h>
 #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+#include <dt-bindings/thermal/thermal.h>
 
 / {
 	interrupt-parent = <&intc>;
@@ -99,6 +100,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x0>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_0>;
 			L2_0: l2-cache {
 				compatible = "cache";
@@ -114,6 +116,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x100>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_100>;
 			L2_100: l2-cache {
 				compatible = "cache";
@@ -126,6 +129,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x200>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_200>;
 			L2_200: l2-cache {
 				compatible = "cache";
@@ -138,6 +142,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x300>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_300>;
 			L2_300: l2-cache {
 				compatible = "cache";
@@ -150,6 +155,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x400>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_400>;
 			L2_400: l2-cache {
 				compatible = "cache";
@@ -162,6 +168,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x500>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_500>;
 			L2_500: l2-cache {
 				compatible = "cache";
@@ -174,6 +181,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x600>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_600>;
 			L2_600: l2-cache {
 				compatible = "cache";
@@ -186,6 +194,7 @@ 
 			compatible = "qcom,kryo385";
 			reg = <0x0 0x700>;
 			enable-method = "psci";
+			#cooling-cells = <2>;
 			next-level-cache = <&L2_700>;
 			L2_700: l2-cache {
 				compatible = "cache";
@@ -1691,18 +1700,41 @@ 
 			thermal-sensors = <&tsens0 1>;
 
 			trips {
-				cpu_alert0: trip0 {
+				cpu0_alert1: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit0: trip1 {
+				cpu0_alert0: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu0_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu0_alert0>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
+							 <&CPU1 THERMAL_NO_LIMIT 1>,
+							 <&CPU2 THERMAL_NO_LIMIT 1>,
+							 <&CPU3 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu0_alert1>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu1-thermal {
@@ -1712,18 +1744,41 @@ 
 			thermal-sensors = <&tsens0 2>;
 
 			trips {
-				cpu_alert1: trip0 {
+				cpu1_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit1: trip1 {
+				cpu1_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu1_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu1_alert0>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
+							 <&CPU1 THERMAL_NO_LIMIT 1>,
+							 <&CPU2 THERMAL_NO_LIMIT 1>,
+							 <&CPU3 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu1_alert1>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu2-thermal {
@@ -1733,18 +1788,41 @@ 
 			thermal-sensors = <&tsens0 3>;
 
 			trips {
-				cpu_alert2: trip0 {
+				cpu2_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit2: trip1 {
+				cpu2_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu2_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu2_alert0>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
+							 <&CPU1 THERMAL_NO_LIMIT 1>,
+							 <&CPU2 THERMAL_NO_LIMIT 1>,
+							 <&CPU3 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu2_alert1>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu3-thermal {
@@ -1754,18 +1832,41 @@ 
 			thermal-sensors = <&tsens0 4>;
 
 			trips {
-				cpu_alert3: trip0 {
+				cpu3_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit3: trip1 {
+				cpu3_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu3_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu3_alert0>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT 1>,
+							 <&CPU1 THERMAL_NO_LIMIT 1>,
+							 <&CPU2 THERMAL_NO_LIMIT 1>,
+							 <&CPU3 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu3_alert1>;
+					cooling-device = <&CPU0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU1 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU2 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu4-thermal {
@@ -1775,18 +1876,41 @@ 
 			thermal-sensors = <&tsens0 7>;
 
 			trips {
-				cpu_alert4: trip0 {
+				cpu4_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit4: trip1 {
+				cpu4_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu4_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu4_alert0>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
+							 <&CPU5 THERMAL_NO_LIMIT 1>,
+							 <&CPU6 THERMAL_NO_LIMIT 1>,
+							 <&CPU7 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu4_alert1>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu5-thermal {
@@ -1796,18 +1920,41 @@ 
 			thermal-sensors = <&tsens0 8>;
 
 			trips {
-				cpu_alert5: trip0 {
+				cpu5_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit5: trip1 {
+				cpu5_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu5_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu5_alert0>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
+							 <&CPU5 THERMAL_NO_LIMIT 1>,
+							 <&CPU6 THERMAL_NO_LIMIT 1>,
+							 <&CPU7 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu5_alert1>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu6-thermal {
@@ -1817,18 +1964,41 @@ 
 			thermal-sensors = <&tsens0 9>;
 
 			trips {
-				cpu_alert6: trip0 {
+				cpu6_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit6: trip1 {
+				cpu6_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu6_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu6_alert0>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
+							 <&CPU5 THERMAL_NO_LIMIT 1>,
+							 <&CPU6 THERMAL_NO_LIMIT 1>,
+							 <&CPU7 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu6_alert1>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 
 		cpu7-thermal {
@@ -1838,18 +2008,41 @@ 
 			thermal-sensors = <&tsens0 10>;
 
 			trips {
-				cpu_alert7: trip0 {
+				cpu7_alert0: trip-point@0 {
 					temperature = <75000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
 
-				cpu_crit7: trip1 {
+				cpu7_alert1: trip-point@1 {
+					temperature = <95000>;
+					hysteresis = <2000>;
+					type = "passive";
+				};
+
+				cpu7_crit: cpu_crit {
 					temperature = <110000>;
 					hysteresis = <1000>;
 					type = "critical";
 				};
 			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu7_alert0>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT 1>,
+							 <&CPU5 THERMAL_NO_LIMIT 1>,
+							 <&CPU6 THERMAL_NO_LIMIT 1>,
+							 <&CPU7 THERMAL_NO_LIMIT 1>;
+				};
+				map1 {
+					trip = <&cpu7_alert1>;
+					cooling-device = <&CPU4 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU5 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU6 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>,
+							 <&CPU7 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+				};
+			};
 		};
 	};
 };