diff mbox series

[v3,RESEND] thermal: qoriq: Only enable sites that actually exist

Message ID 7115709.31r3eYUQgx@pliszka (mailing list archive)
State New, archived
Delegated to: Daniel Lezcano
Headers show
Series [v3,RESEND] thermal: qoriq: Only enable sites that actually exist | expand

Commit Message

Sebastian Krzyszkowiak Sept. 27, 2022, 6:15 a.m. UTC
On i.MX8MQ, enabling monitoring sites that aren't connected to anything
can cause unwanted side effects on some units. This seems to happen
once some of these sites report out-of-range readings and results in
sensor misbehavior, such as thermal zone readings getting stuck or even
suddenly reporting an impossibly high value, triggering emergency
shutdowns.

The datasheet lists all non-existent sites as "reserved" and doesn't
make any guarantees about being able to enable them at all, so let's
not do that. Instead, iterate over sensor DT nodes and only enable
monitoring sites that are specified there prior to registering their
thermal zones. This still fixes the issue with bogus data being
reported on the first reading, but doesn't introduce problems that
come with reading from non-existent sites.

Fixes: 45038e03d633 ("thermal: qoriq: Enable all sensors before registering them")
Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Krzyszkowiak <sebastian.krzyszkowiak@puri.sm>
---
Resent <20220321170852.654094-1-sebastian.krzyszkowiak@puri.sm>
v3: add cc: stable
v2: augment the commit message with details on what the patch is doing
---
 drivers/thermal/qoriq_thermal.c | 63 ++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 20 deletions(-)

Comments

Daniel Lezcano Sept. 27, 2022, 8:34 a.m. UTC | #1
Hi Sebastian,

On 27/09/2022 08:15, Sebastian Krzyszkowiak wrote:
> On i.MX8MQ, enabling monitoring sites that aren't connected to anything
> can cause unwanted side effects on some units. This seems to happen
> once some of these sites report out-of-range readings and results in
> sensor misbehavior, such as thermal zone readings getting stuck or even
> suddenly reporting an impossibly high value, triggering emergency
> shutdowns.
> 
> The datasheet lists all non-existent sites as "reserved" and doesn't
> make any guarantees about being able to enable them at all, so let's
> not do that. Instead, iterate over sensor DT nodes and only enable
> monitoring sites that are specified there prior to registering their
> thermal zones. This still fixes the issue with bogus data being
> reported on the first reading, but doesn't introduce problems that
> come with reading from non-existent sites.

Can you have a look at these patches:

https://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git/commit/?h=thermal/linux-next&id=ab2266ecaa3254811f9f83992cf53fdfe3c62c86

and

https://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git/commit/?h=thermal/linux-next&id=7be4288625df54887b444991d743c6e1af21e27a

Thanks
   -- Daniel

> Fixes: 45038e03d633 ("thermal: qoriq: Enable all sensors before registering them")
> Cc: stable@vger.kernel.org
> Signed-off-by: Sebastian Krzyszkowiak <sebastian.krzyszkowiak@puri.sm>
> ---
> Resent <20220321170852.654094-1-sebastian.krzyszkowiak@puri.sm>
> v3: add cc: stable
> v2: augment the commit message with details on what the patch is doing
> ---
>   drivers/thermal/qoriq_thermal.c | 63 ++++++++++++++++++++++-----------
>   1 file changed, 43 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
> index 73049f9bea25..ef0848849ee2 100644
> --- a/drivers/thermal/qoriq_thermal.c
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -32,7 +32,6 @@
>   #define TMR_DISABLE	0x0
>   #define TMR_ME		0x80000000
>   #define TMR_ALPF	0x0c000000
> -#define TMR_MSITE_ALL	GENMASK(15, 0)
>   
>   #define REGS_TMTMIR	0x008	/* Temperature measurement interval Register */
>   #define TMTMIR_DEFAULT	0x0000000f
> @@ -129,33 +128,51 @@ static const struct thermal_zone_of_device_ops tmu_tz_ops = {
>   static int qoriq_tmu_register_tmu_zone(struct device *dev,
>   				       struct qoriq_tmu_data *qdata)
>   {
> -	int id;
> +	int ret = 0;
> +	struct device_node *np, *child, *sensor_np;
>   
> -	if (qdata->ver == TMU_VER1) {
> -		regmap_write(qdata->regmap, REGS_TMR,
> -			     TMR_MSITE_ALL | TMR_ME | TMR_ALPF);
> -	} else {
> -		regmap_write(qdata->regmap, REGS_V2_TMSR, TMR_MSITE_ALL);
> -		regmap_write(qdata->regmap, REGS_TMR, TMR_ME | TMR_ALPF_V2);
> -	}
> +	np = of_find_node_by_name(NULL, "thermal-zones");
> +	if (!np)
> +		return -ENODEV;
> +
> +	sensor_np = of_node_get(dev->of_node);
>   
> -	for (id = 0; id < SITES_MAX; id++) {
> +	for_each_available_child_of_node(np, child) {
>   		struct thermal_zone_device *tzd;
> -		struct qoriq_sensor *sensor = &qdata->sensor[id];
> -		int ret;
> +		struct qoriq_sensor *sensor;
> +		int id, site;
> +
> +		ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
> +
> +		if (ret < 0) {
> +			dev_err(dev, "failed to get valid sensor id: %d\n", ret);
> +			of_node_put(child);
> +			break;
> +		}
>   
> +		sensor = &qdata->sensor[id];
>   		sensor->id = id;
>   
> +		/* Enable monitoring */
> +		if (qdata->ver == TMU_VER1) {
> +			site = 0x1 << (15 - id);
> +			regmap_update_bits(qdata->regmap, REGS_TMR,
> +					   site | TMR_ME | TMR_ALPF,
> +					   site | TMR_ME | TMR_ALPF);
> +		} else {
> +			site = 0x1 << id;
> +			regmap_update_bits(qdata->regmap, REGS_V2_TMSR, site, site);
> +			regmap_write(qdata->regmap, REGS_TMR, TMR_ME | TMR_ALPF_V2);
> +		}
> +
>   		tzd = devm_thermal_zone_of_sensor_register(dev, id,
>   							   sensor,
>   							   &tmu_tz_ops);
> -		ret = PTR_ERR_OR_ZERO(tzd);
> -		if (ret) {
> -			if (ret == -ENODEV)
> -				continue;
> -
> -			regmap_write(qdata->regmap, REGS_TMR, TMR_DISABLE);
> -			return ret;
> +		if (IS_ERR(tzd)) {
> +			ret = PTR_ERR(tzd);
> +			dev_err(dev, "failed to register thermal zone: %d\n", ret);
> +			of_node_put(child);
> +			break;
>   		}
>   
>   		if (devm_thermal_add_hwmon_sysfs(tzd))
> @@ -164,7 +181,13 @@ static int qoriq_tmu_register_tmu_zone(struct device *dev,
>   
>   	}
>   
> -	return 0;
> +	of_node_put(sensor_np);
> +	of_node_put(np);
> +
> +	if (ret)
> +		regmap_write(qdata->regmap, REGS_TMR, TMR_DISABLE);
> +
> +	return ret;
>   }
>   
>   static int qoriq_tmu_calibration(struct device *dev,
Sebastian Krzyszkowiak Sept. 28, 2022, 2:05 p.m. UTC | #2
On wtorek, 27 wrzeĊ›nia 2022 10:34:00 CEST Daniel Lezcano wrote:
> Hi Sebastian,
> 
> On 27/09/2022 08:15, Sebastian Krzyszkowiak wrote:
> > On i.MX8MQ, enabling monitoring sites that aren't connected to anything
> > can cause unwanted side effects on some units. This seems to happen
> > once some of these sites report out-of-range readings and results in
> > sensor misbehavior, such as thermal zone readings getting stuck or even
> > suddenly reporting an impossibly high value, triggering emergency
> > shutdowns.
> > 
> > The datasheet lists all non-existent sites as "reserved" and doesn't
> > make any guarantees about being able to enable them at all, so let's
> > not do that. Instead, iterate over sensor DT nodes and only enable
> > monitoring sites that are specified there prior to registering their
> > thermal zones. This still fixes the issue with bogus data being
> > reported on the first reading, but doesn't introduce problems that
> > come with reading from non-existent sites.
> 
> Can you have a look at these patches:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git/commit/?h=
> thermal/linux-next&id=ab2266ecaa3254811f9f83992cf53fdfe3c62c86
> 
> and
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git/commit/?h=
> thermal/linux-next&id=7be4288625df54887b444991d743c6e1af21e27a
> 
> Thanks
>    -- Daniel


Hi Daniel,

I'm not sure if that's a good idea. qoriq-thermal has used 
thermal_zone_of_get_sensor_id up until 45038e03d633, which was a change meant 
to fix a bug with bogus data being present on the first report (as zones were 
being registered before their monitoring was being enabled), but it was done 
is a problematic way that introduced erratic behavior. The only ways to fix 
this regression that I see are either to make the driver aware of which zones 
are present on particular platform (like my patch does by using 
thermal_zone_of_get_sensor_id again), or to have some way to attempt 
registering a thermal zone that isn't necessarily ready to report data yet.

Looking at device trees where qoriq-thermal is used, it seems like zone 
configuration is pretty diverse across SoCs:

arm/ls1021a.dtsi: 0 cpu_thermal
arm64/fsl-ls1012a.dtsi: 0 cpu_thermal
arm64/fsl-ls1028a.dtsi: 0 ddr_controller 1 core_cluster
arm64/fsl-ls1043a.dtsi: 0 ddr_controller 1 serdes 2 fman 3 core-cluster 4 sec
arm64/fsl-ls1046a.dtsi: 0 ddr_controller 1 serdes 2 fman 3 core-cluster 4 sec
arm64/fsl-ls1088a.dtsi: 0 core-cluser 1 soc
arm64/fsl-ls208xa.dtsi: 1 ddr-controller1 2 ddr-controller2 3 ddr-controller3 
4 core-cluster1 5 core-cluster2 6 core-cluster3 7 core-cluster4
arm64/fsl-lx2160a.dtsi: 0 cluster6-7 1 ddr-cluster5 2 wriop 3 dce-qbman-hsio2 
4 ccn-dpaa-tbu 5 cluster4-hsio3 6 cluster2-3
arm64/imx8mq.dtsi: 0 cpu-thermal 1 gpu-thermal 2 vpu-thermal
powerpc/t1023si-post.dtsi: 0 cpu_thermal
powerpc/t1040si-post.dtsi: 2 cpu_thermal

I haven't checked dts files where those get included, but I believe it's rather 
unlikely that any additional zones are defined there.

Do you mean that this should all go as platform data into the driver? If so, 
should all the calibration data that's currently in device trees go there as 
well? (if not, why not?)

Cheers,
Sebastian

> > Fixes: 45038e03d633 ("thermal: qoriq: Enable all sensors before
> > registering them") Cc: stable@vger.kernel.org
> > Signed-off-by: Sebastian Krzyszkowiak <sebastian.krzyszkowiak@puri.sm>
> > ---
> > Resent <20220321170852.654094-1-sebastian.krzyszkowiak@puri.sm>
> > v3: add cc: stable
> > v2: augment the commit message with details on what the patch is doing
> > ---
> > 
> >   drivers/thermal/qoriq_thermal.c | 63 ++++++++++++++++++++++-----------
> >   1 file changed, 43 insertions(+), 20 deletions(-)
> > 
> > diff --git a/drivers/thermal/qoriq_thermal.c
> > b/drivers/thermal/qoriq_thermal.c index 73049f9bea25..ef0848849ee2 100644
> > --- a/drivers/thermal/qoriq_thermal.c
> > +++ b/drivers/thermal/qoriq_thermal.c
> > @@ -32,7 +32,6 @@
> > 
> >   #define TMR_DISABLE	0x0
> >   #define TMR_ME		0x80000000
> >   #define TMR_ALPF	0x0c000000
> > 
> > -#define TMR_MSITE_ALL	GENMASK(15, 0)
> > 
> >   #define REGS_TMTMIR	0x008	/* Temperature measurement interval 
Register
> >   */
> >   #define TMTMIR_DEFAULT	0x0000000f
> > 
> > @@ -129,33 +128,51 @@ static const struct thermal_zone_of_device_ops
> > tmu_tz_ops = {> 
> >   static int qoriq_tmu_register_tmu_zone(struct device *dev,
> >   
> >   				       struct qoriq_tmu_data 
*qdata)
> >   
> >   {
> > 
> > -	int id;
> > +	int ret = 0;
> > +	struct device_node *np, *child, *sensor_np;
> > 
> > -	if (qdata->ver == TMU_VER1) {
> > -		regmap_write(qdata->regmap, REGS_TMR,
> > -			     TMR_MSITE_ALL | TMR_ME | TMR_ALPF);
> > -	} else {
> > -		regmap_write(qdata->regmap, REGS_V2_TMSR, 
TMR_MSITE_ALL);
> > -		regmap_write(qdata->regmap, REGS_TMR, TMR_ME | 
TMR_ALPF_V2);
> > -	}
> > +	np = of_find_node_by_name(NULL, "thermal-zones");
> > +	if (!np)
> > +		return -ENODEV;
> > +
> > +	sensor_np = of_node_get(dev->of_node);
> > 
> > -	for (id = 0; id < SITES_MAX; id++) {
> > +	for_each_available_child_of_node(np, child) {
> > 
> >   		struct thermal_zone_device *tzd;
> > 
> > -		struct qoriq_sensor *sensor = &qdata->sensor[id];
> > -		int ret;
> > +		struct qoriq_sensor *sensor;
> > +		int id, site;
> > +
> > +		ret = thermal_zone_of_get_sensor_id(child, sensor_np, 
&id);
> > +
> > +		if (ret < 0) {
> > +			dev_err(dev, "failed to get valid sensor id: 
%d\n", ret);
> > +			of_node_put(child);
> > +			break;
> > +		}
> > 
> > +		sensor = &qdata->sensor[id];
> > 
> >   		sensor->id = id;
> > 
> > +		/* Enable monitoring */
> > +		if (qdata->ver == TMU_VER1) {
> > +			site = 0x1 << (15 - id);
> > +			regmap_update_bits(qdata->regmap, REGS_TMR,
> > +					   site | TMR_ME | 
TMR_ALPF,
> > +					   site | TMR_ME | 
TMR_ALPF);
> > +		} else {
> > +			site = 0x1 << id;
> > +			regmap_update_bits(qdata->regmap, 
REGS_V2_TMSR, site, site);
> > +			regmap_write(qdata->regmap, REGS_TMR, TMR_ME 
| TMR_ALPF_V2);
> > +		}
> > +
> > 
> >   		tzd = devm_thermal_zone_of_sensor_register(dev, id,
> >   		
> >   							
   sensor,
> >   							
   &tmu_tz_ops);
> > 
> > -		ret = PTR_ERR_OR_ZERO(tzd);
> > -		if (ret) {
> > -			if (ret == -ENODEV)
> > -				continue;
> > -
> > -			regmap_write(qdata->regmap, REGS_TMR, 
TMR_DISABLE);
> > -			return ret;
> > +		if (IS_ERR(tzd)) {
> > +			ret = PTR_ERR(tzd);
> > +			dev_err(dev, "failed to register thermal 
zone: %d\n", ret);
> > +			of_node_put(child);
> > +			break;
> > 
> >   		}
> >   		
> >   		if (devm_thermal_add_hwmon_sysfs(tzd))
> > 
> > @@ -164,7 +181,13 @@ static int qoriq_tmu_register_tmu_zone(struct device
> > *dev,> 
> >   	}
> > 
> > -	return 0;
> > +	of_node_put(sensor_np);
> > +	of_node_put(np);
> > +
> > +	if (ret)
> > +		regmap_write(qdata->regmap, REGS_TMR, TMR_DISABLE);
> > +
> > +	return ret;
> > 
> >   }
> >   
> >   static int qoriq_tmu_calibration(struct device *dev,
diff mbox series

Patch

diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
index 73049f9bea25..ef0848849ee2 100644
--- a/drivers/thermal/qoriq_thermal.c
+++ b/drivers/thermal/qoriq_thermal.c
@@ -32,7 +32,6 @@ 
 #define TMR_DISABLE	0x0
 #define TMR_ME		0x80000000
 #define TMR_ALPF	0x0c000000
-#define TMR_MSITE_ALL	GENMASK(15, 0)
 
 #define REGS_TMTMIR	0x008	/* Temperature measurement interval Register */
 #define TMTMIR_DEFAULT	0x0000000f
@@ -129,33 +128,51 @@  static const struct thermal_zone_of_device_ops tmu_tz_ops = {
 static int qoriq_tmu_register_tmu_zone(struct device *dev,
 				       struct qoriq_tmu_data *qdata)
 {
-	int id;
+	int ret = 0;
+	struct device_node *np, *child, *sensor_np;
 
-	if (qdata->ver == TMU_VER1) {
-		regmap_write(qdata->regmap, REGS_TMR,
-			     TMR_MSITE_ALL | TMR_ME | TMR_ALPF);
-	} else {
-		regmap_write(qdata->regmap, REGS_V2_TMSR, TMR_MSITE_ALL);
-		regmap_write(qdata->regmap, REGS_TMR, TMR_ME | TMR_ALPF_V2);
-	}
+	np = of_find_node_by_name(NULL, "thermal-zones");
+	if (!np)
+		return -ENODEV;
+
+	sensor_np = of_node_get(dev->of_node);
 
-	for (id = 0; id < SITES_MAX; id++) {
+	for_each_available_child_of_node(np, child) {
 		struct thermal_zone_device *tzd;
-		struct qoriq_sensor *sensor = &qdata->sensor[id];
-		int ret;
+		struct qoriq_sensor *sensor;
+		int id, site;
+
+		ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
+
+		if (ret < 0) {
+			dev_err(dev, "failed to get valid sensor id: %d\n", ret);
+			of_node_put(child);
+			break;
+		}
 
+		sensor = &qdata->sensor[id];
 		sensor->id = id;
 
+		/* Enable monitoring */
+		if (qdata->ver == TMU_VER1) {
+			site = 0x1 << (15 - id);
+			regmap_update_bits(qdata->regmap, REGS_TMR,
+					   site | TMR_ME | TMR_ALPF,
+					   site | TMR_ME | TMR_ALPF);
+		} else {
+			site = 0x1 << id;
+			regmap_update_bits(qdata->regmap, REGS_V2_TMSR, site, site);
+			regmap_write(qdata->regmap, REGS_TMR, TMR_ME | TMR_ALPF_V2);
+		}
+
 		tzd = devm_thermal_zone_of_sensor_register(dev, id,
 							   sensor,
 							   &tmu_tz_ops);
-		ret = PTR_ERR_OR_ZERO(tzd);
-		if (ret) {
-			if (ret == -ENODEV)
-				continue;
-
-			regmap_write(qdata->regmap, REGS_TMR, TMR_DISABLE);
-			return ret;
+		if (IS_ERR(tzd)) {
+			ret = PTR_ERR(tzd);
+			dev_err(dev, "failed to register thermal zone: %d\n", ret);
+			of_node_put(child);
+			break;
 		}
 
 		if (devm_thermal_add_hwmon_sysfs(tzd))
@@ -164,7 +181,13 @@  static int qoriq_tmu_register_tmu_zone(struct device *dev,
 
 	}
 
-	return 0;
+	of_node_put(sensor_np);
+	of_node_put(np);
+
+	if (ret)
+		regmap_write(qdata->regmap, REGS_TMR, TMR_DISABLE);
+
+	return ret;
 }
 
 static int qoriq_tmu_calibration(struct device *dev,