diff mbox series

[net-next,v1] net: phy: nxp-tja11xx: log critical health state

Message ID 20210810125618.20255-1-o.rempel@pengutronix.de (mailing list archive)
State Not Applicable
Headers show
Series [net-next,v1] net: phy: nxp-tja11xx: log critical health state | expand

Commit Message

Oleksij Rempel Aug. 10, 2021, 12:56 p.m. UTC
TJA1102 provides interrupt notification for the critical health states
like overtemperature and undervoltage.

The overtemperature bit is set if package temperature is beyond 155C°.
This functionality was tested by heating the package up to 200C°

The undervoltage bit is set if supply voltage drops beyond some critical
threshold. Currently not tested.

In a typical use case, both of this events should be logged and stored
(or send to some remote system) for further investigations.

To make testing and troubleshooting more easier, other bits are exported
over the ethtool phy stats interface:
- level of overtemperature. There is no access to thermal sensor.
- power supply line where undervoltage was detected

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
---
 drivers/net/phy/nxp-tja11xx.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

Comments

Andrew Lunn Aug. 10, 2021, 3:05 p.m. UTC | #1
Hi Oleksij

> @@ -89,6 +91,12 @@ static struct tja11xx_phy_stats tja11xx_hw_stats[] = {
>  	{ "phy_polarity_detect", 25, 6, BIT(6) },
>  	{ "phy_open_detect", 25, 7, BIT(7) },
>  	{ "phy_short_detect", 25, 8, BIT(8) },
> +	{ "phy_temp_warn (temp > 155C°)", 25, 9, BIT(9) },
> +	{ "phy_temp_high (temp > 180C°)", 25, 10, BIT(10) },
> +	{ "phy_uv_vddio", 25, 11, BIT(11) },
> +	{ "phy_uv_vddd_1v8", 25, 13, BIT(13) },
> +	{ "phy_uv_vdda_3v3", 25, 14, BIT(14) },
> +	{ "phy_uv_vddd_3v3", 25, 15, BIT(15) },
>  	{ "phy_rem_rcvr_count", 26, 0, GENMASK(7, 0) },
>  	{ "phy_loc_rcvr_count", 26, 8, GENMASK(15, 8) },

I'm not so happy abusing the statistic counters like this. Especially
when we have a better API for temperature and voltage: hwmon.

phy_temp_warn maps to hwmon_temp_max_alarm. phy_temp_high maps to
either hwmon_temp_crit_alarm or hwmon_temp_emergency_alarm.

The under voltage maps to hwmon_in_lcrit_alarm.

> @@ -630,6 +640,11 @@ static irqreturn_t tja11xx_handle_interrupt(struct phy_device *phydev)
>  		return IRQ_NONE;
>  	}
>  
> +	if (irq_status & MII_INTSRC_TEMP_ERR)
> +		dev_err(dev, "Overtemperature error detected (temp > 155C°).\n");
> +	if (irq_status & MII_INTSRC_UV_ERR)
> +		dev_err(dev, "Undervoltage error detected.\n");
> +

These are not actual errors, in the linux sense. So dev_warn() or
maybe dev_info().

      Andrew
Guenter Roeck Aug. 10, 2021, 6:18 p.m. UTC | #2
On 8/10/21 8:05 AM, Andrew Lunn wrote:
> Hi Oleksij
> 
>> @@ -89,6 +91,12 @@ static struct tja11xx_phy_stats tja11xx_hw_stats[] = {
>>   	{ "phy_polarity_detect", 25, 6, BIT(6) },
>>   	{ "phy_open_detect", 25, 7, BIT(7) },
>>   	{ "phy_short_detect", 25, 8, BIT(8) },
>> +	{ "phy_temp_warn (temp > 155C°)", 25, 9, BIT(9) },
>> +	{ "phy_temp_high (temp > 180C°)", 25, 10, BIT(10) },
>> +	{ "phy_uv_vddio", 25, 11, BIT(11) },
>> +	{ "phy_uv_vddd_1v8", 25, 13, BIT(13) },
>> +	{ "phy_uv_vdda_3v3", 25, 14, BIT(14) },
>> +	{ "phy_uv_vddd_3v3", 25, 15, BIT(15) },
>>   	{ "phy_rem_rcvr_count", 26, 0, GENMASK(7, 0) },
>>   	{ "phy_loc_rcvr_count", 26, 8, GENMASK(15, 8) },
> 
> I'm not so happy abusing the statistic counters like this. Especially
> when we have a better API for temperature and voltage: hwmon.
> 
> phy_temp_warn maps to hwmon_temp_max_alarm. phy_temp_high maps to
> either hwmon_temp_crit_alarm or hwmon_temp_emergency_alarm.
> 
> The under voltage maps to hwmon_in_lcrit_alarm.
> 

FWIW, the statistics counters in this driver are already abused
(phy_polarity_detect, phy_open_detect, phy_short_detect), so
I am not sure if adding more abuse makes a difference (and/or
if such abuse is common for phy drivers in general).

Guenter

>> @@ -630,6 +640,11 @@ static irqreturn_t tja11xx_handle_interrupt(struct phy_device *phydev)
>>   		return IRQ_NONE;
>>   	}
>>   
>> +	if (irq_status & MII_INTSRC_TEMP_ERR)
>> +		dev_err(dev, "Overtemperature error detected (temp > 155C°).\n");
>> +	if (irq_status & MII_INTSRC_UV_ERR)
>> +		dev_err(dev, "Undervoltage error detected.\n");
>> +
> 
> These are not actual errors, in the linux sense. So dev_warn() or
> maybe dev_info().
> 
>        Andrew
>
Andrew Lunn Aug. 10, 2021, 6:42 p.m. UTC | #3
> > I'm not so happy abusing the statistic counters like this. Especially
> > when we have a better API for temperature and voltage: hwmon.
> > 
> > phy_temp_warn maps to hwmon_temp_max_alarm. phy_temp_high maps to
> > either hwmon_temp_crit_alarm or hwmon_temp_emergency_alarm.
> > 
> > The under voltage maps to hwmon_in_lcrit_alarm.
> > 
> 
> FWIW, the statistics counters in this driver are already abused
> (phy_polarity_detect, phy_open_detect, phy_short_detect), so
> I am not sure if adding more abuse makes a difference (and/or
> if such abuse is common for phy drivers in general).

Hi Guenter

Abuse is not common in general. I think this is the only driver
abusing stats to return flags.  At the time those where added, we did
not have phy cable test support. Now we do, i would also suggest that
the driver makes use of that infrastructure to issue a cable test
report. These 'stats' need to stay, since they are ABI, but we should
not add more.

That is also why i said "Especially when we have a better API".

     Andrew
diff mbox series

Patch

diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c
index afd7afa1f498..4c37b427a53b 100644
--- a/drivers/net/phy/nxp-tja11xx.c
+++ b/drivers/net/phy/nxp-tja11xx.c
@@ -47,12 +47,14 @@ 
 #define MII_INTSRC_LINK_FAIL		BIT(10)
 #define MII_INTSRC_LINK_UP		BIT(9)
 #define MII_INTSRC_MASK			(MII_INTSRC_LINK_FAIL | MII_INTSRC_LINK_UP)
-#define MII_INTSRC_TEMP_ERR		BIT(1)
 #define MII_INTSRC_UV_ERR		BIT(3)
+#define MII_INTSRC_TEMP_ERR		BIT(1)
 
 #define MII_INTEN			22
 #define MII_INTEN_LINK_FAIL		BIT(10)
 #define MII_INTEN_LINK_UP		BIT(9)
+#define MII_INTEN_UV_ERR		BIT(3)
+#define MII_INTEN_TEMP_ERR		BIT(1)
 
 #define MII_COMMSTAT			23
 #define MII_COMMSTAT_LINK_UP		BIT(15)
@@ -89,6 +91,12 @@  static struct tja11xx_phy_stats tja11xx_hw_stats[] = {
 	{ "phy_polarity_detect", 25, 6, BIT(6) },
 	{ "phy_open_detect", 25, 7, BIT(7) },
 	{ "phy_short_detect", 25, 8, BIT(8) },
+	{ "phy_temp_warn (temp > 155C°)", 25, 9, BIT(9) },
+	{ "phy_temp_high (temp > 180C°)", 25, 10, BIT(10) },
+	{ "phy_uv_vddio", 25, 11, BIT(11) },
+	{ "phy_uv_vddd_1v8", 25, 13, BIT(13) },
+	{ "phy_uv_vdda_3v3", 25, 14, BIT(14) },
+	{ "phy_uv_vddd_3v3", 25, 15, BIT(15) },
 	{ "phy_rem_rcvr_count", 26, 0, GENMASK(7, 0) },
 	{ "phy_loc_rcvr_count", 26, 8, GENMASK(15, 8) },
 };
@@ -607,7 +615,8 @@  static int tja11xx_config_intr(struct phy_device *phydev)
 		if (err)
 			return err;
 
-		value = MII_INTEN_LINK_FAIL | MII_INTEN_LINK_UP;
+		value = MII_INTEN_LINK_FAIL | MII_INTEN_LINK_UP |
+			MII_INTEN_UV_ERR | MII_INTEN_TEMP_ERR;
 		err = phy_write(phydev, MII_INTEN, value);
 	} else {
 		err = phy_write(phydev, MII_INTEN, value);
@@ -622,6 +631,7 @@  static int tja11xx_config_intr(struct phy_device *phydev)
 
 static irqreturn_t tja11xx_handle_interrupt(struct phy_device *phydev)
 {
+	struct device *dev = &phydev->mdio.dev;
 	int irq_status;
 
 	irq_status = phy_read(phydev, MII_INTSRC);
@@ -630,6 +640,11 @@  static irqreturn_t tja11xx_handle_interrupt(struct phy_device *phydev)
 		return IRQ_NONE;
 	}
 
+	if (irq_status & MII_INTSRC_TEMP_ERR)
+		dev_err(dev, "Overtemperature error detected (temp > 155C°).\n");
+	if (irq_status & MII_INTSRC_UV_ERR)
+		dev_err(dev, "Undervoltage error detected.\n");
+
 	if (!(irq_status & MII_INTSRC_MASK))
 		return IRQ_NONE;