diff mbox series

[net,2/2] net: usb: aqc111: Support for thermal throttling feature

Message ID 658f5fab466face2aed2d53dabfe3af16595019a.1544622414.git.igor.russkikh@aquantia.com (mailing list archive)
State New, archived
Headers show
Series aqc111: Thermal throttling feature | expand

Commit Message

Igor Russkikh Dec. 12, 2018, 1:50 p.m. UTC
From: Dmitry Bezrukov <dmitry.bezrukov@aquantia.com>

Lab testing shows that chip may get overheated when
network link is connected on high speed (2.5G/5G).

Default hardware design uses only passive heatsink without a cooler,
and that makes things worse.

To prevent possible chip damage here we add throttling mechanism.

There is a worker that monitors the PHY's temperature via reading
PHY registers. When PHY's temperature reaches the critical threshold
(it is 106 degrees of Celsius) it changes the link speed to the lowest
regardless user/default link speed settings. It should reduce the PHY's
temperature to normal numbers.

When the PHY's temparature is back to normal (low threshold, it is
85 degrees) it restores user/default link speed settings.

Signed-off-by: Dmitry Bezrukov <dmitry.bezrukov@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/usb/aqc111.c | 78 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/usb/aqc111.h |  8 +++++
 2 files changed, 86 insertions(+)

Comments

Andrew Lunn Dec. 12, 2018, 4:08 p.m. UTC | #1
On Wed, Dec 12, 2018 at 01:50:10PM +0000, Igor Russkikh wrote:
> From: Dmitry Bezrukov <dmitry.bezrukov@aquantia.com>
> 
> Lab testing shows that chip may get overheated when
> network link is connected on high speed (2.5G/5G).
> 
> Default hardware design uses only passive heatsink without a cooler,
> and that makes things worse.
> 
> To prevent possible chip damage here we add throttling mechanism.
> 
> There is a worker that monitors the PHY's temperature via reading
> PHY registers. When PHY's temperature reaches the critical threshold
> (it is 106 degrees of Celsius) it changes the link speed to the lowest
> regardless user/default link speed settings. It should reduce the PHY's
> temperature to normal numbers.
> 
> When the PHY's temparature is back to normal (low threshold, it is
> 85 degrees) it restores user/default link speed settings.

Hi Igor

Please could you also export the temperature using HWMON. The Marvell
PHY drivers are good examples.

I also have a patch for driver/net/phy/aquantia.c which adds HWMON
support to that PHY.

> 
> Signed-off-by: Dmitry Bezrukov <dmitry.bezrukov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
> ---
>  drivers/net/usb/aqc111.c | 78 ++++++++++++++++++++++++++++++++++++++++
>  drivers/net/usb/aqc111.h |  8 +++++
>  2 files changed, 86 insertions(+)
> 
> diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c
> index 8867f6a3eaa7..fa6b9dfc2a6f 100644
> --- a/drivers/net/usb/aqc111.c
> +++ b/drivers/net/usb/aqc111.c
> @@ -17,6 +17,7 @@
>  #include <linux/usb/cdc.h>
>  #include <linux/usb/usbnet.h>
>  #include <linux/linkmode.h>
> +#include <linux/workqueue.h>
>  
>  #include "aqc111.h"
>  
> @@ -334,6 +335,9 @@ static void aqc111_set_phy_speed(struct usbnet *dev, u8 autoneg, u16 speed)
>  	aqc111_data->phy_cfg |= (3 << AQ_DSH_RETRIES_SHIFT) &
>  				AQ_DSH_RETRIES_MASK;
>  
> +	if (aqc111_data->thermal_throttling)
> +		speed = SPEED_100;
> +

That is a big jump. Do you need to cool is down quickly, or would
1Gbps be sufficient? I think as a user, i would prefer it ping-pongs
between 5G and 1G, not 5G and 100M.

>  	if (autoneg == AUTONEG_ENABLE) {
>  		switch (speed) {
>  		case SPEED_5000:

It looks like this only works when auto-neg is enabled. If i've fixed
configured it i don't think this works?

> @@ -714,6 +718,8 @@ static int aqc111_bind(struct usbnet *dev, struct usb_interface *intf)
>  	/* store aqc111_data pointer in device data field */
>  	dev->driver_priv = aqc111_data;
>  
> +	aqc111_data->dev = dev;
> +
>  	/* Init the MAC address */
>  	ret = aqc111_read_perm_mac(dev);
>  	if (ret)
> @@ -991,6 +997,71 @@ static int aqc111_link_reset(struct usbnet *dev)
>  	return 0;
>  }
>  
> +int aqc111_get_temperature(struct usbnet *dev, u32 *temperature)
> +{
> +	u16 reg16 = 0;
> +
> +	if (aqc111_mdio_read(dev, AQ_GLB_THERMAL_STATUS2, AQ_PHY_GLOBAL_ADDR,
> +			     &reg16) < 0)
> +		goto err;
> +
> +	if (!(reg16 & AQ_THERM_ST_READY))
> +		goto err;
> +
> +	if (aqc111_mdio_read(dev, AQ_GLB_THERMAL_STATUS1, AQ_PHY_GLOBAL_ADDR,
> +			     &reg16) < 0)
> +		goto err;
> +
> +	/*convert from 1/256 to 1/100 degrees of Celsius*/
> +	*temperature = (u32)reg16 * 100 / 256;
> +	return 0;
> +
> +err:
> +	*temperature = 0;
> +	return -1;
> +}
> +
> +void aqc111_thermal_work_cb(struct work_struct *w)
> +{
> +	struct delayed_work *dw = to_delayed_work(w);
> +	struct aqc111_data *aqc111_data = container_of(dw, struct aqc111_data,
> +						       therm_work);
> +	unsigned long timeout = msecs_to_jiffies(AQ_THERMAL_TIMER_MS);
> +	struct usbnet *dev = aqc111_data->dev;
> +	u32 temperature = 0;
> +	u8 reset_speed = 0;
> +
> +	if (!aqc111_data->link)
> +		/* poll not so frequently */
> +		timeout *= 2;
> +
> +	if (aqc111_get_temperature(dev, &temperature) != 0)
> +		goto end;
> +
> +	if (aqc111_data->thermal_throttling &&
> +	    temperature <= AQ_NORMAL_TEMP_THRESHOLD) {
> +		netdev_info(dev->net, "The temperature is back to normal(%u)",
> +			    AQ_NORMAL_TEMP_THRESHOLD / 100);

How often do you see these messages? I'm wondering if they need to be
rate limited?

> +		aqc111_data->thermal_throttling = 0;
> +		reset_speed = 1;
> +	}
> +
> +	if (!aqc111_data->thermal_throttling &&
> +	    temperature >= AQ_CRITICAL_TEMP_THRESHOLD) {

Should there be some hysteresis in here? In fact, if temperature is
AQ_CRITICAL_TEMP_THRESHOLD it is both back to normal, and throttled at
the same time!

> +		netdev_warn(dev->net, "Critical temperature(%u) is reached",
> +			    AQ_CRITICAL_TEMP_THRESHOLD / 100);
> +		aqc111_data->thermal_throttling = 1;
> +		reset_speed = 1;

update_speed might be a better name, since you are not always
resetting it.

	  Andrew
Igor Russkikh Dec. 12, 2018, 7:34 p.m. UTC | #2
Hi Andrew,

>> When the PHY's temparature is back to normal (low threshold, it is
>> 85 degrees) it restores user/default link speed settings.
> 
> Hi Igor
> 
> Please could you also export the temperature using HWMON. The Marvell
> PHY drivers are good examples.
> 
> I also have a patch for driver/net/phy/aquantia.c which adds HWMON
> support to that PHY.

We actually have almost ready patches to submit hwmon support separately
for both AQC USB and AQC PCI MAC drivers.
Will do that in near time.

>> +	if (aqc111_data->thermal_throttling)
>> +		speed = SPEED_100;
>> +
> 
> That is a big jump. Do you need to cool is down quickly, or would
> 1Gbps be sufficient? I think as a user, i would prefer it ping-pongs
> between 5G and 1G, not 5G and 100M.

In case overheat happens that really means something bad is happening outside,
or heatsink is bad/detached. I'll consult our HW team once again, but 100M was
the original request from our phy/hw team. It seems it really much less on power and
1G may not be enough.

> 
>>  	if (autoneg == AUTONEG_ENABLE) {
>>  		switch (speed) {
>>  		case SPEED_5000:
> 
> It looks like this only works when auto-neg is enabled. If i've fixed
> configured it i don't think this works?

Looks you are right, will check this.


>> +	if (aqc111_data->thermal_throttling &&
>> +	    temperature <= AQ_NORMAL_TEMP_THRESHOLD) {
>> +		netdev_info(dev->net, "The temperature is back to normal(%u)",
>> +			    AQ_NORMAL_TEMP_THRESHOLD / 100);
> 
> How often do you see these messages? I'm wondering if they need to be
> rate limited?

In worst case that will be at least limited by link down/up internal,
which in case of 5G normally takes 3-5secs.

>> +	if (!aqc111_data->thermal_throttling &&
>> +	    temperature >= AQ_CRITICAL_TEMP_THRESHOLD) {
> 
> Should there be some hysteresis in here? In fact, if temperature is
> AQ_CRITICAL_TEMP_THRESHOLD it is both back to normal, and throttled at
> the same time!

NORMAL_TEMP and CRITICAL_TEMP values are actually a hysteresis.
In cool down case only after TEMP_NORMAL temperature link will be flipped back again.

> 
>> +		netdev_warn(dev->net, "Critical temperature(%u) is reached",
>> +			    AQ_CRITICAL_TEMP_THRESHOLD / 100);
>> +		aqc111_data->thermal_throttling = 1;
>> +		reset_speed = 1;
> 
> update_speed might be a better name, since you are not always
> resetting it.

Agreed.

Regards,
  Igor
Andrew Lunn Dec. 12, 2018, 8:28 p.m. UTC | #3
> NORMAL_TEMP and CRITICAL_TEMP values are actually a hysteresis.  In
> cool down case only after TEMP_NORMAL temperature link will be
> flipped back again.

Ah, yes, sorry, i read that wrong.

    Andrew
diff mbox series

Patch

diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c
index 8867f6a3eaa7..fa6b9dfc2a6f 100644
--- a/drivers/net/usb/aqc111.c
+++ b/drivers/net/usb/aqc111.c
@@ -17,6 +17,7 @@ 
 #include <linux/usb/cdc.h>
 #include <linux/usb/usbnet.h>
 #include <linux/linkmode.h>
+#include <linux/workqueue.h>
 
 #include "aqc111.h"
 
@@ -334,6 +335,9 @@  static void aqc111_set_phy_speed(struct usbnet *dev, u8 autoneg, u16 speed)
 	aqc111_data->phy_cfg |= (3 << AQ_DSH_RETRIES_SHIFT) &
 				AQ_DSH_RETRIES_MASK;
 
+	if (aqc111_data->thermal_throttling)
+		speed = SPEED_100;
+
 	if (autoneg == AUTONEG_ENABLE) {
 		switch (speed) {
 		case SPEED_5000:
@@ -714,6 +718,8 @@  static int aqc111_bind(struct usbnet *dev, struct usb_interface *intf)
 	/* store aqc111_data pointer in device data field */
 	dev->driver_priv = aqc111_data;
 
+	aqc111_data->dev = dev;
+
 	/* Init the MAC address */
 	ret = aqc111_read_perm_mac(dev);
 	if (ret)
@@ -991,6 +997,71 @@  static int aqc111_link_reset(struct usbnet *dev)
 	return 0;
 }
 
+int aqc111_get_temperature(struct usbnet *dev, u32 *temperature)
+{
+	u16 reg16 = 0;
+
+	if (aqc111_mdio_read(dev, AQ_GLB_THERMAL_STATUS2, AQ_PHY_GLOBAL_ADDR,
+			     &reg16) < 0)
+		goto err;
+
+	if (!(reg16 & AQ_THERM_ST_READY))
+		goto err;
+
+	if (aqc111_mdio_read(dev, AQ_GLB_THERMAL_STATUS1, AQ_PHY_GLOBAL_ADDR,
+			     &reg16) < 0)
+		goto err;
+
+	/*convert from 1/256 to 1/100 degrees of Celsius*/
+	*temperature = (u32)reg16 * 100 / 256;
+	return 0;
+
+err:
+	*temperature = 0;
+	return -1;
+}
+
+void aqc111_thermal_work_cb(struct work_struct *w)
+{
+	struct delayed_work *dw = to_delayed_work(w);
+	struct aqc111_data *aqc111_data = container_of(dw, struct aqc111_data,
+						       therm_work);
+	unsigned long timeout = msecs_to_jiffies(AQ_THERMAL_TIMER_MS);
+	struct usbnet *dev = aqc111_data->dev;
+	u32 temperature = 0;
+	u8 reset_speed = 0;
+
+	if (!aqc111_data->link)
+		/* poll not so frequently */
+		timeout *= 2;
+
+	if (aqc111_get_temperature(dev, &temperature) != 0)
+		goto end;
+
+	if (aqc111_data->thermal_throttling &&
+	    temperature <= AQ_NORMAL_TEMP_THRESHOLD) {
+		netdev_info(dev->net, "The temperature is back to normal(%u)",
+			    AQ_NORMAL_TEMP_THRESHOLD / 100);
+		aqc111_data->thermal_throttling = 0;
+		reset_speed = 1;
+	}
+
+	if (!aqc111_data->thermal_throttling &&
+	    temperature >= AQ_CRITICAL_TEMP_THRESHOLD) {
+		netdev_warn(dev->net, "Critical temperature(%u) is reached",
+			    AQ_CRITICAL_TEMP_THRESHOLD / 100);
+		aqc111_data->thermal_throttling = 1;
+		reset_speed = 1;
+	}
+
+	if (reset_speed)
+		aqc111_set_phy_speed(dev, aqc111_data->autoneg,
+				     aqc111_data->advertised_speed);
+
+end:
+	schedule_delayed_work(&aqc111_data->therm_work, timeout);
+}
+
 static int aqc111_reset(struct usbnet *dev)
 {
 	struct aqc111_data *aqc111_data = dev->driver_priv;
@@ -1032,6 +1103,10 @@  static int aqc111_reset(struct usbnet *dev)
 	aqc111_set_phy_speed(dev, aqc111_data->autoneg,
 			     aqc111_data->advertised_speed);
 
+	INIT_DELAYED_WORK(&aqc111_data->therm_work, &aqc111_thermal_work_cb);
+	schedule_delayed_work(&aqc111_data->therm_work,
+			      msecs_to_jiffies(AQ_THERMAL_TIMER_MS));
+
 	return 0;
 }
 
@@ -1040,6 +1115,9 @@  static int aqc111_stop(struct usbnet *dev)
 	struct aqc111_data *aqc111_data = dev->driver_priv;
 	u16 reg16 = 0;
 
+	cancel_delayed_work_sync(&aqc111_data->therm_work);
+	aqc111_data->thermal_throttling = 0;
+
 	aqc111_read16_cmd(dev, AQ_ACCESS_MAC, SFR_MEDIUM_STATUS_MODE,
 			  2, &reg16);
 	reg16 &= ~SFR_MEDIUM_RECEIVE_EN;
diff --git a/drivers/net/usb/aqc111.h b/drivers/net/usb/aqc111.h
index 359663635b49..7b50e3cd217b 100644
--- a/drivers/net/usb/aqc111.h
+++ b/drivers/net/usb/aqc111.h
@@ -35,6 +35,11 @@ 
 #define AQ_USB_PHY_SET_TIMEOUT		10000
 #define AQ_USB_SET_TIMEOUT		4000
 
+#define AQ_THERMAL_TIMER_MS		500
+/* Temperature thresholds in units 1/100 degree of Celsius */
+#define AQ_NORMAL_TEMP_THRESHOLD	8500
+#define AQ_CRITICAL_TEMP_THRESHOLD	10600
+
 /* Feature. ********************************************/
 #define AQ_SUPPORT_FEATURE	(NETIF_F_SG | NETIF_F_IP_CSUM |\
 				 NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM |\
@@ -183,6 +188,9 @@  struct aqc111_data {
 	} fw_ver;
 	u32 phy_cfg;
 	u8 wol_flags;
+	struct delayed_work therm_work;
+	u8 thermal_throttling;
+	struct usbnet *dev;
 };
 
 #define AQ_LS_MASK		0x8000