diff mbox series

[net-next,11/12] bnxt_en: Expose threshold temperatures through hwmon

Message ID 20230815045658.80494-12-michael.chan@broadcom.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series bnxt_en: Update for net-next | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1330 this patch: 1330
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 1353 this patch: 1353
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1353 this patch: 1353
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Michael Chan Aug. 15, 2023, 4:56 a.m. UTC
From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>

HWRM_TEMP_MONITOR_QUERY response now indicates various
threshold temperatures. Expose these threshold temperatures
through the hwmon sysfs.
Also, provide temp1_max_alarm through which the user can check
whether the threshold temperature has been reached or not.

Example:
cat /sys/class/hwmon/hwmon3/temp1_input
75000
cat /sys/class/hwmon/hwmon3/temp1_max
105000
cat /sys/class/hwmon/hwmon3/temp1_max_alarm
0

Cc: Jean Delvare <jdelvare@suse.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: linux-hwmon@vger.kernel.org
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  7 ++
 .../net/ethernet/broadcom/bnxt/bnxt_hwmon.c   | 71 +++++++++++++++++--
 2 files changed, 73 insertions(+), 5 deletions(-)

Comments

Guenter Roeck Aug. 15, 2023, 3:05 p.m. UTC | #1
On Mon, Aug 14, 2023 at 09:56:57PM -0700, Michael Chan wrote:
> From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
> 
> HWRM_TEMP_MONITOR_QUERY response now indicates various
> threshold temperatures. Expose these threshold temperatures
> through the hwmon sysfs.
> Also, provide temp1_max_alarm through which the user can check
> whether the threshold temperature has been reached or not.
> 
> Example:
> cat /sys/class/hwmon/hwmon3/temp1_input
> 75000
> cat /sys/class/hwmon/hwmon3/temp1_max
> 105000
> cat /sys/class/hwmon/hwmon3/temp1_max_alarm
> 0
> 
> Cc: Jean Delvare <jdelvare@suse.com>
> Cc: Guenter Roeck <linux@roeck-us.net>
> Cc: linux-hwmon@vger.kernel.org
> Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
> Signed-off-by: Michael Chan <michael.chan@broadcom.com>
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  7 ++
>  .../net/ethernet/broadcom/bnxt/bnxt_hwmon.c   | 71 +++++++++++++++++--
>  2 files changed, 73 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> index 84cbcfa61bc1..43a07d84f815 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> @@ -2013,6 +2013,7 @@ struct bnxt {
>  	#define BNXT_FW_CAP_RING_MONITOR		BIT_ULL(30)
>  	#define BNXT_FW_CAP_DBG_QCAPS			BIT_ULL(31)
>  	#define BNXT_FW_CAP_PTP				BIT_ULL(32)
> +	#define BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED	BIT_ULL(33)
>  
>  	u32			fw_dbg_cap;
>  
> @@ -2185,7 +2186,13 @@ struct bnxt {
>  	struct bnxt_tc_info	*tc_info;
>  	struct list_head	tc_indr_block_list;
>  	struct dentry		*debugfs_pdev;
> +#ifdef CONFIG_BNXT_HWMON
>  	struct device		*hwmon_dev;
> +	u8			warn_thresh_temp;
> +	u8			crit_thresh_temp;
> +	u8			fatal_thresh_temp;
> +	u8			shutdown_thresh_temp;
> +#endif
>  	enum board_idx		board_idx;
>  };
>  
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> index 20381b7b1d78..f5affac1169a 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> @@ -34,6 +34,15 @@ static int bnxt_hwrm_temp_query(struct bnxt *bp, u8 *temp)
>  
>  	if (temp)
>  		*temp = resp->temp;
> +
> +	if (resp->flags & TEMP_MONITOR_QUERY_RESP_FLAGS_THRESHOLD_VALUES_AVAILABLE) {
> +		if (!temp)
> +			bp->fw_cap |= BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED;

The if statement seems unnecessary. If the flag was not set
during initialization, the limit attributes won't be visible anyway,
so it doesn't make a difference if it is set now or not.

> +		bp->warn_thresh_temp = resp->warn_threshold;
> +		bp->crit_thresh_temp = resp->critical_threshold;
> +		bp->fatal_thresh_temp = resp->fatal_threshold;
> +		bp->shutdown_thresh_temp = resp->shutdown_threshold;

Are those temperatures expected to change during runtime ? If not it might
make sense to only execute the entire if condition if temp == NULL to
avoid unnecessary reassignments whenever the temperature is read.

> +	}
>  err:
>  	hwrm_req_drop(bp, req);
>  	return rc;
> @@ -42,12 +51,30 @@ static int bnxt_hwrm_temp_query(struct bnxt *bp, u8 *temp)
>  static umode_t bnxt_hwmon_is_visible(const void *_data, enum hwmon_sensor_types type,
>  				     u32 attr, int channel)
>  {
> +	const struct bnxt *bp = _data;
> +
>  	if (type != hwmon_temp)
>  		return 0;
>  
>  	switch (attr) {
>  	case hwmon_temp_input:
>  		return 0444;
> +	case hwmon_temp_lcrit:
> +	case hwmon_temp_crit:
> +	case hwmon_temp_emergency:
> +	case hwmon_temp_lcrit_alarm:
> +	case hwmon_temp_crit_alarm:
> +	case hwmon_temp_emergency_alarm:
> +		if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED)

Seems to me that
		if (!(bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED))
would be much easier to understand.

> +			return 0;
> +		return 0444;
> +	/* Max temperature setting in NVM is optional */
> +	case hwmon_temp_max:
> +	case hwmon_temp_max_alarm:
> +		if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED ||
> +		    !bp->shutdown_thresh_temp)
> +			return 0;

Wrong use of the 'max' attribute. More on that below.

> +		return 0444;
>  	default:
>  		return 0;
>  	}
> @@ -66,6 +93,38 @@ static int bnxt_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32
>  		if (!rc)
>  			*val = temp * 1000;
>  		return rc;
> +	case hwmon_temp_lcrit:
> +		*val = bp->warn_thresh_temp * 1000;
> +		return 0;
> +	case hwmon_temp_crit:
> +		*val = bp->crit_thresh_temp * 1000;
> +		return 0;
> +	case hwmon_temp_emergency:
> +		*val = bp->fatal_thresh_temp * 1000;
> +		return 0;
> +	case hwmon_temp_max:
> +		*val = bp->shutdown_thresh_temp * 1000;
> +		return 0;
> +	case hwmon_temp_lcrit_alarm:
> +		rc = bnxt_hwrm_temp_query(bp, &temp);
> +		if (!rc)
> +			*val = temp >= bp->warn_thresh_temp;

That is wrong. lcrit is the _lower_ critical temperature, ie the
temperature is critically low. This is not a "high temperature"
alarm.

> +		return rc;
> +	case hwmon_temp_crit_alarm:
> +		rc = bnxt_hwrm_temp_query(bp, &temp);
> +		if (!rc)
> +			*val = temp >= bp->crit_thresh_temp;
> +		return rc;
> +	case hwmon_temp_emergency_alarm:
> +		rc = bnxt_hwrm_temp_query(bp, &temp);
> +		if (!rc)
> +			*val = temp >= bp->fatal_thresh_temp;
> +		return rc;
> +	case hwmon_temp_max_alarm:
> +		rc = bnxt_hwrm_temp_query(bp, &temp);
> +		if (!rc)
> +			*val = temp >= bp->shutdown_thresh_temp;

Hmm, that isn't really the purpose of alarm attributes. The expectation
would be that the chip sets alarm flags and the driver reports it.
I guess there is some value in having it, so I won't object.

Anyway, the ordering is wrong. max_alarm should be the lowest
alarm level, followed by crit and emergency. So
		max_alarm -> temp >= bp->warn_thresh_temp
		crit_alarm -> temp >= bp->crit_thresh_temp
		emergency_alarm -> temp >= bp->fatal_thresh_temp
				or temp >= bp->shutdown_thresh_temp

There are only three levels of upper temperature alarms.
Abusing lcrit as 4th upper alarm is most definitely wrong.

> +		return rc;
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> @@ -73,7 +132,11 @@ static int bnxt_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32
>  
>  static const struct hwmon_channel_info *bnxt_hwmon_info[] = {
>  	HWMON_CHANNEL_INFO(temp,
> -			   HWMON_T_INPUT),
> +			   HWMON_T_INPUT |
> +			   HWMON_T_MAX | HWMON_T_LCRIT |
> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY |
> +			   HWMON_T_CRIT_ALARM | HWMON_T_LCRIT_ALARM |
> +			   HWMON_T_MAX_ALARM | HWMON_T_EMERGENCY_ALARM),
>  	NULL
>  };
>  
> @@ -97,13 +160,11 @@ void bnxt_hwmon_uninit(struct bnxt *bp)
>  
>  void bnxt_hwmon_init(struct bnxt *bp)
>  {
> -	struct hwrm_temp_monitor_query_input *req;
>  	struct pci_dev *pdev = bp->pdev;
>  	int rc;
>  
> -	rc = hwrm_req_init(bp, req, HWRM_TEMP_MONITOR_QUERY);
> -	if (!rc)
> -		rc = hwrm_req_send_silent(bp, req);
> +	/* temp1_xxx is only sensor, ensure not registered if it will fail */
> +	rc = bnxt_hwrm_temp_query(bp, NULL);

Ah, that is the reason for the check in bnxt_hwrm_temp_query().
The check in that function should really be added here, not in the
previous patch.

>  	if (rc == -EACCES || rc == -EOPNOTSUPP) {
>  		bnxt_hwmon_uninit(bp);
>  		return;
> -- 
> 2.30.1
>
Kalesh Anakkur Purayil Aug. 16, 2023, 10:28 a.m. UTC | #2
Thank you Guenter for the review and the suggestions.

Please see my response inline.

On Tue, Aug 15, 2023 at 8:35 PM Guenter Roeck <linux@roeck-us.net> wrote:

> On Mon, Aug 14, 2023 at 09:56:57PM -0700, Michael Chan wrote:
> > From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
> >
> > HWRM_TEMP_MONITOR_QUERY response now indicates various
> > threshold temperatures. Expose these threshold temperatures
> > through the hwmon sysfs.
> > Also, provide temp1_max_alarm through which the user can check
> > whether the threshold temperature has been reached or not.
> >
> > Example:
> > cat /sys/class/hwmon/hwmon3/temp1_input
> > 75000
> > cat /sys/class/hwmon/hwmon3/temp1_max
> > 105000
> > cat /sys/class/hwmon/hwmon3/temp1_max_alarm
> > 0
> >
> > Cc: Jean Delvare <jdelvare@suse.com>
> > Cc: Guenter Roeck <linux@roeck-us.net>
> > Cc: linux-hwmon@vger.kernel.org
> > Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
> > Signed-off-by: Michael Chan <michael.chan@broadcom.com>
> > ---
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  7 ++
> >  .../net/ethernet/broadcom/bnxt/bnxt_hwmon.c   | 71 +++++++++++++++++--
> >  2 files changed, 73 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > index 84cbcfa61bc1..43a07d84f815 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > @@ -2013,6 +2013,7 @@ struct bnxt {
> >       #define BNXT_FW_CAP_RING_MONITOR                BIT_ULL(30)
> >       #define BNXT_FW_CAP_DBG_QCAPS                   BIT_ULL(31)
> >       #define BNXT_FW_CAP_PTP                         BIT_ULL(32)
> > +     #define BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED    BIT_ULL(33)
> >
> >       u32                     fw_dbg_cap;
> >
> > @@ -2185,7 +2186,13 @@ struct bnxt {
> >       struct bnxt_tc_info     *tc_info;
> >       struct list_head        tc_indr_block_list;
> >       struct dentry           *debugfs_pdev;
> > +#ifdef CONFIG_BNXT_HWMON
> >       struct device           *hwmon_dev;
> > +     u8                      warn_thresh_temp;
> > +     u8                      crit_thresh_temp;
> > +     u8                      fatal_thresh_temp;
> > +     u8                      shutdown_thresh_temp;
> > +#endif
> >       enum board_idx          board_idx;
> >  };
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> > index 20381b7b1d78..f5affac1169a 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
> > @@ -34,6 +34,15 @@ static int bnxt_hwrm_temp_query(struct bnxt *bp, u8
> *temp)
> >
> >       if (temp)
> >               *temp = resp->temp;
> > +
> > +     if (resp->flags &
> TEMP_MONITOR_QUERY_RESP_FLAGS_THRESHOLD_VALUES_AVAILABLE) {
> > +             if (!temp)
> > +                     bp->fw_cap |= BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED;
>
> The if statement seems unnecessary. If the flag was not set
> during initialization, the limit attributes won't be visible anyway,
> so it doesn't make a difference if it is set now or not.
>
> > +             bp->warn_thresh_temp = resp->warn_threshold;
> > +             bp->crit_thresh_temp = resp->critical_threshold;
> > +             bp->fatal_thresh_temp = resp->fatal_threshold;
> > +             bp->shutdown_thresh_temp = resp->shutdown_threshold;
>
> Are those temperatures expected to change during runtime ? If not it might
> make sense to only execute the entire if condition if temp == NULL to
> avoid unnecessary reassignments whenever the temperature is read.
>
> > +     }
> >  err:
> >       hwrm_req_drop(bp, req);
> >       return rc;
> > @@ -42,12 +51,30 @@ static int bnxt_hwrm_temp_query(struct bnxt *bp, u8
> *temp)
> >  static umode_t bnxt_hwmon_is_visible(const void *_data, enum
> hwmon_sensor_types type,
> >                                    u32 attr, int channel)
> >  {
> > +     const struct bnxt *bp = _data;
> > +
> >       if (type != hwmon_temp)
> >               return 0;
> >
> >       switch (attr) {
> >       case hwmon_temp_input:
> >               return 0444;
> > +     case hwmon_temp_lcrit:
> > +     case hwmon_temp_crit:
> > +     case hwmon_temp_emergency:
> > +     case hwmon_temp_lcrit_alarm:
> > +     case hwmon_temp_crit_alarm:
> > +     case hwmon_temp_emergency_alarm:
> > +             if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED)
>
> Seems to me that
>                 if (!(bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED))
> would be much easier to understand.
>
> > +                     return 0;
> > +             return 0444;
> > +     /* Max temperature setting in NVM is optional */
> > +     case hwmon_temp_max:
> > +     case hwmon_temp_max_alarm:
> > +             if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED ||
> > +                 !bp->shutdown_thresh_temp)
> > +                     return 0;
>
> Wrong use of the 'max' attribute. More on that below.
>
> > +             return 0444;
> >       default:
> >               return 0;
> >       }
> > @@ -66,6 +93,38 @@ static int bnxt_hwmon_read(struct device *dev, enum
> hwmon_sensor_types type, u32
> >               if (!rc)
> >                       *val = temp * 1000;
> >               return rc;
> > +     case hwmon_temp_lcrit:
> > +             *val = bp->warn_thresh_temp * 1000;
> > +             return 0;
> > +     case hwmon_temp_crit:
> > +             *val = bp->crit_thresh_temp * 1000;
> > +             return 0;
> > +     case hwmon_temp_emergency:
> > +             *val = bp->fatal_thresh_temp * 1000;
> > +             return 0;
> > +     case hwmon_temp_max:
> > +             *val = bp->shutdown_thresh_temp * 1000;
> > +             return 0;
> > +     case hwmon_temp_lcrit_alarm:
> > +             rc = bnxt_hwrm_temp_query(bp, &temp);
> > +             if (!rc)
> > +                     *val = temp >= bp->warn_thresh_temp;
>
> That is wrong. lcrit is the _lower_ critical temperature, ie the
> temperature is critically low. This is not a "high temperature"
> alarm.
>
> > +             return rc;
> > +     case hwmon_temp_crit_alarm:
> > +             rc = bnxt_hwrm_temp_query(bp, &temp);
> > +             if (!rc)
> > +                     *val = temp >= bp->crit_thresh_temp;
> > +             return rc;
> > +     case hwmon_temp_emergency_alarm:
> > +             rc = bnxt_hwrm_temp_query(bp, &temp);
> > +             if (!rc)
> > +                     *val = temp >= bp->fatal_thresh_temp;
> > +             return rc;
> > +     case hwmon_temp_max_alarm:
> > +             rc = bnxt_hwrm_temp_query(bp, &temp);
> > +             if (!rc)
> > +                     *val = temp >= bp->shutdown_thresh_temp;
>
> Hmm, that isn't really the purpose of alarm attributes. The expectation
> would be that the chip sets alarm flags and the driver reports it.
> I guess there is some value in having it, so I won't object.
>
> Anyway, the ordering is wrong. max_alarm should be the lowest
> alarm level, followed by crit and emergency. So
>                 max_alarm -> temp >= bp->warn_thresh_temp
>                 crit_alarm -> temp >= bp->crit_thresh_temp
>                 emergency_alarm -> temp >= bp->fatal_thresh_temp
>                                 or temp >= bp->shutdown_thresh_temp
>
> There are only three levels of upper temperature alarms.
> Abusing lcrit as 4th upper alarm is most definitely wrong.
>
[Kalesh]: Thank you for the clarification.
bnxt_en driver wants to expose 4 threshold temperatures to the user through
hwmon sysfs.
1. warning threshold temperature
2. critical threshold temperature
3. fatal threshold temperature
4. shutdown threshold temperature

I will use the following mapping:

hwmon_temp_max : warning threshold temperature
hwmon_temp_crit : critical threshold temperature
hwmon_temp_emergency : fatal threshold temperature

hwmon_temp_max_alarm : temp >= bp->warn_thresh_temp
hwmon_temp_crit_alarm : temp >= bp->crit_thresh_temp
hwmon_temp_emergency_alarm : temp >= bp->fatal_thresh_temp

Is it OK to map the shutdown threshold temperature to "hwmon_temp_fault"?
If not, can you please suggest an alternative?

Regards,
Kalesh


>
> > +             return rc;
> >       default:
> >               return -EOPNOTSUPP;
> >       }
> > @@ -73,7 +132,11 @@ static int bnxt_hwmon_read(struct device *dev, enum
> hwmon_sensor_types type, u32
> >
> >  static const struct hwmon_channel_info *bnxt_hwmon_info[] = {
> >       HWMON_CHANNEL_INFO(temp,
> > -                        HWMON_T_INPUT),
> > +                        HWMON_T_INPUT |
> > +                        HWMON_T_MAX | HWMON_T_LCRIT |
> > +                        HWMON_T_CRIT | HWMON_T_EMERGENCY |
> > +                        HWMON_T_CRIT_ALARM | HWMON_T_LCRIT_ALARM |
> > +                        HWMON_T_MAX_ALARM | HWMON_T_EMERGENCY_ALARM),
> >       NULL
> >  };
> >
> > @@ -97,13 +160,11 @@ void bnxt_hwmon_uninit(struct bnxt *bp)
> >
> >  void bnxt_hwmon_init(struct bnxt *bp)
> >  {
> > -     struct hwrm_temp_monitor_query_input *req;
> >       struct pci_dev *pdev = bp->pdev;
> >       int rc;
> >
> > -     rc = hwrm_req_init(bp, req, HWRM_TEMP_MONITOR_QUERY);
> > -     if (!rc)
> > -             rc = hwrm_req_send_silent(bp, req);
> > +     /* temp1_xxx is only sensor, ensure not registered if it will fail
> */
> > +     rc = bnxt_hwrm_temp_query(bp, NULL);
>
> Ah, that is the reason for the check in bnxt_hwrm_temp_query().
> The check in that function should really be added here, not in the
> previous patch.
>
> >       if (rc == -EACCES || rc == -EOPNOTSUPP) {
> >               bnxt_hwmon_uninit(bp);
> >               return;
> > --
> > 2.30.1
> >
>
>
>
Guenter Roeck Aug. 16, 2023, 12:13 p.m. UTC | #3
On Wed, Aug 16, 2023 at 03:58:34PM +0530, Kalesh Anakkur Purayil wrote:
> Thank you Guenter for the review and the suggestions.
> 
> Please see my response inline.
> 
> On Tue, Aug 15, 2023 at 8:35 PM Guenter Roeck <linux@roeck-us.net> wrote:
> 
[ ... ]

> >
> > Hmm, that isn't really the purpose of alarm attributes. The expectation
> > would be that the chip sets alarm flags and the driver reports it.
> > I guess there is some value in having it, so I won't object.
> >
> > Anyway, the ordering is wrong. max_alarm should be the lowest
> > alarm level, followed by crit and emergency. So
> >                 max_alarm -> temp >= bp->warn_thresh_temp
> >                 crit_alarm -> temp >= bp->crit_thresh_temp
> >                 emergency_alarm -> temp >= bp->fatal_thresh_temp
> >                                 or temp >= bp->shutdown_thresh_temp
> >
> > There are only three levels of upper temperature alarms.
> > Abusing lcrit as 4th upper alarm is most definitely wrong.
> >
> [Kalesh]: Thank you for the clarification.
> bnxt_en driver wants to expose 4 threshold temperatures to the user through
> hwmon sysfs.
> 1. warning threshold temperature
> 2. critical threshold temperature
> 3. fatal threshold temperature
> 4. shutdown threshold temperature
> 
> I will use the following mapping:
> 
> hwmon_temp_max : warning threshold temperature
> hwmon_temp_crit : critical threshold temperature
> hwmon_temp_emergency : fatal threshold temperature
> 
> hwmon_temp_max_alarm : temp >= bp->warn_thresh_temp
> hwmon_temp_crit_alarm : temp >= bp->crit_thresh_temp
> hwmon_temp_emergency_alarm : temp >= bp->fatal_thresh_temp
> 
> Is it OK to map the shutdown threshold temperature to "hwmon_temp_fault"?

That is a flag, not a temperature, and it is intended to signal
a problem ith the sensor.

> If not, can you please suggest an alternative?
> 

The only one I can think of is to add non-standard attributes
such as temp1_shutdown and temp1_shutdown_alarm.

Guenter
Kalesh Anakkur Purayil Aug. 16, 2023, 4:12 p.m. UTC | #4
On Wed, Aug 16, 2023 at 5:43 PM Guenter Roeck <linux@roeck-us.net> wrote:

> On Wed, Aug 16, 2023 at 03:58:34PM +0530, Kalesh Anakkur Purayil wrote:
> > Thank you Guenter for the review and the suggestions.
> >
> > Please see my response inline.
> >
> > On Tue, Aug 15, 2023 at 8:35 PM Guenter Roeck <linux@roeck-us.net>
> wrote:
> >
> [ ... ]
>
> > >
> > > Hmm, that isn't really the purpose of alarm attributes. The expectation
> > > would be that the chip sets alarm flags and the driver reports it.
> > > I guess there is some value in having it, so I won't object.
> > >
> > > Anyway, the ordering is wrong. max_alarm should be the lowest
> > > alarm level, followed by crit and emergency. So
> > >                 max_alarm -> temp >= bp->warn_thresh_temp
> > >                 crit_alarm -> temp >= bp->crit_thresh_temp
> > >                 emergency_alarm -> temp >= bp->fatal_thresh_temp
> > >                                 or temp >= bp->shutdown_thresh_temp
> > >
> > > There are only three levels of upper temperature alarms.
> > > Abusing lcrit as 4th upper alarm is most definitely wrong.
> > >
> > [Kalesh]: Thank you for the clarification.
> > bnxt_en driver wants to expose 4 threshold temperatures to the user
> through
> > hwmon sysfs.
> > 1. warning threshold temperature
> > 2. critical threshold temperature
> > 3. fatal threshold temperature
> > 4. shutdown threshold temperature
> >
> > I will use the following mapping:
> >
> > hwmon_temp_max : warning threshold temperature
> > hwmon_temp_crit : critical threshold temperature
> > hwmon_temp_emergency : fatal threshold temperature
> >
> > hwmon_temp_max_alarm : temp >= bp->warn_thresh_temp
> > hwmon_temp_crit_alarm : temp >= bp->crit_thresh_temp
> > hwmon_temp_emergency_alarm : temp >= bp->fatal_thresh_temp
> >
> > Is it OK to map the shutdown threshold temperature to "hwmon_temp_fault"?
>
> That is a flag, not a temperature, and it is intended to signal
> a problem ith the sensor.
>
> > If not, can you please suggest an alternative?
> >
>
> The only one I can think of is to add non-standard attributes
> such as temp1_shutdown and temp1_shutdown_alarm.
>
[Kalesh]: Sorry, I don't quite get this part. I was looking at the kernel
hwmon code, but could not find any reference.

Can we add new attributes "shutdown" and "shutdown_alarm" for tempX? For
example:

#define HWMON_T_SHUTDOWN BIT(hwmon_temp_shutdown)

>
> Guenter
>
Guenter Roeck Aug. 16, 2023, 7:25 p.m. UTC | #5
On Wed, Aug 16, 2023 at 09:42:17PM +0530, Kalesh Anakkur Purayil wrote:
> On Wed, Aug 16, 2023 at 5:43 PM Guenter Roeck <linux@roeck-us.net> wrote:
> 
> > On Wed, Aug 16, 2023 at 03:58:34PM +0530, Kalesh Anakkur Purayil wrote:
> > > Thank you Guenter for the review and the suggestions.
> > >
> > > Please see my response inline.
> > >
> > > On Tue, Aug 15, 2023 at 8:35 PM Guenter Roeck <linux@roeck-us.net>
> > wrote:
> > >
> > [ ... ]
> >
> > > >
> > > > Hmm, that isn't really the purpose of alarm attributes. The expectation
> > > > would be that the chip sets alarm flags and the driver reports it.
> > > > I guess there is some value in having it, so I won't object.
> > > >
> > > > Anyway, the ordering is wrong. max_alarm should be the lowest
> > > > alarm level, followed by crit and emergency. So
> > > >                 max_alarm -> temp >= bp->warn_thresh_temp
> > > >                 crit_alarm -> temp >= bp->crit_thresh_temp
> > > >                 emergency_alarm -> temp >= bp->fatal_thresh_temp
> > > >                                 or temp >= bp->shutdown_thresh_temp
> > > >
> > > > There are only three levels of upper temperature alarms.
> > > > Abusing lcrit as 4th upper alarm is most definitely wrong.
> > > >
> > > [Kalesh]: Thank you for the clarification.
> > > bnxt_en driver wants to expose 4 threshold temperatures to the user
> > through
> > > hwmon sysfs.
> > > 1. warning threshold temperature
> > > 2. critical threshold temperature
> > > 3. fatal threshold temperature
> > > 4. shutdown threshold temperature
> > >
> > > I will use the following mapping:
> > >
> > > hwmon_temp_max : warning threshold temperature
> > > hwmon_temp_crit : critical threshold temperature
> > > hwmon_temp_emergency : fatal threshold temperature
> > >
> > > hwmon_temp_max_alarm : temp >= bp->warn_thresh_temp
> > > hwmon_temp_crit_alarm : temp >= bp->crit_thresh_temp
> > > hwmon_temp_emergency_alarm : temp >= bp->fatal_thresh_temp
> > >
> > > Is it OK to map the shutdown threshold temperature to "hwmon_temp_fault"?
> >
> > That is a flag, not a temperature, and it is intended to signal
> > a problem ith the sensor.
> >
> > > If not, can you please suggest an alternative?
> > >
> >
> > The only one I can think of is to add non-standard attributes
> > such as temp1_shutdown and temp1_shutdown_alarm.
> >
> [Kalesh]: Sorry, I don't quite get this part. I was looking at the kernel
> hwmon code, but could not find any reference.
> 

It would be non-standard attributes, so, correct, there is no reference.

> Can we add new attributes "shutdown" and "shutdown_alarm" for tempX? For
> example:
> 
> #define HWMON_T_SHUTDOWN BIT(hwmon_temp_shutdown)
> 

Not for a single driver. You can implement the sysfs attributes
directly in the driver and pass an extra attribute group to the
hwmon core when registering the hwmon device.

Guenter
Michael Chan Aug. 17, 2023, 6:28 a.m. UTC | #6
On Wed, Aug 16, 2023 at 12:25 PM Guenter Roeck <linux@roeck-us.net> wrote:
>
> On Wed, Aug 16, 2023 at 09:42:17PM +0530, Kalesh Anakkur Purayil wrote:
> > [Kalesh]: Sorry, I don't quite get this part. I was looking at the kernel
> > hwmon code, but could not find any reference.
> >
>
> It would be non-standard attributes, so, correct, there is no reference.
>
> > Can we add new attributes "shutdown" and "shutdown_alarm" for tempX? For
> > example:
> >
> > #define HWMON_T_SHUTDOWN BIT(hwmon_temp_shutdown)
> >
>
> Not for a single driver. You can implement the sysfs attributes
> directly in the driver and pass an extra attribute group to the
> hwmon core when registering the hwmon device.

Thanks for the review.  I will drop these hwmon patches for now and
respin the others for v2.  Kalesh will rework these hwmon patches.
Guenter Roeck Aug. 17, 2023, 1:44 p.m. UTC | #7
On Wed, Aug 16, 2023 at 11:28:07PM -0700, Michael Chan wrote:
> On Wed, Aug 16, 2023 at 12:25 PM Guenter Roeck <linux@roeck-us.net> wrote:
> >
> > On Wed, Aug 16, 2023 at 09:42:17PM +0530, Kalesh Anakkur Purayil wrote:
> > > [Kalesh]: Sorry, I don't quite get this part. I was looking at the kernel
> > > hwmon code, but could not find any reference.
> > >
> >
> > It would be non-standard attributes, so, correct, there is no reference.
> >
> > > Can we add new attributes "shutdown" and "shutdown_alarm" for tempX? For
> > > example:
> > >
> > > #define HWMON_T_SHUTDOWN BIT(hwmon_temp_shutdown)
> > >
> >
> > Not for a single driver. You can implement the sysfs attributes
> > directly in the driver and pass an extra attribute group to the
> > hwmon core when registering the hwmon device.
> 
> Thanks for the review.  I will drop these hwmon patches for now and
> respin the others for v2.  Kalesh will rework these hwmon patches.

Alternatively you could keep the standard attributes and add the
shutdown attributes later.

Guenter
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 84cbcfa61bc1..43a07d84f815 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2013,6 +2013,7 @@  struct bnxt {
 	#define BNXT_FW_CAP_RING_MONITOR		BIT_ULL(30)
 	#define BNXT_FW_CAP_DBG_QCAPS			BIT_ULL(31)
 	#define BNXT_FW_CAP_PTP				BIT_ULL(32)
+	#define BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED	BIT_ULL(33)
 
 	u32			fw_dbg_cap;
 
@@ -2185,7 +2186,13 @@  struct bnxt {
 	struct bnxt_tc_info	*tc_info;
 	struct list_head	tc_indr_block_list;
 	struct dentry		*debugfs_pdev;
+#ifdef CONFIG_BNXT_HWMON
 	struct device		*hwmon_dev;
+	u8			warn_thresh_temp;
+	u8			crit_thresh_temp;
+	u8			fatal_thresh_temp;
+	u8			shutdown_thresh_temp;
+#endif
 	enum board_idx		board_idx;
 };
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
index 20381b7b1d78..f5affac1169a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
@@ -34,6 +34,15 @@  static int bnxt_hwrm_temp_query(struct bnxt *bp, u8 *temp)
 
 	if (temp)
 		*temp = resp->temp;
+
+	if (resp->flags & TEMP_MONITOR_QUERY_RESP_FLAGS_THRESHOLD_VALUES_AVAILABLE) {
+		if (!temp)
+			bp->fw_cap |= BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED;
+		bp->warn_thresh_temp = resp->warn_threshold;
+		bp->crit_thresh_temp = resp->critical_threshold;
+		bp->fatal_thresh_temp = resp->fatal_threshold;
+		bp->shutdown_thresh_temp = resp->shutdown_threshold;
+	}
 err:
 	hwrm_req_drop(bp, req);
 	return rc;
@@ -42,12 +51,30 @@  static int bnxt_hwrm_temp_query(struct bnxt *bp, u8 *temp)
 static umode_t bnxt_hwmon_is_visible(const void *_data, enum hwmon_sensor_types type,
 				     u32 attr, int channel)
 {
+	const struct bnxt *bp = _data;
+
 	if (type != hwmon_temp)
 		return 0;
 
 	switch (attr) {
 	case hwmon_temp_input:
 		return 0444;
+	case hwmon_temp_lcrit:
+	case hwmon_temp_crit:
+	case hwmon_temp_emergency:
+	case hwmon_temp_lcrit_alarm:
+	case hwmon_temp_crit_alarm:
+	case hwmon_temp_emergency_alarm:
+		if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED)
+			return 0;
+		return 0444;
+	/* Max temperature setting in NVM is optional */
+	case hwmon_temp_max:
+	case hwmon_temp_max_alarm:
+		if (~bp->fw_cap & BNXT_FW_CAP_THRESHOLD_TEMP_SUPPORTED ||
+		    !bp->shutdown_thresh_temp)
+			return 0;
+		return 0444;
 	default:
 		return 0;
 	}
@@ -66,6 +93,38 @@  static int bnxt_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32
 		if (!rc)
 			*val = temp * 1000;
 		return rc;
+	case hwmon_temp_lcrit:
+		*val = bp->warn_thresh_temp * 1000;
+		return 0;
+	case hwmon_temp_crit:
+		*val = bp->crit_thresh_temp * 1000;
+		return 0;
+	case hwmon_temp_emergency:
+		*val = bp->fatal_thresh_temp * 1000;
+		return 0;
+	case hwmon_temp_max:
+		*val = bp->shutdown_thresh_temp * 1000;
+		return 0;
+	case hwmon_temp_lcrit_alarm:
+		rc = bnxt_hwrm_temp_query(bp, &temp);
+		if (!rc)
+			*val = temp >= bp->warn_thresh_temp;
+		return rc;
+	case hwmon_temp_crit_alarm:
+		rc = bnxt_hwrm_temp_query(bp, &temp);
+		if (!rc)
+			*val = temp >= bp->crit_thresh_temp;
+		return rc;
+	case hwmon_temp_emergency_alarm:
+		rc = bnxt_hwrm_temp_query(bp, &temp);
+		if (!rc)
+			*val = temp >= bp->fatal_thresh_temp;
+		return rc;
+	case hwmon_temp_max_alarm:
+		rc = bnxt_hwrm_temp_query(bp, &temp);
+		if (!rc)
+			*val = temp >= bp->shutdown_thresh_temp;
+		return rc;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -73,7 +132,11 @@  static int bnxt_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32
 
 static const struct hwmon_channel_info *bnxt_hwmon_info[] = {
 	HWMON_CHANNEL_INFO(temp,
-			   HWMON_T_INPUT),
+			   HWMON_T_INPUT |
+			   HWMON_T_MAX | HWMON_T_LCRIT |
+			   HWMON_T_CRIT | HWMON_T_EMERGENCY |
+			   HWMON_T_CRIT_ALARM | HWMON_T_LCRIT_ALARM |
+			   HWMON_T_MAX_ALARM | HWMON_T_EMERGENCY_ALARM),
 	NULL
 };
 
@@ -97,13 +160,11 @@  void bnxt_hwmon_uninit(struct bnxt *bp)
 
 void bnxt_hwmon_init(struct bnxt *bp)
 {
-	struct hwrm_temp_monitor_query_input *req;
 	struct pci_dev *pdev = bp->pdev;
 	int rc;
 
-	rc = hwrm_req_init(bp, req, HWRM_TEMP_MONITOR_QUERY);
-	if (!rc)
-		rc = hwrm_req_send_silent(bp, req);
+	/* temp1_xxx is only sensor, ensure not registered if it will fail */
+	rc = bnxt_hwrm_temp_query(bp, NULL);
 	if (rc == -EACCES || rc == -EOPNOTSUPP) {
 		bnxt_hwmon_uninit(bp);
 		return;