diff mbox series

[ndctl,2/2,v2,RESEND] libndctl/msft: Improve "smart" state reporting

Message ID 20230113172732.1122643-2-mav@ixsystems.com (mailing list archive)
State Accepted
Commit a88bdcfb4202c73aadfee6f83c5502eb5121cbd9
Headers show
Series [ndctl,1/2,v2,RESEND] libndctl/msft: Cleanup the code | expand

Commit Message

Alexander Motin Jan. 13, 2023, 5:27 p.m. UTC
Previous code reported "smart" state based on number of bits
set in the module health field.  But actually any single bit
set there already means critical failure.  Rework the logic
according to specification, properly reporting non-critical
state in case of warning threshold reached, critical in case
of any module health bit set or error threshold reached and
fatal if NVDIMM exhausted its life time.  In attempt to
report the cause of failure in absence of better methods,
report reached thresholds as more or less matching alarms.

Signed-off-by: Alexander Motin <mav@ixsystems.com>
---
 ndctl/lib/msft.c | 55 ++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

Comments

Verma, Vishal L Feb. 15, 2023, 5:06 a.m. UTC | #1
[Dropping Lijun from CC due to a bounce]

On Fri, 2023-01-13 at 12:27 -0500, Alexander Motin wrote:
> Previous code reported "smart" state based on number of bits
> set in the module health field.  But actually any single bit
> set there already means critical failure.  Rework the logic
> according to specification, properly reporting non-critical
> state in case of warning threshold reached, critical in case
> of any module health bit set or error threshold reached and
> fatal if NVDIMM exhausted its life time.  In attempt to
> report the cause of failure in absence of better methods,
> report reached thresholds as more or less matching alarms.

This looks like unnecessarily aggressive wrapping. Commit messages are
typically wrapped at 72 chars.

Other than that this looks fine.

> 
> Signed-off-by: Alexander Motin <mav@ixsystems.com>
> ---
>  ndctl/lib/msft.c | 55 ++++++++++++++++++++++++++----------------------
>  1 file changed, 30 insertions(+), 25 deletions(-)
> 
> diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
> index efc7fe1..8f66c97 100644
> --- a/ndctl/lib/msft.c
> +++ b/ndctl/lib/msft.c
> @@ -114,26 +114,32 @@ static unsigned int msft_cmd_smart_get_flags(struct ndctl_cmd *cmd)
>  
>         /* below health data can be retrieved via MSFT _DSM function 11 */
>         return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
> -           ND_SMART_USED_VALID;
> +           ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
>  }
>  
> -static unsigned int num_set_bit_health(__u16 num)
> +static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>  {
> -       int i;
> -       __u16 n = num & 0x7FFF;
> -       unsigned int count = 0;
> +       unsigned int health = 0;
> +       int rc;
>  
> -       for (i = 0; i < 15; i++)
> -               if (!!(n & (1 << i)))
> -                       count++;
> +       rc = msft_smart_valid(cmd);
> +       if (rc < 0) {
> +               errno = -rc;
> +               return UINT_MAX;
> +       }
>  
> -       return count;
> +       if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
> +               health |= ND_SMART_FATAL_HEALTH;
> +       if (CMD_MSFT_SMART(cmd)->health != 0 ||
> +           CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
> +               health |= ND_SMART_CRITICAL_HEALTH;
> +       if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
> +               health |= ND_SMART_NON_CRITICAL_HEALTH;
> +       return health;
>  }
>  
> -static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
> +static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>  {
> -       unsigned int health;
> -       unsigned int num;
>         int rc;
>  
>         rc = msft_smart_valid(cmd);
> @@ -142,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>                 return UINT_MAX;
>         }
>  
> -       num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
> -       if (num == 0)
> -               health = 0;
> -       else if (num < 2)
> -               health = ND_SMART_NON_CRITICAL_HEALTH;
> -       else if (num < 3)
> -               health = ND_SMART_CRITICAL_HEALTH;
> -       else
> -               health = ND_SMART_FATAL_HEALTH;
> -
> -       return health;
> +       return CMD_MSFT_SMART(cmd)->temp * 16;
>  }
>  
> -static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
> +static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
>  {
> +       __u8 stat;
> +       unsigned int flags = 0;
>         int rc;
>  
>         rc = msft_smart_valid(cmd);
> @@ -165,7 +163,13 @@ static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>                 return UINT_MAX;
>         }
>  
> -       return CMD_MSFT_SMART(cmd)->temp * 16;
> +       stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
> +           CMD_MSFT_SMART(cmd)->warn_thresh_stat;
> +       if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
> +               flags |= ND_SMART_SPARE_TRIP;
> +       if (stat & 4) /* ES_TEMP */
> +               flags |= ND_SMART_CTEMP_TRIP;
> +       return flags;
>  }
>  
>  static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
> @@ -209,6 +213,7 @@ struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
>         .smart_get_flags = msft_cmd_smart_get_flags,
>         .smart_get_health = msft_cmd_smart_get_health,
>         .smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
> +       .smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
>         .smart_get_life_used = msft_cmd_smart_get_life_used,
>         .xlat_firmware_status = msft_cmd_xlat_firmware_status,
>  };
diff mbox series

Patch

diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
index efc7fe1..8f66c97 100644
--- a/ndctl/lib/msft.c
+++ b/ndctl/lib/msft.c
@@ -114,26 +114,32 @@  static unsigned int msft_cmd_smart_get_flags(struct ndctl_cmd *cmd)
 
 	/* below health data can be retrieved via MSFT _DSM function 11 */
 	return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
-	    ND_SMART_USED_VALID;
+	    ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
 }
 
-static unsigned int num_set_bit_health(__u16 num)
+static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
 {
-	int i;
-	__u16 n = num & 0x7FFF;
-	unsigned int count = 0;
+	unsigned int health = 0;
+	int rc;
 
-	for (i = 0; i < 15; i++)
-		if (!!(n & (1 << i)))
-			count++;
+	rc = msft_smart_valid(cmd);
+	if (rc < 0) {
+		errno = -rc;
+		return UINT_MAX;
+	}
 
-	return count;
+	if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
+		health |= ND_SMART_FATAL_HEALTH;
+	if (CMD_MSFT_SMART(cmd)->health != 0 ||
+	    CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
+		health |= ND_SMART_CRITICAL_HEALTH;
+	if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
+		health |= ND_SMART_NON_CRITICAL_HEALTH;
+	return health;
 }
 
-static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
+static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
 {
-	unsigned int health;
-	unsigned int num;
 	int rc;
 
 	rc = msft_smart_valid(cmd);
@@ -142,21 +148,13 @@  static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
 		return UINT_MAX;
 	}
 
-	num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
-	if (num == 0)
-		health = 0;
-	else if (num < 2)
-		health = ND_SMART_NON_CRITICAL_HEALTH;
-	else if (num < 3)
-		health = ND_SMART_CRITICAL_HEALTH;
-	else
-		health = ND_SMART_FATAL_HEALTH;
-
-	return health;
+	return CMD_MSFT_SMART(cmd)->temp * 16;
 }
 
-static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
+static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
 {
+	__u8 stat;
+	unsigned int flags = 0;
 	int rc;
 
 	rc = msft_smart_valid(cmd);
@@ -165,7 +163,13 @@  static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
 		return UINT_MAX;
 	}
 
-	return CMD_MSFT_SMART(cmd)->temp * 16;
+	stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
+	    CMD_MSFT_SMART(cmd)->warn_thresh_stat;
+	if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
+		flags |= ND_SMART_SPARE_TRIP;
+	if (stat & 4) /* ES_TEMP */
+		flags |= ND_SMART_CTEMP_TRIP;
+	return flags;
 }
 
 static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
@@ -209,6 +213,7 @@  struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
 	.smart_get_flags = msft_cmd_smart_get_flags,
 	.smart_get_health = msft_cmd_smart_get_health,
 	.smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
+	.smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
 	.smart_get_life_used = msft_cmd_smart_get_life_used,
 	.xlat_firmware_status = msft_cmd_xlat_firmware_status,
 };