diff mbox series

[v3,3/4] hw/block/nvme: add smart_critical_warning property

Message ID 20210114072251.334304-4-pizhenwei@bytedance.com (mailing list archive)
State New, archived
Headers show
Series support NVMe smart critial warning injection | expand

Commit Message

zhenwei pi Jan. 14, 2021, 7:22 a.m. UTC
There is a very low probability that hitting physical NVMe disk
hardware critical warning case, it's hard to write & test a monitor
agent service.

For debugging purposes, add a new 'smart_critical_warning' property
to emulate this situation.

The orignal version of this change is implemented by adding a fixed
property which could be initialized by QEMU command line. Suggested
by Philippe & Klaus, rework like current version.

Test with this patch:
1, change smart_critical_warning property for a running VM:
 #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set",
  "arguments": { "path": "/machine/peripheral-anon/device[0]",
  "property": "smart_critical_warning", "value":16 } }'
2, run smartctl in guest
 #smartctl -H -l error /dev/nvme0n1

  === START OF SMART DATA SECTION ===
  SMART overall-health self-assessment test result: FAILED!
  - volatile memory backup device has failed

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 hw/block/nvme.c | 40 ++++++++++++++++++++++++++++++++++++++++
 hw/block/nvme.h |  1 +
 2 files changed, 41 insertions(+)

Comments

Klaus Jensen Jan. 14, 2021, 8:29 a.m. UTC | #1
On Jan 14 15:22, zhenwei pi wrote:
> There is a very low probability that hitting physical NVMe disk
> hardware critical warning case, it's hard to write & test a monitor
> agent service.
> 
> For debugging purposes, add a new 'smart_critical_warning' property
> to emulate this situation.
> 
> The orignal version of this change is implemented by adding a fixed
> property which could be initialized by QEMU command line. Suggested
> by Philippe & Klaus, rework like current version.
> 
> Test with this patch:
> 1, change smart_critical_warning property for a running VM:
>  #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set",
>   "arguments": { "path": "/machine/peripheral-anon/device[0]",
>   "property": "smart_critical_warning", "value":16 } }'
> 2, run smartctl in guest
>  #smartctl -H -l error /dev/nvme0n1
> 
>   === START OF SMART DATA SECTION ===
>   SMART overall-health self-assessment test result: FAILED!
>   - volatile memory backup device has failed
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>  hw/block/nvme.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  hw/block/nvme.h |  1 +
>  2 files changed, 41 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f361103bb4..ce9a9c9023 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1214,6 +1214,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
>      }
>  
>      trans_len = MIN(sizeof(smart) - off, buf_len);
> +    smart.critical_warning = n->smart_critical_warning;
>  
>      smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
>                                                          1000));
> @@ -2826,6 +2827,41 @@ static Property nvme_props[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +
> +static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
> +                                   void *opaque, Error **errp)
> +{
> +    NvmeCtrl *s = NVME(obj);
> +    uint8_t value = s->smart_critical_warning;
> +
> +    visit_type_uint8(v, name, &value, errp);
> +}
> +
> +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
> +                                   void *opaque, Error **errp)
> +{
> +    NvmeCtrl *s = NVME(obj);
> +    uint8_t value, cap = 0;
> +    uint64_t pmr_cap = CAP_PMR_MASK;
> +
> +    if (!visit_type_uint8(v, name, &value, errp)) {
> +        return;
> +    }
> +
> +    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
> +          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
> +    if (s->bar.cap & (pmr_cap << CAP_PMR_SHIFT)) {
> +        cap |= NVME_SMART_PMR_UNRELIABLE;
> +    }

Looks like an NVME_CAP_PMRS(cap) macro is missing in
include/block/nvme.h. I have added it in another PMR series under
review, but you can add it here as well instead of manually doing the
shift and check.

> +
> +    if ((value & cap) != value) {
> +        error_setg(errp, "unsupported smart critical warning value");
> +        return;
> +    }
> +
> +    s->smart_critical_warning = value;
> +}
> +
>  static const VMStateDescription nvme_vmstate = {
>      .name = "nvme",
>      .unmigratable = 1,
> @@ -2856,6 +2892,10 @@ static void nvme_instance_init(Object *obj)
>                                        "bootindex", "/namespace@1,0",
>                                        DEVICE(obj));
>      }
> +
> +    object_property_add(obj, "smart_critical_warning", "uint8",
> +                        nvme_get_smart_warning,
> +                        nvme_set_smart_warning, NULL, NULL);
>  }
>  
>  static const TypeInfo nvme_info = {
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index e080a2318a..64e3497244 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -139,6 +139,7 @@ typedef struct NvmeCtrl {
>      uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
>      uint64_t    starttime_ms;
>      uint16_t    temperature;
> +    uint8_t     smart_critical_warning;
>  
>      HostMemoryBackend *pmrdev;
>  
> -- 
> 2.25.1
> 
>
Philippe Mathieu-Daudé Jan. 14, 2021, 3:55 p.m. UTC | #2
On 1/14/21 8:22 AM, zhenwei pi wrote:
> There is a very low probability that hitting physical NVMe disk
> hardware critical warning case, it's hard to write & test a monitor
> agent service.
> 
> For debugging purposes, add a new 'smart_critical_warning' property
> to emulate this situation.
> 
> The orignal version of this change is implemented by adding a fixed
> property which could be initialized by QEMU command line. Suggested
> by Philippe & Klaus, rework like current version.
> 
> Test with this patch:
> 1, change smart_critical_warning property for a running VM:
>  #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set",
>   "arguments": { "path": "/machine/peripheral-anon/device[0]",
>   "property": "smart_critical_warning", "value":16 } }'
> 2, run smartctl in guest
>  #smartctl -H -l error /dev/nvme0n1
> 
>   === START OF SMART DATA SECTION ===
>   SMART overall-health self-assessment test result: FAILED!
>   - volatile memory backup device has failed
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>  hw/block/nvme.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  hw/block/nvme.h |  1 +
>  2 files changed, 41 insertions(+)
...

> +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
> +                                   void *opaque, Error **errp)
> +{
> +    NvmeCtrl *s = NVME(obj);
> +    uint8_t value, cap = 0;
> +    uint64_t pmr_cap = CAP_PMR_MASK;
> +
> +    if (!visit_type_uint8(v, name, &value, errp)) {
> +        return;
> +    }
> +
> +    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
> +          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
> +    if (s->bar.cap & (pmr_cap << CAP_PMR_SHIFT)) {
> +        cap |= NVME_SMART_PMR_UNRELIABLE;
> +    }
> +
> +    if ((value & cap) != value) {
> +        error_setg(errp, "unsupported smart critical warning value");

More useful:

           error_setg(errp,
                      "unsupported smart critical warning bits: 0x%x",
                      value & ~cap);

Regardless:
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>

Thanks!
Keith Busch Jan. 14, 2021, 10:23 p.m. UTC | #3
On Thu, Jan 14, 2021 at 03:22:50PM +0800, zhenwei pi wrote:
> +static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
> +                                   void *opaque, Error **errp)
> +{
> +    NvmeCtrl *s = NVME(obj);

With only one exception, all variables of type 'NvmeCtrl' in this
program are called 'n', so let's keep that consistency please.
Otherwise, this looks fine.

> +    uint8_t value = s->smart_critical_warning;
> +
> +    visit_type_uint8(v, name, &value, errp);
> +}
> +
> +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
> +                                   void *opaque, Error **errp)
> +{
> +    NvmeCtrl *s = NVME(obj);
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f361103bb4..ce9a9c9023 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1214,6 +1214,7 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
     }
 
     trans_len = MIN(sizeof(smart) - off, buf_len);
+    smart.critical_warning = n->smart_critical_warning;
 
     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
                                                         1000));
@@ -2826,6 +2827,41 @@  static Property nvme_props[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+
+static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    NvmeCtrl *s = NVME(obj);
+    uint8_t value = s->smart_critical_warning;
+
+    visit_type_uint8(v, name, &value, errp);
+}
+
+static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    NvmeCtrl *s = NVME(obj);
+    uint8_t value, cap = 0;
+    uint64_t pmr_cap = CAP_PMR_MASK;
+
+    if (!visit_type_uint8(v, name, &value, errp)) {
+        return;
+    }
+
+    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
+          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
+    if (s->bar.cap & (pmr_cap << CAP_PMR_SHIFT)) {
+        cap |= NVME_SMART_PMR_UNRELIABLE;
+    }
+
+    if ((value & cap) != value) {
+        error_setg(errp, "unsupported smart critical warning value");
+        return;
+    }
+
+    s->smart_critical_warning = value;
+}
+
 static const VMStateDescription nvme_vmstate = {
     .name = "nvme",
     .unmigratable = 1,
@@ -2856,6 +2892,10 @@  static void nvme_instance_init(Object *obj)
                                       "bootindex", "/namespace@1,0",
                                       DEVICE(obj));
     }
+
+    object_property_add(obj, "smart_critical_warning", "uint8",
+                        nvme_get_smart_warning,
+                        nvme_set_smart_warning, NULL, NULL);
 }
 
 static const TypeInfo nvme_info = {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a..64e3497244 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -139,6 +139,7 @@  typedef struct NvmeCtrl {
     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
     uint64_t    starttime_ms;
     uint16_t    temperature;
+    uint8_t     smart_critical_warning;
 
     HostMemoryBackend *pmrdev;