diff mbox series

[v3,2/2] nvme: Add physical writes/reads from OCP log

Message ID 20221116171455.3401086-3-j.granados@samsung.com (mailing list archive)
State New, archived
Headers show
Series Add OCP extended log to nvme QEMU | expand

Commit Message

Joel Granados Nov. 16, 2022, 5:14 p.m. UTC
In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes)

We add a controller argument (ocp) that toggles on/off the SMART log
extended structure.  To accommodate different vendor specific specifications
like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
will route to the different log functions based on arguments and log ids.
We only return the OCP extended SMART log when the command is 0xC0 and ocp
has been turned on in the args.

Though we add the whole nvme SMART log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

Signed-off-by: Joel Granados <j.granados@samsung.com>
---
 docs/system/devices/nvme.rst |  7 +++++
 hw/nvme/ctrl.c               | 55 ++++++++++++++++++++++++++++++++++++
 hw/nvme/nvme.h               |  1 +
 include/block/nvme.h         | 36 +++++++++++++++++++++++
 4 files changed, 99 insertions(+)

Comments

Klaus Jensen Nov. 17, 2022, 7:30 a.m. UTC | #1
On Nov 16 18:14, Joel Granados wrote:
> In order to evaluate write amplification factor (WAF) within the storage
> stack it is important to know the number of bytes written to the
> controller. The existing SMART log value of Data Units Written is too
> coarse (given in units of 500 Kb) and so we add the SMART health
> information extended from the OCP specification (given in units of bytes)
> 
> We add a controller argument (ocp) that toggles on/off the SMART log
> extended structure.  To accommodate different vendor specific specifications
> like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
> will route to the different log functions based on arguments and log ids.
> We only return the OCP extended SMART log when the command is 0xC0 and ocp
> has been turned on in the args.
> 
> Though we add the whole nvme SMART log extended structure, we only populate
> the physical_media_units_{read,written}, log_page_version and
> log_page_uuid.
> 
> Signed-off-by: Joel Granados <j.granados@samsung.com>
> ---
>  docs/system/devices/nvme.rst |  7 +++++
>  hw/nvme/ctrl.c               | 55 ++++++++++++++++++++++++++++++++++++
>  hw/nvme/nvme.h               |  1 +
>  include/block/nvme.h         | 36 +++++++++++++++++++++++
>  4 files changed, 99 insertions(+)
> 
> diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
> index 30f841ef62..1cc5e52c00 100644
> --- a/docs/system/devices/nvme.rst
> +++ b/docs/system/devices/nvme.rst
> @@ -53,6 +53,13 @@ parameters.
>    Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
>    previously used.
>  
> +``ocp`` (default: ``off``)
> +  The Open Compute Project defines the Datacenter NVMe SSD Specification that
> +  sits on top of NVMe. It describes additional commands and NVMe behaviors
> +  specific for the Datacenter. When this option is ``on`` OCP features such as
> +  the SMART / Health information extended log become available in the
> +  controller.
> +
>  Additional Namespaces
>  ---------------------
>  
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index bf291f7ffe..c7215a4ed1 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -4455,6 +4455,41 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
>      stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
>  }
>  
> +static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
> +                                             uint32_t buf_len, uint64_t off,
> +                                             NvmeRequest *req)
> +{
> +    NvmeNamespace *ns = NULL;
> +    NvmeSmartLogExtended smart_l = { 0 };
> +    struct nvme_stats stats = { 0 };
> +    uint32_t trans_len;
> +
> +    if (off >= sizeof(smart_l)) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    /* accumulate all stats from all namespaces */
> +    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> +        ns = nvme_ns(n, i);
> +        if (ns) {
> +            nvme_set_blk_stats(ns, &stats);
> +        }
> +    }
> +
> +    smart_l.physical_media_units_written[0] = cpu_to_le32(stats.units_written);
> +    smart_l.physical_media_units_read[0] = cpu_to_le32(stats.units_read);

These are uint64s, so should be cpu_to_le64().

> +    smart_l.log_page_version = 0x0003;
> +    smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
> +    smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;

Technically the field is called the "Log Page GUID", not the UUID.
Perhaps this is a bit of Microsoft leaking in, or it is to differentiate
it from the UUID Index functionality, who knows.

It looks like you byte swapped the two 64 bit parts, but not the
individual bytes. It's super confusing when the spec just says "shall be
set to VALUE". Is that VALUE already in little endian? Sigh.

Anyway, I think it is fair to assume that, so just make
log_page_uuid/guid a uint8_t 16-array and do something like:

	static const uint8_t uuid[16] = {
		0xAF, 0xD5, 0x14, 0xC9, 0x7C, 0x6F, 0x4F, 0x9C,
		0xA4, 0xF2, 0xBF, 0xEA, 0x28, 0x10, 0xAF, 0xC5,
	};

	memcpy(smart_l.log_page_guid, uuid, sizeof(smart_l.log_page_guid));
Joel Granados Nov. 17, 2022, 3:20 p.m. UTC | #2
On Thu, Nov 17, 2022 at 08:30:46AM +0100, Klaus Jensen wrote:
> On Nov 16 18:14, Joel Granados wrote:
> > In order to evaluate write amplification factor (WAF) within the storage
> > stack it is important to know the number of bytes written to the
> > controller. The existing SMART log value of Data Units Written is too
> > coarse (given in units of 500 Kb) and so we add the SMART health
> > information extended from the OCP specification (given in units of bytes)
> > 
> > We add a controller argument (ocp) that toggles on/off the SMART log
> > extended structure.  To accommodate different vendor specific specifications
> > like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
> > will route to the different log functions based on arguments and log ids.
> > We only return the OCP extended SMART log when the command is 0xC0 and ocp
> > has been turned on in the args.
> > 
> > Though we add the whole nvme SMART log extended structure, we only populate
> > the physical_media_units_{read,written}, log_page_version and
> > log_page_uuid.
> > 
> > Signed-off-by: Joel Granados <j.granados@samsung.com>
> > ---
> >  docs/system/devices/nvme.rst |  7 +++++
> >  hw/nvme/ctrl.c               | 55 ++++++++++++++++++++++++++++++++++++
> >  hw/nvme/nvme.h               |  1 +
> >  include/block/nvme.h         | 36 +++++++++++++++++++++++
> >  4 files changed, 99 insertions(+)
> > 
> > diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
> > index 30f841ef62..1cc5e52c00 100644
> > --- a/docs/system/devices/nvme.rst
> > +++ b/docs/system/devices/nvme.rst
> > @@ -53,6 +53,13 @@ parameters.
> >    Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
> >    previously used.
> >  
> > +``ocp`` (default: ``off``)
> > +  The Open Compute Project defines the Datacenter NVMe SSD Specification that
> > +  sits on top of NVMe. It describes additional commands and NVMe behaviors
> > +  specific for the Datacenter. When this option is ``on`` OCP features such as
> > +  the SMART / Health information extended log become available in the
> > +  controller.
> > +
> >  Additional Namespaces
> >  ---------------------
> >  
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index bf291f7ffe..c7215a4ed1 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -4455,6 +4455,41 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
> >      stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
> >  }
> >  
> > +static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
> > +                                             uint32_t buf_len, uint64_t off,
> > +                                             NvmeRequest *req)
> > +{
> > +    NvmeNamespace *ns = NULL;
> > +    NvmeSmartLogExtended smart_l = { 0 };
> > +    struct nvme_stats stats = { 0 };
> > +    uint32_t trans_len;
> > +
> > +    if (off >= sizeof(smart_l)) {
> > +        return NVME_INVALID_FIELD | NVME_DNR;
> > +    }
> > +
> > +    /* accumulate all stats from all namespaces */
> > +    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> > +        ns = nvme_ns(n, i);
> > +        if (ns) {
> > +            nvme_set_blk_stats(ns, &stats);
> > +        }
> > +    }
> > +
> > +    smart_l.physical_media_units_written[0] = cpu_to_le32(stats.units_written);
> > +    smart_l.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
> 
> These are uint64s, so should be cpu_to_le64().
Good catch

> 
> > +    smart_l.log_page_version = 0x0003;
> > +    smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
> > +    smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
> 
> Technically the field is called the "Log Page GUID", not the UUID.
Yep, that was all me. My brain just automatically completed UUID. Sorry
about that.
> Perhaps this is a bit of Microsoft leaking in, or it is to differentiate
> it from the UUID Index functionality, who knows.
> 
> It looks like you byte swapped the two 64 bit parts, but not the
> individual bytes. It's super confusing when the spec just says "shall be
Individual bytes are also swapped. I actually tested this with nvme cli
and it successfully checks these 128 bytes. I got inspired by nvme-cli
(plugins/ocp/ocp-nvme.c) where the opc number appears.

> set to VALUE". Is that VALUE already in little endian? Sigh.
The spec says "All values in logs shall be little endian format". Which
still does not say if the value that appears in the pdf is in little or
big endian. Yes a bit confusing, see my final comment

> 
> Anyway, I think it is fair to assume that, so just make
> log_page_uuid/guid a uint8_t 16-array and do something like:
> 
> 	static const uint8_t uuid[16] = {
> 		0xAF, 0xD5, 0x14, 0xC9, 0x7C, 0x6F, 0x4F, 0x9C,
> 		0xA4, 0xF2, 0xBF, 0xEA, 0x28, 0x10, 0xAF, 0xC5,
> 	};

If I use this order the nvme-cli command will fail.
The order should be this one to be consistent with nvme-cli:

{
0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
}

This means that nvme-cli interpreted the value that appeared on the spec
pdf as big endian and then changed it to little endian following the
spec. Another thing that could have been done is to interpret what appears in
the PDF as little endian just push it through without swapping it.
Is there something in the spec that can give clarity as to what endianess
the values in the pdf should be by default?

I see two options here:
1. We go with the interpretation of nvme-cli (big endian in pdf, little
   endian in code)
2. or we change nvme-cli (little endian in pdf, little endian in code)

What do you think?


> 
> 	memcpy(smart_l.log_page_guid, uuid, sizeof(smart_l.log_page_guid));
diff mbox series

Patch

diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index 30f841ef62..1cc5e52c00 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -53,6 +53,13 @@  parameters.
   Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
   previously used.
 
+``ocp`` (default: ``off``)
+  The Open Compute Project defines the Datacenter NVMe SSD Specification that
+  sits on top of NVMe. It describes additional commands and NVMe behaviors
+  specific for the Datacenter. When this option is ``on`` OCP features such as
+  the SMART / Health information extended log become available in the
+  controller.
+
 Additional Namespaces
 ---------------------
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index bf291f7ffe..c7215a4ed1 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4455,6 +4455,41 @@  static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
 
+static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
+                                             uint32_t buf_len, uint64_t off,
+                                             NvmeRequest *req)
+{
+    NvmeNamespace *ns = NULL;
+    NvmeSmartLogExtended smart_l = { 0 };
+    struct nvme_stats stats = { 0 };
+    uint32_t trans_len;
+
+    if (off >= sizeof(smart_l)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    /* accumulate all stats from all namespaces */
+    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+        ns = nvme_ns(n, i);
+        if (ns) {
+            nvme_set_blk_stats(ns, &stats);
+        }
+    }
+
+    smart_l.physical_media_units_written[0] = cpu_to_le32(stats.units_written);
+    smart_l.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
+    smart_l.log_page_version = 0x0003;
+    smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
+    smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
+
+    if (!rae) {
+        nvme_clear_events(n, NVME_AER_TYPE_SMART);
+    }
+
+    trans_len = MIN(sizeof(smart_l) - off, buf_len);
+    return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req);
+}
+
 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
                                 uint64_t off, NvmeRequest *req)
 {
@@ -4642,6 +4677,23 @@  static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
 }
 
+static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
+                                         uint32_t buf_len, uint64_t off,
+                                         NvmeRequest *req, uint8_t lid)
+{
+    switch (lid) {
+    case 0xc0:
+        if (n->params.ocp) {
+            return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
+        }
+        break;
+        /* add a case for each additional vendor specific log id */
+    }
+
+    trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
+    return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -4683,6 +4735,8 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
         return nvme_error_info(n, rae, len, off, req);
     case NVME_LOG_SMART_INFO:
         return nvme_smart_info(n, rae, len, off, req);
+    case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
+        return nvme_vendor_specific_log(n, rae, len, off, req, lid);
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, len, off, req);
     case NVME_LOG_CHANGED_NSLIST:
@@ -7685,6 +7739,7 @@  static Property nvme_props[] = {
                       params.sriov_max_vi_per_vf, 0),
     DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
                       params.sriov_max_vq_per_vf, 0),
+    DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..d7f486f795 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -427,6 +427,7 @@  typedef struct NvmeParams {
     uint16_t sriov_vi_flexible;
     uint8_t  sriov_max_vq_per_vf;
     uint8_t  sriov_max_vi_per_vf;
+    bool     ocp;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8027b7126b..a9041604d9 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -978,6 +978,40 @@  typedef struct QEMU_PACKED NvmeSmartLog {
     uint8_t     reserved2[320];
 } NvmeSmartLog;
 
+typedef struct QEMU_PACKED NvmeSmartLogExtended {
+    uint64_t    physical_media_units_written[2];
+    uint64_t    physical_media_units_read[2];
+    uint64_t    bad_user_blocks;
+    uint64_t    bad_system_nand_blocks;
+    uint64_t    xor_recovery_count;
+    uint64_t    uncorrectable_read_error_count;
+    uint64_t    soft_ecc_error_count;
+    uint64_t    end2end_correction_counts;
+    uint8_t     system_data_percent_used;
+    uint8_t     refresh_counts[7];
+    uint64_t    user_data_erase_counts;
+    uint16_t    thermal_throttling_stat_and_count;
+    uint16_t    dssd_spec_version[3];
+    uint64_t    pcie_correctable_error_count;
+    uint32_t    incomplete_shutdowns;
+    uint32_t    rsvd116;
+    uint8_t     percent_free_blocks;
+    uint8_t     rsvd121[7];
+    uint16_t    capacity_health;
+    uint8_t     nvme_errata_ver;
+    uint8_t     rsvd131[5];
+    uint64_t    unaligned_io;
+    uint64_t    security_ver_num;
+    uint64_t    total_nuse;
+    uint64_t    plp_start_count[2];
+    uint64_t    endurance_estimate[2];
+    uint64_t    pcie_retraining_count;
+    uint64_t    power_state_change_count;
+    uint8_t     rsvd208[286];
+    uint16_t    log_page_version;
+    uint64_t    log_page_uuid[2];
+} NvmeSmartLogExtended;
+
 #define NVME_SMART_WARN_MAX     6
 enum NvmeSmartWarn {
     NVME_SMART_SPARE                  = 1 << 0,
@@ -1010,6 +1044,8 @@  enum NvmeLogIdentifier {
     NVME_LOG_FW_SLOT_INFO   = 0x03,
     NVME_LOG_CHANGED_NSLIST = 0x04,
     NVME_LOG_CMD_EFFECTS    = 0x05,
+    NVME_LOG_VENDOR_START   = 0xc0,
+    NVME_LOG_VENDOR_END     = 0xff,
 };
 
 typedef struct QEMU_PACKED NvmePSD {