diff mbox series

[ndctl,4/5] cxl/list: add --media-errors option to cxl list

Message ID 762edeab529125d3048cf13721360b1a07260531.1668133294.git.alison.schofield@intel.com
State New, archived
Headers show
Series Support poison list retrieval | expand

Commit Message

Alison Schofield Nov. 11, 2022, 3:20 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

The --media-errors option to 'cxl list' retrieves poison lists
from memory devices (supporting the capability) and displays
the returned media-error records in the cxl list json. This
option can apply to memdevs or regions.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 Documentation/cxl/cxl-list.txt | 64 ++++++++++++++++++++++++++++++++++
 cxl/filter.c                   |  2 ++
 cxl/filter.h                   |  1 +
 cxl/list.c                     |  2 ++
 4 files changed, 69 insertions(+)

Comments

Jonathan Cameron Nov. 16, 2022, 1:03 p.m. UTC | #1
On Thu, 10 Nov 2022 19:20:07 -0800
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> The --media-errors option to 'cxl list' retrieves poison lists
> from memory devices (supporting the capability) and displays
> the returned media-error records in the cxl list json. This
> option can apply to memdevs or regions.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  Documentation/cxl/cxl-list.txt | 64 ++++++++++++++++++++++++++++++++++
>  cxl/filter.c                   |  2 ++
>  cxl/filter.h                   |  1 +
>  cxl/list.c                     |  2 ++
>  4 files changed, 69 insertions(+)
> 
> diff --git a/Documentation/cxl/cxl-list.txt b/Documentation/cxl/cxl-list.txt
> index 14a2b4bb5c2a..24a0cf97cef2 100644
> --- a/Documentation/cxl/cxl-list.txt
> +++ b/Documentation/cxl/cxl-list.txt
> @@ -344,6 +344,70 @@ OPTIONS
>  --region::
>  	Specify CXL region device name(s), or device id(s), to filter the listing.
>  
> +-a::
> +--media-errors::
> +	Include media-error information. The poison list is retrieved
> +	from the device(s) and media error records are added to the
> +	listing. This option applies to memdevs and regions where
> +	devices support the poison list capability.

I'm not sure media errors is a good name.  The poison doesn't have to originate
in the device.  Given we are logging poison with "external" as the source
those definitely don't come from the device and may have nothing to do
with 'media' as such.

Why not just call it poison?

> +
> +----
> +# cxl list -m mem11 --media-errors
> +[
> +  {
> +    "memdev":"mem11",
> +    "pmem_size":268435456,
> +    "ram_size":0,
> +    "serial":0,
> +    "host":"0000:37:00.0",
> +    "media_errors":{
> +      "nr_media_errors":1,
> +      "media_error_records":[
> +        {
> +          "dpa":0,
> +          "length":64,
> +          "source":"Internal",
> +          "flags":"",
> +          "overflow_time":0
> +        }
> +      ]
> +    }
> +  }
> +]
> +# cxl list -r region5 --media-errors
> +[
> +  {
> +    "region":"region5",
> +    "resource":1035623989248,
> +    "size":2147483648,
> +    "interleave_ways":2,
> +    "interleave_granularity":4096,
> +    "decode_state":"commit",
> +    "media_errors":{
> +      "nr_media_errors":2,
> +      "media_error_records":[
> +        {
> +          "memdev":"mem2",
> +          "dpa":0,
> +          "length":64,
> +          "source":"Internal",
> +          "flags":"",
> +          "overflow_time":0
> +        },
> +        {
> +          "memdev":"mem5",
> +          "dpa":0,
> +          "length":512,
> +          "source":"Vendor",
> +          "flags":"",
> +          "overflow_time":0
> +        }
> +      ]
> +    }
> +  }
> +]
> +----
> +
>  -v::
>  --verbose::
>  	Increase verbosity of the output. This can be specified
> diff --git a/cxl/filter.c b/cxl/filter.c
> index 56c659965891..fe6c29148fb4 100644
> --- a/cxl/filter.c
> +++ b/cxl/filter.c
> @@ -686,6 +686,8 @@ static unsigned long params_to_flags(struct cxl_filter_params *param)
>  		flags |= UTIL_JSON_TARGETS;
>  	if (param->partition)
>  		flags |= UTIL_JSON_PARTITION;
> +	if (param->media_errors)
> +		flags |= UTIL_JSON_MEDIA_ERRORS;
>  	return flags;
>  }
>  
> diff --git a/cxl/filter.h b/cxl/filter.h
> index 256df49c3d0c..a92295fe2511 100644
> --- a/cxl/filter.h
> +++ b/cxl/filter.h
> @@ -26,6 +26,7 @@ struct cxl_filter_params {
>  	bool human;
>  	bool health;
>  	bool partition;
> +	bool media_errors;
>  	int verbose;
>  	struct log_ctx ctx;
>  };
> diff --git a/cxl/list.c b/cxl/list.c
> index 8c48fbbaaec3..df2ae5a3fec0 100644
> --- a/cxl/list.c
> +++ b/cxl/list.c
> @@ -52,6 +52,8 @@ static const struct option options[] = {
>  		    "include memory device health information"),
>  	OPT_BOOLEAN('I', "partition", &param.partition,
>  		    "include memory device partition information"),
> +	OPT_BOOLEAN('a', "media-errors", &param.media_errors,
> +		    "include media error information "),
>  	OPT_INCR('v', "verbose", &param.verbose,
>  		 "increase output detail"),
>  #ifdef ENABLE_DEBUG
Alison Schofield Nov. 17, 2022, 11:42 p.m. UTC | #2
On Wed, Nov 16, 2022 at 01:03:45PM +0000, Jonathan Cameron wrote:
> On Thu, 10 Nov 2022 19:20:07 -0800
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > The --media-errors option to 'cxl list' retrieves poison lists
> > from memory devices (supporting the capability) and displays
> > the returned media-error records in the cxl list json. This
> > option can apply to memdevs or regions.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  Documentation/cxl/cxl-list.txt | 64 ++++++++++++++++++++++++++++++++++
> >  cxl/filter.c                   |  2 ++
> >  cxl/filter.h                   |  1 +
> >  cxl/list.c                     |  2 ++
> >  4 files changed, 69 insertions(+)
> > 
> > diff --git a/Documentation/cxl/cxl-list.txt b/Documentation/cxl/cxl-list.txt
> > index 14a2b4bb5c2a..24a0cf97cef2 100644
> > --- a/Documentation/cxl/cxl-list.txt
> > +++ b/Documentation/cxl/cxl-list.txt
> > @@ -344,6 +344,70 @@ OPTIONS
> >  --region::
> >  	Specify CXL region device name(s), or device id(s), to filter the listing.
> >  
> > +-a::
> > +--media-errors::
> > +	Include media-error information. The poison list is retrieved
> > +	from the device(s) and media error records are added to the
> > +	listing. This option applies to memdevs and regions where
> > +	devices support the poison list capability.
> 
> I'm not sure media errors is a good name.  The poison doesn't have to originate
> in the device.  Given we are logging poison with "external" as the source
> those definitely don't come from the device and may have nothing to do
> with 'media' as such.
> 
> Why not just call it poison?
> 
--media-errors probably originated from ndctl tool which used
that same option name, but it fits in with the CXL Spec language.

The CXL Spec calls the records returned from the 'Get Poison List'
command Media Error Records. It refers to poison as media errors.
So, here, in a command that lists things - the thing(s) being listed
is(are) 'media error record(s)'. 

I see what you're saying about 'External' source. Does that mean
an 'External' source caused an actual media error?

So, that 'Why not poison?' answer. I'm easily swayed either way.
Would you suggest:
> > +
> > +----
> > +# cxl list -m mem11 --media-errors

cxl list -m mem1 --poison

> > +    "media_errors":{
> > +      "nr_media_errors":1,
> > +      "media_error_records":[

and rename the fields above:
	"poison_errors"
	"nr_poison_errors"
	"poison_error_records"
Jonathan Cameron Nov. 21, 2022, 10:57 a.m. UTC | #3
On Thu, 17 Nov 2022 15:42:43 -0800
Alison Schofield <alison.schofield@intel.com> wrote:

> On Wed, Nov 16, 2022 at 01:03:45PM +0000, Jonathan Cameron wrote:
> > On Thu, 10 Nov 2022 19:20:07 -0800
> > alison.schofield@intel.com wrote:
> >   
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > The --media-errors option to 'cxl list' retrieves poison lists
> > > from memory devices (supporting the capability) and displays
> > > the returned media-error records in the cxl list json. This
> > > option can apply to memdevs or regions.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > ---
> > >  Documentation/cxl/cxl-list.txt | 64 ++++++++++++++++++++++++++++++++++
> > >  cxl/filter.c                   |  2 ++
> > >  cxl/filter.h                   |  1 +
> > >  cxl/list.c                     |  2 ++
> > >  4 files changed, 69 insertions(+)
> > > 
> > > diff --git a/Documentation/cxl/cxl-list.txt b/Documentation/cxl/cxl-list.txt
> > > index 14a2b4bb5c2a..24a0cf97cef2 100644
> > > --- a/Documentation/cxl/cxl-list.txt
> > > +++ b/Documentation/cxl/cxl-list.txt
> > > @@ -344,6 +344,70 @@ OPTIONS
> > >  --region::
> > >  	Specify CXL region device name(s), or device id(s), to filter the listing.
> > >  
> > > +-a::
> > > +--media-errors::
> > > +	Include media-error information. The poison list is retrieved
> > > +	from the device(s) and media error records are added to the
> > > +	listing. This option applies to memdevs and regions where
> > > +	devices support the poison list capability.  
> > 
> > I'm not sure media errors is a good name.  The poison doesn't have to originate
> > in the device.  Given we are logging poison with "external" as the source
> > those definitely don't come from the device and may have nothing to do
> > with 'media' as such.
> > 
> > Why not just call it poison?
> >   
> --media-errors probably originated from ndctl tool which used
> that same option name, but it fits in with the CXL Spec language.
> 
> The CXL Spec calls the records returned from the 'Get Poison List'
> command Media Error Records. It refers to poison as media errors.
> So, here, in a command that lists things - the thing(s) being listed
> is(are) 'media error record(s)'. 
> 
> I see what you're saying about 'External' source. Does that mean
> an 'External' source caused an actual media error?

Hmm. I suspect this all evolved.  An External source need not have
anything to do with media (could be corruption in some random cache
or on interconnect or even that a link collapsed potentially).

Ah well, I'm fine with any naming you prefer.  No idea if the NVDIMM
equivalent has a the same issue with externally generated poison.

> 
> So, that 'Why not poison?' answer. I'm easily swayed either way.
> Would you suggest:
> > > +
> > > +----
> > > +# cxl list -m mem11 --media-errors  
> 
> cxl list -m mem1 --poison
> 
> > > +    "media_errors":{
> > > +      "nr_media_errors":1,
> > > +      "media_error_records":[  
> 
> and rename the fields above:
> 	"poison_errors"
> 	"nr_poison_errors"
> 	"poison_error_records"
> 
> 
That works for me, but if it's going to confuse people familiar with
other similar cases, then I don't mind the original naming that much.

Jonathan
diff mbox series

Patch

diff --git a/Documentation/cxl/cxl-list.txt b/Documentation/cxl/cxl-list.txt
index 14a2b4bb5c2a..24a0cf97cef2 100644
--- a/Documentation/cxl/cxl-list.txt
+++ b/Documentation/cxl/cxl-list.txt
@@ -344,6 +344,70 @@  OPTIONS
 --region::
 	Specify CXL region device name(s), or device id(s), to filter the listing.
 
+-a::
+--media-errors::
+	Include media-error information. The poison list is retrieved
+	from the device(s) and media error records are added to the
+	listing. This option applies to memdevs and regions where
+	devices support the poison list capability.
+
+----
+# cxl list -m mem11 --media-errors
+[
+  {
+    "memdev":"mem11",
+    "pmem_size":268435456,
+    "ram_size":0,
+    "serial":0,
+    "host":"0000:37:00.0",
+    "media_errors":{
+      "nr_media_errors":1,
+      "media_error_records":[
+        {
+          "dpa":0,
+          "length":64,
+          "source":"Internal",
+          "flags":"",
+          "overflow_time":0
+        }
+      ]
+    }
+  }
+]
+# cxl list -r region5 --media-errors
+[
+  {
+    "region":"region5",
+    "resource":1035623989248,
+    "size":2147483648,
+    "interleave_ways":2,
+    "interleave_granularity":4096,
+    "decode_state":"commit",
+    "media_errors":{
+      "nr_media_errors":2,
+      "media_error_records":[
+        {
+          "memdev":"mem2",
+          "dpa":0,
+          "length":64,
+          "source":"Internal",
+          "flags":"",
+          "overflow_time":0
+        },
+        {
+          "memdev":"mem5",
+          "dpa":0,
+          "length":512,
+          "source":"Vendor",
+          "flags":"",
+          "overflow_time":0
+        }
+      ]
+    }
+  }
+]
+----
+
 -v::
 --verbose::
 	Increase verbosity of the output. This can be specified
diff --git a/cxl/filter.c b/cxl/filter.c
index 56c659965891..fe6c29148fb4 100644
--- a/cxl/filter.c
+++ b/cxl/filter.c
@@ -686,6 +686,8 @@  static unsigned long params_to_flags(struct cxl_filter_params *param)
 		flags |= UTIL_JSON_TARGETS;
 	if (param->partition)
 		flags |= UTIL_JSON_PARTITION;
+	if (param->media_errors)
+		flags |= UTIL_JSON_MEDIA_ERRORS;
 	return flags;
 }
 
diff --git a/cxl/filter.h b/cxl/filter.h
index 256df49c3d0c..a92295fe2511 100644
--- a/cxl/filter.h
+++ b/cxl/filter.h
@@ -26,6 +26,7 @@  struct cxl_filter_params {
 	bool human;
 	bool health;
 	bool partition;
+	bool media_errors;
 	int verbose;
 	struct log_ctx ctx;
 };
diff --git a/cxl/list.c b/cxl/list.c
index 8c48fbbaaec3..df2ae5a3fec0 100644
--- a/cxl/list.c
+++ b/cxl/list.c
@@ -52,6 +52,8 @@  static const struct option options[] = {
 		    "include memory device health information"),
 	OPT_BOOLEAN('I', "partition", &param.partition,
 		    "include memory device partition information"),
+	OPT_BOOLEAN('a', "media-errors", &param.media_errors,
+		    "include media error information "),
 	OPT_INCR('v', "verbose", &param.verbose,
 		 "increase output detail"),
 #ifdef ENABLE_DEBUG