diff mbox series

[ndctl,v11,1/7] libcxl: add interfaces for GET_POISON_LIST mailbox commands

Message ID c43e12c5bafca30d3194ebb11d9817b9a05eaad0.1710386468.git.alison.schofield@intel.com (mailing list archive)
State Superseded
Headers show
Series Support poison list retrieval | expand

Commit Message

Alison Schofield March 14, 2024, 4:05 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

CXL devices maintain a list of locations that are poisoned or result
in poison if the addresses are accessed by the host.

Per the spec (CXL 3.1 8.2.9.9.4.1), the device returns the Poison
List as a set of  Media Error Records that include the source of the
error, the starting device physical address and length.

Trigger the retrieval of the poison list by writing to the memory
device sysfs attribute: trigger_poison_list. The CXL driver only
offers triggering per memdev, so the trigger by region interface
offered here is a convenience API that triggers a poison list
retrieval for each memdev contributing to a region.

int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
int cxl_region_trigger_poison_list(struct cxl_region *region);

The resulting poison records are logged as kernel trace events
named 'cxl_poison'.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
---
 cxl/lib/libcxl.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 cxl/lib/libcxl.sym |  2 ++
 cxl/libcxl.h       |  2 ++
 3 files changed, 51 insertions(+)

Comments

fan March 18, 2024, 5:51 p.m. UTC | #1
On Wed, Mar 13, 2024 at 09:05:17PM -0700, alison.schofield@intel.com wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices maintain a list of locations that are poisoned or result
> in poison if the addresses are accessed by the host.
> 
> Per the spec (CXL 3.1 8.2.9.9.4.1), the device returns the Poison
> List as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length.
> 
> Trigger the retrieval of the poison list by writing to the memory
> device sysfs attribute: trigger_poison_list. The CXL driver only
> offers triggering per memdev, so the trigger by region interface
> offered here is a convenience API that triggers a poison list
> retrieval for each memdev contributing to a region.
> 
> int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> int cxl_region_trigger_poison_list(struct cxl_region *region);
> 
> The resulting poison records are logged as kernel trace events
> named 'cxl_poison'.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  cxl/lib/libcxl.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++++
>  cxl/lib/libcxl.sym |  2 ++
>  cxl/libcxl.h       |  2 ++
>  3 files changed, 51 insertions(+)
> 
> diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
> index ff27cdf7c44a..73db8f15c704 100644
> --- a/cxl/lib/libcxl.c
> +++ b/cxl/lib/libcxl.c
> @@ -1761,6 +1761,53 @@ CXL_EXPORT int cxl_memdev_disable_invalidate(struct cxl_memdev *memdev)
>  	return 0;
>  }
>  
> +CXL_EXPORT int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev)
> +{
> +	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> +	char *path = memdev->dev_buf;
> +	int len = memdev->buf_len, rc;
> +
> +	if (snprintf(path, len, "%s/trigger_poison_list",
> +		     memdev->dev_path) >= len) {
> +		err(ctx, "%s: buffer too small\n",
> +		    cxl_memdev_get_devname(memdev));
> +		return -ENXIO;
> +	}
> +	rc = sysfs_write_attr(ctx, path, "1\n");
> +	if (rc < 0) {
> +		fprintf(stderr,
> +			"%s: Failed write sysfs attr trigger_poison_list\n",
> +			cxl_memdev_get_devname(memdev));

Should we use err() instead of fprintf here? 

Fan

> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +CXL_EXPORT int cxl_region_trigger_poison_list(struct cxl_region *region)
> +{
> +	struct cxl_memdev_mapping *mapping;
> +	int rc;
> +
> +	cxl_mapping_foreach(region, mapping) {
> +		struct cxl_decoder *decoder;
> +		struct cxl_memdev *memdev;
> +
> +		decoder = cxl_mapping_get_decoder(mapping);
> +		if (!decoder)
> +			continue;
> +
> +		memdev = cxl_decoder_get_memdev(decoder);
> +		if (!memdev)
> +			continue;
> +
> +		rc = cxl_memdev_trigger_poison_list(memdev);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	return 0;
> +}
> +
>  CXL_EXPORT int cxl_memdev_enable(struct cxl_memdev *memdev)
>  {
>  	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> diff --git a/cxl/lib/libcxl.sym b/cxl/lib/libcxl.sym
> index de2cd84b2960..3f709c60db3d 100644
> --- a/cxl/lib/libcxl.sym
> +++ b/cxl/lib/libcxl.sym
> @@ -280,4 +280,6 @@ global:
>  	cxl_memdev_get_pmem_qos_class;
>  	cxl_memdev_get_ram_qos_class;
>  	cxl_region_qos_class_mismatch;
> +	cxl_memdev_trigger_poison_list;
> +	cxl_region_trigger_poison_list;
>  } LIBCXL_6;
> diff --git a/cxl/libcxl.h b/cxl/libcxl.h
> index a6af3fb04693..29165043ca3f 100644
> --- a/cxl/libcxl.h
> +++ b/cxl/libcxl.h
> @@ -467,6 +467,8 @@ enum cxl_setpartition_mode {
>  
>  int cxl_cmd_partition_set_mode(struct cxl_cmd *cmd,
>  		enum cxl_setpartition_mode mode);
> +int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> +int cxl_region_trigger_poison_list(struct cxl_region *region);
>  
>  int cxl_cmd_alert_config_set_life_used_prog_warn_threshold(struct cxl_cmd *cmd,
>  							   int threshold);
> -- 
> 2.37.3
>
Alison Schofield March 18, 2024, 8:11 p.m. UTC | #2
On Mon, Mar 18, 2024 at 10:51:13AM -0700, fan wrote:
> On Wed, Mar 13, 2024 at 09:05:17PM -0700, alison.schofield@intel.com wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices maintain a list of locations that are poisoned or result
> > in poison if the addresses are accessed by the host.
> > 
> > Per the spec (CXL 3.1 8.2.9.9.4.1), the device returns the Poison
> > List as a set of  Media Error Records that include the source of the
> > error, the starting device physical address and length.
> > 
> > Trigger the retrieval of the poison list by writing to the memory
> > device sysfs attribute: trigger_poison_list. The CXL driver only
> > offers triggering per memdev, so the trigger by region interface
> > offered here is a convenience API that triggers a poison list
> > retrieval for each memdev contributing to a region.
> > 
> > int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> > int cxl_region_trigger_poison_list(struct cxl_region *region);
> > 
> > The resulting poison records are logged as kernel trace events
> > named 'cxl_poison'.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> >  cxl/lib/libcxl.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++++
> >  cxl/lib/libcxl.sym |  2 ++
> >  cxl/libcxl.h       |  2 ++
> >  3 files changed, 51 insertions(+)
> > 
> > diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
> > index ff27cdf7c44a..73db8f15c704 100644
> > --- a/cxl/lib/libcxl.c
> > +++ b/cxl/lib/libcxl.c
> > @@ -1761,6 +1761,53 @@ CXL_EXPORT int cxl_memdev_disable_invalidate(struct cxl_memdev *memdev)
> >  	return 0;
> >  }
> >  
> > +CXL_EXPORT int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev)
> > +{
> > +	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> > +	char *path = memdev->dev_buf;
> > +	int len = memdev->buf_len, rc;
> > +
> > +	if (snprintf(path, len, "%s/trigger_poison_list",
> > +		     memdev->dev_path) >= len) {
> > +		err(ctx, "%s: buffer too small\n",
> > +		    cxl_memdev_get_devname(memdev));
> > +		return -ENXIO;
> > +	}
> > +	rc = sysfs_write_attr(ctx, path, "1\n");
> > +	if (rc < 0) {
> > +		fprintf(stderr,
> > +			"%s: Failed write sysfs attr trigger_poison_list\n",
> > +			cxl_memdev_get_devname(memdev));
> 
> Should we use err() instead of fprintf here? 

Thanks Fan,

How about this?

- use fprintf if access() fails, ie device doesn't support poison list,
- use err() for failure to actually read the poison list on a device with
  support

Alison


> 
> Fan
> 
> > +		return rc;
> > +	}
> > +	return 0;
> > +}
> > +
> > +CXL_EXPORT int cxl_region_trigger_poison_list(struct cxl_region *region)
> > +{
> > +	struct cxl_memdev_mapping *mapping;
> > +	int rc;
> > +
> > +	cxl_mapping_foreach(region, mapping) {
> > +		struct cxl_decoder *decoder;
> > +		struct cxl_memdev *memdev;
> > +
> > +		decoder = cxl_mapping_get_decoder(mapping);
> > +		if (!decoder)
> > +			continue;
> > +
> > +		memdev = cxl_decoder_get_memdev(decoder);
> > +		if (!memdev)
> > +			continue;
> > +
> > +		rc = cxl_memdev_trigger_poison_list(memdev);
> > +		if (rc)
> > +			return rc;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  CXL_EXPORT int cxl_memdev_enable(struct cxl_memdev *memdev)
> >  {
> >  	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> > diff --git a/cxl/lib/libcxl.sym b/cxl/lib/libcxl.sym
> > index de2cd84b2960..3f709c60db3d 100644
> > --- a/cxl/lib/libcxl.sym
> > +++ b/cxl/lib/libcxl.sym
> > @@ -280,4 +280,6 @@ global:
> >  	cxl_memdev_get_pmem_qos_class;
> >  	cxl_memdev_get_ram_qos_class;
> >  	cxl_region_qos_class_mismatch;
> > +	cxl_memdev_trigger_poison_list;
> > +	cxl_region_trigger_poison_list;
> >  } LIBCXL_6;
> > diff --git a/cxl/libcxl.h b/cxl/libcxl.h
> > index a6af3fb04693..29165043ca3f 100644
> > --- a/cxl/libcxl.h
> > +++ b/cxl/libcxl.h
> > @@ -467,6 +467,8 @@ enum cxl_setpartition_mode {
> >  
> >  int cxl_cmd_partition_set_mode(struct cxl_cmd *cmd,
> >  		enum cxl_setpartition_mode mode);
> > +int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> > +int cxl_region_trigger_poison_list(struct cxl_region *region);
> >  
> >  int cxl_cmd_alert_config_set_life_used_prog_warn_threshold(struct cxl_cmd *cmd,
> >  							   int threshold);
> > -- 
> > 2.37.3
> >
Dan Williams March 18, 2024, 9:01 p.m. UTC | #3
Alison Schofield wrote:
> On Mon, Mar 18, 2024 at 10:51:13AM -0700, fan wrote:
> > On Wed, Mar 13, 2024 at 09:05:17PM -0700, alison.schofield@intel.com wrote:
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > CXL devices maintain a list of locations that are poisoned or result
> > > in poison if the addresses are accessed by the host.
> > > 
> > > Per the spec (CXL 3.1 8.2.9.9.4.1), the device returns the Poison
> > > List as a set of  Media Error Records that include the source of the
> > > error, the starting device physical address and length.
> > > 
> > > Trigger the retrieval of the poison list by writing to the memory
> > > device sysfs attribute: trigger_poison_list. The CXL driver only
> > > offers triggering per memdev, so the trigger by region interface
> > > offered here is a convenience API that triggers a poison list
> > > retrieval for each memdev contributing to a region.
> > > 
> > > int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> > > int cxl_region_trigger_poison_list(struct cxl_region *region);
> > > 
> > > The resulting poison records are logged as kernel trace events
> > > named 'cxl_poison'.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > ---
> > >  cxl/lib/libcxl.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  cxl/lib/libcxl.sym |  2 ++
> > >  cxl/libcxl.h       |  2 ++
> > >  3 files changed, 51 insertions(+)
> > > 
> > > diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
> > > index ff27cdf7c44a..73db8f15c704 100644
> > > --- a/cxl/lib/libcxl.c
> > > +++ b/cxl/lib/libcxl.c
> > > @@ -1761,6 +1761,53 @@ CXL_EXPORT int cxl_memdev_disable_invalidate(struct cxl_memdev *memdev)
> > >  	return 0;
> > >  }
> > >  
> > > +CXL_EXPORT int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev)
> > > +{
> > > +	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> > > +	char *path = memdev->dev_buf;
> > > +	int len = memdev->buf_len, rc;
> > > +
> > > +	if (snprintf(path, len, "%s/trigger_poison_list",
> > > +		     memdev->dev_path) >= len) {
> > > +		err(ctx, "%s: buffer too small\n",
> > > +		    cxl_memdev_get_devname(memdev));
> > > +		return -ENXIO;
> > > +	}
> > > +	rc = sysfs_write_attr(ctx, path, "1\n");
> > > +	if (rc < 0) {
> > > +		fprintf(stderr,
> > > +			"%s: Failed write sysfs attr trigger_poison_list\n",
> > > +			cxl_memdev_get_devname(memdev));
> > 
> > Should we use err() instead of fprintf here? 
> 
> Thanks Fan,
> 
> How about this?
> 
> - use fprintf if access() fails, ie device doesn't support poison list,
> - use err() for failure to actually read the poison list on a device with
>   support

Why? There is no raw usage of fprintf in any of the libraries (ndctl,
daxctl, cxl) to date. If someone builds the library without logging then
it should not chat on stderr at all, and if someone redirects logging to
syslog then it also should emit messages only there and not stderr.
Alison Schofield March 19, 2024, 4:43 p.m. UTC | #4
On Mon, Mar 18, 2024 at 02:01:38PM -0700, Dan Williams wrote:
> Alison Schofield wrote:
> > On Mon, Mar 18, 2024 at 10:51:13AM -0700, fan wrote:
> > > On Wed, Mar 13, 2024 at 09:05:17PM -0700, alison.schofield@intel.com wrote:
> > > > From: Alison Schofield <alison.schofield@intel.com>
> > > > 
> > > > CXL devices maintain a list of locations that are poisoned or result
> > > > in poison if the addresses are accessed by the host.
> > > > 
> > > > Per the spec (CXL 3.1 8.2.9.9.4.1), the device returns the Poison
> > > > List as a set of  Media Error Records that include the source of the
> > > > error, the starting device physical address and length.
> > > > 
> > > > Trigger the retrieval of the poison list by writing to the memory
> > > > device sysfs attribute: trigger_poison_list. The CXL driver only
> > > > offers triggering per memdev, so the trigger by region interface
> > > > offered here is a convenience API that triggers a poison list
> > > > retrieval for each memdev contributing to a region.
> > > > 
> > > > int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
> > > > int cxl_region_trigger_poison_list(struct cxl_region *region);
> > > > 
> > > > The resulting poison records are logged as kernel trace events
> > > > named 'cxl_poison'.
> > > > 
> > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > > ---
> > > >  cxl/lib/libcxl.c   | 47 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  cxl/lib/libcxl.sym |  2 ++
> > > >  cxl/libcxl.h       |  2 ++
> > > >  3 files changed, 51 insertions(+)
> > > > 
> > > > diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
> > > > index ff27cdf7c44a..73db8f15c704 100644
> > > > --- a/cxl/lib/libcxl.c
> > > > +++ b/cxl/lib/libcxl.c
> > > > @@ -1761,6 +1761,53 @@ CXL_EXPORT int cxl_memdev_disable_invalidate(struct cxl_memdev *memdev)
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +CXL_EXPORT int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev)
> > > > +{
> > > > +	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
> > > > +	char *path = memdev->dev_buf;
> > > > +	int len = memdev->buf_len, rc;
> > > > +
> > > > +	if (snprintf(path, len, "%s/trigger_poison_list",
> > > > +		     memdev->dev_path) >= len) {
> > > > +		err(ctx, "%s: buffer too small\n",
> > > > +		    cxl_memdev_get_devname(memdev));
> > > > +		return -ENXIO;
> > > > +	}
> > > > +	rc = sysfs_write_attr(ctx, path, "1\n");
> > > > +	if (rc < 0) {
> > > > +		fprintf(stderr,
> > > > +			"%s: Failed write sysfs attr trigger_poison_list\n",
> > > > +			cxl_memdev_get_devname(memdev));
> > > 
> > > Should we use err() instead of fprintf here? 
> > 
> > Thanks Fan,
> > 
> > How about this?
> > 
> > - use fprintf if access() fails, ie device doesn't support poison list,
> > - use err() for failure to actually read the poison list on a device with
> >   support
> 
> Why? There is no raw usage of fprintf in any of the libraries (ndctl,
> daxctl, cxl) to date. If someone builds the library without logging then
> it should not chat on stderr at all, and if someone redirects logging to
> syslog then it also should emit messages only there and not stderr.

Why indeed :(

I'll remove the fprintf() and only use err() for both cases: device
doesn't support feature, or failure to read list.
diff mbox series

Patch

diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
index ff27cdf7c44a..73db8f15c704 100644
--- a/cxl/lib/libcxl.c
+++ b/cxl/lib/libcxl.c
@@ -1761,6 +1761,53 @@  CXL_EXPORT int cxl_memdev_disable_invalidate(struct cxl_memdev *memdev)
 	return 0;
 }
 
+CXL_EXPORT int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev)
+{
+	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
+	char *path = memdev->dev_buf;
+	int len = memdev->buf_len, rc;
+
+	if (snprintf(path, len, "%s/trigger_poison_list",
+		     memdev->dev_path) >= len) {
+		err(ctx, "%s: buffer too small\n",
+		    cxl_memdev_get_devname(memdev));
+		return -ENXIO;
+	}
+	rc = sysfs_write_attr(ctx, path, "1\n");
+	if (rc < 0) {
+		fprintf(stderr,
+			"%s: Failed write sysfs attr trigger_poison_list\n",
+			cxl_memdev_get_devname(memdev));
+		return rc;
+	}
+	return 0;
+}
+
+CXL_EXPORT int cxl_region_trigger_poison_list(struct cxl_region *region)
+{
+	struct cxl_memdev_mapping *mapping;
+	int rc;
+
+	cxl_mapping_foreach(region, mapping) {
+		struct cxl_decoder *decoder;
+		struct cxl_memdev *memdev;
+
+		decoder = cxl_mapping_get_decoder(mapping);
+		if (!decoder)
+			continue;
+
+		memdev = cxl_decoder_get_memdev(decoder);
+		if (!memdev)
+			continue;
+
+		rc = cxl_memdev_trigger_poison_list(memdev);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
 CXL_EXPORT int cxl_memdev_enable(struct cxl_memdev *memdev)
 {
 	struct cxl_ctx *ctx = cxl_memdev_get_ctx(memdev);
diff --git a/cxl/lib/libcxl.sym b/cxl/lib/libcxl.sym
index de2cd84b2960..3f709c60db3d 100644
--- a/cxl/lib/libcxl.sym
+++ b/cxl/lib/libcxl.sym
@@ -280,4 +280,6 @@  global:
 	cxl_memdev_get_pmem_qos_class;
 	cxl_memdev_get_ram_qos_class;
 	cxl_region_qos_class_mismatch;
+	cxl_memdev_trigger_poison_list;
+	cxl_region_trigger_poison_list;
 } LIBCXL_6;
diff --git a/cxl/libcxl.h b/cxl/libcxl.h
index a6af3fb04693..29165043ca3f 100644
--- a/cxl/libcxl.h
+++ b/cxl/libcxl.h
@@ -467,6 +467,8 @@  enum cxl_setpartition_mode {
 
 int cxl_cmd_partition_set_mode(struct cxl_cmd *cmd,
 		enum cxl_setpartition_mode mode);
+int cxl_memdev_trigger_poison_list(struct cxl_memdev *memdev);
+int cxl_region_trigger_poison_list(struct cxl_region *region);
 
 int cxl_cmd_alert_config_set_life_used_prog_warn_threshold(struct cxl_cmd *cmd,
 							   int threshold);