diff mbox series

[ndctl,v5,3/5] cxl/list: collect and parse the poison list records

Message ID bf65d11d6388bcdce2e6dc35064edf4094c0c5a8.1700615159.git.alison.schofield@intel.com
State Superseded
Commit 84101f4787f8b5d4d76af266d57ae0d07e2fcf86
Headers show
Series Support poison list retrieval | expand

Commit Message

Alison Schofield Nov. 22, 2023, 1:22 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

Poison list records are logged as events in the kernel tracing
subsystem. To prepare the poison list for cxl list, enable tracing,
trigger the poison list read, and parse the generated cxl_poison
events into a json representation.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 cxl/json.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

Comments

Dan Williams Dec. 7, 2023, 4:39 a.m. UTC | #1
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> Poison list records are logged as events in the kernel tracing
> subsystem. To prepare the poison list for cxl list, enable tracing,
> trigger the poison list read, and parse the generated cxl_poison
> events into a json representation.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  cxl/json.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 211 insertions(+)
> 
> diff --git a/cxl/json.c b/cxl/json.c
> index 7678d02020b6..6fb17582a1cb 100644
> --- a/cxl/json.c
> +++ b/cxl/json.c
> @@ -2,15 +2,19 @@
>  // Copyright (C) 2015-2021 Intel Corporation. All rights reserved.
>  #include <limits.h>
>  #include <util/json.h>
> +#include <util/bitmap.h>
>  #include <uuid/uuid.h>
>  #include <cxl/libcxl.h>
>  #include <json-c/json.h>
>  #include <json-c/printbuf.h>
>  #include <ccan/short_types/short_types.h>
> +#include <traceevent/event-parse.h>
> +#include <tracefs/tracefs.h>
>  
>  #include "filter.h"
>  #include "json.h"
>  #include "../daxctl/json.h"
> +#include "event_trace.h"
>  
>  #define CXL_FW_VERSION_STR_LEN	16
>  #define CXL_FW_MAX_SLOTS	4
> @@ -571,6 +575,201 @@ err_jobj:
>  	return NULL;
>  }
>  
> +/* CXL Spec 3.1 Table 8-140 Media Error Record */
> +#define CXL_POISON_SOURCE_UNKNOWN 0
> +#define CXL_POISON_SOURCE_EXTERNAL 1
> +#define CXL_POISON_SOURCE_INTERNAL 2
> +#define CXL_POISON_SOURCE_INJECTED 3
> +#define CXL_POISON_SOURCE_VENDOR 7
> +
> +/* CXL Spec 3.1 Table 8-139 Get Poison List Output Payload */
> +#define CXL_POISON_FLAG_MORE BIT(0)
> +#define CXL_POISON_FLAG_OVERFLOW BIT(1)
> +#define CXL_POISON_FLAG_SCANNING BIT(2)
> +
> +static struct json_object *
> +util_cxl_poison_events_to_json(struct tracefs_instance *inst,
> +			       const char *region_name, unsigned long flags)
> +{
> +	struct json_object *jerrors, *jpoison, *jobj = NULL;
> +	struct jlist_node *jnode, *next;
> +	struct event_ctx ectx = {
> +		.event_name = "cxl_poison",
> +		.event_pid = getpid(),
> +		.system = "cxl",
> +	};
> +	int rc, count = 0;
> +
> +	list_head_init(&ectx.jlist_head);
> +	rc = cxl_parse_events(inst, &ectx);

This pattern really feels like it wants a cxl_for_each_event() -style
helper rather than require the end caller to open code list usage.
Basically cxl_parse_events() is a helper that should stay local to
cxl/monitor.c. This new cxl_for_each_event() would become used
internally by cxl_parse_events() and let
util_cxl_poison_events_to_json() do its own per-event iteration.

> +	if (rc < 0) {
> +		fprintf(stderr, "Failed to parse events: %d\n", rc);
> +		return NULL;
> +	}
> +	/* Add nr_records:0 to json */
> +	if (list_empty(&ectx.jlist_head))
> +		goto out;
> +
> +	jerrors = json_object_new_array();
> +	if (!jerrors)
> +		return NULL;
> +
> +	list_for_each_safe(&ectx.jlist_head, jnode, next, list) {
> +		struct json_object *jp, *jval;
> +		int source, pflags = 0;
> +		u64 addr, len;
> +
> +		jp = json_object_new_object();
> +		if (!jp)
> +			return NULL;
> +
> +		/* Skip records not in this region when listing by region */
> +		if (json_object_object_get_ex(jnode->jobj, "region", &jval)) {
> +			const char *name;

So we're building a json_object internal to cxl_parse_events() only to
turn around and extract details out of that object that tell us this
event was not of interest, or to create yet another json object?

I think this implementation has a chance to be significantly less
complicated if the event list can be iterated directly without this
temporary json_object parsing.
Alison Schofield Dec. 13, 2023, 2:13 a.m. UTC | #2
On Wed, Dec 06, 2023 at 08:39:07PM -0800, Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 

snip

> > +	struct json_object *jerrors, *jpoison, *jobj = NULL;
> > +	struct jlist_node *jnode, *next;
> > +	struct event_ctx ectx = {
> > +		.event_name = "cxl_poison",
> > +		.event_pid = getpid(),
> > +		.system = "cxl",
> > +	};
> > +	int rc, count = 0;
> > +
> > +	list_head_init(&ectx.jlist_head);
> > +	rc = cxl_parse_events(inst, &ectx);
> 
> This pattern really feels like it wants a cxl_for_each_event() -style
> helper rather than require the end caller to open code list usage.
> Basically cxl_parse_events() is a helper that should stay local to
> cxl/monitor.c. This new cxl_for_each_event() would become used
> internally by cxl_parse_events() and let
> util_cxl_poison_events_to_json() do its own per-event iteration.
> 

snip & concat'ing your next comment:

> So we're building a json_object internal to cxl_parse_events() only to
> turn around and extract details out of that object that tell us this
> event was not of interest, or to create yet another json object?
> 
> I think this implementation has a chance to be significantly less
> complicated if the event list can be iterated directly without this
> temporary json_object parsing.

DaveJ actually already implemented a method to include a 'private'
parsing function in the event_ctx structure. I didn't use it, but
but rather used the generic cxl_event_to_json helper, and then
parsed that all over again to refine for poison list output.

I think reorganizing w a private event_ctx->parse_event will
streamline as you suggest.

Alison
diff mbox series

Patch

diff --git a/cxl/json.c b/cxl/json.c
index 7678d02020b6..6fb17582a1cb 100644
--- a/cxl/json.c
+++ b/cxl/json.c
@@ -2,15 +2,19 @@ 
 // Copyright (C) 2015-2021 Intel Corporation. All rights reserved.
 #include <limits.h>
 #include <util/json.h>
+#include <util/bitmap.h>
 #include <uuid/uuid.h>
 #include <cxl/libcxl.h>
 #include <json-c/json.h>
 #include <json-c/printbuf.h>
 #include <ccan/short_types/short_types.h>
+#include <traceevent/event-parse.h>
+#include <tracefs/tracefs.h>
 
 #include "filter.h"
 #include "json.h"
 #include "../daxctl/json.h"
+#include "event_trace.h"
 
 #define CXL_FW_VERSION_STR_LEN	16
 #define CXL_FW_MAX_SLOTS	4
@@ -571,6 +575,201 @@  err_jobj:
 	return NULL;
 }
 
+/* CXL Spec 3.1 Table 8-140 Media Error Record */
+#define CXL_POISON_SOURCE_UNKNOWN 0
+#define CXL_POISON_SOURCE_EXTERNAL 1
+#define CXL_POISON_SOURCE_INTERNAL 2
+#define CXL_POISON_SOURCE_INJECTED 3
+#define CXL_POISON_SOURCE_VENDOR 7
+
+/* CXL Spec 3.1 Table 8-139 Get Poison List Output Payload */
+#define CXL_POISON_FLAG_MORE BIT(0)
+#define CXL_POISON_FLAG_OVERFLOW BIT(1)
+#define CXL_POISON_FLAG_SCANNING BIT(2)
+
+static struct json_object *
+util_cxl_poison_events_to_json(struct tracefs_instance *inst,
+			       const char *region_name, unsigned long flags)
+{
+	struct json_object *jerrors, *jpoison, *jobj = NULL;
+	struct jlist_node *jnode, *next;
+	struct event_ctx ectx = {
+		.event_name = "cxl_poison",
+		.event_pid = getpid(),
+		.system = "cxl",
+	};
+	int rc, count = 0;
+
+	list_head_init(&ectx.jlist_head);
+	rc = cxl_parse_events(inst, &ectx);
+	if (rc < 0) {
+		fprintf(stderr, "Failed to parse events: %d\n", rc);
+		return NULL;
+	}
+	/* Add nr_records:0 to json */
+	if (list_empty(&ectx.jlist_head))
+		goto out;
+
+	jerrors = json_object_new_array();
+	if (!jerrors)
+		return NULL;
+
+	list_for_each_safe(&ectx.jlist_head, jnode, next, list) {
+		struct json_object *jp, *jval;
+		int source, pflags = 0;
+		u64 addr, len;
+
+		jp = json_object_new_object();
+		if (!jp)
+			return NULL;
+
+		/* Skip records not in this region when listing by region */
+		if (json_object_object_get_ex(jnode->jobj, "region", &jval)) {
+			const char *name;
+
+			name = json_object_get_string(jval);
+			if ((region_name) && (strcmp(region_name, name) != 0))
+				continue;
+
+			if (strlen(name))
+				json_object_object_add(jp, "region", jval);
+		}
+
+		/* Memdev name is only needed when listing by region */
+		if (region_name) {
+			if (json_object_object_get_ex(jnode->jobj, "memdev",
+						      &jval))
+				json_object_object_add(jp, "memdev", jval);
+		}
+
+		/*
+		 * When listing by memdev, region names and valid HPAs
+		 * will appear if the poisoned address is part of a region.
+		 * Pick up those valid region names and HPAs and ignore
+		 * any empties and invalids.
+		 */
+
+		if (json_object_object_get_ex(jnode->jobj, "hpa", &jval)) {
+			addr = json_object_get_uint64(jval);
+			if (addr != ULLONG_MAX) {
+				jobj = util_json_object_hex(addr, flags);
+				json_object_object_add(jp, "hpa", jobj);
+			}
+		}
+		if (json_object_object_get_ex(jnode->jobj, "dpa", &jval)) {
+			addr = json_object_get_int64(jval);
+			jobj = util_json_object_hex(addr, flags);
+			json_object_object_add(jp, "dpa", jobj);
+		}
+		if (json_object_object_get_ex(jnode->jobj, "dpa_length", &jval)) {
+			len = json_object_get_int64(jval);
+			jobj = util_json_object_size(len, flags);
+			json_object_object_add(jp, "dpa_length", jobj);
+		}
+		if (json_object_object_get_ex(jnode->jobj, "source", &jval)) {
+			source = json_object_get_int(jval);
+			switch (source) {
+			case CXL_POISON_SOURCE_UNKNOWN:
+				jobj = json_object_new_string("Unknown");
+				break;
+			case CXL_POISON_SOURCE_EXTERNAL:
+				jobj = json_object_new_string("External");
+				break;
+			case CXL_POISON_SOURCE_INTERNAL:
+				jobj = json_object_new_string("Internal");
+				break;
+			case CXL_POISON_SOURCE_INJECTED:
+				jobj = json_object_new_string("Injected");
+				break;
+			case CXL_POISON_SOURCE_VENDOR:
+				jobj = json_object_new_string("Vendor");
+				break;
+			default:
+				jobj = json_object_new_string("Reserved");
+			}
+			json_object_object_add(jp, "source", jobj);
+		}
+		if (json_object_object_get_ex(jnode->jobj, "flags", &jval))
+			pflags = json_object_get_int(jval);
+
+		if (pflags) {
+			char flag_str[32] = { '\0' };
+
+			if (pflags & CXL_POISON_FLAG_MORE)
+				strcat(flag_str, "More,");
+			if (pflags & CXL_POISON_FLAG_OVERFLOW)
+				strcat(flag_str, "Overflow,");
+			if (pflags & CXL_POISON_FLAG_SCANNING)
+				strcat(flag_str, "Scanning,");
+			jobj = json_object_new_string(flag_str);
+			if (jobj)
+				json_object_object_add(jp, "flags", jobj);
+		}
+		if (json_object_object_get_ex(jnode->jobj, "overflow_t", &jval))
+			json_object_object_add(jp, "overflow_time", jval);
+
+		json_object_array_add(jerrors, jp);
+		count++;
+	} /* list_for_each_safe */
+
+out:
+	jpoison = json_object_new_object();
+	if (!jpoison)
+		return NULL;
+
+	/* Always include the count. If count is zero, no records follow. */
+	jobj = json_object_new_int(count);
+	if (jobj)
+		json_object_object_add(jpoison, "nr_records", jobj);
+	if (count)
+		json_object_object_add(jpoison, "records", jerrors);
+
+	return jpoison;
+}
+
+static struct json_object *
+util_cxl_poison_list_to_json(struct cxl_region *region,
+			     struct cxl_memdev *memdev,
+			     unsigned long flags)
+{
+	struct json_object *jpoison = NULL;
+	struct tracefs_instance *inst;
+	const char *region_name;
+	int rc;
+
+	inst = tracefs_instance_create("cxl list");
+	if (!inst) {
+		fprintf(stderr, "tracefs_instance_create() failed\n");
+		return NULL;
+	}
+
+	rc = cxl_event_tracing_enable(inst, "cxl", "cxl_poison");
+	if (rc < 0) {
+		fprintf(stderr, "Failed to enable trace: %d\n", rc);
+		goto err_free;
+	}
+
+	if (region)
+		rc = cxl_region_trigger_poison_list(region);
+	else
+		rc = cxl_memdev_trigger_poison_list(memdev);
+	if (rc)
+		goto err_free;
+
+	rc = cxl_event_tracing_disable(inst);
+	if (rc < 0) {
+		fprintf(stderr, "Failed to disable trace: %d\n", rc);
+		goto err_free;
+	}
+
+	region_name = region ? cxl_region_get_devname(region) : NULL;
+	jpoison = util_cxl_poison_events_to_json(inst, region_name, flags);
+
+err_free:
+	tracefs_instance_free(inst);
+	return jpoison;
+}
+
 struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev,
 		unsigned long flags)
 {
@@ -649,6 +848,12 @@  struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev,
 			json_object_object_add(jdev, "firmware", jobj);
 	}
 
+	if (flags & UTIL_JSON_MEDIA_ERRORS) {
+		jobj = util_cxl_poison_list_to_json(NULL, memdev, flags);
+		if (jobj)
+			json_object_object_add(jdev, "poison", jobj);
+	}
+
 	json_object_set_userdata(jdev, memdev, NULL);
 	return jdev;
 }
@@ -987,6 +1192,12 @@  struct json_object *util_cxl_region_to_json(struct cxl_region *region,
 			json_object_object_add(jregion, "state", jobj);
 	}
 
+	if (flags & UTIL_JSON_MEDIA_ERRORS) {
+		jobj = util_cxl_poison_list_to_json(region, NULL, flags);
+		if (jobj)
+			json_object_object_add(jregion, "poison", jobj);
+	}
+
 	util_cxl_mappings_append_json(jregion, region, flags);
 
 	if (flags & UTIL_JSON_DAX) {