Message ID | 80f60f513ced74bc5eed7d0082faf74783fa41d7.1696196382.git.alison.schofield@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | Support poison list retrieval | expand |
On Sun, 2023-10-01 at 15:31 -0700, alison.schofield@intel.com wrote: > From: Alison Schofield <alison.schofield@intel.com> > > Poison list records are logged as events in the kernel tracing > subsystem. To prepare the poison list for cxl list, enable tracing, > trigger the poison list read, and parse the generated cxl_poison > events into a json representation. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > cxl/json.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > util/json.h | 1 + > 2 files changed, 209 insertions(+) > > diff --git a/cxl/json.c b/cxl/json.c > index 7678d02020b6..36db73de4f8f 100644 > --- a/cxl/json.c > +++ b/cxl/json.c > @@ -2,15 +2,19 @@ > // Copyright (C) 2015-2021 Intel Corporation. All rights reserved. > #include <limits.h> > #include <util/json.h> > +#include <util/bitmap.h> > #include <uuid/uuid.h> > #include <cxl/libcxl.h> > #include <json-c/json.h> > #include <json-c/printbuf.h> > #include <ccan/short_types/short_types.h> > +#include <traceevent/event-parse.h> > +#include <tracefs/tracefs.h> > > #include "filter.h" > #include "json.h" > #include "../daxctl/json.h" > +#include "event_trace.h" > > #define CXL_FW_VERSION_STR_LEN 16 > #define CXL_FW_MAX_SLOTS 4 > @@ -571,6 +575,190 @@ err_jobj: > return NULL; > } > > +/* CXL 8.2.9.5.4.1 Get Poison List: Poison Source */ These usually have a spec version too - "CXL 3.0 8.2.9... " > +#define CXL_POISON_SOURCE_UNKNOWN 0 > +#define CXL_POISON_SOURCE_EXTERNAL 1 > +#define CXL_POISON_SOURCE_INTERNAL 2 > +#define CXL_POISON_SOURCE_INJECTED 3 > +#define CXL_POISON_SOURCE_VENDOR 7 > + > +/* CXL 8.2.9.5.4.1 Get Poison List: Payload out flags */ Same thing here. > +#define CXL_POISON_FLAG_MORE BIT(0) > +#define CXL_POISON_FLAG_OVERFLOW BIT(1) > +#define CXL_POISON_FLAG_SCANNING BIT(2) > + > +static struct json_object * > +util_cxl_poison_events_to_json(struct tracefs_instance *inst, bool is_region, > + unsigned long flags) > +{ > + struct json_object *jerrors, *jmedia, *jobj = NULL; Since everything else is now 'poison', might be good to also s/jmedia/jpoison/ everywhere. > + struct jlist_node *jnode, *next; > + struct event_ctx ectx = { > + .event_name = "cxl_poison", > + .event_pid = getpid(), > + .system = "cxl", > + }; > + int rc, count = 0; > + > + list_head_init(&ectx.jlist_head); > + rc = cxl_parse_events(inst, &ectx); > + if (rc < 0) { > + fprintf(stderr, "Failed to parse events: %d\n", rc); > + return NULL; > + } > + /* Add nr_poison_records:0 to json */ > + if (list_empty(&ectx.jlist_head)) > + goto out; > + > + jerrors = json_object_new_array(); > + if (!jerrors) > + return NULL; > + > + list_for_each_safe(&ectx.jlist_head, jnode, next, list) { > + struct json_object *jval = NULL; > + struct json_object *jp = NULL; Are the NULL assignments needed? At least for @jp, it is unconditionally assigned below, and isn't used before that. I suspect json-c probably doesn't care about what's in @jval either before writing it. > + int source, pflags; > + u64 addr, len; > + > + jp = json_object_new_object(); > + if (!jp) > + return NULL; > + > + if (is_region) { > + /* Add the memdev name in a by region list */ > + if (json_object_object_get_ex(jnode->jobj, "memdev", > + &jval)) > + json_object_object_add(jp, "memdev", jval); > + } > + > + /* > + * When listing is by memdev, region names and valid HPAs > + * will appear if the poison address is part of a region. > + * Pick up those valid region names and HPAs but ignore the > + * empties and invalids. > + */ > + > + /* Only add non NULL region names */ > + if (json_object_object_get_ex(jnode->jobj, "region", &jval)) { > + if (strlen(json_object_get_string(jval)) != 0) > + json_object_object_add(jp, "region", jval); > + } > + /* Only display valid HPAs */ > + if (json_object_object_get_ex(jnode->jobj, "hpa", &jval)) { > + addr = json_object_get_uint64(jval); > + if (addr != ULLONG_MAX) { > + jobj = util_json_object_hex(addr, flags); > + json_object_object_add(jp, "hpa", jobj); > + } > + } > + if (json_object_object_get_ex(jnode->jobj, "dpa", &jval)) { > + addr = json_object_get_int64(jval); > + jobj = util_json_object_hex(addr, flags); > + json_object_object_add(jp, "dpa", jobj); > + } > + if (json_object_object_get_ex(jnode->jobj, "dpa_length", &jval)) { > + len = json_object_get_int64(jval); > + jobj = util_json_object_size(len, flags); > + json_object_object_add(jp, "dpa_length", jobj); > + } > + if (json_object_object_get_ex(jnode->jobj, "source", &jval)) { > + source = json_object_get_int(jval); > + if (source == CXL_POISON_SOURCE_UNKNOWN) > + jobj = json_object_new_string("Unknown"); > + else if (source == CXL_POISON_SOURCE_EXTERNAL) > + jobj = json_object_new_string("External"); > + else if (source == CXL_POISON_SOURCE_INTERNAL) > + jobj = json_object_new_string("Internal"); > + else if (source == CXL_POISON_SOURCE_INJECTED) > + jobj = json_object_new_string("Injected"); > + else if (source == CXL_POISON_SOURCE_VENDOR) > + jobj = json_object_new_string("Vendor"); > + else > + jobj = json_object_new_string("Reserved"); Minor nit, but maybe 'switch (source) ...' would look a bit cleaner? > + json_object_object_add(jp, "source", jobj); > + } > + if (json_object_object_get_ex(jnode->jobj, "flags", &jval)) { > + char flag_str[32] = { '\0' }; > + > + pflags = json_object_get_int(jval); > + if (pflags & CXL_POISON_FLAG_MORE) > + strcat(flag_str, "More,"); > + if (pflags & CXL_POISON_FLAG_OVERFLOW) > + strcat(flag_str, "Overflow,"); > + if (pflags & CXL_POISON_FLAG_SCANNING) > + strcat(flag_str, "Scanning,"); > + jobj = json_object_new_string(flag_str); > + if (jobj) > + json_object_object_add(jp, "flags", jobj); > + } > + if (json_object_object_get_ex(jnode->jobj, "overflow_t", &jval)) > + json_object_object_add(jp, "overflow_time", jval); > + > + json_object_array_add(jerrors, jp); > + count++; > + } /* list_for_each_safe */ > + > +out: > + jmedia = json_object_new_object(); > + if (!jmedia) > + return NULL; > + > + /* Always include the count. If count is zero, no records follow. */ > + jobj = json_object_new_int(count); > + if (jobj) > + json_object_object_add(jmedia, "nr_poison_records", jobj); > + if (count) > + json_object_object_add(jmedia, "poison_records", jerrors); Since these are already nested under a 'poison' JSON object, I'm tempted to say these can just be 'nr_records' and 'records' respectively. > + > + return jmedia; > +} > + > +struct cxl_poison_ctx { > + void *dev; > + bool is_region; > +}; This structure is a bit awkward - what do you think about creating different wrappers for the memdev and region case - util_cxl_memdev_poison_list_to_json(), and util_cxl_region_poison_list_to_json() that are called respectively by util_cxl_{memdev,region}_to_json(), and internally they can call: util_cxl_poison_list_to_json(NULL, memdev, flags), or util_cxl_poison_list_to_json(region, NULL, flags) For the next level down, i.e. poison_events_to_json, the @is_region bool passed in directly is fine as it doesn't need the memdev or region objects passed in via void *. > + > +static struct json_object * > +util_cxl_poison_list_to_json(struct cxl_poison_ctx *pctx, > + unsigned long flags) > +{ > + struct json_object *jmedia = NULL; > + struct tracefs_instance *inst; > + int rc; > + > + inst = tracefs_instance_create("cxl list"); > + if (!inst) { > + fprintf(stderr, "tracefs_instance_create() failed\n"); > + return NULL; > + } > + > + rc = cxl_event_tracing_enable(inst, "cxl", "cxl_poison"); > + if (rc < 0) { > + fprintf(stderr, "Failed to enable trace: %d\n", rc); > + goto err_free; > + } > + > + if (pctx->is_region) > + rc = cxl_region_trigger_poison_list(pctx->dev); > + else > + rc = cxl_memdev_trigger_poison_list(pctx->dev); > + if (rc) { > + fprintf(stderr, "Failed write of sysfs attribute: %d\n", rc); This would be incorrect if the memdev trigger reported an ENOMEM, and then this reported a sysfs write failure. It should at least be something like 'failed to trigger poison" - but since the memdev trigger helper has prints for every failure case, maybe this can just be omitted? > + goto err_free; > + } > + > + rc = cxl_event_tracing_disable(inst); > + if (rc < 0) { > + fprintf(stderr, "Failed to disable trace: %d\n", rc); > + goto err_free; > + } > + > + jmedia = util_cxl_poison_events_to_json(inst, pctx->is_region, flags); > +err_free: > + tracefs_instance_free(inst); > + return jmedia; > +} > + > struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev, > unsigned long flags) > { > @@ -649,6 +837,16 @@ struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev, > json_object_object_add(jdev, "firmware", jobj); > } > > + if (flags & UTIL_JSON_POISON_LIST) { > + struct cxl_poison_ctx pctx = { > + .dev = memdev, > + .is_region = false, > + }; > + jobj = util_cxl_poison_list_to_json(&pctx, flags); > + if (jobj) > + json_object_object_add(jdev, "poison", jobj); > + } > + > json_object_set_userdata(jdev, memdev, NULL); > return jdev; > } > @@ -987,6 +1185,16 @@ struct json_object *util_cxl_region_to_json(struct cxl_region *region, > json_object_object_add(jregion, "state", jobj); > } > > + if (flags & UTIL_JSON_POISON_LIST) { > + struct cxl_poison_ctx pectx = { > + .dev = region, > + .is_region = true, > + }; > + jobj = util_cxl_poison_list_to_json(&pectx, flags); > + if (jobj) > + json_object_object_add(jregion, "poison", jobj); > + } > + > util_cxl_mappings_append_json(jregion, region, flags); > > if (flags & UTIL_JSON_DAX) { > diff --git a/util/json.h b/util/json.h > index ea370df4d1b7..3ae4074a95c3 100644 > --- a/util/json.h > +++ b/util/json.h > @@ -21,6 +21,7 @@ enum util_json_flags { > UTIL_JSON_TARGETS = (1 << 11), > UTIL_JSON_PARTITION = (1 << 12), > UTIL_JSON_ALERT_CONFIG = (1 << 13), > + UTIL_JSON_POISON_LIST = (1 << 14), There's already a UTIL_JSON_MEDIA_ERRORS, can we just reuse that (in spite of the name :)) > }; > > void util_display_json_array(FILE *f_out, struct json_object *jarray,
On Wed, Nov 15, 2023 at 02:09:15AM -0800, Vishal Verma wrote: > On Sun, 2023-10-01 at 15:31 -0700, alison.schofield@intel.com wrote: snip > > > > +/* CXL 8.2.9.5.4.1 Get Poison List: Poison Source */ > > These usually have a spec version too - "CXL 3.0 8.2.9... " Got it. Fixed up the two like this and also a commit log reference. Moved them all to the now released CXL 3.1 spec. > > +static struct json_object * > > +util_cxl_poison_events_to_json(struct tracefs_instance *inst, bool is_region, > > + unsigned long flags) > > +{ > > + struct json_object *jerrors, *jmedia, *jobj = NULL; > > Since everything else is now 'poison', might be good to also > s/jmedia/jpoison/ everywhere. Done. > > + > > + list_for_each_safe(&ectx.jlist_head, jnode, next, list) { > > + struct json_object *jval = NULL; > > + struct json_object *jp = NULL; > > Are the NULL assignments needed? At least for @jp, it is > unconditionally assigned below, and isn't used before that. I suspect > json-c probably doesn't care about what's in @jval either before > writing it. > jp init, obvious not init needed. For jval init, I guess I worried about garbage in jval, but the way the trace event is built, it's not a concern. Removed both those inits. snip > > + if (json_object_object_get_ex(jnode->jobj, "source", &jval)) { > > + source = json_object_get_int(jval); > > + if (source == CXL_POISON_SOURCE_UNKNOWN) > > + jobj = json_object_new_string("Unknown"); > > + else if (source == CXL_POISON_SOURCE_EXTERNAL) > > + jobj = json_object_new_string("External"); > > + else if (source == CXL_POISON_SOURCE_INTERNAL) > > + jobj = json_object_new_string("Internal"); > > + else if (source == CXL_POISON_SOURCE_INJECTED) > > + jobj = json_object_new_string("Injected"); > > + else if (source == CXL_POISON_SOURCE_VENDOR) > > + jobj = json_object_new_string("Vendor"); > > + else > > + jobj = json_object_new_string("Reserved"); > > Minor nit, but maybe 'switch (source) ...' would look a bit cleaner? Done. > > > + > > + /* Always include the count. If count is zero, no records follow. */ > > + jobj = json_object_new_int(count); > > + if (jobj) > > + json_object_object_add(jmedia, "nr_poison_records", jobj); > > + if (count) > > + json_object_object_add(jmedia, "poison_records", jerrors); > > Since these are already nested under a 'poison' JSON object, I'm > tempted to say these can just be 'nr_records' and 'records' > respectively. > Done. > > + > > + return jmedia; > > +} > > + > > +struct cxl_poison_ctx { > > + void *dev; > > + bool is_region; > > +}; > > This structure is a bit awkward - what do you think about creating > different wrappers for the memdev and region case - > > util_cxl_memdev_poison_list_to_json(), and > util_cxl_region_poison_list_to_json() that are called respectively by > > util_cxl_{memdev,region}_to_json(), and internally they can call: > > util_cxl_poison_list_to_json(NULL, memdev, flags), or > util_cxl_poison_list_to_json(region, NULL, flags) > > For the next level down, i.e. poison_events_to_json, the @is_region > bool passed in directly is fine as it doesn't need the memdev or region > objects passed in via void *. Thanks! I did something like this but didn't actually add wrappers. Please look in v3 and let me know. > snip > > + if (pctx->is_region) > > + rc = cxl_region_trigger_poison_list(pctx->dev); > > + else > > + rc = cxl_memdev_trigger_poison_list(pctx->dev); > > + if (rc) { > > + fprintf(stderr, "Failed write of sysfs attribute: %d\n", rc); > > This would be incorrect if the memdev trigger reported an ENOMEM, and > then this reported a sysfs write failure. > > It should at least be something like 'failed to trigger poison" - but > since the memdev trigger helper has prints for every failure case, > maybe this can just be omitted? Removed it. > snip > > diff --git a/util/json.h b/util/json.h > > index ea370df4d1b7..3ae4074a95c3 100644 > > --- a/util/json.h > > +++ b/util/json.h > > @@ -21,6 +21,7 @@ enum util_json_flags { > > UTIL_JSON_TARGETS = (1 << 11), > > UTIL_JSON_PARTITION = (1 << 12), > > UTIL_JSON_ALERT_CONFIG = (1 << 13), > > + UTIL_JSON_POISON_LIST = (1 << 14), > > There's already a UTIL_JSON_MEDIA_ERRORS, can we just reuse that (in > spite of the name :)) Since it's not visible to user, changed it. Thanks for the review Vishal! >
diff --git a/cxl/json.c b/cxl/json.c index 7678d02020b6..36db73de4f8f 100644 --- a/cxl/json.c +++ b/cxl/json.c @@ -2,15 +2,19 @@ // Copyright (C) 2015-2021 Intel Corporation. All rights reserved. #include <limits.h> #include <util/json.h> +#include <util/bitmap.h> #include <uuid/uuid.h> #include <cxl/libcxl.h> #include <json-c/json.h> #include <json-c/printbuf.h> #include <ccan/short_types/short_types.h> +#include <traceevent/event-parse.h> +#include <tracefs/tracefs.h> #include "filter.h" #include "json.h" #include "../daxctl/json.h" +#include "event_trace.h" #define CXL_FW_VERSION_STR_LEN 16 #define CXL_FW_MAX_SLOTS 4 @@ -571,6 +575,190 @@ err_jobj: return NULL; } +/* CXL 8.2.9.5.4.1 Get Poison List: Poison Source */ +#define CXL_POISON_SOURCE_UNKNOWN 0 +#define CXL_POISON_SOURCE_EXTERNAL 1 +#define CXL_POISON_SOURCE_INTERNAL 2 +#define CXL_POISON_SOURCE_INJECTED 3 +#define CXL_POISON_SOURCE_VENDOR 7 + +/* CXL 8.2.9.5.4.1 Get Poison List: Payload out flags */ +#define CXL_POISON_FLAG_MORE BIT(0) +#define CXL_POISON_FLAG_OVERFLOW BIT(1) +#define CXL_POISON_FLAG_SCANNING BIT(2) + +static struct json_object * +util_cxl_poison_events_to_json(struct tracefs_instance *inst, bool is_region, + unsigned long flags) +{ + struct json_object *jerrors, *jmedia, *jobj = NULL; + struct jlist_node *jnode, *next; + struct event_ctx ectx = { + .event_name = "cxl_poison", + .event_pid = getpid(), + .system = "cxl", + }; + int rc, count = 0; + + list_head_init(&ectx.jlist_head); + rc = cxl_parse_events(inst, &ectx); + if (rc < 0) { + fprintf(stderr, "Failed to parse events: %d\n", rc); + return NULL; + } + /* Add nr_poison_records:0 to json */ + if (list_empty(&ectx.jlist_head)) + goto out; + + jerrors = json_object_new_array(); + if (!jerrors) + return NULL; + + list_for_each_safe(&ectx.jlist_head, jnode, next, list) { + struct json_object *jval = NULL; + struct json_object *jp = NULL; + int source, pflags; + u64 addr, len; + + jp = json_object_new_object(); + if (!jp) + return NULL; + + if (is_region) { + /* Add the memdev name in a by region list */ + if (json_object_object_get_ex(jnode->jobj, "memdev", + &jval)) + json_object_object_add(jp, "memdev", jval); + } + + /* + * When listing is by memdev, region names and valid HPAs + * will appear if the poison address is part of a region. + * Pick up those valid region names and HPAs but ignore the + * empties and invalids. + */ + + /* Only add non NULL region names */ + if (json_object_object_get_ex(jnode->jobj, "region", &jval)) { + if (strlen(json_object_get_string(jval)) != 0) + json_object_object_add(jp, "region", jval); + } + /* Only display valid HPAs */ + if (json_object_object_get_ex(jnode->jobj, "hpa", &jval)) { + addr = json_object_get_uint64(jval); + if (addr != ULLONG_MAX) { + jobj = util_json_object_hex(addr, flags); + json_object_object_add(jp, "hpa", jobj); + } + } + if (json_object_object_get_ex(jnode->jobj, "dpa", &jval)) { + addr = json_object_get_int64(jval); + jobj = util_json_object_hex(addr, flags); + json_object_object_add(jp, "dpa", jobj); + } + if (json_object_object_get_ex(jnode->jobj, "dpa_length", &jval)) { + len = json_object_get_int64(jval); + jobj = util_json_object_size(len, flags); + json_object_object_add(jp, "dpa_length", jobj); + } + if (json_object_object_get_ex(jnode->jobj, "source", &jval)) { + source = json_object_get_int(jval); + if (source == CXL_POISON_SOURCE_UNKNOWN) + jobj = json_object_new_string("Unknown"); + else if (source == CXL_POISON_SOURCE_EXTERNAL) + jobj = json_object_new_string("External"); + else if (source == CXL_POISON_SOURCE_INTERNAL) + jobj = json_object_new_string("Internal"); + else if (source == CXL_POISON_SOURCE_INJECTED) + jobj = json_object_new_string("Injected"); + else if (source == CXL_POISON_SOURCE_VENDOR) + jobj = json_object_new_string("Vendor"); + else + jobj = json_object_new_string("Reserved"); + json_object_object_add(jp, "source", jobj); + } + if (json_object_object_get_ex(jnode->jobj, "flags", &jval)) { + char flag_str[32] = { '\0' }; + + pflags = json_object_get_int(jval); + if (pflags & CXL_POISON_FLAG_MORE) + strcat(flag_str, "More,"); + if (pflags & CXL_POISON_FLAG_OVERFLOW) + strcat(flag_str, "Overflow,"); + if (pflags & CXL_POISON_FLAG_SCANNING) + strcat(flag_str, "Scanning,"); + jobj = json_object_new_string(flag_str); + if (jobj) + json_object_object_add(jp, "flags", jobj); + } + if (json_object_object_get_ex(jnode->jobj, "overflow_t", &jval)) + json_object_object_add(jp, "overflow_time", jval); + + json_object_array_add(jerrors, jp); + count++; + } /* list_for_each_safe */ + +out: + jmedia = json_object_new_object(); + if (!jmedia) + return NULL; + + /* Always include the count. If count is zero, no records follow. */ + jobj = json_object_new_int(count); + if (jobj) + json_object_object_add(jmedia, "nr_poison_records", jobj); + if (count) + json_object_object_add(jmedia, "poison_records", jerrors); + + return jmedia; +} + +struct cxl_poison_ctx { + void *dev; + bool is_region; +}; + +static struct json_object * +util_cxl_poison_list_to_json(struct cxl_poison_ctx *pctx, + unsigned long flags) +{ + struct json_object *jmedia = NULL; + struct tracefs_instance *inst; + int rc; + + inst = tracefs_instance_create("cxl list"); + if (!inst) { + fprintf(stderr, "tracefs_instance_create() failed\n"); + return NULL; + } + + rc = cxl_event_tracing_enable(inst, "cxl", "cxl_poison"); + if (rc < 0) { + fprintf(stderr, "Failed to enable trace: %d\n", rc); + goto err_free; + } + + if (pctx->is_region) + rc = cxl_region_trigger_poison_list(pctx->dev); + else + rc = cxl_memdev_trigger_poison_list(pctx->dev); + if (rc) { + fprintf(stderr, "Failed write of sysfs attribute: %d\n", rc); + goto err_free; + } + + rc = cxl_event_tracing_disable(inst); + if (rc < 0) { + fprintf(stderr, "Failed to disable trace: %d\n", rc); + goto err_free; + } + + jmedia = util_cxl_poison_events_to_json(inst, pctx->is_region, flags); +err_free: + tracefs_instance_free(inst); + return jmedia; +} + struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev, unsigned long flags) { @@ -649,6 +837,16 @@ struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev, json_object_object_add(jdev, "firmware", jobj); } + if (flags & UTIL_JSON_POISON_LIST) { + struct cxl_poison_ctx pctx = { + .dev = memdev, + .is_region = false, + }; + jobj = util_cxl_poison_list_to_json(&pctx, flags); + if (jobj) + json_object_object_add(jdev, "poison", jobj); + } + json_object_set_userdata(jdev, memdev, NULL); return jdev; } @@ -987,6 +1185,16 @@ struct json_object *util_cxl_region_to_json(struct cxl_region *region, json_object_object_add(jregion, "state", jobj); } + if (flags & UTIL_JSON_POISON_LIST) { + struct cxl_poison_ctx pectx = { + .dev = region, + .is_region = true, + }; + jobj = util_cxl_poison_list_to_json(&pectx, flags); + if (jobj) + json_object_object_add(jregion, "poison", jobj); + } + util_cxl_mappings_append_json(jregion, region, flags); if (flags & UTIL_JSON_DAX) { diff --git a/util/json.h b/util/json.h index ea370df4d1b7..3ae4074a95c3 100644 --- a/util/json.h +++ b/util/json.h @@ -21,6 +21,7 @@ enum util_json_flags { UTIL_JSON_TARGETS = (1 << 11), UTIL_JSON_PARTITION = (1 << 12), UTIL_JSON_ALERT_CONFIG = (1 << 13), + UTIL_JSON_POISON_LIST = (1 << 14), }; void util_display_json_array(FILE *f_out, struct json_object *jarray,