diff mbox series

[RFC,1/9] cxl/mem: Implement Get Event Records command

Message ID 20220813053243.757363-2-ira.weiny@intel.com
State New, archived
Headers show
Series CXL: Read and clear event logs | expand

Commit Message

Ira Weiny Aug. 13, 2022, 5:32 a.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

Event records are defined for CXL devices.  Each record is reported in
one event log.  Devices are required to support the storage of at least
one event record in each event log type.

Devices track event log overflow by incrementing a counter and tracking
the time of the first and last overflow event seen.

Software queries events via the Get Event Record mailbox command; CXL
v3.0 section 8.2.9.2.2.

Issue the Get Event Record mailbox command on driver load.  Trace each
record found, as well as any overflow conditions.  Only 1 event is
requested for each query.  Optimization of multiple record queries is
deferred.

This patch traces a raw event record only and leaves the specific event
record types to subsequent patches.

NOTE: checkpatch is not completely happy with the tracing part of this
patch but AFAICT it is correct.  I'm open to suggestions if I've done
something wrong.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 MAINTAINERS                       |   1 +
 drivers/cxl/core/mbox.c           |  60 ++++++++++++++
 drivers/cxl/cxlmem.h              |  66 ++++++++++++++++
 include/trace/events/cxl-events.h | 127 ++++++++++++++++++++++++++++++
 include/uapi/linux/cxl_mem.h      |   1 +
 5 files changed, 255 insertions(+)
 create mode 100644 include/trace/events/cxl-events.h

Comments

Steven Rostedt Aug. 16, 2022, 4:39 p.m. UTC | #1
On Fri, 12 Aug 2022 22:32:35 -0700
ira.weiny@intel.com wrote:

> From: Ira Weiny <ira.weiny@intel.com>
> 
> Event records are defined for CXL devices.  Each record is reported in
> one event log.  Devices are required to support the storage of at least
> one event record in each event log type.
> 
> Devices track event log overflow by incrementing a counter and tracking
> the time of the first and last overflow event seen.
> 
> Software queries events via the Get Event Record mailbox command; CXL
> v3.0 section 8.2.9.2.2.
> 
> Issue the Get Event Record mailbox command on driver load.  Trace each
> record found, as well as any overflow conditions.  Only 1 event is
> requested for each query.  Optimization of multiple record queries is
> deferred.
> 
> This patch traces a raw event record only and leaves the specific event
> record types to subsequent patches.
> 
> NOTE: checkpatch is not completely happy with the tracing part of this
> patch but AFAICT it is correct.  I'm open to suggestions if I've done
> something wrong.

The include/trace/events/*.h files are all broken according to
checkpatch.pl ;-) Don't worry about the formatting there. I need to update
that script to detect that it's looking at TRACE_EVENT() that has different
rules than normal macros.

> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ---
>  MAINTAINERS                       |   1 +
>  drivers/cxl/core/mbox.c           |  60 ++++++++++++++
>  drivers/cxl/cxlmem.h              |  66 ++++++++++++++++
>  include/trace/events/cxl-events.h | 127 ++++++++++++++++++++++++++++++
>  include/uapi/linux/cxl_mem.h      |   1 +
>  5 files changed, 255 insertions(+)
>  create mode 100644 include/trace/events/cxl-events.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 54fa6e2059de..1cb9cec31009 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5014,6 +5014,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>  L:	linux-cxl@vger.kernel.org
>  S:	Maintained
>  F:	drivers/cxl/
> +F:	include/trace/events/cxl*.h
>  F:	include/uapi/linux/cxl_mem.h
>  
>  CONEXANT ACCESSRUNNER USB DRIVER
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 16176b9278b4..2cceed8608dc 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -7,6 +7,9 @@
>  #include <cxlmem.h>
>  #include <cxl.h>
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl-events.h>
> +
>  #include "core.h"
>  
>  static bool cxl_raw_allow_all;
> @@ -48,6 +51,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
>  	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
>  #endif
>  	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
> +	CXL_CMD(GET_EVENT_RECORD, 1, CXL_VARIABLE_PAYLOAD, 0),
>  	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
>  	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
>  	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
> @@ -704,6 +708,62 @@ int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>  
> +static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
> +				   enum cxl_event_log_type type)
> +{
> +	struct cxl_get_event_payload payload;
> +
> +	do {
> +		u8 log_type = type;
> +		u16 record_count;
> +		int rc;
> +
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
> +				       &log_type, sizeof(log_type),
> +				       &payload, sizeof(payload));
> +		if (rc)
> +			return rc;
> +
> +		record_count = le16_to_cpu(payload.record_count);
> +		if (record_count > 0)
> +			trace_cxl_event(dev_name(cxlds->dev), type,
> +					&payload.record);
> +
> +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> +						 &payload);

If you want to avoid the compare operations when the tracepoints are not
enabled, you can add:

		if (trace_cxl_event_enabled()) {
			if (record_count > 0)
				trace_cxl_event(dev_name(cxlds->dev), type,
						&payload.record);
		}

		if (trace_cxl_event_overflow_enabled()) {
			if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
				trace_cxl_event_overflow(dev_name(cxlds->dev), type,
							 &payload);
		}

Those "<tracepoint>_enabled()" functions are static branches. Which means
when not enabled it's a nop that skips this code, and when either is
enabled, it turns into a jump to the contents of the if block.


> +
> +	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
> +
> +	return 0;
> +}
> +
> +/**
> + * cxl_mem_get_event_records - Get Event Records from the device
> + * @cxlds: The device data for the operation
> + *
> + * Retrieve all event records available on the device and report them as trace
> + * events.
> + *
> + * See CXL v3.0 @8.2.9.2.2 Get Event Records
> + */
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	enum cxl_event_log_type log_type;
> +
> +	for (log_type = CXL_EVENT_TYPE_INFO;
> +	     log_type < CXL_EVENT_TYPE_MAX; log_type++) {
> +		int rc;
> +
> +		rc = cxl_mem_get_records_log(cxlds, log_type);
> +		if (rc)
> +			dev_err(dev, "Failed to query %s Event Logs : %d",
> +				cxl_event_log_type_str(log_type), rc);
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
> +
>  /**
>   * cxl_mem_get_partition_info - Get partition info
>   * @cxlds: The device data for the operation
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 88e3a8e54b6a..f83634f3bc8d 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -4,6 +4,7 @@
>  #define __CXL_MEM_H__
>  #include <uapi/linux/cxl_mem.h>
>  #include <linux/cdev.h>
> +#include <linux/uuid.h>
>  #include "cxl.h"
>  
>  /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> @@ -253,6 +254,7 @@ struct cxl_dev_state {
>  enum cxl_opcode {
>  	CXL_MBOX_OP_INVALID		= 0x0000,
>  	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
> +	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
>  	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
>  	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
>  	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
> @@ -322,6 +324,69 @@ struct cxl_mbox_identify {
>  	u8 qos_telemetry_caps;
>  } __packed;
>  
> +/*
> + * Common Event Record Format
> + * CXL v3.0 section 8.2.9.2.1; Table 8-42
> + */
> +struct cxl_event_record_hdr {
> +	uuid_t id;
> +	__le32 flags_length;
> +	__le16 handle;
> +	__le16 related_handle;
> +	__le64 timestamp;
> +	__le64 reserved1;
> +	__le64 reserved2;
> +} __packed;
> +
> +#define EVENT_RECORD_DATA_LENGTH 0x50
> +struct cxl_event_record_raw {
> +	struct cxl_event_record_hdr hdr;
> +	u8 data[EVENT_RECORD_DATA_LENGTH];
> +} __packed;
> +
> +/*
> + * Get Event Records output payload
> + * CXL v3.0 section 8.2.9.2.2; Table 8-50
> + *
> + * Space given for 1 record
> + */
> +#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
> +#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
> +struct cxl_get_event_payload {
> +	u8 flags;
> +	u8 reserved1;
> +	__le16 overflow_err_count;
> +	__le64 first_overflow_timestamp;
> +	__le64 last_overflow_timestamp;
> +	__le16 record_count;
> +	u8 reserved2[0xa];
> +	struct cxl_event_record_raw record;
> +} __packed;
> +
> +enum cxl_event_log_type {
> +	CXL_EVENT_TYPE_INFO = 0x00,
> +	CXL_EVENT_TYPE_WARN,
> +	CXL_EVENT_TYPE_FAIL,
> +	CXL_EVENT_TYPE_FATAL,
> +	CXL_EVENT_TYPE_MAX
> +};
> +static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
> +{
> +	switch (type) {
> +	case CXL_EVENT_TYPE_INFO:
> +		return "Informational";
> +	case CXL_EVENT_TYPE_WARN:
> +		return "Warning";
> +	case CXL_EVENT_TYPE_FAIL:
> +		return "Failure";
> +	case CXL_EVENT_TYPE_FATAL:
> +		return "Fatal";
> +	default:
> +		break;
> +	}
> +	return "<unknown>";
> +}
> +
>  struct cxl_mbox_get_partition_info {
>  	__le64 active_volatile_cap;
>  	__le64 active_persistent_cap;
> @@ -381,6 +446,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
>  void cxl_mem_active_dec(void);
> diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> new file mode 100644
> index 000000000000..f4baeae66cf3
> --- /dev/null
> +++ b/include/trace/events/cxl-events.h
> @@ -0,0 +1,127 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM cxl_events
> +
> +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> +#define _CXL_TRACE_EVENTS_H
> +
> +#include <linux/tracepoint.h>
> +
> +#define EVENT_LOGS					\
> +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
> +
> +/*
> + * First define the enums in the above macros to be exported to userspace via
> + * TRACE_DEFINE_ENUM().
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)	TRACE_DEFINE_ENUM(a);
> +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> +
> +EVENT_LOGS
> +#define show_log_type(type) __print_symbolic(type, EVENT_LOGS)
> +
> +/*
> + * Now redefine the EM and EMe macros to map the enums to the strings that will
> + * be printed in the output
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)        {a, b},
> +#define EMe(a, b)       {a, b}
> +
> +TRACE_EVENT(cxl_event_overflow,
> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_get_event_payload *payload),
> +
> +	TP_ARGS(dev_name, log, payload),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__field(u16, count)
> +		__field(u64, first)
> +		__field(u64, last)

Because you have a dynamic string, you will save some bytes in the ring
buffer if you have:

		__string(dev_name, dev_name)
		__field(int, log)
		__field(u64, first)
		__field(u64, last)
		__field(u16, count)


> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->log = log;
> +		__entry->count = le16_to_cpu(payload->overflow_err_count);
> +		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
> +		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
> +	),
> +
> +	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->count, __entry->first, __entry->last)
> +
> +);
> +
> +/*
> + * Common Event Record Format
> + * CXL v2.0 section 8.2.9.1.1; Table 153
> + */
> +#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
> +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
> +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
> +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
> +#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
> +	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> +)
> +
> +TRACE_EVENT(cxl_event,
> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_event_record_raw *rec),
> +
> +	TP_ARGS(dev_name, log, rec),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__array(u8, id, UUID_SIZE)
> +		__field(u32, flags)
> +		__field(u16, handle)
> +		__field(u16, related_handle)
> +		__field(u64, timestamp)
> +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> +		__field(u8, length)

The above looks good.

> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
> +		__entry->log = log;
> +		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
> +		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
> +		__entry->handle = le16_to_cpu(rec->hdr.handle);
> +		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
> +		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);

I wonder if I should add le64_to_cpu() and le32_to_cpu() to the functions
that libtraceevent can parse, and then we could move that logic to the
TP_printk(). That is, out of the fast path.

-- Steve

> +		memcpy(__entry->data, &rec->data, EVENT_RECORD_DATA_LENGTH);
> +	),
> +
> +	TP_printk("%s: %s time=%llu id=%pUl handle=%x related_handle=%x hdr_flags='%s' " \
> +		  ": %s",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->timestamp, __entry->id, __entry->handle,
> +		__entry->related_handle, show_hdr_flags(__entry->flags),
> +		__print_hex(__entry->data, EVENT_RECORD_DATA_LENGTH)
> +		)
> +);
> +
> +#endif /* _CXL_TRACE_EVENTS_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE cxl-events
> +#include <trace/define_trace.h>
> diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
> index c71021a2a9ed..70459be5bdd4 100644
> --- a/include/uapi/linux/cxl_mem.h
> +++ b/include/uapi/linux/cxl_mem.h
> @@ -24,6 +24,7 @@
>  	___C(IDENTIFY, "Identify Command"),                               \
>  	___C(RAW, "Raw device command"),                                  \
>  	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
> +	___C(GET_EVENT_RECORD, "Get Event Record"),                       \
>  	___C(GET_FW_INFO, "Get FW Info"),                                 \
>  	___C(GET_PARTITION_INFO, "Get Partition Information"),            \
>  	___C(GET_LSA, "Get Label Storage Area"),                          \
Steven Rostedt Aug. 16, 2022, 4:41 p.m. UTC | #2
On Tue, 16 Aug 2022 12:39:58 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > +		record_count = le16_to_cpu(payload.record_count);
> > +		if (record_count > 0)
> > +			trace_cxl_event(dev_name(cxlds->dev), type,
> > +					&payload.record);
> > +
> > +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> > +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> > +						 &payload);  
> 
> If you want to avoid the compare operations when the tracepoints are not
> enabled, you can add:
> 
> 		if (trace_cxl_event_enabled()) {
> 			if (record_count > 0)
> 				trace_cxl_event(dev_name(cxlds->dev), type,
> 						&payload.record);
> 		}
> 
> 		if (trace_cxl_event_overflow_enabled()) {
> 			if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> 				trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> 							 &payload);
> 		}
> 
> Those "<tracepoint>_enabled()" functions are static branches. Which means
> when not enabled it's a nop that skips this code, and when either is
> enabled, it turns into a jump to the contents of the if block.

Ignore this suggestion. I see in the second patch you add more logic to the
if condition. Only use this suggestion if the logic is only for when the
tracepoint is enabled.

-- Steve
Ira Weiny Aug. 16, 2022, 11:11 p.m. UTC | #3
On Tue, Aug 16, 2022 at 12:41:28PM -0400, Steven Rostedt wrote:
> On Tue, 16 Aug 2022 12:39:58 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > > +		record_count = le16_to_cpu(payload.record_count);
> > > +		if (record_count > 0)
> > > +			trace_cxl_event(dev_name(cxlds->dev), type,
> > > +					&payload.record);
> > > +
> > > +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> > > +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> > > +						 &payload);  
> > 
> > If you want to avoid the compare operations when the tracepoints are not
> > enabled, you can add:
> > 
> > 		if (trace_cxl_event_enabled()) {
> > 			if (record_count > 0)
> > 				trace_cxl_event(dev_name(cxlds->dev), type,
> > 						&payload.record);
> > 		}
> > 
> > 		if (trace_cxl_event_overflow_enabled()) {
> > 			if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> > 				trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> > 							 &payload);
> > 		}
> > 
> > Those "<tracepoint>_enabled()" functions are static branches. Which means
> > when not enabled it's a nop that skips this code, and when either is
> > enabled, it turns into a jump to the contents of the if block.
> 
> Ignore this suggestion. I see in the second patch you add more logic to the
> if condition.

Correct.

> Only use this suggestion if the logic is only for when the
> tracepoint is enabled.

This could apply to the overflow trace.  I think it is more likely that either
all of these traces will be enabled or none.

So other than doing:

	if (trace_cxl_event_enabled() ||
	    trace_cxl_event_overflow_enabled() ||
	    trace_cxl_gen_media_event() ||
	    <every event defined>
	    ...) {

Is there a way to know if 

	/sys/kernel/tracing/events/cxl_events/enable

is != 0?

I feel like the real optimization will be to shut this entire functionality
down if no one is listening.

Ira
Ira Weiny Aug. 16, 2022, 11:35 p.m. UTC | #4
On Tue, Aug 16, 2022 at 12:39:58PM -0400, Steven Rostedt wrote:
> On Fri, 12 Aug 2022 22:32:35 -0700
> ira.weiny@intel.com wrote:
> 
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > Event records are defined for CXL devices.  Each record is reported in
> > one event log.  Devices are required to support the storage of at least
> > one event record in each event log type.
> > 
> > Devices track event log overflow by incrementing a counter and tracking
> > the time of the first and last overflow event seen.
> > 
> > Software queries events via the Get Event Record mailbox command; CXL
> > v3.0 section 8.2.9.2.2.
> > 
> > Issue the Get Event Record mailbox command on driver load.  Trace each
> > record found, as well as any overflow conditions.  Only 1 event is
> > requested for each query.  Optimization of multiple record queries is
> > deferred.
> > 
> > This patch traces a raw event record only and leaves the specific event
> > record types to subsequent patches.
> > 
> > NOTE: checkpatch is not completely happy with the tracing part of this
> > patch but AFAICT it is correct.  I'm open to suggestions if I've done
> > something wrong.
> 
> The include/trace/events/*.h files are all broken according to
> checkpatch.pl ;-) Don't worry about the formatting there. I need to update
> that script to detect that it's looking at TRACE_EVENT() that has different
> rules than normal macros.

Thanks!

[snip]

> > +
> > +/*
> > + * Now redefine the EM and EMe macros to map the enums to the strings that will
> > + * be printed in the output
> > + */
> > +#undef EM
> > +#undef EMe
> > +#define EM(a, b)        {a, b},
> > +#define EMe(a, b)       {a, b}
> > +
> > +TRACE_EVENT(cxl_event_overflow,
> > +
> > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > +		 struct cxl_get_event_payload *payload),
> > +
> > +	TP_ARGS(dev_name, log, payload),
> > +
> > +	TP_STRUCT__entry(
> > +		__string(dev_name, dev_name)
> > +		__field(int, log)
> > +		__field(u16, count)
> > +		__field(u64, first)
> > +		__field(u64, last)
> 
> Because you have a dynamic string, you will save some bytes in the ring
> buffer if you have:
> 
> 		__string(dev_name, dev_name)
> 		__field(int, log)
> 		__field(u64, first)
> 		__field(u64, last)
> 		__field(u16, count)

Thanks I missed this one.  I was trying to pack things better but I missed this
one.

> 
> 
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__assign_str(dev_name, dev_name);
> > +		__entry->log = log;
> > +		__entry->count = le16_to_cpu(payload->overflow_err_count);
> > +		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
> > +		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
> > +	),
> > +
> > +	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
> > +		__get_str(dev_name), show_log_type(__entry->log),
> > +		__entry->count, __entry->first, __entry->last)
> > +
> > +);
> > +
> > +/*
> > + * Common Event Record Format
> > + * CXL v2.0 section 8.2.9.1.1; Table 153
> > + */
> > +#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
> > +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
> > +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
> > +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
> > +#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
> > +	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
> > +	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \
> > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > +)
> > +
> > +TRACE_EVENT(cxl_event,
> > +
> > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > +		 struct cxl_event_record_raw *rec),
> > +
> > +	TP_ARGS(dev_name, log, rec),
> > +
> > +	TP_STRUCT__entry(
> > +		__string(dev_name, dev_name)
> > +		__field(int, log)
> > +		__array(u8, id, UUID_SIZE)
> > +		__field(u32, flags)
> > +		__field(u16, handle)
> > +		__field(u16, related_handle)
> > +		__field(u64, timestamp)
> > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > +		__field(u8, length)
> 
> The above looks good.
> 
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__assign_str(dev_name, dev_name);
> > +		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
> > +		__entry->log = log;
> > +		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
> > +		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
> > +		__entry->handle = le16_to_cpu(rec->hdr.handle);
> > +		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
> > +		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);
> 
> I wonder if I should add le64_to_cpu() and le32_to_cpu() to the functions
> that libtraceevent can parse, and then we could move that logic to the
> TP_printk(). That is, out of the fast path.

I would not do it for this series.  I don't see performance on these traces
being an issue.  The built in logging the trace mechanism provides (space
considerations) as well as user API to gain access to the data is what we are
leveraging more.

What I would really like (and was looking for the time to enhance) would be a
way to create 'sub-class' like events.

In this case each of the traces comes with a common header which is part of the
generic cxl_event trace point.

It would be nice if we could define something like:

DECLARE_EVENT_BASE_CLASS(event_header,

	TP_PROTO(const char *dev_name, enum cxl_event_log_type log_type,
		 struct cxl_event_record_hdr hdr),
	...

	TP_base_printk(<print header fields>),
);

Then somehow use that header in the 

TRACE_EVENT(cxl_event,

	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
		 struct cxl_event_record_raw *rec),

	TP_SUB_CLASS(dev_name, log, (struct cxl_event_record_hdr *)rec),

	TP_fast_assign(
		call_base_assign(rec),
		...
	),

	TP_printk(<automatically print header fields>
		  <print raw data>),
	...
);

TRACE_EVENT(cxl_gen_media_event,

	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
		 struct cxl_evt_gen_media *rec),

	TP_SUB_CLASS(dev_name, log, (struct cxl_event_record_hdr *)rec),

	TP_fast_assign(
		call_base_assign(rec),
		...
	),

	TP_printk(<automatically print header fields>
		  <print media event fields>),
	...
);

<etc>

Does that make sense?  I've no idea how this could be done.  But I think the
real work would be in the printk merging between the base class (header prints)
and the rest of the record.  I _think_ that the fast assign could just call the
assign defined in the base class somehow.

Am I way off base thinking this is possible?

Thanks for the review!
Ira
Dave Jiang Aug. 17, 2022, 10:54 p.m. UTC | #5
On 8/12/2022 10:32 PM, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
>
> Event records are defined for CXL devices.  Each record is reported in
> one event log.  Devices are required to support the storage of at least
> one event record in each event log type.
>
> Devices track event log overflow by incrementing a counter and tracking
> the time of the first and last overflow event seen.
>
> Software queries events via the Get Event Record mailbox command; CXL
> v3.0 section 8.2.9.2.2.
>
> Issue the Get Event Record mailbox command on driver load.  Trace each
> record found, as well as any overflow conditions.  Only 1 event is
> requested for each query.  Optimization of multiple record queries is
> deferred.
>
> This patch traces a raw event record only and leaves the specific event
> record types to subsequent patches.
>
> NOTE: checkpatch is not completely happy with the tracing part of this
> patch but AFAICT it is correct.  I'm open to suggestions if I've done
> something wrong.
>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ---
>   MAINTAINERS                       |   1 +
>   drivers/cxl/core/mbox.c           |  60 ++++++++++++++
>   drivers/cxl/cxlmem.h              |  66 ++++++++++++++++
>   include/trace/events/cxl-events.h | 127 ++++++++++++++++++++++++++++++
>   include/uapi/linux/cxl_mem.h      |   1 +
>   5 files changed, 255 insertions(+)
>   create mode 100644 include/trace/events/cxl-events.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 54fa6e2059de..1cb9cec31009 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5014,6 +5014,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>   L:	linux-cxl@vger.kernel.org
>   S:	Maintained
>   F:	drivers/cxl/
> +F:	include/trace/events/cxl*.h
>   F:	include/uapi/linux/cxl_mem.h
>   
>   CONEXANT ACCESSRUNNER USB DRIVER
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 16176b9278b4..2cceed8608dc 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -7,6 +7,9 @@
>   #include <cxlmem.h>
>   #include <cxl.h>
>   
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl-events.h>
> +
>   #include "core.h"
>   
>   static bool cxl_raw_allow_all;
> @@ -48,6 +51,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
>   	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
>   #endif
>   	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
> +	CXL_CMD(GET_EVENT_RECORD, 1, CXL_VARIABLE_PAYLOAD, 0),
>   	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
>   	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
>   	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
> @@ -704,6 +708,62 @@ int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
>   }
>   EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>   
> +static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
> +				   enum cxl_event_log_type type)
> +{
> +	struct cxl_get_event_payload payload;
> +
> +	do {
> +		u8 log_type = type;
> +		u16 record_count;
> +		int rc;
> +
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
> +				       &log_type, sizeof(log_type),
> +				       &payload, sizeof(payload));
> +		if (rc)
> +			return rc;
> +
> +		record_count = le16_to_cpu(payload.record_count);
> +		if (record_count > 0)
> +			trace_cxl_event(dev_name(cxlds->dev), type,
> +					&payload.record);
> +
> +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> +						 &payload);
> +
> +	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
> +
> +	return 0;
> +}
> +
> +/**
> + * cxl_mem_get_event_records - Get Event Records from the device
> + * @cxlds: The device data for the operation
> + *
> + * Retrieve all event records available on the device and report them as trace
> + * events.
> + *
> + * See CXL v3.0 @8.2.9.2.2 Get Event Records
> + */
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	enum cxl_event_log_type log_type;
> +
> +	for (log_type = CXL_EVENT_TYPE_INFO;
> +	     log_type < CXL_EVENT_TYPE_MAX; log_type++) {
> +		int rc;
> +
> +		rc = cxl_mem_get_records_log(cxlds, log_type);
> +		if (rc)
> +			dev_err(dev, "Failed to query %s Event Logs : %d",
> +				cxl_event_log_type_str(log_type), rc);
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
> +
>   /**
>    * cxl_mem_get_partition_info - Get partition info
>    * @cxlds: The device data for the operation
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 88e3a8e54b6a..f83634f3bc8d 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -4,6 +4,7 @@
>   #define __CXL_MEM_H__
>   #include <uapi/linux/cxl_mem.h>
>   #include <linux/cdev.h>
> +#include <linux/uuid.h>
>   #include "cxl.h"
>   
>   /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> @@ -253,6 +254,7 @@ struct cxl_dev_state {
>   enum cxl_opcode {
>   	CXL_MBOX_OP_INVALID		= 0x0000,
>   	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
> +	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
>   	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
>   	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
>   	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
> @@ -322,6 +324,69 @@ struct cxl_mbox_identify {
>   	u8 qos_telemetry_caps;
>   } __packed;
>   
> +/*
> + * Common Event Record Format
> + * CXL v3.0 section 8.2.9.2.1; Table 8-42
> + */
> +struct cxl_event_record_hdr {
> +	uuid_t id;
> +	__le32 flags_length;
> +	__le16 handle;
> +	__le16 related_handle;
> +	__le64 timestamp;
> +	__le64 reserved1;
> +	__le64 reserved2;
> +} __packed;
> +
> +#define EVENT_RECORD_DATA_LENGTH 0x50
> +struct cxl_event_record_raw {
> +	struct cxl_event_record_hdr hdr;
> +	u8 data[EVENT_RECORD_DATA_LENGTH];
> +} __packed;
> +
> +/*
> + * Get Event Records output payload
> + * CXL v3.0 section 8.2.9.2.2; Table 8-50
> + *
> + * Space given for 1 record
> + */
> +#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
> +#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
> +struct cxl_get_event_payload {
> +	u8 flags;
> +	u8 reserved1;
> +	__le16 overflow_err_count;
> +	__le64 first_overflow_timestamp;
> +	__le64 last_overflow_timestamp;
> +	__le16 record_count;
> +	u8 reserved2[0xa];
> +	struct cxl_event_record_raw record;
> +} __packed;
> +
> +enum cxl_event_log_type {
> +	CXL_EVENT_TYPE_INFO = 0x00,
> +	CXL_EVENT_TYPE_WARN,
> +	CXL_EVENT_TYPE_FAIL,
> +	CXL_EVENT_TYPE_FATAL,
> +	CXL_EVENT_TYPE_MAX
> +};
> +static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
> +{
> +	switch (type) {
> +	case CXL_EVENT_TYPE_INFO:
> +		return "Informational";
> +	case CXL_EVENT_TYPE_WARN:
> +		return "Warning";
> +	case CXL_EVENT_TYPE_FAIL:
> +		return "Failure";
> +	case CXL_EVENT_TYPE_FATAL:
> +		return "Fatal";
> +	default:
> +		break;
> +	}
> +	return "<unknown>";
> +}
> +
>   struct cxl_mbox_get_partition_info {
>   	__le64 active_volatile_cap;
>   	__le64 active_persistent_cap;
> @@ -381,6 +446,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>   struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>   void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>   void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
>   #ifdef CONFIG_CXL_SUSPEND
>   void cxl_mem_active_inc(void);
>   void cxl_mem_active_dec(void);
> diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> new file mode 100644
> index 000000000000..f4baeae66cf3
> --- /dev/null
> +++ b/include/trace/events/cxl-events.h
> @@ -0,0 +1,127 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM cxl_events
> +
> +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> +#define _CXL_TRACE_EVENTS_H
> +
> +#include <linux/tracepoint.h>
> +
> +#define EVENT_LOGS					\
> +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
> +
> +/*
> + * First define the enums in the above macros to be exported to userspace via
> + * TRACE_DEFINE_ENUM().
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)	TRACE_DEFINE_ENUM(a);
> +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> +
> +EVENT_LOGS
> +#define show_log_type(type) __print_symbolic(type, EVENT_LOGS)
> +
> +/*
> + * Now redefine the EM and EMe macros to map the enums to the strings that will
> + * be printed in the output
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)        {a, b},
> +#define EMe(a, b)       {a, b}
> +
> +TRACE_EVENT(cxl_event_overflow,

Kind of a general comment for the event names. Maybe just "overflow" 
instead of "cxl_event_overflow" since it shows up in sysfs under the 
cxl_events directory and becomes redundant?

- Dave

> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_get_event_payload *payload),
> +
> +	TP_ARGS(dev_name, log, payload),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__field(u16, count)
> +		__field(u64, first)
> +		__field(u64, last)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->log = log;
> +		__entry->count = le16_to_cpu(payload->overflow_err_count);
> +		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
> +		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
> +	),
> +
> +	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->count, __entry->first, __entry->last)
> +
> +);
> +
> +/*
> + * Common Event Record Format
> + * CXL v2.0 section 8.2.9.1.1; Table 153
> + */
> +#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
> +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
> +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
> +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
> +#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
> +	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> +)
> +
> +TRACE_EVENT(cxl_event,
> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_event_record_raw *rec),
> +
> +	TP_ARGS(dev_name, log, rec),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__array(u8, id, UUID_SIZE)
> +		__field(u32, flags)
> +		__field(u16, handle)
> +		__field(u16, related_handle)
> +		__field(u64, timestamp)
> +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> +		__field(u8, length)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
> +		__entry->log = log;
> +		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
> +		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
> +		__entry->handle = le16_to_cpu(rec->hdr.handle);
> +		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
> +		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);
> +		memcpy(__entry->data, &rec->data, EVENT_RECORD_DATA_LENGTH);
> +	),
> +
> +	TP_printk("%s: %s time=%llu id=%pUl handle=%x related_handle=%x hdr_flags='%s' " \
> +		  ": %s",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->timestamp, __entry->id, __entry->handle,
> +		__entry->related_handle, show_hdr_flags(__entry->flags),
> +		__print_hex(__entry->data, EVENT_RECORD_DATA_LENGTH)
> +		)
> +);
> +
> +#endif /* _CXL_TRACE_EVENTS_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE cxl-events
> +#include <trace/define_trace.h>
> diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
> index c71021a2a9ed..70459be5bdd4 100644
> --- a/include/uapi/linux/cxl_mem.h
> +++ b/include/uapi/linux/cxl_mem.h
> @@ -24,6 +24,7 @@
>   	___C(IDENTIFY, "Identify Command"),                               \
>   	___C(RAW, "Raw device command"),                                  \
>   	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
> +	___C(GET_EVENT_RECORD, "Get Event Record"),                       \
>   	___C(GET_FW_INFO, "Get FW Info"),                                 \
>   	___C(GET_PARTITION_INFO, "Get Partition Information"),            \
>   	___C(GET_LSA, "Get Label Storage Area"),                          \
Jonathan Cameron Aug. 24, 2022, 3:50 p.m. UTC | #6
On Fri, 12 Aug 2022 22:32:35 -0700
ira.weiny@intel.com wrote:

> From: Ira Weiny <ira.weiny@intel.com>
> 
> Event records are defined for CXL devices.  Each record is reported in
> one event log.  Devices are required to support the storage of at least
> one event record in each event log type.
Hi Ira,

Someone went and slipped in a new field in CXL r3.0.  Might be easier
just to add it now?

A few other comments inline.

> 
> Devices track event log overflow by incrementing a counter and tracking
> the time of the first and last overflow event seen.
> 
> Software queries events via the Get Event Record mailbox command; CXL
> v3.0 section 8.2.9.2.2.
rev3.0

You reference 3.0 but use 2.0 definitions below (I'm guessing this crossed
with spec release).

> 
> Issue the Get Event Record mailbox command on driver load.  Trace each
> record found, as well as any overflow conditions.  Only 1 event is
> requested for each query.  Optimization of multiple record queries is
> deferred.
I'd be tempted to make it easier by using a variable sized fail element and
an allocation, but fair enough that can come later.

> 
> This patch traces a raw event record only and leaves the specific event
> record types to subsequent patches.
> 
> NOTE: checkpatch is not completely happy with the tracing part of this
> patch but AFAICT it is correct.  I'm open to suggestions if I've done
> something wrong.
> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>


> ---
>  MAINTAINERS                       |   1 +
>  drivers/cxl/core/mbox.c           |  60 ++++++++++++++
>  drivers/cxl/cxlmem.h              |  66 ++++++++++++++++
>  include/trace/events/cxl-events.h | 127 ++++++++++++++++++++++++++++++
>  include/uapi/linux/cxl_mem.h      |   1 +
>  5 files changed, 255 insertions(+)
>  create mode 100644 include/trace/events/cxl-events.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 54fa6e2059de..1cb9cec31009 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5014,6 +5014,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
>  L:	linux-cxl@vger.kernel.org
>  S:	Maintained
>  F:	drivers/cxl/
> +F:	include/trace/events/cxl*.h
>  F:	include/uapi/linux/cxl_mem.h
>  
>  CONEXANT ACCESSRUNNER USB DRIVER
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 16176b9278b4..2cceed8608dc 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -7,6 +7,9 @@
>  #include <cxlmem.h>
>  #include <cxl.h>
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl-events.h>
> +
>  #include "core.h"
>  
>  static bool cxl_raw_allow_all;
> @@ -48,6 +51,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
>  	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
>  #endif
>  	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
> +	CXL_CMD(GET_EVENT_RECORD, 1, CXL_VARIABLE_PAYLOAD, 0),
>  	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
>  	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
>  	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
> @@ -704,6 +708,62 @@ int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
>  
> +static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
> +				   enum cxl_event_log_type type)
> +{
> +	struct cxl_get_event_payload payload;
> +
> +	do {
> +		u8 log_type = type;
> +		u16 record_count;
> +		int rc;
> +
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
> +				       &log_type, sizeof(log_type),
> +				       &payload, sizeof(payload));
> +		if (rc)
> +			return rc;
> +
> +		record_count = le16_to_cpu(payload.record_count);
> +		if (record_count > 0)

If it is anything other than 1 you have a problem..  So fornow
I would check for that.

> +			trace_cxl_event(dev_name(cxlds->dev), type,
> +					&payload.record);
> +
> +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> +						 &payload);
> +
> +	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
> +
> +	return 0;
> +}
> +
> +/**
> + * cxl_mem_get_event_records - Get Event Records from the device
> + * @cxlds: The device data for the operation
> + *
> + * Retrieve all event records available on the device and report them as trace
> + * events.
> + *
> + * See CXL v3.0 @8.2.9.2.2 Get Event Records
> + */
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	enum cxl_event_log_type log_type;
> +
> +	for (log_type = CXL_EVENT_TYPE_INFO;
> +	     log_type < CXL_EVENT_TYPE_MAX; log_type++) {
> +		int rc;
> +
> +		rc = cxl_mem_get_records_log(cxlds, log_type);
> +		if (rc)
> +			dev_err(dev, "Failed to query %s Event Logs : %d",
> +				cxl_event_log_type_str(log_type), rc);
> +	}
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
> +
>  /**
>   * cxl_mem_get_partition_info - Get partition info
>   * @cxlds: The device data for the operation
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 88e3a8e54b6a..f83634f3bc8d 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -4,6 +4,7 @@
>  #define __CXL_MEM_H__
>  #include <uapi/linux/cxl_mem.h>
>  #include <linux/cdev.h>
> +#include <linux/uuid.h>
>  #include "cxl.h"
>  
>  /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> @@ -253,6 +254,7 @@ struct cxl_dev_state {
>  enum cxl_opcode {
>  	CXL_MBOX_OP_INVALID		= 0x0000,
>  	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
> +	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
>  	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
>  	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
>  	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
> @@ -322,6 +324,69 @@ struct cxl_mbox_identify {
>  	u8 qos_telemetry_caps;
>  } __packed;
>  
> +/*
> + * Common Event Record Format
> + * CXL v3.0 section 8.2.9.2.1; Table 8-42
> + */
> +struct cxl_event_record_hdr {
> +	uuid_t id;
> +	__le32 flags_length;

Can you split this into a u8 and
u8[3] then use the get_unaligned_le24 accessor
where appropriate? Oh for 24bit types ;)

> +	__le16 handle;
> +	__le16 related_handle;
> +	__le64 timestamp;
> +	__le64 reserved1;

As below. Maintenance op from CXL 3.0?  Seems easy
to add now rather than needing a change later.

> +	__le64 reserved2;
> +} __packed;
> +
> +#define EVENT_RECORD_DATA_LENGTH 0x50
> +struct cxl_event_record_raw {
> +	struct cxl_event_record_hdr hdr;
> +	u8 data[EVENT_RECORD_DATA_LENGTH];
> +} __packed;
> +
> +/*
> + * Get Event Records output payload
> + * CXL v3.0 section 8.2.9.2.2; Table 8-50

r3.0 :) (just drop the v and go with 3.0 would be my preference).

> + *
> + * Space given for 1 record
> + */
> +#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
> +#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
> +struct cxl_get_event_payload {
> +	u8 flags;
> +	u8 reserved1;
> +	__le16 overflow_err_count;
> +	__le64 first_overflow_timestamp;
> +	__le64 last_overflow_timestamp;
> +	__le16 record_count;
> +	u8 reserved2[0xa];
> +	struct cxl_event_record_raw record;
> +} __packed;
> +
> +enum cxl_event_log_type {
> +	CXL_EVENT_TYPE_INFO = 0x00,
> +	CXL_EVENT_TYPE_WARN,
> +	CXL_EVENT_TYPE_FAIL,
> +	CXL_EVENT_TYPE_FATAL,

Worth putting Dynamic capacity in now? Up to you.

> +	CXL_EVENT_TYPE_MAX
> +};

Blank line for readability.

> +static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
> +{
> +	switch (type) {
> +	case CXL_EVENT_TYPE_INFO:
> +		return "Informational";
> +	case CXL_EVENT_TYPE_WARN:
> +		return "Warning";
> +	case CXL_EVENT_TYPE_FAIL:
> +		return "Failure";
> +	case CXL_EVENT_TYPE_FATAL:
> +		return "Fatal";
> +	default:
> +		break;
> +	}
> +	return "<unknown>";
> +}
> +
>  struct cxl_mbox_get_partition_info {
>  	__le64 active_volatile_cap;
>  	__le64 active_persistent_cap;
> @@ -381,6 +446,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
>  void cxl_mem_active_dec(void);
> diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> new file mode 100644
> index 000000000000..f4baeae66cf3
> --- /dev/null
> +++ b/include/trace/events/cxl-events.h
> @@ -0,0 +1,127 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM cxl_events
> +
> +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> +#define _CXL_TRACE_EVENTS_H
> +
> +#include <linux/tracepoint.h>
> +
> +#define EVENT_LOGS					\
> +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")

Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
dynamic capacity events so I guess it doesn't matter.

> +
> +/*
> + * First define the enums in the above macros to be exported to userspace via
> + * TRACE_DEFINE_ENUM().
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)	TRACE_DEFINE_ENUM(a);
> +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> +
> +EVENT_LOGS
> +#define show_log_type(type) __print_symbolic(type, EVENT_LOGS)
> +
> +/*
> + * Now redefine the EM and EMe macros to map the enums to the strings that will
> + * be printed in the output
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)        {a, b},
> +#define EMe(a, b)       {a, b}
> +
> +TRACE_EVENT(cxl_event_overflow,
> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_get_event_payload *payload),
> +
> +	TP_ARGS(dev_name, log, payload),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__field(u16, count)
> +		__field(u64, first)
> +		__field(u64, last)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->log = log;
> +		__entry->count = le16_to_cpu(payload->overflow_err_count);
> +		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
> +		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
> +	),
> +
> +	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->count, __entry->first, __entry->last)
> +
> +);
> +
> +/*
> + * Common Event Record Format
> + * CXL v2.0 section 8.2.9.1.1; Table 153
> + */
> +#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
> +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
> +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
> +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
> +#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
> +	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \

Maintenance

> +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> +)
> +
> +TRACE_EVENT(cxl_event,
> +
> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> +		 struct cxl_event_record_raw *rec),
> +
> +	TP_ARGS(dev_name, log, rec),
> +
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(int, log)
> +		__array(u8, id, UUID_SIZE)
> +		__field(u32, flags)
> +		__field(u16, handle)
> +		__field(u16, related_handle)
> +		__field(u64, timestamp)
> +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> +		__field(u8, length)

Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
(only noticed because I happen to have that spec revision open rather than 2.0).

> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
> +		__entry->log = log;
> +		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
> +		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
> +		__entry->handle = le16_to_cpu(rec->hdr.handle);
> +		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
> +		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);
> +		memcpy(__entry->data, &rec->data, EVENT_RECORD_DATA_LENGTH);
> +	),
> +
> +	TP_printk("%s: %s time=%llu id=%pUl handle=%x related_handle=%x hdr_flags='%s' " \
> +		  ": %s",
> +		__get_str(dev_name), show_log_type(__entry->log),
> +		__entry->timestamp, __entry->id, __entry->handle,
> +		__entry->related_handle, show_hdr_flags(__entry->flags),
> +		__print_hex(__entry->data, EVENT_RECORD_DATA_LENGTH)
> +		)
> +);
> +
> +#endif /* _CXL_TRACE_EVENTS_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE cxl-events
> +#include <trace/define_trace.h>
Ira Weiny Sept. 7, 2022, 4:28 a.m. UTC | #7
On Wed, Aug 24, 2022 at 04:50:58PM +0100, Jonathan Cameron wrote:
> On Fri, 12 Aug 2022 22:32:35 -0700
> ira.weiny@intel.com wrote:
> 
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > Event records are defined for CXL devices.  Each record is reported in
> > one event log.  Devices are required to support the storage of at least
> > one event record in each event log type.
> Hi Ira,
> 
> Someone went and slipped in a new field in CXL r3.0.  Might be easier
> just to add it now?

Yea I did not notice the difference below.  I thought 3.0 only added records in
this area which I was going to leave to additional patches.  Thanks for
catching this.

> 
> A few other comments inline.
> 
> > 
> > Devices track event log overflow by incrementing a counter and tracking
> > the time of the first and last overflow event seen.
> > 
> > Software queries events via the Get Event Record mailbox command; CXL
> > v3.0 section 8.2.9.2.2.
> rev3.0
> 
> You reference 3.0 but use 2.0 definitions below (I'm guessing this crossed
> with spec release).

It did.  :-/

> 
> > 
> > Issue the Get Event Record mailbox command on driver load.  Trace each
> > record found, as well as any overflow conditions.  Only 1 event is
> > requested for each query.  Optimization of multiple record queries is
> > deferred.
> I'd be tempted to make it easier by using a variable sized fail element and
> an allocation, but fair enough that can come later.

Yea I don't see this as a performant path.  So I'm struggling to justify the
additional complexity of reading more than 1 event at a time.  It is not super
difficult of course but I think my time is better spent elsewhere.

> 
> > 
> > This patch traces a raw event record only and leaves the specific event
> > record types to subsequent patches.
> > 
> > NOTE: checkpatch is not completely happy with the tracing part of this
> > patch but AFAICT it is correct.  I'm open to suggestions if I've done
> > something wrong.
> > 
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> 
> > ---
> >  MAINTAINERS                       |   1 +
> >  drivers/cxl/core/mbox.c           |  60 ++++++++++++++
> >  drivers/cxl/cxlmem.h              |  66 ++++++++++++++++
> >  include/trace/events/cxl-events.h | 127 ++++++++++++++++++++++++++++++
> >  include/uapi/linux/cxl_mem.h      |   1 +
> >  5 files changed, 255 insertions(+)
> >  create mode 100644 include/trace/events/cxl-events.h
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 54fa6e2059de..1cb9cec31009 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -5014,6 +5014,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
> >  L:	linux-cxl@vger.kernel.org
> >  S:	Maintained
> >  F:	drivers/cxl/
> > +F:	include/trace/events/cxl*.h
> >  F:	include/uapi/linux/cxl_mem.h
> >  
> >  CONEXANT ACCESSRUNNER USB DRIVER
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 16176b9278b4..2cceed8608dc 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -7,6 +7,9 @@
> >  #include <cxlmem.h>
> >  #include <cxl.h>
> >  
> > +#define CREATE_TRACE_POINTS
> > +#include <trace/events/cxl-events.h>
> > +
> >  #include "core.h"
> >  
> >  static bool cxl_raw_allow_all;
> > @@ -48,6 +51,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
> >  	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
> >  #endif
> >  	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
> > +	CXL_CMD(GET_EVENT_RECORD, 1, CXL_VARIABLE_PAYLOAD, 0),
> >  	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
> >  	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
> >  	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
> > @@ -704,6 +708,62 @@ int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
> >  
> > +static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
> > +				   enum cxl_event_log_type type)
> > +{
> > +	struct cxl_get_event_payload payload;
> > +
> > +	do {
> > +		u8 log_type = type;
> > +		u16 record_count;
> > +		int rc;
> > +
> > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
> > +				       &log_type, sizeof(log_type),
> > +				       &payload, sizeof(payload));
> > +		if (rc)
> > +			return rc;
> > +
> > +		record_count = le16_to_cpu(payload.record_count);
> > +		if (record_count > 0)
> 
> If it is anything other than 1 you have a problem..  So fornow
> I would check for that.

I assume you mean if there are any records at all?  For the next version I've
checked for 1 here but 0 is also valid if there are no records to return.  So
!= 1 is not an error.

(Currently all logs are checked when the event records are queried and some may
be empty.  I don't plan on trying to distinguish the various interrupts.)

> 
> > +			trace_cxl_event(dev_name(cxlds->dev), type,
> > +					&payload.record);
> > +
> > +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> > +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> > +						 &payload);
> > +
> > +	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * cxl_mem_get_event_records - Get Event Records from the device
> > + * @cxlds: The device data for the operation
> > + *
> > + * Retrieve all event records available on the device and report them as trace
> > + * events.
> > + *
> > + * See CXL v3.0 @8.2.9.2.2 Get Event Records
> > + */
> > +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds)
> > +{
> > +	struct device *dev = cxlds->dev;
> > +	enum cxl_event_log_type log_type;
> > +
> > +	for (log_type = CXL_EVENT_TYPE_INFO;
> > +	     log_type < CXL_EVENT_TYPE_MAX; log_type++) {
> > +		int rc;
> > +
> > +		rc = cxl_mem_get_records_log(cxlds, log_type);
> > +		if (rc)
> > +			dev_err(dev, "Failed to query %s Event Logs : %d",
> > +				cxl_event_log_type_str(log_type), rc);
> > +	}
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
> > +
> >  /**
> >   * cxl_mem_get_partition_info - Get partition info
> >   * @cxlds: The device data for the operation
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 88e3a8e54b6a..f83634f3bc8d 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -4,6 +4,7 @@
> >  #define __CXL_MEM_H__
> >  #include <uapi/linux/cxl_mem.h>
> >  #include <linux/cdev.h>
> > +#include <linux/uuid.h>
> >  #include "cxl.h"
> >  
> >  /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> > @@ -253,6 +254,7 @@ struct cxl_dev_state {
> >  enum cxl_opcode {
> >  	CXL_MBOX_OP_INVALID		= 0x0000,
> >  	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
> > +	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
> >  	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
> >  	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
> >  	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
> > @@ -322,6 +324,69 @@ struct cxl_mbox_identify {
> >  	u8 qos_telemetry_caps;
> >  } __packed;
> >  
> > +/*
> > + * Common Event Record Format
> > + * CXL v3.0 section 8.2.9.2.1; Table 8-42
> > + */
> > +struct cxl_event_record_hdr {
> > +	uuid_t id;
> > +	__le32 flags_length;
> 
> Can you split this into a u8 and
> u8[3] then use the get_unaligned_le24 accessor
> where appropriate? Oh for 24bit types ;)

Sure!  Another function I did not know about.

So the following should be correct ordering?

...
	uuid_t id;
	u8 length;
	u8 flags[3];
	__le16 handle;
...

There are other records which may work better this way too.

> 
> > +	__le16 handle;
> > +	__le16 related_handle;
> > +	__le64 timestamp;
> > +	__le64 reserved1;
> 
> As below. Maintenance op from CXL 3.0?  Seems easy
> to add now rather than needing a change later.

Yes I see it now.  Added.

> 
> > +	__le64 reserved2;
> > +} __packed;
> > +
> > +#define EVENT_RECORD_DATA_LENGTH 0x50
> > +struct cxl_event_record_raw {
> > +	struct cxl_event_record_hdr hdr;
> > +	u8 data[EVENT_RECORD_DATA_LENGTH];
> > +} __packed;
> > +
> > +/*
> > + * Get Event Records output payload
> > + * CXL v3.0 section 8.2.9.2.2; Table 8-50
> 
> r3.0 :) (just drop the v and go with 3.0 would be my preference).

Can do.

> 
> > + *
> > + * Space given for 1 record
> > + */
> > +#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
> > +#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
> > +struct cxl_get_event_payload {
> > +	u8 flags;
> > +	u8 reserved1;
> > +	__le16 overflow_err_count;
> > +	__le64 first_overflow_timestamp;
> > +	__le64 last_overflow_timestamp;
> > +	__le16 record_count;
> > +	u8 reserved2[0xa];
> > +	struct cxl_event_record_raw record;
> > +} __packed;
> > +
> > +enum cxl_event_log_type {
> > +	CXL_EVENT_TYPE_INFO = 0x00,
> > +	CXL_EVENT_TYPE_WARN,
> > +	CXL_EVENT_TYPE_FAIL,
> > +	CXL_EVENT_TYPE_FATAL,
> 
> Worth putting Dynamic capacity in now? Up to you.

Might as well.

> 
> > +	CXL_EVENT_TYPE_MAX
> > +};
> 
> Blank line for readability.

Done.

> 
> > +static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
> > +{
> > +	switch (type) {
> > +	case CXL_EVENT_TYPE_INFO:
> > +		return "Informational";
> > +	case CXL_EVENT_TYPE_WARN:
> > +		return "Warning";
> > +	case CXL_EVENT_TYPE_FAIL:
> > +		return "Failure";
> > +	case CXL_EVENT_TYPE_FATAL:
> > +		return "Fatal";
> > +	default:
> > +		break;
> > +	}
> > +	return "<unknown>";
> > +}
> > +
> >  struct cxl_mbox_get_partition_info {
> >  	__le64 active_volatile_cap;
> >  	__le64 active_persistent_cap;
> > @@ -381,6 +446,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
> >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
> >  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> >  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> > +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
> >  #ifdef CONFIG_CXL_SUSPEND
> >  void cxl_mem_active_inc(void);
> >  void cxl_mem_active_dec(void);
> > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > new file mode 100644
> > index 000000000000..f4baeae66cf3
> > --- /dev/null
> > +++ b/include/trace/events/cxl-events.h
> > @@ -0,0 +1,127 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM cxl_events
> > +
> > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > +#define _CXL_TRACE_EVENTS_H
> > +
> > +#include <linux/tracepoint.h>
> > +
> > +#define EVENT_LOGS					\
> > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
> 
> Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> dynamic capacity events so I guess it doesn't matter.

I'm not sure why you would say that.  I anticipate some user space daemon
requiring these events to set things up.

> 
> > +
> > +/*
> > + * First define the enums in the above macros to be exported to userspace via
> > + * TRACE_DEFINE_ENUM().
> > + */
> > +#undef EM
> > +#undef EMe
> > +#define EM(a, b)	TRACE_DEFINE_ENUM(a);
> > +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> > +
> > +EVENT_LOGS
> > +#define show_log_type(type) __print_symbolic(type, EVENT_LOGS)
> > +
> > +/*
> > + * Now redefine the EM and EMe macros to map the enums to the strings that will
> > + * be printed in the output
> > + */
> > +#undef EM
> > +#undef EMe
> > +#define EM(a, b)        {a, b},
> > +#define EMe(a, b)       {a, b}
> > +
> > +TRACE_EVENT(cxl_event_overflow,
> > +
> > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > +		 struct cxl_get_event_payload *payload),
> > +
> > +	TP_ARGS(dev_name, log, payload),
> > +
> > +	TP_STRUCT__entry(
> > +		__string(dev_name, dev_name)
> > +		__field(int, log)
> > +		__field(u16, count)
> > +		__field(u64, first)
> > +		__field(u64, last)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__assign_str(dev_name, dev_name);
> > +		__entry->log = log;
> > +		__entry->count = le16_to_cpu(payload->overflow_err_count);
> > +		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
> > +		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
> > +	),
> > +
> > +	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
> > +		__get_str(dev_name), show_log_type(__entry->log),
> > +		__entry->count, __entry->first, __entry->last)
> > +
> > +);
> > +
> > +/*
> > + * Common Event Record Format
> > + * CXL v2.0 section 8.2.9.1.1; Table 153
> > + */
> > +#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
> > +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
> > +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
> > +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
> > +#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
> > +	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
> > +	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \
> 
> Maintenance

Thanks done.

> 
> > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > +)
> > +
> > +TRACE_EVENT(cxl_event,
> > +
> > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > +		 struct cxl_event_record_raw *rec),
> > +
> > +	TP_ARGS(dev_name, log, rec),
> > +
> > +	TP_STRUCT__entry(
> > +		__string(dev_name, dev_name)
> > +		__field(int, log)
> > +		__array(u8, id, UUID_SIZE)
> > +		__field(u32, flags)
> > +		__field(u16, handle)
> > +		__field(u16, related_handle)
> > +		__field(u64, timestamp)
> > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > +		__field(u8, length)
> 
> Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> (only noticed because I happen to have that spec revision open rather than 2.0).

Yes done.

There is some discussion with Dan regarding not decoding anything and letting
user space take care of it all.  I think this shows a valid reason Dan
suggested this.

But for now I have added the field.

Thanks for the review,
Ira

> 
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__assign_str(dev_name, dev_name);
> > +		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
> > +		__entry->log = log;
> > +		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
> > +		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
> > +		__entry->handle = le16_to_cpu(rec->hdr.handle);
> > +		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
> > +		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);
> > +		memcpy(__entry->data, &rec->data, EVENT_RECORD_DATA_LENGTH);
> > +	),
> > +
> > +	TP_printk("%s: %s time=%llu id=%pUl handle=%x related_handle=%x hdr_flags='%s' " \
> > +		  ": %s",
> > +		__get_str(dev_name), show_log_type(__entry->log),
> > +		__entry->timestamp, __entry->id, __entry->handle,
> > +		__entry->related_handle, show_hdr_flags(__entry->flags),
> > +		__print_hex(__entry->data, EVENT_RECORD_DATA_LENGTH)
> > +		)
> > +);
> > +
> > +#endif /* _CXL_TRACE_EVENTS_H */
> > +
> > +/* This part must be outside protection */
> > +#undef TRACE_INCLUDE_FILE
> > +#define TRACE_INCLUDE_FILE cxl-events
> > +#include <trace/define_trace.h>
Ira Weiny Sept. 7, 2022, 4:53 a.m. UTC | #8
On Wed, Aug 17, 2022 at 03:54:15PM -0700, Jiang, Dave wrote:
> 

[snip]

> > +
> > +/*
> > + * Now redefine the EM and EMe macros to map the enums to the strings that will
> > + * be printed in the output
> > + */
> > +#undef EM
> > +#undef EMe
> > +#define EM(a, b)        {a, b},
> > +#define EMe(a, b)       {a, b}
> > +
> > +TRACE_EVENT(cxl_event_overflow,
> 
> Kind of a general comment for the event names. Maybe just "overflow" instead
> of "cxl_event_overflow" since it shows up in sysfs under the cxl_events
> directory and becomes redundant?

It is redundant for sure...  The problem is that trace_overflow() is pretty
generic but I think it will work.

Ira

> 
> - Dave
>
Jonathan Cameron Sept. 8, 2022, 12:52 p.m. UTC | #9
> > > +static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
> > > +				   enum cxl_event_log_type type)
> > > +{
> > > +	struct cxl_get_event_payload payload;
> > > +
> > > +	do {
> > > +		u8 log_type = type;
> > > +		u16 record_count;
> > > +		int rc;
> > > +
> > > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
> > > +				       &log_type, sizeof(log_type),
> > > +				       &payload, sizeof(payload));
> > > +		if (rc)
> > > +			return rc;
> > > +
> > > +		record_count = le16_to_cpu(payload.record_count);
> > > +		if (record_count > 0)  
> > 
> > If it is anything other than 1 you have a problem..  So fornow
> > I would check for that.  
> 
> I assume you mean if there are any records at all?  For the next version I've
> checked for 1 here but 0 is also valid if there are no records to return.  So
> != 1 is not an error.

Yes, I must meant if (record_count == 1)
for this case..

> 
> (Currently all logs are checked when the event records are queried and some may
> be empty.  I don't plan on trying to distinguish the various interrupts.)
> 
> >   
> > > +			trace_cxl_event(dev_name(cxlds->dev), type,
> > > +					&payload.record);
> > > +
> > > +		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> > > +			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
> > > +						 &payload);
> > > +
> > > +	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
> > > +
> > > +	return 0;
> > > +}
> > > +

> > >   * cxl_mem_get_partition_info - Get partition info
> > >   * @cxlds: The device data for the operation
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index 88e3a8e54b6a..f83634f3bc8d 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -4,6 +4,7 @@
> > >  #define __CXL_MEM_H__
> > >  #include <uapi/linux/cxl_mem.h>
> > >  #include <linux/cdev.h>
> > > +#include <linux/uuid.h>
> > >  #include "cxl.h"
> > >  
> > >  /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> > > @@ -253,6 +254,7 @@ struct cxl_dev_state {
> > >  enum cxl_opcode {
> > >  	CXL_MBOX_OP_INVALID		= 0x0000,
> > >  	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
> > > +	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
> > >  	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
> > >  	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
> > >  	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
> > > @@ -322,6 +324,69 @@ struct cxl_mbox_identify {
> > >  	u8 qos_telemetry_caps;
> > >  } __packed;
> > >  
> > > +/*
> > > + * Common Event Record Format
> > > + * CXL v3.0 section 8.2.9.2.1; Table 8-42
> > > + */
> > > +struct cxl_event_record_hdr {
> > > +	uuid_t id;
> > > +	__le32 flags_length;  
> > 
> > Can you split this into a u8 and
> > u8[3] then use the get_unaligned_le24 accessor
> > where appropriate? Oh for 24bit types ;)  
> 
> Sure!  Another function I did not know about.
> 
> So the following should be correct ordering?
> 
> ...
> 	uuid_t id;
> 	u8 length;
> 	u8 flags[3];
> 	__le16 handle;
> ...
> 
Looks good.

> There are other records which may work better this way too.
> 
> >   
> > > +	__le16 handle;
> > > +	__le16 related_handle;
> > > +	__le64 timestamp;
> > > +	__le64 reserved1;  
> > 
> > As below. Maintenance op from CXL 3.0?  Seems easy
> > to add now rather than needing a change later.  
> 
> Yes I see it now.  Added.
> 
> >   
> > > +	__le64 reserved2;
> > > +} __packed;
> > > +
> > > +#define EVENT_RECORD_DATA_LENGTH 0x50
> > > +struct cxl_event_record_raw {
> > > +	struct cxl_event_record_hdr hdr;
> > > +	u8 data[EVENT_RECORD_DATA_LENGTH];
> > > +} __packed;
> > > +
> > > +/*
> > > + * Get Event Records output payload
> > > + * CXL v3.0 section 8.2.9.2.2; Table 8-50  
> > 
> > r3.0 :) (just drop the v and go with 3.0 would be my preference).  
> 
> Can do.
> 
> >   
> > > + *
> > > + * Space given for 1 record
> > > + */
> > > +#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
> > > +#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
> > > +struct cxl_get_event_payload {
> > > +	u8 flags;
> > > +	u8 reserved1;
> > > +	__le16 overflow_err_count;
> > > +	__le64 first_overflow_timestamp;
> > > +	__le64 last_overflow_timestamp;
> > > +	__le16 record_count;
> > > +	u8 reserved2[0xa];
> > > +	struct cxl_event_record_raw record;
> > > +} __packed;
> > > +
> > > +enum cxl_event_log_type {
> > > +	CXL_EVENT_TYPE_INFO = 0x00,
> > > +	CXL_EVENT_TYPE_WARN,
> > > +	CXL_EVENT_TYPE_FAIL,
> > > +	CXL_EVENT_TYPE_FATAL,  
> > 
> > Worth putting Dynamic capacity in now? Up to you.  
> 
> Might as well.
> 
> >   
> > > +	CXL_EVENT_TYPE_MAX
> > > +};  
> > 
> > Blank line for readability.  
> 
> Done.
> 
> >   
> > > +static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
> > > +{
> > > +	switch (type) {
> > > +	case CXL_EVENT_TYPE_INFO:
> > > +		return "Informational";
> > > +	case CXL_EVENT_TYPE_WARN:
> > > +		return "Warning";
> > > +	case CXL_EVENT_TYPE_FAIL:
> > > +		return "Failure";
> > > +	case CXL_EVENT_TYPE_FATAL:
> > > +		return "Fatal";
> > > +	default:
> > > +		break;
> > > +	}
> > > +	return "<unknown>";
> > > +}
> > > +
> > >  struct cxl_mbox_get_partition_info {
> > >  	__le64 active_volatile_cap;
> > >  	__le64 active_persistent_cap;
> > > @@ -381,6 +446,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
> > >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
> > >  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> > >  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> > > +void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
> > >  #ifdef CONFIG_CXL_SUSPEND
> > >  void cxl_mem_active_inc(void);
> > >  void cxl_mem_active_dec(void);
> > > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > > new file mode 100644
> > > index 000000000000..f4baeae66cf3
> > > --- /dev/null
> > > +++ b/include/trace/events/cxl-events.h
> > > @@ -0,0 +1,127 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +#undef TRACE_SYSTEM
> > > +#define TRACE_SYSTEM cxl_events
> > > +
> > > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > > +#define _CXL_TRACE_EVENTS_H
> > > +
> > > +#include <linux/tracepoint.h>
> > > +
> > > +#define EVENT_LOGS					\
> > > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")  
> > 
> > Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> > dynamic capacity events so I guess it doesn't matter.  
> 
> I'm not sure why you would say that.  I anticipate some user space daemon
> requiring these events to set things up.

Certainly a possible solution. I'd kind of expect a more hand shake based approach
than a tracepoint.  Guess we'll see :)


> >   
> > > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > > +)
> > > +
> > > +TRACE_EVENT(cxl_event,
> > > +
> > > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > > +		 struct cxl_event_record_raw *rec),
> > > +
> > > +	TP_ARGS(dev_name, log, rec),
> > > +
> > > +	TP_STRUCT__entry(
> > > +		__string(dev_name, dev_name)
> > > +		__field(int, log)
> > > +		__array(u8, id, UUID_SIZE)
> > > +		__field(u32, flags)
> > > +		__field(u16, handle)
> > > +		__field(u16, related_handle)
> > > +		__field(u64, timestamp)
> > > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > > +		__field(u8, length)  
> > 
> > Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> > (only noticed because I happen to have that spec revision open rather than 2.0).  
> 
> Yes done.
> 
> There is some discussion with Dan regarding not decoding anything and letting
> user space take care of it all.  I think this shows a valid reason Dan
> suggested this.

I like being able to print tracepoints with out userspace tools.
This also enforces structure and stability of interface which I like.

Maybe a raw tracepoint or variable length trailing buffer to pass
on what we don't understand?

Jonathan
Ira Weiny Sept. 9, 2022, 8:53 p.m. UTC | #10
On Thu, Sep 08, 2022 at 01:52:40PM +0100, Jonathan Cameron wrote:
> 

[snip]

> > > > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > > > new file mode 100644
> > > > index 000000000000..f4baeae66cf3
> > > > --- /dev/null
> > > > +++ b/include/trace/events/cxl-events.h
> > > > @@ -0,0 +1,127 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +#undef TRACE_SYSTEM
> > > > +#define TRACE_SYSTEM cxl_events
> > > > +
> > > > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > > > +#define _CXL_TRACE_EVENTS_H
> > > > +
> > > > +#include <linux/tracepoint.h>
> > > > +
> > > > +#define EVENT_LOGS					\
> > > > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > > > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > > > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > > > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > > > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")  
> > > 
> > > Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> > > dynamic capacity events so I guess it doesn't matter.  
> > 
> > I'm not sure why you would say that.  I anticipate some user space daemon
> > requiring these events to set things up.
> 
> Certainly a possible solution. I'd kind of expect a more hand shake based approach
> than a tracepoint.  Guess we'll see :)

Yea I think we should wait an see.

> 
> 
> > >   
> > > > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > > > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > > > +)
> > > > +
> > > > +TRACE_EVENT(cxl_event,
> > > > +
> > > > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > > > +		 struct cxl_event_record_raw *rec),
> > > > +
> > > > +	TP_ARGS(dev_name, log, rec),
> > > > +
> > > > +	TP_STRUCT__entry(
> > > > +		__string(dev_name, dev_name)
> > > > +		__field(int, log)
> > > > +		__array(u8, id, UUID_SIZE)
> > > > +		__field(u32, flags)
> > > > +		__field(u16, handle)
> > > > +		__field(u16, related_handle)
> > > > +		__field(u64, timestamp)
> > > > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > > > +		__field(u8, length)  
> > > 
> > > Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> > > (only noticed because I happen to have that spec revision open rather than 2.0).  
> > 
> > Yes done.
> > 
> > There is some discussion with Dan regarding not decoding anything and letting
> > user space take care of it all.  I think this shows a valid reason Dan
> > suggested this.
> 
> I like being able to print tracepoints with out userspace tools.
> This also enforces structure and stability of interface which I like.

I tend to agree with you.

> 
> Maybe a raw tracepoint or variable length trailing buffer to pass
> on what we don't understand?

I've already realized that we need to print all reserved fields for this
reason.  If there is something the kernel does not understand user space can
just figure it out on it's own.

Sound reasonable?

Ira

> 
> Jonathan
> 
>
Jonathan Cameron Sept. 20, 2022, 3:49 p.m. UTC | #11
On Fri, 9 Sep 2022 13:53:55 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> On Thu, Sep 08, 2022 at 01:52:40PM +0100, Jonathan Cameron wrote:
> >   
> 
> [snip]
> 
> > > > > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > > > > new file mode 100644
> > > > > index 000000000000..f4baeae66cf3
> > > > > --- /dev/null
> > > > > +++ b/include/trace/events/cxl-events.h
> > > > > @@ -0,0 +1,127 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > +#undef TRACE_SYSTEM
> > > > > +#define TRACE_SYSTEM cxl_events
> > > > > +
> > > > > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > > > > +#define _CXL_TRACE_EVENTS_H
> > > > > +
> > > > > +#include <linux/tracepoint.h>
> > > > > +
> > > > > +#define EVENT_LOGS					\
> > > > > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > > > > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > > > > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > > > > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > > > > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")    
> > > > 
> > > > Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> > > > dynamic capacity events so I guess it doesn't matter.    
> > > 
> > > I'm not sure why you would say that.  I anticipate some user space daemon
> > > requiring these events to set things up.  
> > 
> > Certainly a possible solution. I'd kind of expect a more hand shake based approach
> > than a tracepoint.  Guess we'll see :)  
> 
> Yea I think we should wait an see.
> 
> > 
> >   
> > > >     
> > > > > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > > > > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > > > > +)
> > > > > +
> > > > > +TRACE_EVENT(cxl_event,
> > > > > +
> > > > > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > > > > +		 struct cxl_event_record_raw *rec),
> > > > > +
> > > > > +	TP_ARGS(dev_name, log, rec),
> > > > > +
> > > > > +	TP_STRUCT__entry(
> > > > > +		__string(dev_name, dev_name)
> > > > > +		__field(int, log)
> > > > > +		__array(u8, id, UUID_SIZE)
> > > > > +		__field(u32, flags)
> > > > > +		__field(u16, handle)
> > > > > +		__field(u16, related_handle)
> > > > > +		__field(u64, timestamp)
> > > > > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > > > > +		__field(u8, length)    
> > > > 
> > > > Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> > > > (only noticed because I happen to have that spec revision open rather than 2.0).    
> > > 
> > > Yes done.
> > > 
> > > There is some discussion with Dan regarding not decoding anything and letting
> > > user space take care of it all.  I think this shows a valid reason Dan
> > > suggested this.  
> > 
> > I like being able to print tracepoints with out userspace tools.
> > This also enforces structure and stability of interface which I like.  
> 
> I tend to agree with you.
> 
> > 
> > Maybe a raw tracepoint or variable length trailing buffer to pass
> > on what we don't understand?  
> 
> I've already realized that we need to print all reserved fields for this
> reason.  If there is something the kernel does not understand user space can
> just figure it out on it's own.
> 
> Sound reasonable?

Hmm. Printing reserved fields would be unusual.  Not sure what is done for similar
cases elsewhere, CPER records etc...

We could just print a raw array of the whole event as well as decode version, but
that means logging most of the fields twice...

Not nice either.

I'm a bit inclined to say we should maybe just ignore stuff we don't know about or
is there a version number we can use to decide between decoded vs decoded as much as
possible + raw log?

Jonathan

> 
> Ira
> 
> > 
> > Jonathan
> > 
> >
Dave Jiang Sept. 20, 2022, 8:23 p.m. UTC | #12
On 9/20/2022 8:49 AM, Jonathan Cameron wrote:
> On Fri, 9 Sep 2022 13:53:55 -0700
> Ira Weiny <ira.weiny@intel.com> wrote:
>
>> On Thu, Sep 08, 2022 at 01:52:40PM +0100, Jonathan Cameron wrote:
>>>    
>> [snip]
>>
>>>>>> diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
>>>>>> new file mode 100644
>>>>>> index 000000000000..f4baeae66cf3
>>>>>> --- /dev/null
>>>>>> +++ b/include/trace/events/cxl-events.h
>>>>>> @@ -0,0 +1,127 @@
>>>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>>>> +#undef TRACE_SYSTEM
>>>>>> +#define TRACE_SYSTEM cxl_events
>>>>>> +
>>>>>> +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
>>>>>> +#define _CXL_TRACE_EVENTS_H
>>>>>> +
>>>>>> +#include <linux/tracepoint.h>
>>>>>> +
>>>>>> +#define EVENT_LOGS					\
>>>>>> +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
>>>>>> +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
>>>>>> +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
>>>>>> +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
>>>>>> +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
>>>>> Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
>>>>> dynamic capacity events so I guess it doesn't matter.
>>>> I'm not sure why you would say that.  I anticipate some user space daemon
>>>> requiring these events to set things up.
>>> Certainly a possible solution. I'd kind of expect a more hand shake based approach
>>> than a tracepoint.  Guess we'll see :)
>> Yea I think we should wait an see.
>>
>>>    
>>>>>      
>>>>>> +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
>>>>>> +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
>>>>>> +)
>>>>>> +
>>>>>> +TRACE_EVENT(cxl_event,
>>>>>> +
>>>>>> +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
>>>>>> +		 struct cxl_event_record_raw *rec),
>>>>>> +
>>>>>> +	TP_ARGS(dev_name, log, rec),
>>>>>> +
>>>>>> +	TP_STRUCT__entry(
>>>>>> +		__string(dev_name, dev_name)
>>>>>> +		__field(int, log)
>>>>>> +		__array(u8, id, UUID_SIZE)
>>>>>> +		__field(u32, flags)
>>>>>> +		__field(u16, handle)
>>>>>> +		__field(u16, related_handle)
>>>>>> +		__field(u64, timestamp)
>>>>>> +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
>>>>>> +		__field(u8, length)
>>>>> Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
>>>>> (only noticed because I happen to have that spec revision open rather than 2.0).
>>>> Yes done.
>>>>
>>>> There is some discussion with Dan regarding not decoding anything and letting
>>>> user space take care of it all.  I think this shows a valid reason Dan
>>>> suggested this.
>>> I like being able to print tracepoints with out userspace tools.
>>> This also enforces structure and stability of interface which I like.
>> I tend to agree with you.
>>
>>> Maybe a raw tracepoint or variable length trailing buffer to pass
>>> on what we don't understand?
>> I've already realized that we need to print all reserved fields for this
>> reason.  If there is something the kernel does not understand user space can
>> just figure it out on it's own.
>>
>> Sound reasonable?
> Hmm. Printing reserved fields would be unusual.  Not sure what is done for similar
> cases elsewhere, CPER records etc...
>
> We could just print a raw array of the whole event as well as decode version, but
> that means logging most of the fields twice...
>
> Not nice either.
>
> I'm a bit inclined to say we should maybe just ignore stuff we don't know about or
> is there a version number we can use to decide between decoded vs decoded as much as
> possible + raw log?

libtraceevent can pull the trace event data structure fields directly. 
So the raw data can be pulled directly from the kernel. And what gets 
printed to the trace buffer can be decoded data constructed from those 
fields by the kernel code. So with that you can have access both.

>
> Jonathan
>
>> Ira
>>
>>> Jonathan
>>>
>>>
Ira Weiny Sept. 20, 2022, 10:10 p.m. UTC | #13
On Tue, Sep 20, 2022 at 01:23:29PM -0700, Jiang, Dave wrote:
> 
> On 9/20/2022 8:49 AM, Jonathan Cameron wrote:
> > On Fri, 9 Sep 2022 13:53:55 -0700
> > Ira Weiny <ira.weiny@intel.com> wrote:
> > 
> > > On Thu, Sep 08, 2022 at 01:52:40PM +0100, Jonathan Cameron wrote:
> > > [snip]
> > > 
> > > > > > > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..f4baeae66cf3
> > > > > > > --- /dev/null
> > > > > > > +++ b/include/trace/events/cxl-events.h
> > > > > > > @@ -0,0 +1,127 @@
> > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > +#undef TRACE_SYSTEM
> > > > > > > +#define TRACE_SYSTEM cxl_events
> > > > > > > +
> > > > > > > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > > > > > > +#define _CXL_TRACE_EVENTS_H
> > > > > > > +
> > > > > > > +#include <linux/tracepoint.h>
> > > > > > > +
> > > > > > > +#define EVENT_LOGS					\
> > > > > > > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > > > > > > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > > > > > > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > > > > > > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > > > > > > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
> > > > > > Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> > > > > > dynamic capacity events so I guess it doesn't matter.
> > > > > I'm not sure why you would say that.  I anticipate some user space daemon
> > > > > requiring these events to set things up.
> > > > Certainly a possible solution. I'd kind of expect a more hand shake based approach
> > > > than a tracepoint.  Guess we'll see :)
> > > Yea I think we should wait an see.
> > > 
> > > > > > > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > > > > > > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > > > > > > +)
> > > > > > > +
> > > > > > > +TRACE_EVENT(cxl_event,
> > > > > > > +
> > > > > > > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > > > > > > +		 struct cxl_event_record_raw *rec),
> > > > > > > +
> > > > > > > +	TP_ARGS(dev_name, log, rec),
> > > > > > > +
> > > > > > > +	TP_STRUCT__entry(
> > > > > > > +		__string(dev_name, dev_name)
> > > > > > > +		__field(int, log)
> > > > > > > +		__array(u8, id, UUID_SIZE)
> > > > > > > +		__field(u32, flags)
> > > > > > > +		__field(u16, handle)
> > > > > > > +		__field(u16, related_handle)
> > > > > > > +		__field(u64, timestamp)
> > > > > > > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > > > > > > +		__field(u8, length)
> > > > > > Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> > > > > > (only noticed because I happen to have that spec revision open rather than 2.0).
> > > > > Yes done.
> > > > > 
> > > > > There is some discussion with Dan regarding not decoding anything and letting
> > > > > user space take care of it all.  I think this shows a valid reason Dan
> > > > > suggested this.
> > > > I like being able to print tracepoints with out userspace tools.
> > > > This also enforces structure and stability of interface which I like.
> > > I tend to agree with you.
> > > 
> > > > Maybe a raw tracepoint or variable length trailing buffer to pass
> > > > on what we don't understand?
> > > I've already realized that we need to print all reserved fields for this
> > > reason.  If there is something the kernel does not understand user space can
> > > just figure it out on it's own.
> > > 
> > > Sound reasonable?
> > Hmm. Printing reserved fields would be unusual.  Not sure what is done for similar
> > cases elsewhere, CPER records etc...
> > 
> > We could just print a raw array of the whole event as well as decode version, but
> > that means logging most of the fields twice...
> > 
> > Not nice either.
> > 
> > I'm a bit inclined to say we should maybe just ignore stuff we don't know about or
> > is there a version number we can use to decide between decoded vs decoded as much as
> > possible + raw log?

I'm not a fan of loging the raw + decoded versions.

> 
> libtraceevent can pull the trace event data structure fields directly. So
> the raw data can be pulled directly from the kernel.

This raw data needs to be in a field though.  If the kernel does not save the
reserved fields in the TP_fast_assign() then the data won't be in a field to
access.

>
> And what gets printed
> to the trace buffer can be decoded data constructed from those fields by the
> kernel code. So with that you can have access both.
> 

Fast assigning the entire buffer + decoded versions will roughly double the
trace event size.

Thinking through this a bit more there is a sticking point.

The difficulty will be ensuring that any new field names are documented such
that when user space starts to look at them they can determine if that data
appears as a new field or as part of a reserved field.

For example if user space needs to access data in the reserved data now it can
simply decode it.  However, when that data becomes a field it no longer is part
of the reserved data.  So what user space would need to do is look for the
field first (ie know the field name) and then if it does not appear extract it
from the reserved data.

I'm now wondering if I've wasted my time decoding anything since the kernel
does not need to know anything about these fields.  Because the above scenario
means that user space may get ugly over time.

That said I don't think it will present any incompatibilities.  So perhaps we
are ok?

Ira
Jonathan Cameron Sept. 21, 2022, 4:36 p.m. UTC | #14
On Tue, 20 Sep 2022 15:10:26 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> On Tue, Sep 20, 2022 at 01:23:29PM -0700, Jiang, Dave wrote:
> > 
> > On 9/20/2022 8:49 AM, Jonathan Cameron wrote:  
> > > On Fri, 9 Sep 2022 13:53:55 -0700
> > > Ira Weiny <ira.weiny@intel.com> wrote:
> > >   
> > > > On Thu, Sep 08, 2022 at 01:52:40PM +0100, Jonathan Cameron wrote:
> > > > [snip]
> > > >   
> > > > > > > > diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..f4baeae66cf3
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/include/trace/events/cxl-events.h
> > > > > > > > @@ -0,0 +1,127 @@
> > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > > +#undef TRACE_SYSTEM
> > > > > > > > +#define TRACE_SYSTEM cxl_events
> > > > > > > > +
> > > > > > > > +#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
> > > > > > > > +#define _CXL_TRACE_EVENTS_H
> > > > > > > > +
> > > > > > > > +#include <linux/tracepoint.h>
> > > > > > > > +
> > > > > > > > +#define EVENT_LOGS					\
> > > > > > > > +	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
> > > > > > > > +	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
> > > > > > > > +	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
> > > > > > > > +	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
> > > > > > > > +	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")  
> > > > > > > Hmm. 4 is defined in CXL 3.0, but I'd assume we won't use tracepoints for
> > > > > > > dynamic capacity events so I guess it doesn't matter.  
> > > > > > I'm not sure why you would say that.  I anticipate some user space daemon
> > > > > > requiring these events to set things up.  
> > > > > Certainly a possible solution. I'd kind of expect a more hand shake based approach
> > > > > than a tracepoint.  Guess we'll see :)  
> > > > Yea I think we should wait an see.
> > > >   
> > > > > > > > +	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
> > > > > > > > +	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
> > > > > > > > +)
> > > > > > > > +
> > > > > > > > +TRACE_EVENT(cxl_event,
> > > > > > > > +
> > > > > > > > +	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
> > > > > > > > +		 struct cxl_event_record_raw *rec),
> > > > > > > > +
> > > > > > > > +	TP_ARGS(dev_name, log, rec),
> > > > > > > > +
> > > > > > > > +	TP_STRUCT__entry(
> > > > > > > > +		__string(dev_name, dev_name)
> > > > > > > > +		__field(int, log)
> > > > > > > > +		__array(u8, id, UUID_SIZE)
> > > > > > > > +		__field(u32, flags)
> > > > > > > > +		__field(u16, handle)
> > > > > > > > +		__field(u16, related_handle)
> > > > > > > > +		__field(u64, timestamp)
> > > > > > > > +		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
> > > > > > > > +		__field(u8, length)  
> > > > > > > Do we want the maintenance operation class added in Table 8-42 from CXL 3.0?
> > > > > > > (only noticed because I happen to have that spec revision open rather than 2.0).  
> > > > > > Yes done.
> > > > > > 
> > > > > > There is some discussion with Dan regarding not decoding anything and letting
> > > > > > user space take care of it all.  I think this shows a valid reason Dan
> > > > > > suggested this.  
> > > > > I like being able to print tracepoints with out userspace tools.
> > > > > This also enforces structure and stability of interface which I like.  
> > > > I tend to agree with you.
> > > >   
> > > > > Maybe a raw tracepoint or variable length trailing buffer to pass
> > > > > on what we don't understand?  
> > > > I've already realized that we need to print all reserved fields for this
> > > > reason.  If there is something the kernel does not understand user space can
> > > > just figure it out on it's own.
> > > > 
> > > > Sound reasonable?  
> > > Hmm. Printing reserved fields would be unusual.  Not sure what is done for similar
> > > cases elsewhere, CPER records etc...
> > > 
> > > We could just print a raw array of the whole event as well as decode version, but
> > > that means logging most of the fields twice...
> > > 
> > > Not nice either.
> > > 
> > > I'm a bit inclined to say we should maybe just ignore stuff we don't know about or
> > > is there a version number we can use to decide between decoded vs decoded as much as
> > > possible + raw log?  
> 
> I'm not a fan of loging the raw + decoded versions.
> 
> > 
> > libtraceevent can pull the trace event data structure fields directly. So
> > the raw data can be pulled directly from the kernel.  
> 
> This raw data needs to be in a field though.  If the kernel does not save the
> reserved fields in the TP_fast_assign() then the data won't be in a field to
> access.
> 
> >
> > And what gets printed
> > to the trace buffer can be decoded data constructed from those fields by the
> > kernel code. So with that you can have access both.
> >   
> 
> Fast assigning the entire buffer + decoded versions will roughly double the
> trace event size.
> 
> Thinking through this a bit more there is a sticking point.
> 
> The difficulty will be ensuring that any new field names are documented such
> that when user space starts to look at them they can determine if that data
> appears as a new field or as part of a reserved field.
> 
> For example if user space needs to access data in the reserved data now it can
> simply decode it.  However, when that data becomes a field it no longer is part
> of the reserved data.  So what user space would need to do is look for the
> field first (ie know the field name) and then if it does not appear extract it
> from the reserved data.
> 
> I'm now wondering if I've wasted my time decoding anything since the kernel
> does not need to know anything about these fields.  Because the above scenario
> means that user space may get ugly over time.
> 
> That said I don't think it will present any incompatibilities.  So perhaps we
> are ok?

I favor decoding current record in kernel and packing it appropriately.
If that means we don't provide some new data from a future version then such
is life - the kernel needs upgrading.  That information is unlikely to be
crucial - it's probably just more detail.

Jonathan

> 
> Ira
Ira Weiny Sept. 22, 2022, 4:16 a.m. UTC | #15
On Wed, Sep 21, 2022 at 05:36:42PM +0100, Jonathan Cameron wrote:
> On Tue, 20 Sep 2022 15:10:26 -0700
> Ira Weiny <ira.weiny@intel.com> wrote:
> 
> > On Tue, Sep 20, 2022 at 01:23:29PM -0700, Jiang, Dave wrote:

[snip]

> > >
> > > And what gets printed
> > > to the trace buffer can be decoded data constructed from those fields by the
> > > kernel code. So with that you can have access both.
> > >   
> > 
> > Fast assigning the entire buffer + decoded versions will roughly double the
> > trace event size.
> > 
> > Thinking through this a bit more there is a sticking point.
> > 
> > The difficulty will be ensuring that any new field names are documented such
> > that when user space starts to look at them they can determine if that data
> > appears as a new field or as part of a reserved field.
> > 
> > For example if user space needs to access data in the reserved data now it can
> > simply decode it.  However, when that data becomes a field it no longer is part
> > of the reserved data.  So what user space would need to do is look for the
> > field first (ie know the field name) and then if it does not appear extract it
> > from the reserved data.
> > 
> > I'm now wondering if I've wasted my time decoding anything since the kernel
> > does not need to know anything about these fields.  Because the above scenario
> > means that user space may get ugly over time.
> > 
> > That said I don't think it will present any incompatibilities.  So perhaps we
> > are ok?
> 
> I favor decoding current record in kernel and packing it appropriately.
> If that means we don't provide some new data from a future version then such
> is life - the kernel needs upgrading.  That information is unlikely to be
> crucial - it's probably just more detail.

Dave, Dan, and I discussed this further today.  Dan expressed the same opinion.
So I'm going to remove all the reserved fields from the next version.

Thanks,
Ira
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 54fa6e2059de..1cb9cec31009 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5014,6 +5014,7 @@  M:	Dan Williams <dan.j.williams@intel.com>
 L:	linux-cxl@vger.kernel.org
 S:	Maintained
 F:	drivers/cxl/
+F:	include/trace/events/cxl*.h
 F:	include/uapi/linux/cxl_mem.h
 
 CONEXANT ACCESSRUNNER USB DRIVER
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 16176b9278b4..2cceed8608dc 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -7,6 +7,9 @@ 
 #include <cxlmem.h>
 #include <cxl.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/cxl-events.h>
+
 #include "core.h"
 
 static bool cxl_raw_allow_all;
@@ -48,6 +51,7 @@  static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
 #endif
 	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_EVENT_RECORD, 1, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
 	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
 	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
@@ -704,6 +708,62 @@  int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
 
+static int cxl_mem_get_records_log(struct cxl_dev_state *cxlds,
+				   enum cxl_event_log_type type)
+{
+	struct cxl_get_event_payload payload;
+
+	do {
+		u8 log_type = type;
+		u16 record_count;
+		int rc;
+
+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_EVENT_RECORD,
+				       &log_type, sizeof(log_type),
+				       &payload, sizeof(payload));
+		if (rc)
+			return rc;
+
+		record_count = le16_to_cpu(payload.record_count);
+		if (record_count > 0)
+			trace_cxl_event(dev_name(cxlds->dev), type,
+					&payload.record);
+
+		if (payload.flags & CXL_GET_EVENT_FLAG_OVERFLOW)
+			trace_cxl_event_overflow(dev_name(cxlds->dev), type,
+						 &payload);
+
+	} while (payload.flags & CXL_GET_EVENT_FLAG_MORE_RECORDS);
+
+	return 0;
+}
+
+/**
+ * cxl_mem_get_event_records - Get Event Records from the device
+ * @cxlds: The device data for the operation
+ *
+ * Retrieve all event records available on the device and report them as trace
+ * events.
+ *
+ * See CXL v3.0 @8.2.9.2.2 Get Event Records
+ */
+void cxl_mem_get_event_records(struct cxl_dev_state *cxlds)
+{
+	struct device *dev = cxlds->dev;
+	enum cxl_event_log_type log_type;
+
+	for (log_type = CXL_EVENT_TYPE_INFO;
+	     log_type < CXL_EVENT_TYPE_MAX; log_type++) {
+		int rc;
+
+		rc = cxl_mem_get_records_log(cxlds, log_type);
+		if (rc)
+			dev_err(dev, "Failed to query %s Event Logs : %d",
+				cxl_event_log_type_str(log_type), rc);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
+
 /**
  * cxl_mem_get_partition_info - Get partition info
  * @cxlds: The device data for the operation
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 88e3a8e54b6a..f83634f3bc8d 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -4,6 +4,7 @@ 
 #define __CXL_MEM_H__
 #include <uapi/linux/cxl_mem.h>
 #include <linux/cdev.h>
+#include <linux/uuid.h>
 #include "cxl.h"
 
 /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
@@ -253,6 +254,7 @@  struct cxl_dev_state {
 enum cxl_opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
 	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
@@ -322,6 +324,69 @@  struct cxl_mbox_identify {
 	u8 qos_telemetry_caps;
 } __packed;
 
+/*
+ * Common Event Record Format
+ * CXL v3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_hdr {
+	uuid_t id;
+	__le32 flags_length;
+	__le16 handle;
+	__le16 related_handle;
+	__le64 timestamp;
+	__le64 reserved1;
+	__le64 reserved2;
+} __packed;
+
+#define EVENT_RECORD_DATA_LENGTH 0x50
+struct cxl_event_record_raw {
+	struct cxl_event_record_hdr hdr;
+	u8 data[EVENT_RECORD_DATA_LENGTH];
+} __packed;
+
+/*
+ * Get Event Records output payload
+ * CXL v3.0 section 8.2.9.2.2; Table 8-50
+ *
+ * Space given for 1 record
+ */
+#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
+#define CXL_GET_EVENT_FLAG_MORE_RECORDS	BIT(1)
+struct cxl_get_event_payload {
+	u8 flags;
+	u8 reserved1;
+	__le16 overflow_err_count;
+	__le64 first_overflow_timestamp;
+	__le64 last_overflow_timestamp;
+	__le16 record_count;
+	u8 reserved2[0xa];
+	struct cxl_event_record_raw record;
+} __packed;
+
+enum cxl_event_log_type {
+	CXL_EVENT_TYPE_INFO = 0x00,
+	CXL_EVENT_TYPE_WARN,
+	CXL_EVENT_TYPE_FAIL,
+	CXL_EVENT_TYPE_FATAL,
+	CXL_EVENT_TYPE_MAX
+};
+static inline char *cxl_event_log_type_str(enum cxl_event_log_type type)
+{
+	switch (type) {
+	case CXL_EVENT_TYPE_INFO:
+		return "Informational";
+	case CXL_EVENT_TYPE_WARN:
+		return "Warning";
+	case CXL_EVENT_TYPE_FAIL:
+		return "Failure";
+	case CXL_EVENT_TYPE_FATAL:
+		return "Fatal";
+	default:
+		break;
+	}
+	return "<unknown>";
+}
+
 struct cxl_mbox_get_partition_info {
 	__le64 active_volatile_cap;
 	__le64 active_persistent_cap;
@@ -381,6 +446,7 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
 void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
 void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
+void cxl_mem_get_event_records(struct cxl_dev_state *cxlds);
 #ifdef CONFIG_CXL_SUSPEND
 void cxl_mem_active_inc(void);
 void cxl_mem_active_dec(void);
diff --git a/include/trace/events/cxl-events.h b/include/trace/events/cxl-events.h
new file mode 100644
index 000000000000..f4baeae66cf3
--- /dev/null
+++ b/include/trace/events/cxl-events.h
@@ -0,0 +1,127 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cxl_events
+
+#if !defined(_CXL_TRACE_EVENTS_H) ||  defined(TRACE_HEADER_MULTI_READ)
+#define _CXL_TRACE_EVENTS_H
+
+#include <linux/tracepoint.h>
+
+#define EVENT_LOGS					\
+	EM(CXL_EVENT_TYPE_INFO,		"Info")		\
+	EM(CXL_EVENT_TYPE_WARN,		"Warning")	\
+	EM(CXL_EVENT_TYPE_FAIL,		"Failure")	\
+	EM(CXL_EVENT_TYPE_FATAL,	"Fatal")	\
+	EMe(CXL_EVENT_TYPE_MAX,		"<undefined>")
+
+/*
+ * First define the enums in the above macros to be exported to userspace via
+ * TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+EVENT_LOGS
+#define show_log_type(type) __print_symbolic(type, EVENT_LOGS)
+
+/*
+ * Now redefine the EM and EMe macros to map the enums to the strings that will
+ * be printed in the output
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)        {a, b},
+#define EMe(a, b)       {a, b}
+
+TRACE_EVENT(cxl_event_overflow,
+
+	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
+		 struct cxl_get_event_payload *payload),
+
+	TP_ARGS(dev_name, log, payload),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name)
+		__field(int, log)
+		__field(u16, count)
+		__field(u64, first)
+		__field(u64, last)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name);
+		__entry->log = log;
+		__entry->count = le16_to_cpu(payload->overflow_err_count);
+		__entry->first = le64_to_cpu(payload->first_overflow_timestamp);
+		__entry->last = le64_to_cpu(payload->last_overflow_timestamp);
+	),
+
+	TP_printk("%s: EVENT LOG %s OVERFLOW %u records from %llu to %llu",
+		__get_str(dev_name), show_log_type(__entry->log),
+		__entry->count, __entry->first, __entry->last)
+
+);
+
+/*
+ * Common Event Record Format
+ * CXL v2.0 section 8.2.9.1.1; Table 153
+ */
+#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
+#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
+#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
+#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
+#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
+	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"Permanent Condition"		}, \
+	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"Maintanance Needed"		}, \
+	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"Performance Degraded"		}, \
+	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"Hardware Replacement Needed"	}  \
+)
+
+TRACE_EVENT(cxl_event,
+
+	TP_PROTO(const char *dev_name, enum cxl_event_log_type log,
+		 struct cxl_event_record_raw *rec),
+
+	TP_ARGS(dev_name, log, rec),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name)
+		__field(int, log)
+		__array(u8, id, UUID_SIZE)
+		__field(u32, flags)
+		__field(u16, handle)
+		__field(u16, related_handle)
+		__field(u64, timestamp)
+		__array(u8, data, EVENT_RECORD_DATA_LENGTH)
+		__field(u8, length)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name);
+		memcpy(__entry->id, &rec->hdr.id, UUID_SIZE);
+		__entry->log = log;
+		__entry->flags = le32_to_cpu(rec->hdr.flags_length) >> 8;
+		__entry->length = le32_to_cpu(rec->hdr.flags_length) & 0xFF;
+		__entry->handle = le16_to_cpu(rec->hdr.handle);
+		__entry->related_handle = le16_to_cpu(rec->hdr.related_handle);
+		__entry->timestamp = le64_to_cpu(rec->hdr.timestamp);
+		memcpy(__entry->data, &rec->data, EVENT_RECORD_DATA_LENGTH);
+	),
+
+	TP_printk("%s: %s time=%llu id=%pUl handle=%x related_handle=%x hdr_flags='%s' " \
+		  ": %s",
+		__get_str(dev_name), show_log_type(__entry->log),
+		__entry->timestamp, __entry->id, __entry->handle,
+		__entry->related_handle, show_hdr_flags(__entry->flags),
+		__print_hex(__entry->data, EVENT_RECORD_DATA_LENGTH)
+		)
+);
+
+#endif /* _CXL_TRACE_EVENTS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE cxl-events
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index c71021a2a9ed..70459be5bdd4 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -24,6 +24,7 @@ 
 	___C(IDENTIFY, "Identify Command"),                               \
 	___C(RAW, "Raw device command"),                                  \
 	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
+	___C(GET_EVENT_RECORD, "Get Event Record"),                       \
 	___C(GET_FW_INFO, "Get FW Info"),                                 \
 	___C(GET_PARTITION_INFO, "Get Partition Information"),            \
 	___C(GET_LSA, "Get Label Storage Area"),                          \