diff mbox series

[v6,4/4] cxl/core: Add region info to cxl_general_media and cxl_dram events

Message ID dd8d708b7a7ebfb64a27020a5eb338091336b34d.1714496730.git.alison.schofield@intel.com
State Accepted
Commit 6aec00139d3a83e2394d4bcb0084e872b4036e8f
Headers show
Series Add DPA->HPA translation to dram & general_media events | expand

Commit Message

Alison Schofield April 30, 2024, 5:28 p.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

User space may need to know which region, if any, maps the DPAs
(device physical addresses) reported in a cxl_general_media or
cxl_dram event. Since the mapping can change, the kernel provides
this information at the time the event occurs. This informs user
space that at event <timestamp> this <region> mapped this <DPA>
to this <HPA>.

Add the same region info that is included in the cxl_poison trace
event: the DPA->HPA translation, region name, and region uuid.

The new fields are inserted in the trace event and no existing
fields are modified. If the DPA is not mapped, user will see:
hpa=ULLONG_MAX, region="", and uuid=0

This work must be protected by dpa_rwsem & region_rwsem since
it is looking up region mappings.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huwei.com>
---
 drivers/cxl/core/mbox.c   | 36 ++++++++++++++++++++++++++------
 drivers/cxl/core/trace.h  | 44 +++++++++++++++++++++++++++++++--------
 include/linux/cxl-event.h | 10 +++++++++
 3 files changed, 75 insertions(+), 15 deletions(-)

Comments

Dan Williams May 3, 2024, 6:05 p.m. UTC | #1
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> User space may need to know which region, if any, maps the DPAs
> (device physical addresses) reported in a cxl_general_media or
> cxl_dram event. Since the mapping can change, the kernel provides
> this information at the time the event occurs. This informs user
> space that at event <timestamp> this <region> mapped this <DPA>
> to this <HPA>.
> 
> Add the same region info that is included in the cxl_poison trace
> event: the DPA->HPA translation, region name, and region uuid.
> 
> The new fields are inserted in the trace event and no existing
> fields are modified. If the DPA is not mapped, user will see:
> hpa=ULLONG_MAX, region="", and uuid=0
> 
> This work must be protected by dpa_rwsem & region_rwsem since
> it is looking up region mappings.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huwei.com>
[..]
> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> index 03fa6d50d46f..5342755777cc 100644
> --- a/include/linux/cxl-event.h
> +++ b/include/linux/cxl-event.h
> @@ -91,11 +91,21 @@ struct cxl_event_mem_module {
>  	u8 reserved[0x3d];
>  } __packed;
>  
> +/*
> + * General Media or DRAM Event Common Fields
> + * - provides common access to phys_addr
> + */
> +struct cxl_event_common {
> +	struct cxl_event_record_hdr hdr;
> +	__le64 phys_addr;
> +} __packed;
> +
>  union cxl_event {
>  	struct cxl_event_generic generic;
>  	struct cxl_event_gen_media gen_media;
>  	struct cxl_event_dram dram;
>  	struct cxl_event_mem_module mem_module;
> +	struct cxl_event_common common;

I think we should rename this. As I was doing one last once-over my
brain went "wait, there's no physical address in the Common Event
Record".

So at a minimum call this "media_hdr" to match the style of
cxl_event_record_hdr.

Now I say "minimum" because 'struct cxl_event_gen_media' and 'struct
cxl_event_dram' should be including that struct in their definition to
make it impossible that they have different definitions. I.e. I am not a
fan of creating a new 'struct cxl_event_media_hdr' that is only consumed
by 'union cxl_event'.

The "maximum" would be to combine these definitions into one common
flexible format to maximize shared definitions, something like the
below, but the thrash it would cause is probably on the same order as
just adding a 'struct cxl_event_media_hdr media_hdr' member to existing
'struct cxl_event_gen_media' and 'struct cxl_event_dram', fixup all the
users and call it a day,

struct cxl_event_media {
        struct cxl_event_record_hdr hdr;
        struct_group_tagged(cxl_event_media_hdr, media_hdr,
                __le64 phys_addr,
                u8 descriptor,
                u8 type,
                u8 transaction_type,
                u8 validity_flags[2],
                u8 channel,
                u8 rank,
        };
        union { 
                struct_group(general,
                        u8 device[3],
                        u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE],
                        u8 gen_reserved[46],
                );
                struct_group(media,
                        u8 nibble_mask[3],
                        u8 bank_group,
                        u8 bank,
                        u8 row[3],
                        u8 column[2],
                        u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE],
                        u8 dram_reserved[0x17],
                );
        };
} __packed;
Dan Williams May 3, 2024, 6:08 p.m. UTC | #2
Dan Williams wrote:
[..]
> The "maximum" would be to combine these definitions into one common
> flexible format to maximize shared definitions, something like the
> below, but the thrash it would cause is probably on the same order as
> just adding a 'struct cxl_event_media_hdr media_hdr' member to existing
> 'struct cxl_event_gen_media' and 'struct cxl_event_dram', fixup all the
> users and call it a day,
> 
> struct cxl_event_media {
>         struct cxl_event_record_hdr hdr;
>         struct_group_tagged(cxl_event_media_hdr, media_hdr,
>                 __le64 phys_addr,
>                 u8 descriptor,
>                 u8 type,
>                 u8 transaction_type,
>                 u8 validity_flags[2],
>                 u8 channel,
>                 u8 rank,
>         };
>         union { 
>                 struct_group(general,
>                         u8 device[3],
>                         u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE],
>                         u8 gen_reserved[46],
>                 );
>                 struct_group(media,

...should be "dram", but you get the idea.

>                         u8 nibble_mask[3],
>                         u8 bank_group,
>                         u8 bank,
>                         u8 row[3],
>                         u8 column[2],
>                         u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE],
>                         u8 dram_reserved[0x17],
>                 );
>         };
> } __packed;
Alison Schofield May 8, 2024, 4:54 p.m. UTC | #3
On Fri, May 03, 2024 at 11:05:29AM -0700, Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > User space may need to know which region, if any, maps the DPAs
> > (device physical addresses) reported in a cxl_general_media or
> > cxl_dram event. Since the mapping can change, the kernel provides
> > this information at the time the event occurs. This informs user
> > space that at event <timestamp> this <region> mapped this <DPA>
> > to this <HPA>.
> > 
> > Add the same region info that is included in the cxl_poison trace
> > event: the DPA->HPA translation, region name, and region uuid.
> > 
> > The new fields are inserted in the trace event and no existing
> > fields are modified. If the DPA is not mapped, user will see:
> > hpa=ULLONG_MAX, region="", and uuid=0
> > 
> > This work must be protected by dpa_rwsem & region_rwsem since
> > it is looking up region mappings.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> > Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huwei.com>
> [..]
> > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
> > index 03fa6d50d46f..5342755777cc 100644
> > --- a/include/linux/cxl-event.h
> > +++ b/include/linux/cxl-event.h
> > @@ -91,11 +91,21 @@ struct cxl_event_mem_module {
> >  	u8 reserved[0x3d];
> >  } __packed;
> >  
> > +/*
> > + * General Media or DRAM Event Common Fields
> > + * - provides common access to phys_addr
> > + */
> > +struct cxl_event_common {
> > +	struct cxl_event_record_hdr hdr;
> > +	__le64 phys_addr;
> > +} __packed;
> > +
> >  union cxl_event {
> >  	struct cxl_event_generic generic;
> >  	struct cxl_event_gen_media gen_media;
> >  	struct cxl_event_dram dram;
> >  	struct cxl_event_mem_module mem_module;
> > +	struct cxl_event_common common;
> 
> I think we should rename this. As I was doing one last once-over my
> brain went "wait, there's no physical address in the Common Event
> Record".
> 
> So at a minimum call this "media_hdr" to match the style of
> cxl_event_record_hdr.
> 
> Now I say "minimum" because 'struct cxl_event_gen_media' and 'struct
> cxl_event_dram' should be including that struct in their definition to
> make it impossible that they have different definitions. I.e. I am not a
> fan of creating a new 'struct cxl_event_media_hdr' that is only consumed
> by 'union cxl_event'.
> 
> The "maximum" would be to combine these definitions into one common
> flexible format to maximize shared definitions, something like the
> below, but the thrash it would cause is probably on the same order as
> just adding a 'struct cxl_event_media_hdr media_hdr' member to existing
> 'struct cxl_event_gen_media' and 'struct cxl_event_dram', fixup all the
> users and call it a day,

I've added this to my queue to fixup. 
Thanks for the review.

-- Alison

> 
> struct cxl_event_media {
>         struct cxl_event_record_hdr hdr;
>         struct_group_tagged(cxl_event_media_hdr, media_hdr,
>                 __le64 phys_addr,
>                 u8 descriptor,
>                 u8 type,
>                 u8 transaction_type,
>                 u8 validity_flags[2],
>                 u8 channel,
>                 u8 rank,
>         };
>         union { 
>                 struct_group(general,
>                         u8 device[3],
>                         u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE],
>                         u8 gen_reserved[46],
>                 );
>                 struct_group(media,
>                         u8 nibble_mask[3],
>                         u8 bank_group,
>                         u8 bank,
>                         u8 row[3],
>                         u8 column[2],
>                         u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE],
>                         u8 dram_reserved[0x17],
>                 );
>         };
> } __packed;
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 9adda4795eb7..df0fc2a4570f 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -842,14 +842,38 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 			    enum cxl_event_type event_type,
 			    const uuid_t *uuid, union cxl_event *evt)
 {
-	if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
-		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-	else if (event_type == CXL_CPER_EVENT_DRAM)
-		trace_cxl_dram(cxlmd, type, &evt->dram);
-	else if (event_type == CXL_CPER_EVENT_MEM_MODULE)
+	if (event_type == CXL_CPER_EVENT_MEM_MODULE) {
 		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
-	else
+		return;
+	}
+	if (event_type == CXL_CPER_EVENT_GENERIC) {
 		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
+		return;
+	}
+
+	if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
+		u64 dpa, hpa = ULLONG_MAX;
+		struct cxl_region *cxlr;
+
+		/*
+		 * These trace points are annotated with HPA and region
+		 * translations. Take topology mutation locks and lookup
+		 * { HPA, REGION } from { DPA, MEMDEV } in the event record.
+		 */
+		guard(rwsem_read)(&cxl_region_rwsem);
+		guard(rwsem_read)(&cxl_dpa_rwsem);
+
+		dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK;
+		cxlr = cxl_dpa_to_region(cxlmd, dpa);
+		if (cxlr)
+			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
+
+		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
+			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
+						&evt->gen_media);
+		else if (event_type == CXL_CPER_EVENT_DRAM)
+			trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram);
+	}
 }
 EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
 
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index e303e618aa05..07a0394b1d99 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -316,9 +316,9 @@  TRACE_EVENT(cxl_generic_event,
 TRACE_EVENT(cxl_general_media,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
-		 struct cxl_event_gen_media *rec),
+		 struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec),
 
-	TP_ARGS(cxlmd, log, rec),
+	TP_ARGS(cxlmd, log, cxlr, hpa, rec),
 
 	TP_STRUCT__entry(
 		CXL_EVT_TP_entry
@@ -330,10 +330,13 @@  TRACE_EVENT(cxl_general_media,
 		__field(u8, channel)
 		__field(u32, device)
 		__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
-		__field(u16, validity_flags)
 		/* Following are out of order to pack trace record */
+		__field(u64, hpa)
+		__field_struct(uuid_t, region_uuid)
+		__field(u16, validity_flags)
 		__field(u8, rank)
 		__field(u8, dpa_flags)
+		__string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
 	),
 
 	TP_fast_assign(
@@ -354,18 +357,28 @@  TRACE_EVENT(cxl_general_media,
 		memcpy(__entry->comp_id, &rec->component_id,
 			CXL_EVENT_GEN_MED_COMP_ID_SIZE);
 		__entry->validity_flags = get_unaligned_le16(&rec->validity_flags);
+		__entry->hpa = hpa;
+		if (cxlr) {
+			__assign_str(region_name, dev_name(&cxlr->dev));
+			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+		} else {
+			__assign_str(region_name, "");
+			uuid_copy(&__entry->region_uuid, &uuid_null);
+		}
 	),
 
 	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' " \
 		"descriptor='%s' type='%s' transaction_type='%s' channel=%u rank=%u " \
-		"device=%x comp_id=%s validity_flags='%s'",
+		"device=%x comp_id=%s validity_flags='%s' " \
+		"hpa=%llx region=%s region_uuid=%pUb",
 		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
 		show_event_desc_flags(__entry->descriptor),
 		show_mem_event_type(__entry->type),
 		show_trans_type(__entry->transaction_type),
 		__entry->channel, __entry->rank, __entry->device,
 		__print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
-		show_valid_flags(__entry->validity_flags)
+		show_valid_flags(__entry->validity_flags),
+		__entry->hpa, __get_str(region_name), &__entry->region_uuid
 	)
 );
 
@@ -400,9 +413,9 @@  TRACE_EVENT(cxl_general_media,
 TRACE_EVENT(cxl_dram,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
-		 struct cxl_event_dram *rec),
+		 struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec),
 
-	TP_ARGS(cxlmd, log, rec),
+	TP_ARGS(cxlmd, log, cxlr, hpa, rec),
 
 	TP_STRUCT__entry(
 		CXL_EVT_TP_entry
@@ -417,10 +430,13 @@  TRACE_EVENT(cxl_dram,
 		__field(u32, nibble_mask)
 		__field(u32, row)
 		__array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
+		__field(u64, hpa)
+		__field_struct(uuid_t, region_uuid)
 		__field(u8, rank)	/* Out of order to pack trace record */
 		__field(u8, bank_group)	/* Out of order to pack trace record */
 		__field(u8, bank)	/* Out of order to pack trace record */
 		__field(u8, dpa_flags)	/* Out of order to pack trace record */
+		__string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
 	),
 
 	TP_fast_assign(
@@ -444,12 +460,21 @@  TRACE_EVENT(cxl_dram,
 		__entry->column = get_unaligned_le16(rec->column);
 		memcpy(__entry->cor_mask, &rec->correction_mask,
 			CXL_EVENT_DER_CORRECTION_MASK_SIZE);
+		__entry->hpa = hpa;
+		if (cxlr) {
+			__assign_str(region_name, dev_name(&cxlr->dev));
+			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+		} else {
+			__assign_str(region_name, "");
+			uuid_copy(&__entry->region_uuid, &uuid_null);
+		}
 	),
 
 	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' descriptor='%s' type='%s' " \
 		"transaction_type='%s' channel=%u rank=%u nibble_mask=%x " \
 		"bank_group=%u bank=%u row=%u column=%u cor_mask=%s " \
-		"validity_flags='%s'",
+		"validity_flags='%s' " \
+		"hpa=%llx region=%s region_uuid=%pUb",
 		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
 		show_event_desc_flags(__entry->descriptor),
 		show_mem_event_type(__entry->type),
@@ -458,7 +483,8 @@  TRACE_EVENT(cxl_dram,
 		__entry->bank_group, __entry->bank,
 		__entry->row, __entry->column,
 		__print_hex(__entry->cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE),
-		show_dram_valid_flags(__entry->validity_flags)
+		show_dram_valid_flags(__entry->validity_flags),
+		__entry->hpa, __get_str(region_name), &__entry->region_uuid
 	)
 );
 
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..5342755777cc 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -91,11 +91,21 @@  struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
+/*
+ * General Media or DRAM Event Common Fields
+ * - provides common access to phys_addr
+ */
+struct cxl_event_common {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+} __packed;
+
 union cxl_event {
 	struct cxl_event_generic generic;
 	struct cxl_event_gen_media gen_media;
 	struct cxl_event_dram dram;
 	struct cxl_event_mem_module mem_module;
+	struct cxl_event_common common;
 } __packed;
 
 /*