Message ID | 20250107143852.3692571-15-terry.bowman@amd.com |
---|---|
State | New |
Headers | show |
Series | Enable CXL PCIe port protocol error handling and logging | expand |
On Tue, 7 Jan 2025 08:38:50 -0600 Terry Bowman <terry.bowman@amd.com> wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Additionally, update > the CXL Port Protocol Error handlers to invoke these new trace functions. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Alejandro Lucero <alucerop@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> An example print in commit message would help understand what the tracepoints look like. Few more things inline following on from earlier comments. Jonathan > --- > drivers/cxl/core/pci.c | 17 +++++++++++---- > drivers/cxl/core/trace.h | 47 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 60 insertions(+), 4 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 411834f7efe0..3e87fe54a1a2 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -663,10 +663,15 @@ static void __cxl_handle_cor_ras(struct device *dev, > > addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > status = readl(addr); > - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { > - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) > + return; > + > + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + > + if (is_cxl_memdev(dev)) As below. Drag to earlier patch. > trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); > - } > + else and perhaps check it's a port mostly for documentation purposes. > + trace_cxl_port_aer_correctable_error(dev, status); > } > > static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) > @@ -724,7 +729,11 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > } > > header_log_copy(ras_base, hl); > - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); > + if (is_cxl_memdev(dev)) As mentioned above, drag this if to the earlier patch. > + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); > + else For documentation purposes mostly I'd be tempted to have an is_cxl_port() check before calling the following. > + trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl); > + > writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); > > return true; > diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h > index 8389a94adb1a..681e415ac8f5 100644 > --- a/drivers/cxl/core/trace.h > +++ b/drivers/cxl/core/trace.h > @@ -48,6 +48,34 @@ > { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ > ) > > +TRACE_EVENT(cxl_port_aer_uncorrectable_error, > + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > + TP_ARGS(dev, status, fe, hl), > + TP_STRUCT__entry( > + __string(devname, dev_name(dev)) > + __string(host, dev_name(dev->parent)) What is host in this case? Perhaps a comment. > + __field(u32, status) > + __field(u32, first_error) > + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) > + ), > + TP_fast_assign( > + __assign_str(devname); > + __assign_str(host); > + __entry->status = status; > + __entry->first_error = fe; > + /* > + * Embed the 512B headerlog data for user app retrieval and > + * parsing, but no need to print this in the trace buffer. > + */ > + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); > + ), > + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", > + __get_str(devname), __get_str(host), > + show_uc_errs(__entry->status), > + show_uc_errs(__entry->first_error) > + ) > +); > + > TRACE_EVENT(cxl_aer_uncorrectable_error, > TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), > TP_ARGS(cxlmd, status, fe, hl), > @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, > { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ > ) > > +TRACE_EVENT(cxl_port_aer_correctable_error, > + TP_PROTO(struct device *dev, u32 status), > + TP_ARGS(dev, status), > + TP_STRUCT__entry( > + __string(devname, dev_name(dev)) > + __string(host, dev_name(dev->parent)) > + __field(u32, status) > + ), > + TP_fast_assign( > + __assign_str(devname); > + __assign_str(host); > + __entry->status = status; > + ), > + TP_printk("device=%s host=%s status='%s'", > + __get_str(devname), __get_str(host), > + show_ce_errs(__entry->status) > + ) > +); > + > TRACE_EVENT(cxl_aer_correctable_error, > TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), > TP_ARGS(cxlmd, status),
On 1/14/2025 5:49 AM, Jonathan Cameron wrote: > On Tue, 7 Jan 2025 08:38:50 -0600 > Terry Bowman <terry.bowman@amd.com> wrote: > >> The CXL drivers use kernel trace functions for logging endpoint and >> Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality >> is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL >> Upstream Switch Ports. >> >> Introduce trace logging functions for both RAS correctable and >> uncorrectable errors specific to CXL PCIe Ports. Additionally, update >> the CXL Port Protocol Error handlers to invoke these new trace functions. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> Reviewed-by: Alejandro Lucero <alucerop@amd.com> >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > An example print in commit message would help understand what the tracepoints > look like. > > Few more things inline following on from earlier comments. > > Jonathan >> --- >> drivers/cxl/core/pci.c | 17 +++++++++++---- >> drivers/cxl/core/trace.h | 47 ++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 60 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index 411834f7efe0..3e87fe54a1a2 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -663,10 +663,15 @@ static void __cxl_handle_cor_ras(struct device *dev, >> >> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; >> status = readl(addr); >> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { >> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) >> + return; >> + >> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + >> + if (is_cxl_memdev(dev)) > As below. Drag to earlier patch. Ok >> trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); >> - } >> + else > and perhaps check it's a port mostly for documentation purposes. > Ok >> + trace_cxl_port_aer_correctable_error(dev, status); >> } >> >> static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) >> @@ -724,7 +729,11 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) >> } >> >> header_log_copy(ras_base, hl); >> - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); >> + if (is_cxl_memdev(dev)) > As mentioned above, drag this if to the earlier patch. Ok >> + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); >> + else > For documentation purposes mostly I'd be tempted to have an is_cxl_port() check > before calling the following. > >> + trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl); >> + >> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); >> >> return true; >> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h >> index 8389a94adb1a..681e415ac8f5 100644 >> --- a/drivers/cxl/core/trace.h >> +++ b/drivers/cxl/core/trace.h >> @@ -48,6 +48,34 @@ >> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ >> ) >> >> +TRACE_EVENT(cxl_port_aer_uncorrectable_error, >> + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), >> + TP_ARGS(dev, status, fe, hl), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(host, dev_name(dev->parent)) > What is host in this case? Perhaps a comment. host is a string initialized with value from dev_name(dev->parent). What kind of comment would you like to see here? Regards, Terry >> + __field(u32, status) >> + __field(u32, first_error) >> + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(host); >> + __entry->status = status; >> + __entry->first_error = fe; >> + /* >> + * Embed the 512B headerlog data for user app retrieval and >> + * parsing, but no need to print this in the trace buffer. >> + */ >> + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); >> + ), >> + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", >> + __get_str(devname), __get_str(host), >> + show_uc_errs(__entry->status), >> + show_uc_errs(__entry->first_error) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_uncorrectable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), >> TP_ARGS(cxlmd, status, fe, hl), >> @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, >> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ >> ) >> >> +TRACE_EVENT(cxl_port_aer_correctable_error, >> + TP_PROTO(struct device *dev, u32 status), >> + TP_ARGS(dev, status), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(host, dev_name(dev->parent)) >> + __field(u32, status) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(host); >> + __entry->status = status; >> + ), >> + TP_printk("device=%s host=%s status='%s'", >> + __get_str(devname), __get_str(host), >> + show_ce_errs(__entry->status) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_correctable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), >> TP_ARGS(cxlmd, status),
Terry Bowman wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Additionally, update > the CXL Port Protocol Error handlers to invoke these new trace functions. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Alejandro Lucero <alucerop@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > --- Reviewed-by: Ira Weiny <ira.weiny@intel.com> [snip]
> >> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h > >> index 8389a94adb1a..681e415ac8f5 100644 > >> --- a/drivers/cxl/core/trace.h > >> +++ b/drivers/cxl/core/trace.h > >> @@ -48,6 +48,34 @@ > >> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ > >> ) > >> > >> +TRACE_EVENT(cxl_port_aer_uncorrectable_error, > >> + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > >> + TP_ARGS(dev, status, fe, hl), > >> + TP_STRUCT__entry( > >> + __string(devname, dev_name(dev)) > >> + __string(host, dev_name(dev->parent)) > > What is host in this case? Perhaps a comment. > host is a string initialized with value from dev_name(dev->parent). What > kind of comment would you like to see here? What is that parent in practice? A port, an EP, a PCI device? > > Regards, > Terry
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 411834f7efe0..3e87fe54a1a2 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -663,10 +663,15 @@ static void __cxl_handle_cor_ras(struct device *dev, addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) + return; + + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + + if (is_cxl_memdev(dev)) trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); - } + else + trace_cxl_port_aer_correctable_error(dev, status); } static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) @@ -724,7 +729,11 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) } header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); + if (is_cxl_memdev(dev)) + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); + else + trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true; diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 8389a94adb1a..681e415ac8f5 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -48,6 +48,34 @@ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ ) +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(host); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(devname), __get_str(host), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + TRACE_EVENT(cxl_aer_uncorrectable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), TP_ARGS(cxlmd, status, fe, hl), @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ ) +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(host); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(devname), __get_str(host), + show_ce_errs(__entry->status) + ) +); + TRACE_EVENT(cxl_aer_correctable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), TP_ARGS(cxlmd, status),