Message ID | 20250211192444.2292833-11-terry.bowman@amd.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | Enable CXL PCIe port protocol error handling and logging | expand |
On 2/11/25 12:24 PM, Terry Bowman wrote: > The CXL RAS handlers do not currently log if the RAS registers are > unmapped. This is needed in order to help debug CXL error handling. Update > the CXL driver to log a warning message if the RAS register block is > unmapped. > > Also, add type check before processing EP or RCH DP. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> Reviewed-by: Dave Jiang <dave.jiang@intel.com> > --- > drivers/cxl/core/pci.c | 20 ++++++++++++++------ > 1 file changed, 14 insertions(+), 6 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 69bb030aa8e1..af809e7cbe3b 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -658,15 +658,19 @@ static void __cxl_handle_cor_ras(struct device *dev, > void __iomem *addr; > u32 status; > > - if (!ras_base) > + if (!ras_base) { > + dev_warn_once(dev, "CXL RAS register block is not mapped"); > return; > + } > > addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > status = readl(addr); > - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { > - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) > + return; > + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + > + if (is_cxl_memdev(dev)) > trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); > - } > } > > static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) > @@ -702,8 +706,10 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > u32 status; > u32 fe; > > - if (!ras_base) > + if (!ras_base) { > + dev_warn_once(dev, "CXL RAS register block is not mapped"); > return false; > + } > > addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; > status = readl(addr); > @@ -722,7 +728,9 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > } > > header_log_copy(ras_base, hl); > - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); > + if (is_cxl_memdev(dev)) > + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); > + > writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); > > return true;
Terry Bowman wrote: > The CXL RAS handlers do not currently log if the RAS registers are > unmapped. This is needed in order to help debug CXL error handling. Update > the CXL driver to log a warning message if the RAS register block is > unmapped. > > Also, add type check before processing EP or RCH DP. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> > --- > drivers/cxl/core/pci.c | 20 ++++++++++++++------ > 1 file changed, 14 insertions(+), 6 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 69bb030aa8e1..af809e7cbe3b 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -658,15 +658,19 @@ static void __cxl_handle_cor_ras(struct device *dev, > void __iomem *addr; > u32 status; > > - if (!ras_base) > + if (!ras_base) { > + dev_warn_once(dev, "CXL RAS register block is not mapped"); > return; > + } > > addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > status = readl(addr); > - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { > - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) > + return; > + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > + > + if (is_cxl_memdev(dev)) > trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); I think trace_cxl_aer_correctable_error() should always fire and this should somehow be unified with the CPER record trace-event for protocol errors. The only usage of @memdev in this trace is retrieving the device serial number. If the device is not a memdev then print zero for the serial number, or something like that. In the end RAS daemon should only need to enable one trace event to get protocol errors and header logs from ports or endpoints, either natively, or via CPER. > - } > } > > static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) > @@ -702,8 +706,10 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > u32 status; > u32 fe; > > - if (!ras_base) > + if (!ras_base) { > + dev_warn_once(dev, "CXL RAS register block is not mapped"); Is this a "never can happen" print? It seems like an oversight in an upper layer to get this far down error reporting without the registers mapped. Like maybe this is a bug in a driver that should crash, or the driver should not be registering broken error handlers?
On 2/12/2025 4:59 PM, Dan Williams wrote: > Terry Bowman wrote: >> The CXL RAS handlers do not currently log if the RAS registers are >> unmapped. This is needed in order to help debug CXL error handling. Update >> the CXL driver to log a warning message if the RAS register block is >> unmapped. >> >> Also, add type check before processing EP or RCH DP. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Reviewed-by: Ira Weiny <ira.weiny@intel.com> >> Reviewed-by: Gregory Price <gourry@gourry.net> >> --- >> drivers/cxl/core/pci.c | 20 ++++++++++++++------ >> 1 file changed, 14 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index 69bb030aa8e1..af809e7cbe3b 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -658,15 +658,19 @@ static void __cxl_handle_cor_ras(struct device *dev, >> void __iomem *addr; >> u32 status; >> >> - if (!ras_base) >> + if (!ras_base) { >> + dev_warn_once(dev, "CXL RAS register block is not mapped"); >> return; >> + } >> >> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; >> status = readl(addr); >> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { >> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) >> + return; >> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + >> + if (is_cxl_memdev(dev)) >> trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); > I think trace_cxl_aer_correctable_error() should always fire and this > should somehow be unified with the CPER record trace-event for protocol > errors. > > The only usage of @memdev in this trace is retrieving the device serial > number. If the device is not a memdev then print zero for the serial > number, or something like that. > > In the end RAS daemon should only need to enable one trace event to get > protocol errors and header logs from ports or endpoints, either > natively, or via CPER. > That would be: we use 'struct *device' instead of 'struct *cxl_memdev' and pass serial# in as a parameter (0 in non-EP cases)? >> - } >> } >> >> static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) >> @@ -702,8 +706,10 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) >> u32 status; >> u32 fe; >> >> - if (!ras_base) >> + if (!ras_base) { >> + dev_warn_once(dev, "CXL RAS register block is not mapped"); > Is this a "never can happen" print? It seems like an oversight in an > upper layer to get this far down error reporting without the registers > mapped. > > Like maybe this is a bug in a driver that should crash, or the driver > should not be registering broken error handlers? Correct. The error handler assignment and enablement is gated by RAS mapping in cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting(). Terry
On Wed, 12 Feb 2025 18:08:13 -0600 "Bowman, Terry" <terry.bowman@amd.com> wrote: > On 2/12/2025 4:59 PM, Dan Williams wrote: > > Terry Bowman wrote: > >> The CXL RAS handlers do not currently log if the RAS registers are > >> unmapped. This is needed in order to help debug CXL error handling. Update > >> the CXL driver to log a warning message if the RAS register block is > >> unmapped. > >> > >> Also, add type check before processing EP or RCH DP. > >> > >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> > >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > >> Reviewed-by: Ira Weiny <ira.weiny@intel.com> > >> Reviewed-by: Gregory Price <gourry@gourry.net> > >> --- > >> drivers/cxl/core/pci.c | 20 ++++++++++++++------ > >> 1 file changed, 14 insertions(+), 6 deletions(-) > >> > >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > >> index 69bb030aa8e1..af809e7cbe3b 100644 > >> --- a/drivers/cxl/core/pci.c > >> +++ b/drivers/cxl/core/pci.c > >> @@ -658,15 +658,19 @@ static void __cxl_handle_cor_ras(struct device *dev, > >> void __iomem *addr; > >> u32 status; > >> > >> - if (!ras_base) > >> + if (!ras_base) { > >> + dev_warn_once(dev, "CXL RAS register block is not mapped"); > >> return; > >> + } > >> > >> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > >> status = readl(addr); > >> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { > >> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > >> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) > >> + return; > >> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > >> + > >> + if (is_cxl_memdev(dev)) > >> trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); > > I think trace_cxl_aer_correctable_error() should always fire and this > > should somehow be unified with the CPER record trace-event for protocol > > errors. > > > > The only usage of @memdev in this trace is retrieving the device serial > > number. If the device is not a memdev then print zero for the serial > > number, or something like that. > > > > In the end RAS daemon should only need to enable one trace event to get > > protocol errors and header logs from ports or endpoints, either > > natively, or via CPER. > > > That would be: we use 'struct *device' instead of 'struct *cxl_memdev' > and pass serial# in as a parameter (0 in non-EP cases)? For a USP may well have a serial number cap. Might be worth getting it? slightly nasty thing here will be change of memdev=%s to devname=%s or similar. Meh. It's a TP_printk() I'm not sure anyone will care. Need to be careful with the tracepoint itself though and the rasdaemon etc handling. https://github.com/mchehab/rasdaemon/blob/master/ras-cxl-handler.c#L351 We may have to just add another field. > > >> - } > >> } > >> > >> static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) > >> @@ -702,8 +706,10 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) > >> u32 status; > >> u32 fe; > >> > >> - if (!ras_base) > >> + if (!ras_base) { > >> + dev_warn_once(dev, "CXL RAS register block is not mapped"); > > Is this a "never can happen" print? It seems like an oversight in an > > upper layer to get this far down error reporting without the registers > > mapped. > > > > Like maybe this is a bug in a driver that should crash, or the driver > > should not be registering broken error handlers? > Correct. The error handler assignment and enablement is gated by RAS mapping > in cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting(). > > Terry > > > > >
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 69bb030aa8e1..af809e7cbe3b 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -658,15 +658,19 @@ static void __cxl_handle_cor_ras(struct device *dev, void __iomem *addr; u32 status; - if (!ras_base) + if (!ras_base) { + dev_warn_once(dev, "CXL RAS register block is not mapped"); return; + } addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) + return; + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + + if (is_cxl_memdev(dev)) trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); - } } static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) @@ -702,8 +706,10 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) u32 status; u32 fe; - if (!ras_base) + if (!ras_base) { + dev_warn_once(dev, "CXL RAS register block is not mapped"); return false; + } addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; status = readl(addr); @@ -722,7 +728,9 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) } header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); + if (is_cxl_memdev(dev)) + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true;