diff mbox series

[RFC,1/2] cxl: RAS: Multiple header recording support

Message ID 20230113154058.16227-2-Jonathan.Cameron@huawei.com
State New, archived
Headers show
Series CXL UE RAS Multiple Header Logging support | expand

Commit Message

Jonathan Cameron Jan. 13, 2023, 3:40 p.m. UTC
Similar to PCIe, CXL devices may support logging multiple headers
corresponding to multiple errors as reported via the CXL RAS capability.

Unlike PCIe, in CXL there is no Multiple Header Recording Enable bit
and the CXL r3.0 specification is sparse on details. As such, the
kernel should allow for any reasonable interpretation including
endpoints for which the capability bit is set that behave as per
the PCIe equivalent definitions (with assumption that the missing
'enable bit' is set). Note that behaving as if Multiple Headers
are being logged is also valid behavior when they are not so this
approach should be safe with all sensible specification interpretations.

By repeatedly attempting to clear a single bit corresponding to the reported
First Error (may need multiple goes if multiple records of same type
are tracked by the hardware) the additional header logs may be obtained.

Note that each trace record only records the FE in the status.
We could record them all as done without Multi header recording
capability but that seemed less intuitive to me.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/cxl/core/pci.c | 17 ++++++++++++-----
 drivers/cxl/cxl.h      |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

Comments

Dave Jiang Jan. 17, 2023, 6:05 p.m. UTC | #1
On 1/13/23 8:40 AM, Jonathan Cameron wrote:
> Similar to PCIe, CXL devices may support logging multiple headers
> corresponding to multiple errors as reported via the CXL RAS capability.
> 
> Unlike PCIe, in CXL there is no Multiple Header Recording Enable bit
> and the CXL r3.0 specification is sparse on details. As such, the
> kernel should allow for any reasonable interpretation including
> endpoints for which the capability bit is set that behave as per
> the PCIe equivalent definitions (with assumption that the missing
> 'enable bit' is set). Note that behaving as if Multiple Headers
> are being logged is also valid behavior when they are not so this
> approach should be safe with all sensible specification interpretations.
> 
> By repeatedly attempting to clear a single bit corresponding to the reported
> First Error (may need multiple goes if multiple records of same type
> are tracked by the hardware) the additional header logs may be obtained.
> 
> Note that each trace record only records the FE in the status.
> We could record them all as done without Multi header recording
> capability but that seemed less intuitive to me.
> 
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Looks reasonable

Reviewed-by: Dave Jiang <dave.jiang@intel.com>

> ---
>   drivers/cxl/core/pci.c | 17 ++++++++++++-----
>   drivers/cxl/cxl.h      |  1 +
>   2 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 184ead6a2796..6fd311e313c6 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -673,10 +673,13 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>   	void __iomem *addr;
>   	u32 status;
>   	u32 fe;
> +	bool mh;
>   
>   	if (!cxlds->regs.ras)
>   		return false;
>   
> +next_record:
> +	mh = false;
>   	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>   	status = readl(addr);
>   	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
> @@ -684,11 +687,13 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>   
>   	/* If multiple errors, log header points to first error from ctrl reg */
>   	if (hweight32(status) > 1) {
> -		void __iomem *rcc_addr =
> -			cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
> -
> -		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> -				   readl(rcc_addr)));
> +		u32 capctrl = readl(cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET);
> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, capctrl));
> +		if (FIELD_GET(CXL_RAS_CAP_CONTROL_MH_REC_CAP, capctrl)) {
> +			mh = true;
> +			/* Report and clear only first error */
> +			status = fe;
> +		}
>   	} else {
>   		fe = status;
>   	}
> @@ -696,6 +701,8 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>   	header_log_copy(cxlds, hl);
>   	trace_cxl_aer_uncorrectable_error(dev, status, fe, hl);
>   	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
> +	if (mh)
> +		goto next_record;
>   
>   	return true;
>   }
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index aa3af3bb73b2..ee31a99073c2 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -138,6 +138,7 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
>   #define   CXL_RAS_CORRECTABLE_MASK_MASK GENMASK(6, 0)
>   #define CXL_RAS_CAP_CONTROL_OFFSET 0x14
>   #define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0)
> +#define CXL_RAS_CAP_CONTROL_MH_REC_CAP BIT(9)
>   #define CXL_RAS_HEADER_LOG_OFFSET 0x18
>   #define CXL_RAS_CAPABILITY_LENGTH 0x58
>   #define CXL_HEADERLOG_SIZE SZ_512
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 184ead6a2796..6fd311e313c6 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -673,10 +673,13 @@  static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
 	void __iomem *addr;
 	u32 status;
 	u32 fe;
+	bool mh;
 
 	if (!cxlds->regs.ras)
 		return false;
 
+next_record:
+	mh = false;
 	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
@@ -684,11 +687,13 @@  static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
 
 	/* If multiple errors, log header points to first error from ctrl reg */
 	if (hweight32(status) > 1) {
-		void __iomem *rcc_addr =
-			cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
-
-		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
-				   readl(rcc_addr)));
+		u32 capctrl = readl(cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET);
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, capctrl));
+		if (FIELD_GET(CXL_RAS_CAP_CONTROL_MH_REC_CAP, capctrl)) {
+			mh = true;
+			/* Report and clear only first error */
+			status = fe;
+		}
 	} else {
 		fe = status;
 	}
@@ -696,6 +701,8 @@  static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
 	header_log_copy(cxlds, hl);
 	trace_cxl_aer_uncorrectable_error(dev, status, fe, hl);
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+	if (mh)
+		goto next_record;
 
 	return true;
 }
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index aa3af3bb73b2..ee31a99073c2 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -138,6 +138,7 @@  static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
 #define   CXL_RAS_CORRECTABLE_MASK_MASK GENMASK(6, 0)
 #define CXL_RAS_CAP_CONTROL_OFFSET 0x14
 #define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0)
+#define CXL_RAS_CAP_CONTROL_MH_REC_CAP BIT(9)
 #define CXL_RAS_HEADER_LOG_OFFSET 0x18
 #define CXL_RAS_CAPABILITY_LENGTH 0x58
 #define CXL_HEADERLOG_SIZE SZ_512