diff mbox series

[v6,11/11] cxl/pci: Add callback to log AER correctable error

Message ID 166985287203.2871899.13605149073500556137.stgit@djiang5-desk3.ch.intel.com
State Accepted
Commit 6155ccc9ddf6642056f1c00c2851d1938d27a7f2
Headers show
Series None | expand

Commit Message

Dave Jiang Dec. 1, 2022, 12:02 a.m. UTC
Add AER error handler callback to read the RAS capability structure
correctable error (CE) status register for the CXL device. Log the
error as a trace event and clear the error. For CXL devices, the driver
also needs to write back to the status register to clear the
unmasked correctable errors.

See CXL spec rev3.0 8.2.4.16 for RAS capability structure CE Status
Register.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---

v6:
- Update commit log to point to RAS capability structure. (Bjorn)
- Change cxl_correctable_error_logging() to cxl_cor_error_detected().
  (Bjorn)

 drivers/cxl/pci.c |   20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

Comments

Bowman, Terry Dec. 7, 2022, 8:04 p.m. UTC | #1
Hi Dave,

On 11/30/22 18:02, Dave Jiang wrote:
> Add AER error handler callback to read the RAS capability structure
> correctable error (CE) status register for the CXL device. Log the
> error as a trace event and clear the error. For CXL devices, the driver
> also needs to write back to the status register to clear the
> unmasked correctable errors.
> 
> See CXL spec rev3.0 8.2.4.16 for RAS capability structure CE Status
> Register.
> 
> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
> 
> v6:
> - Update commit log to point to RAS capability structure. (Bjorn)
> - Change cxl_correctable_error_logging() to cxl_cor_error_detected().
>   (Bjorn)
> 
>  drivers/cxl/pci.c |   20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
> 
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 11f842df9807..02342830b612 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -622,10 +622,30 @@ static void cxl_error_resume(struct pci_dev *pdev)
>  		 dev->driver ? "successful" : "failed");
>  }
>  
> +static void cxl_cor_error_detected(struct pci_dev *pdev)
> +{
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> +	struct device *dev = &cxlmd->dev;
> +	void __iomem *addr;
> +	u32 status;
> +
> +	if (!cxlds->regs.ras)
> +		return;
> +
> +	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
> +	status = le32_to_cpu(readl(addr));
> +	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
> +		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
> +		trace_cxl_aer_correctable_error(dev_name(dev), status);
> +	}
> +}
> +

This will log PCI AER CEs only if there is also a RAS CE. My understanding
(could be the problem) is AER CE's are normally reported. Will this be inconsistent
with other error AER CE handling?

Regards,
Terry 

>  static const struct pci_error_handlers cxl_error_handlers = {
>  	.error_detected	= cxl_error_detected,
>  	.slot_reset	= cxl_slot_reset,
>  	.resume		= cxl_error_resume,
> +	.cor_error_detected	= cxl_cor_error_detected,
>  };
>  
>  static struct pci_driver cxl_pci_driver = {
> 
>
Bjorn Helgaas Dec. 7, 2022, 8:29 p.m. UTC | #2
Hi Terry,

On Wed, Dec 07, 2022 at 02:04:17PM -0600, Terry Bowman wrote:
> On 11/30/22 18:02, Dave Jiang wrote:
> > Add AER error handler callback to read the RAS capability structure
> > correctable error (CE) status register for the CXL device. Log the
> > error as a trace event and clear the error. For CXL devices, the driver
> > also needs to write back to the status register to clear the
> > unmasked correctable errors.
> > 
> > See CXL spec rev3.0 8.2.4.16 for RAS capability structure CE Status
> > Register.
> > 
> > Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> > 
> > v6:
> > - Update commit log to point to RAS capability structure. (Bjorn)
> > - Change cxl_correctable_error_logging() to cxl_cor_error_detected().
> >   (Bjorn)
> > 
> >  drivers/cxl/pci.c |   20 ++++++++++++++++++++
> >  1 file changed, 20 insertions(+)
> > 
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index 11f842df9807..02342830b612 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -622,10 +622,30 @@ static void cxl_error_resume(struct pci_dev *pdev)
> >  		 dev->driver ? "successful" : "failed");
> >  }
> >  
> > +static void cxl_cor_error_detected(struct pci_dev *pdev)
> > +{
> > +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> > +	struct cxl_memdev *cxlmd = cxlds->cxlmd;
> > +	struct device *dev = &cxlmd->dev;
> > +	void __iomem *addr;
> > +	u32 status;
> > +
> > +	if (!cxlds->regs.ras)
> > +		return;
> > +
> > +	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
> > +	status = le32_to_cpu(readl(addr));
> > +	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
> > +		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
> > +		trace_cxl_aer_correctable_error(dev_name(dev), status);
> > +	}
> > +}
> > +
> 
> This will log PCI AER CEs only if there is also a RAS CE. My
> understanding (could be the problem) is AER CE's are normally
> reported. Will this be inconsistent with other error AER CE
> handling?

I can't quite parse this, so let me ramble and see if we accidentally
converge on some understanding :)

cxl_cor_error_detected() is the .cor_error_detected handler, which is
called by the AER code in the PCI core.  So IIUC, we'll only get here
if that PCI core AER code is invoked via an AER interrupt, AER
polling, or an event from the ACPI APEI framework.

So I would expect "this will only log CXL RAS CEs if there is a PCI
AER CE", which is the opposite of what you said.  But I have no idea
at all about how CXL RAS CEs work.

It looks like aer_enable_rootport() sets PCI_ERR_ROOT_CMD_COR_EN, so I
would expect that AER CEs normally cause interrupts and would be
discovered that way.

> >  static const struct pci_error_handlers cxl_error_handlers = {
> >  	.error_detected	= cxl_error_detected,
> >  	.slot_reset	= cxl_slot_reset,
> >  	.resume		= cxl_error_resume,
> > +	.cor_error_detected	= cxl_cor_error_detected,
> >  };
> >  
> >  static struct pci_driver cxl_pci_driver = {
> > 
> > 
>
Bowman, Terry Dec. 7, 2022, 8:54 p.m. UTC | #3
Hi Bjorn,

On 12/7/22 14:29, Bjorn Helgaas wrote:
> Hi Terry,
> 
> On Wed, Dec 07, 2022 at 02:04:17PM -0600, Terry Bowman wrote:
>> On 11/30/22 18:02, Dave Jiang wrote:
>>> Add AER error handler callback to read the RAS capability structure
>>> correctable error (CE) status register for the CXL device. Log the
>>> error as a trace event and clear the error. For CXL devices, the driver
>>> also needs to write back to the status register to clear the
>>> unmasked correctable errors.
>>>
>>> See CXL spec rev3.0 8.2.4.16 for RAS capability structure CE Status
>>> Register.
>>>
>>> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>>> ---
>>>
>>> v6:
>>> - Update commit log to point to RAS capability structure. (Bjorn)
>>> - Change cxl_correctable_error_logging() to cxl_cor_error_detected().
>>>   (Bjorn)
>>>
>>>  drivers/cxl/pci.c |   20 ++++++++++++++++++++
>>>  1 file changed, 20 insertions(+)
>>>
>>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>>> index 11f842df9807..02342830b612 100644
>>> --- a/drivers/cxl/pci.c
>>> +++ b/drivers/cxl/pci.c
>>> @@ -622,10 +622,30 @@ static void cxl_error_resume(struct pci_dev *pdev)
>>>  		 dev->driver ? "successful" : "failed");
>>>  }
>>>  
>>> +static void cxl_cor_error_detected(struct pci_dev *pdev)
>>> +{
>>> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>>> +	struct cxl_memdev *cxlmd = cxlds->cxlmd;
>>> +	struct device *dev = &cxlmd->dev;
>>> +	void __iomem *addr;
>>> +	u32 status;
>>> +
>>> +	if (!cxlds->regs.ras)
>>> +		return;
>>> +
>>> +	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>>> +	status = le32_to_cpu(readl(addr));
>>> +	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>>> +		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>>> +		trace_cxl_aer_correctable_error(dev_name(dev), status);
>>> +	}
>>> +}
>>> +
>>
>> This will log PCI AER CEs only if there is also a RAS CE. My
>> understanding (could be the problem) is AER CE's are normally
>> reported. Will this be inconsistent with other error AER CE
>> handling?
> 
> I can't quite parse this, so let me ramble and see if we accidentally
> converge on some understanding :)
> 
> cxl_cor_error_detected() is the .cor_error_detected handler, which is
> called by the AER code in the PCI core.  So IIUC, we'll only get here
> if that PCI core AER code is invoked via an AER interrupt, AER
> polling, or an event from the ACPI APEI framework.
> 
> So I would expect "this will only log CXL RAS CEs if there is a PCI
> AER CE", which is the opposite of what you said.  But I have no idea
> at all about how CXL RAS CEs work.
> 
> It looks like aer_enable_rootport() sets PCI_ERR_ROOT_CMD_COR_EN, so I
> would expect that AER CEs normally cause interrupts and would be
> discovered that way.
> 

Thanks for the details. I realized after I sent the email that 
cxl_aer_uncorrectable_error() and cxl_aer_correctable_error() trace commands 
are logging the RAS registers.

Regards,
Terry

>>>  static const struct pci_error_handlers cxl_error_handlers = {
>>>  	.error_detected	= cxl_error_detected,
>>>  	.slot_reset	= cxl_slot_reset,
>>>  	.resume		= cxl_error_resume,
>>> +	.cor_error_detected	= cxl_cor_error_detected,
>>>  };
>>>  
>>>  static struct pci_driver cxl_pci_driver = {
>>>
>>>
>>
diff mbox series

Patch

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 11f842df9807..02342830b612 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -622,10 +622,30 @@  static void cxl_error_resume(struct pci_dev *pdev)
 		 dev->driver ? "successful" : "failed");
 }
 
+static void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	void __iomem *addr;
+	u32 status;
+
+	if (!cxlds->regs.ras)
+		return;
+
+	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+	status = le32_to_cpu(readl(addr));
+	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+		trace_cxl_aer_correctable_error(dev_name(dev), status);
+	}
+}
+
 static const struct pci_error_handlers cxl_error_handlers = {
 	.error_detected	= cxl_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
+	.cor_error_detected	= cxl_cor_error_detected,
 };
 
 static struct pci_driver cxl_pci_driver = {