diff mbox series

[v3,6/6] PCI/AER: Unmask RCEC internal errors to enable RCH downstream port error handling

Message ID 20230411180302.2678736-7-terry.bowman@amd.com
State Superseded
Headers show
Series cxl/pci: Add support for RCH RAS error handling | expand

Commit Message

Bowman, Terry April 11, 2023, 6:03 p.m. UTC
From: Robert Richter <rrichter@amd.com>

RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
disabled by default. [1][2] Enable them to receive CXL downstream port
errors of a Restricted CXL Host (RCH).

[1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
[2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
    7.8.4.6 Correctable Error Mask Register

Co-developed-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-pci@vger.kernel.org
---
 drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

Comments

Bjorn Helgaas April 12, 2023, 9:29 p.m. UTC | #1
On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> From: Robert Richter <rrichter@amd.com>
> 
> RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> disabled by default.

"Disabled by default" just means "the power-up state of CIE/UIC is
that they are masked", right?  It doesn't mean that Linux normally
masks them.

> [1][2] Enable them to receive CXL downstream port
> errors of a Restricted CXL Host (RCH).
> 
> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
>     7.8.4.6 Correctable Error Mask Register
> 
> Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Robert Richter <rrichter@amd.com>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Cc: "Oliver O'Halloran" <oohall@gmail.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-pci@vger.kernel.org
> ---
>  drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 73 insertions(+)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 171a08fd8ebd..3973c731e11d 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1000,7 +1000,79 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>  		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
>  }
>  
> +static bool cxl_error_is_native(struct pci_dev *dev)
> +{
> +	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
> +
> +	if (pcie_ports_native)
> +		return true;
> +
> +	return host->native_aer && host->native_cxl_error;
> +}
> +
> +static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> +{
> +	int *handles_cxl = data;
> +
> +	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> +
> +	return *handles_cxl;
> +}
> +
> +static bool handles_cxl_errors(struct pci_dev *rcec)
> +{
> +	int handles_cxl = 0;
> +
> +	if (!rcec->aer_cap)
> +		return false;
> +
> +	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
> +		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> +
> +	return !!handles_cxl;
> +}
> +
> +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> +{
> +	int aer, rc;
> +	u32 mask;
> +
> +	/*
> +	 * Internal errors are masked by default, unmask RCEC's here
> +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> +	 */

Unmasking internal errors doesn't have anything specific to do with
CXL, so I don't think it should have "cxl" in the function name.
Maybe something like "pci_aer_unmask_internal_errors()".

This also has nothing special to do with RCECs, so I think we should
refer to the device as "dev" as is typical in this file.

I think this needs to check pcie_aer_is_native() as is done by
pci_aer_clear_nonfatal_status() and other functions that write the AER
Capability.

With the exception of this function, this patch looks like all CXL
code that maybe could be with other CXL code.  Would require making
pcie_walk_rcec() available outside drivers/pci, I guess.

> +	aer = rcec->aer_cap;
> +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
> +	if (rc)
> +		return rc;
> +	mask &= ~PCI_ERR_UNC_INTN;
> +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
> +	if (rc)
> +		return rc;
> +
> +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
> +	if (rc)
> +		return rc;
> +	mask &= ~PCI_ERR_COR_INTERNAL;
> +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
> +
> +	return rc;
> +}
> +
> +static void cxl_unmask_internal_errors(struct pci_dev *rcec)
> +{
> +	if (!handles_cxl_errors(rcec))
> +		return;
> +
> +	if (__cxl_unmask_internal_errors(rcec))
> +		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
> +	else
> +		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");
> +}
> +
>  #else
> +static inline void cxl_unmask_internal_errors(struct pci_dev *dev) { }
>  static inline void cxl_handle_error(struct pci_dev *dev,
>  				    struct aer_err_info *info) { }
>  #endif
> @@ -1397,6 +1469,7 @@ static int aer_probe(struct pcie_device *dev)
>  		return status;
>  	}
>  
> +	cxl_unmask_internal_errors(port);
>  	aer_enable_rootport(rpc);
>  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
>  	return 0;
> -- 
> 2.34.1
>
Robert Richter April 13, 2023, 1:38 p.m. UTC | #2
On 12.04.23 16:29:01, Bjorn Helgaas wrote:
> On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> > From: Robert Richter <rrichter@amd.com>
> > 
> > RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> > disabled by default.
> 
> "Disabled by default" just means "the power-up state of CIE/UIC is
> that they are masked", right?  It doesn't mean that Linux normally
> masks them.

Yes, will change the wording here.

> > [1][2] Enable them to receive CXL downstream port
> > errors of a Restricted CXL Host (RCH).
> > 
> > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> > [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
> >     7.8.4.6 Correctable Error Mask Register
> > 
> > Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> > Signed-off-by: Robert Richter <rrichter@amd.com>
> > Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> > Cc: "Oliver O'Halloran" <oohall@gmail.com>
> > Cc: Bjorn Helgaas <bhelgaas@google.com>
> > Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: linux-pci@vger.kernel.org
> > ---
> >  drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 73 insertions(+)
> > 
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index 171a08fd8ebd..3973c731e11d 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -1000,7 +1000,79 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> >  		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
> >  }
> >  
> > +static bool cxl_error_is_native(struct pci_dev *dev)
> > +{
> > +	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
> > +
> > +	if (pcie_ports_native)
> > +		return true;
> > +
> > +	return host->native_aer && host->native_cxl_error;
> > +}
> > +
> > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> > +{
> > +	int *handles_cxl = data;
> > +
> > +	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> > +
> > +	return *handles_cxl;
> > +}
> > +
> > +static bool handles_cxl_errors(struct pci_dev *rcec)
> > +{
> > +	int handles_cxl = 0;
> > +
> > +	if (!rcec->aer_cap)
> > +		return false;
> > +
> > +	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
> > +		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> > +
> > +	return !!handles_cxl;
> > +}
> > +
> > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > +{
> > +	int aer, rc;
> > +	u32 mask;
> > +
> > +	/*
> > +	 * Internal errors are masked by default, unmask RCEC's here
> > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > +	 */
> 
> Unmasking internal errors doesn't have anything specific to do with
> CXL, so I don't think it should have "cxl" in the function name.
> Maybe something like "pci_aer_unmask_internal_errors()".

Since it is static I renamed it to aer_unmask_internal_errors() and
also moved it to the beginning of the #ifdef block for easier later
reuse.

> 
> This also has nothing special to do with RCECs, so I think we should
> refer to the device as "dev" as is typical in this file.

Changed.

> 
> I think this needs to check pcie_aer_is_native() as is done by
> pci_aer_clear_nonfatal_status() and other functions that write the AER
> Capability.

Also added the check to aer_unmask_internal_errors(). There was a
check for native_* in handles_cxl_errors() already, but only for the
pci devs of the RCEC. I added a check of the RCEC there too.

> 
> With the exception of this function, this patch looks like all CXL
> code that maybe could be with other CXL code.  Would require making
> pcie_walk_rcec() available outside drivers/pci, I guess.

Even this is CXL code, it implements AER support and fits better here
around AER code. Export of pcie_walk_rcec() (and others?) is not the
main issue here. CXL drivers can come as modules and would need to
register a hook at the aer handler.  This would add even more
complexity here. In contrast, current solution just adds two functions
for enablement and handling which are empty stubs if code is disabled.

I could move that code to aer_cxl.c similar to aer_inject.c. Since the
CXL part is small compared to the remaining aer code I left it in
aer.c. Also, it is guarded by #ifdef which additionally encapsulates
it.

> 
> > +	aer = rcec->aer_cap;
> > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
> > +	if (rc)
> > +		return rc;
> > +	mask &= ~PCI_ERR_UNC_INTN;
> > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
> > +	if (rc)
> > +		return rc;
> > +
> > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
> > +	if (rc)
> > +		return rc;
> > +	mask &= ~PCI_ERR_COR_INTERNAL;
> > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
> > +
> > +	return rc;
> > +}
> > +
> > +static void cxl_unmask_internal_errors(struct pci_dev *rcec)

Also renaming this to cxl_enable_rcec() to more generalize the
function.

> > +{
> > +	if (!handles_cxl_errors(rcec))
> > +		return;
> > +
> > +	if (__cxl_unmask_internal_errors(rcec))
> > +		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
> > +	else
> > +		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");

I am going to change this to a pci_info() for alignment with other
messages around:

[   14.200265] pcieport 0000:40:00.3: PME: Signaling with IRQ 44
[   14.213925] pcieport 0000:40:00.3: AER: cxl: Internal errors unmasked
[   14.228413] pcieport 0000:40:00.3: AER: enabled with IRQ 44

Plus, using pci_err() instead of dev_err().

> > +}
> > +
> >  #else
> > +static inline void cxl_unmask_internal_errors(struct pci_dev *dev) { }
> >  static inline void cxl_handle_error(struct pci_dev *dev,
> >  				    struct aer_err_info *info) { }
> >  #endif
> > @@ -1397,6 +1469,7 @@ static int aer_probe(struct pcie_device *dev)
> >  		return status;
> >  	}
> >  
> > +	cxl_unmask_internal_errors(port);
> >  	aer_enable_rootport(rpc);
> >  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
> >  	return 0;
> > -- 
> > 2.34.1
> > 

Thanks for review,

-Robert
Jonathan Cameron April 13, 2023, 5:01 p.m. UTC | #3
On Wed, 12 Apr 2023 16:29:01 -0500
Bjorn Helgaas <helgaas@kernel.org> wrote:

> On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> > From: Robert Richter <rrichter@amd.com>
> > 
> > RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> > disabled by default.  
> 
> "Disabled by default" just means "the power-up state of CIE/UIC is
> that they are masked", right?  It doesn't mean that Linux normally
> masks them.
> 
> > [1][2] Enable them to receive CXL downstream port
> > errors of a Restricted CXL Host (RCH).
> > 
> > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> > [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
> >     7.8.4.6 Correctable Error Mask Register
> > 
> > Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> > Signed-off-by: Robert Richter <rrichter@amd.com>
> > Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> > Cc: "Oliver O'Halloran" <oohall@gmail.com>
> > Cc: Bjorn Helgaas <bhelgaas@google.com>
> > Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: linux-pci@vger.kernel.org
> > ---
> >  drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 73 insertions(+)
> > 
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index 171a08fd8ebd..3973c731e11d 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -1000,7 +1000,79 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> >  		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
> >  }
> >  
> > +static bool cxl_error_is_native(struct pci_dev *dev)
> > +{
> > +	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
> > +
> > +	if (pcie_ports_native)
> > +		return true;
> > +
> > +	return host->native_aer && host->native_cxl_error;
> > +}
> > +
> > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> > +{
> > +	int *handles_cxl = data;
> > +
> > +	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> > +
> > +	return *handles_cxl;
> > +}
> > +
> > +static bool handles_cxl_errors(struct pci_dev *rcec)
> > +{
> > +	int handles_cxl = 0;
> > +
> > +	if (!rcec->aer_cap)
> > +		return false;
> > +
> > +	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
> > +		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> > +
> > +	return !!handles_cxl;
> > +}
> > +
> > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > +{
> > +	int aer, rc;
> > +	u32 mask;
> > +
> > +	/*
> > +	 * Internal errors are masked by default, unmask RCEC's here
> > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > +	 */  
> 
> Unmasking internal errors doesn't have anything specific to do with
> CXL, so I don't think it should have "cxl" in the function name.
> Maybe something like "pci_aer_unmask_internal_errors()".

This reminds me.  Not sure we resolved earlier discussion on changing
the system wide policy to turn these on 
https://lore.kernel.org/linux-cxl/20221229172731.GA611562@bhelgaas/
which needs pretty much the same thing.

Ira, I think you were picking this one up?
https://lore.kernel.org/linux-cxl/63e5fb533f304_13244829412@iweiny-mobl.notmuch/

Thanks,

Jonathan


> 
> This also has nothing special to do with RCECs, so I think we should
> refer to the device as "dev" as is typical in this file.
> 
> I think this needs to check pcie_aer_is_native() as is done by
> pci_aer_clear_nonfatal_status() and other functions that write the AER
> Capability.
> 
> With the exception of this function, this patch looks like all CXL
> code that maybe could be with other CXL code.  Would require making
> pcie_walk_rcec() available outside drivers/pci, I guess.
> 
> > +	aer = rcec->aer_cap;
> > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
> > +	if (rc)
> > +		return rc;
> > +	mask &= ~PCI_ERR_UNC_INTN;
> > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
> > +	if (rc)
> > +		return rc;
> > +
> > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
> > +	if (rc)
> > +		return rc;
> > +	mask &= ~PCI_ERR_COR_INTERNAL;
> > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
> > +
> > +	return rc;
> > +}
> > +
> > +static void cxl_unmask_internal_errors(struct pci_dev *rcec)
> > +{
> > +	if (!handles_cxl_errors(rcec))
> > +		return;
> > +
> > +	if (__cxl_unmask_internal_errors(rcec))
> > +		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
> > +	else
> > +		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");
> > +}
> > +
> >  #else
> > +static inline void cxl_unmask_internal_errors(struct pci_dev *dev) { }
> >  static inline void cxl_handle_error(struct pci_dev *dev,
> >  				    struct aer_err_info *info) { }
> >  #endif
> > @@ -1397,6 +1469,7 @@ static int aer_probe(struct pcie_device *dev)
> >  		return status;
> >  	}
> >  
> > +	cxl_unmask_internal_errors(port);
> >  	aer_enable_rootport(rpc);
> >  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
> >  	return 0;
> > -- 
> > 2.34.1
> >
Jonathan Cameron April 13, 2023, 5:05 p.m. UTC | #4
On Thu, 13 Apr 2023 15:38:07 +0200
Robert Richter <rrichter@amd.com> wrote:

> On 12.04.23 16:29:01, Bjorn Helgaas wrote:
> > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:  
> > > From: Robert Richter <rrichter@amd.com>
> > > 
> > > RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> > > disabled by default.  
> > 
> > "Disabled by default" just means "the power-up state of CIE/UIC is
> > that they are masked", right?  It doesn't mean that Linux normally
> > masks them.  
> 
> Yes, will change the wording here.
> 
> > > [1][2] Enable them to receive CXL downstream port
> > > errors of a Restricted CXL Host (RCH).
> > > 
> > > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> > > [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
> > >     7.8.4.6 Correctable Error Mask Register
> > > 
> > > Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> > > Signed-off-by: Robert Richter <rrichter@amd.com>
> > > Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> > > Cc: "Oliver O'Halloran" <oohall@gmail.com>
> > > Cc: Bjorn Helgaas <bhelgaas@google.com>
> > > Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> > > Cc: linuxppc-dev@lists.ozlabs.org
> > > Cc: linux-pci@vger.kernel.org
> > > ---
> > >  drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 73 insertions(+)
> > > 
> > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > > index 171a08fd8ebd..3973c731e11d 100644
> > > --- a/drivers/pci/pcie/aer.c
> > > +++ b/drivers/pci/pcie/aer.c
> > > @@ -1000,7 +1000,79 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> > >  		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
> > >  }
> > >  
> > > +static bool cxl_error_is_native(struct pci_dev *dev)
> > > +{
> > > +	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
> > > +
> > > +	if (pcie_ports_native)
> > > +		return true;
> > > +
> > > +	return host->native_aer && host->native_cxl_error;
> > > +}
> > > +
> > > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> > > +{
> > > +	int *handles_cxl = data;
> > > +
> > > +	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> > > +
> > > +	return *handles_cxl;
> > > +}
> > > +
> > > +static bool handles_cxl_errors(struct pci_dev *rcec)
> > > +{
> > > +	int handles_cxl = 0;
> > > +
> > > +	if (!rcec->aer_cap)
> > > +		return false;
> > > +
> > > +	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
> > > +		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> > > +
> > > +	return !!handles_cxl;
> > > +}
> > > +
> > > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > > +{
> > > +	int aer, rc;
> > > +	u32 mask;
> > > +
> > > +	/*
> > > +	 * Internal errors are masked by default, unmask RCEC's here
> > > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > > +	 */  
> > 
> > Unmasking internal errors doesn't have anything specific to do with
> > CXL, so I don't think it should have "cxl" in the function name.
> > Maybe something like "pci_aer_unmask_internal_errors()".  
> 
> Since it is static I renamed it to aer_unmask_internal_errors() and
> also moved it to the beginning of the #ifdef block for easier later
> reuse.
> 
> > 
> > This also has nothing special to do with RCECs, so I think we should
> > refer to the device as "dev" as is typical in this file.  
> 
> Changed.
> 
> > 
> > I think this needs to check pcie_aer_is_native() as is done by
> > pci_aer_clear_nonfatal_status() and other functions that write the AER
> > Capability.  
> 
> Also added the check to aer_unmask_internal_errors(). There was a
> check for native_* in handles_cxl_errors() already, but only for the
> pci devs of the RCEC. I added a check of the RCEC there too.
> 
> > 
> > With the exception of this function, this patch looks like all CXL
> > code that maybe could be with other CXL code.  Would require making
> > pcie_walk_rcec() available outside drivers/pci, I guess.  
> 
> Even this is CXL code, it implements AER support and fits better here
> around AER code. Export of pcie_walk_rcec() (and others?) is not the
> main issue here. CXL drivers can come as modules and would need to
> register a hook at the aer handler.  This would add even more
> complexity here. In contrast, current solution just adds two functions
> for enablement and handling which are empty stubs if code is disabled.
> 
> I could move that code to aer_cxl.c similar to aer_inject.c. Since the
> CXL part is small compared to the remaining aer code I left it in
> aer.c. Also, it is guarded by #ifdef which additionally encapsulates
> it.
> 

To throw another option in there (what Bjorn suggested IIRC for the more
general case..) 

Just enable internal errors always.  No need to know if they are CXL
or something else.

There will/might be fallout and it will be fun.

Jonathan

> >   
> > > +	aer = rcec->aer_cap;
> > > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
> > > +	if (rc)
> > > +		return rc;
> > > +	mask &= ~PCI_ERR_UNC_INTN;
> > > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
> > > +	if (rc)
> > > +		return rc;
> > > +
> > > +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
> > > +	if (rc)
> > > +		return rc;
> > > +	mask &= ~PCI_ERR_COR_INTERNAL;
> > > +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
> > > +
> > > +	return rc;
> > > +}
> > > +
> > > +static void cxl_unmask_internal_errors(struct pci_dev *rcec)  
> 
> Also renaming this to cxl_enable_rcec() to more generalize the
> function.
> 
> > > +{
> > > +	if (!handles_cxl_errors(rcec))
> > > +		return;
> > > +
> > > +	if (__cxl_unmask_internal_errors(rcec))
> > > +		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
> > > +	else
> > > +		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");  
> 
> I am going to change this to a pci_info() for alignment with other
> messages around:
> 
> [   14.200265] pcieport 0000:40:00.3: PME: Signaling with IRQ 44
> [   14.213925] pcieport 0000:40:00.3: AER: cxl: Internal errors unmasked
> [   14.228413] pcieport 0000:40:00.3: AER: enabled with IRQ 44
> 
> Plus, using pci_err() instead of dev_err().
> 
> > > +}
> > > +
> > >  #else
> > > +static inline void cxl_unmask_internal_errors(struct pci_dev *dev) { }
> > >  static inline void cxl_handle_error(struct pci_dev *dev,
> > >  				    struct aer_err_info *info) { }
> > >  #endif
> > > @@ -1397,6 +1469,7 @@ static int aer_probe(struct pcie_device *dev)
> > >  		return status;
> > >  	}
> > >  
> > > +	cxl_unmask_internal_errors(port);
> > >  	aer_enable_rootport(rpc);
> > >  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
> > >  	return 0;
> > > -- 
> > > 2.34.1
> > >   
> 
> Thanks for review,
> 
> -Robert
Ira Weiny April 13, 2023, 10:52 p.m. UTC | #5
Jonathan Cameron wrote:
> On Wed, 12 Apr 2023 16:29:01 -0500
> Bjorn Helgaas <helgaas@kernel.org> wrote:
> 
> > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> > > From: Robert Richter <rrichter@amd.com>
> > > 
> > > RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> > > disabled by default.  
> > 
> > "Disabled by default" just means "the power-up state of CIE/UIC is
> > that they are masked", right?  It doesn't mean that Linux normally
> > masks them.
> > 
> > > [1][2] Enable them to receive CXL downstream port
> > > errors of a Restricted CXL Host (RCH).
> > > 
> > > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> > > [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
> > >     7.8.4.6 Correctable Error Mask Register
> > > 
> > > Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> > > Signed-off-by: Robert Richter <rrichter@amd.com>
> > > Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> > > Cc: "Oliver O'Halloran" <oohall@gmail.com>
> > > Cc: Bjorn Helgaas <bhelgaas@google.com>
> > > Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> > > Cc: linuxppc-dev@lists.ozlabs.org
> > > Cc: linux-pci@vger.kernel.org
> > > ---
> > >  drivers/pci/pcie/aer.c | 73 ++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 73 insertions(+)
> > > 
> > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > > index 171a08fd8ebd..3973c731e11d 100644
> > > --- a/drivers/pci/pcie/aer.c
> > > +++ b/drivers/pci/pcie/aer.c
> > > @@ -1000,7 +1000,79 @@ static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> > >  		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
> > >  }
> > >  
> > > +static bool cxl_error_is_native(struct pci_dev *dev)
> > > +{
> > > +	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
> > > +
> > > +	if (pcie_ports_native)
> > > +		return true;
> > > +
> > > +	return host->native_aer && host->native_cxl_error;
> > > +}
> > > +
> > > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> > > +{
> > > +	int *handles_cxl = data;
> > > +
> > > +	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> > > +
> > > +	return *handles_cxl;
> > > +}
> > > +
> > > +static bool handles_cxl_errors(struct pci_dev *rcec)
> > > +{
> > > +	int handles_cxl = 0;
> > > +
> > > +	if (!rcec->aer_cap)
> > > +		return false;
> > > +
> > > +	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
> > > +		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> > > +
> > > +	return !!handles_cxl;
> > > +}
> > > +
> > > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > > +{
> > > +	int aer, rc;
> > > +	u32 mask;
> > > +
> > > +	/*
> > > +	 * Internal errors are masked by default, unmask RCEC's here
> > > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > > +	 */  
> > 
> > Unmasking internal errors doesn't have anything specific to do with
> > CXL, so I don't think it should have "cxl" in the function name.
> > Maybe something like "pci_aer_unmask_internal_errors()".
> 
> This reminds me.  Not sure we resolved earlier discussion on changing
> the system wide policy to turn these on 
> https://lore.kernel.org/linux-cxl/20221229172731.GA611562@bhelgaas/
> which needs pretty much the same thing.
> 
> Ira, I think you were picking this one up?
> https://lore.kernel.org/linux-cxl/63e5fb533f304_13244829412@iweiny-mobl.notmuch/

After this discussion I posted an RFC to enable those errors.

https://lore.kernel.org/all/20230209-cxl-pci-aer-v1-1-f9a817fa4016@intel.com/

Unfortunately the prevailing opinion was that this was unsafe.  And no one
piped up with a reason to pursue the alternative of a pci core call to enable
them as needed.

So I abandoned the work.

I think the direction things where headed was to have a call like:

int pci_enable_pci_internal_errors(struct pci_dev *dev)
{
	int pos_cap_err;
	u32 reg;

	if (!pcie_aer_is_native(dev))
		return -EIO;

	pos_cap_err = dev->aer_cap;

	/* Unmask correctable and uncorrectable (non-fatal) internal errors */
	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, &reg);
	reg &= ~PCI_ERR_COR_INTERNAL;
	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, reg);
	
	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, &reg);
	reg &= ~PCI_ERR_UNC_INTN;
	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, reg);
	
	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, &reg);
	reg &= ~PCI_ERR_UNC_INTN;
	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, reg);

	return 0;
}

... and call this from the cxl code where it is needed.

Is this an acceptable direction?  Terry is welcome to steal the above from my
patch and throw it into the PCI core.

Looking at the current state of things I think cxl_pci_ras_unmask() may
actually be broken now without calling something like the above.  For that I
dropped the ball.

Ira
Robert Richter April 14, 2023, 11:21 a.m. UTC | #6
On 13.04.23 15:52:36, Ira Weiny wrote:
> Jonathan Cameron wrote:
> > On Wed, 12 Apr 2023 16:29:01 -0500
> > Bjorn Helgaas <helgaas@kernel.org> wrote:
> > 
> > > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> > > > From: Robert Richter <rrichter@amd.com>
> > > > 

> > > > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > > > +{
> > > > +	int aer, rc;
> > > > +	u32 mask;
> > > > +
> > > > +	/*
> > > > +	 * Internal errors are masked by default, unmask RCEC's here
> > > > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > > > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > > > +	 */  
> > > 
> > > Unmasking internal errors doesn't have anything specific to do with
> > > CXL, so I don't think it should have "cxl" in the function name.
> > > Maybe something like "pci_aer_unmask_internal_errors()".
> > 
> > This reminds me.  Not sure we resolved earlier discussion on changing
> > the system wide policy to turn these on 
> > https://lore.kernel.org/linux-cxl/20221229172731.GA611562@bhelgaas/
> > which needs pretty much the same thing.
> > 
> > Ira, I think you were picking this one up?
> > https://lore.kernel.org/linux-cxl/63e5fb533f304_13244829412@iweiny-mobl.notmuch/
> 
> After this discussion I posted an RFC to enable those errors.
> 
> https://lore.kernel.org/all/20230209-cxl-pci-aer-v1-1-f9a817fa4016@intel.com/
> 
> Unfortunately the prevailing opinion was that this was unsafe.  And no one
> piped up with a reason to pursue the alternative of a pci core call to enable
> them as needed.
> 
> So I abandoned the work.
> 
> I think the direction things where headed was to have a call like:
> 
> int pci_enable_pci_internal_errors(struct pci_dev *dev)
> {
> 	int pos_cap_err;
> 	u32 reg;
> 
> 	if (!pcie_aer_is_native(dev))
> 		return -EIO;
> 
> 	pos_cap_err = dev->aer_cap;
> 
> 	/* Unmask correctable and uncorrectable (non-fatal) internal errors */
> 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, &reg);
> 	reg &= ~PCI_ERR_COR_INTERNAL;
> 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, reg);
> 	
> 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, &reg);
> 	reg &= ~PCI_ERR_UNC_INTN;
> 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, reg);
> 	
> 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, &reg);
> 	reg &= ~PCI_ERR_UNC_INTN;
> 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, reg);
> 
> 	return 0;
> }
> 
> ... and call this from the cxl code where it is needed.

The version I have ready after addressing Bjorn's comments is pretty
much the same, apart from error checking of the read/writes.

From your patch proposed you will need it in aer.c too and we do not
need to export it.

This patch only enables it for (CXL) RCECs. You might want to extend
this for CXL endpoints (and ports?) then.

> 
> Is this an acceptable direction?  Terry is welcome to steal the above from my
> patch and throw it into the PCI core.
> 
> Looking at the current state of things I think cxl_pci_ras_unmask() may
> actually be broken now without calling something like the above.  For that I
> dropped the ball.

Thanks,

-Robert

> 
> Ira
Jonathan Cameron April 14, 2023, 11:55 a.m. UTC | #7
On Fri, 14 Apr 2023 13:21:37 +0200
Robert Richter <rrichter@amd.com> wrote:

> On 13.04.23 15:52:36, Ira Weiny wrote:
> > Jonathan Cameron wrote:  
> > > On Wed, 12 Apr 2023 16:29:01 -0500
> > > Bjorn Helgaas <helgaas@kernel.org> wrote:
> > >   
> > > > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:  
> > > > > From: Robert Richter <rrichter@amd.com>
> > > > >   
> 
> > > > > +static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
> > > > > +{
> > > > > +	int aer, rc;
> > > > > +	u32 mask;
> > > > > +
> > > > > +	/*
> > > > > +	 * Internal errors are masked by default, unmask RCEC's here
> > > > > +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> > > > > +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> > > > > +	 */    
> > > > 
> > > > Unmasking internal errors doesn't have anything specific to do with
> > > > CXL, so I don't think it should have "cxl" in the function name.
> > > > Maybe something like "pci_aer_unmask_internal_errors()".  
> > > 
> > > This reminds me.  Not sure we resolved earlier discussion on changing
> > > the system wide policy to turn these on 
> > > https://lore.kernel.org/linux-cxl/20221229172731.GA611562@bhelgaas/
> > > which needs pretty much the same thing.
> > > 
> > > Ira, I think you were picking this one up?
> > > https://lore.kernel.org/linux-cxl/63e5fb533f304_13244829412@iweiny-mobl.notmuch/  
> > 
> > After this discussion I posted an RFC to enable those errors.
> > 
> > https://lore.kernel.org/all/20230209-cxl-pci-aer-v1-1-f9a817fa4016@intel.com/
> > 

Ah. I'd forgotten that thread. Thanks!

> > Unfortunately the prevailing opinion was that this was unsafe.  And no one
> > piped up with a reason to pursue the alternative of a pci core call to enable
> > them as needed.
> > 
> > So I abandoned the work.
> > 
> > I think the direction things where headed was to have a call like:
> > 
> > int pci_enable_pci_internal_errors(struct pci_dev *dev)
> > {
> > 	int pos_cap_err;
> > 	u32 reg;
> > 
> > 	if (!pcie_aer_is_native(dev))
> > 		return -EIO;
> > 
> > 	pos_cap_err = dev->aer_cap;
> > 
> > 	/* Unmask correctable and uncorrectable (non-fatal) internal errors */
> > 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, &reg);
> > 	reg &= ~PCI_ERR_COR_INTERNAL;
> > 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, reg);
> > 	
> > 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, &reg);
> > 	reg &= ~PCI_ERR_UNC_INTN;
> > 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, reg);
> > 	
> > 	pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, &reg);
> > 	reg &= ~PCI_ERR_UNC_INTN;
> > 	pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK, reg);
> > 
> > 	return 0;
> > }
> > 
> > ... and call this from the cxl code where it is needed.  
> 
> The version I have ready after addressing Bjorn's comments is pretty
> much the same, apart from error checking of the read/writes.
> 
> From your patch proposed you will need it in aer.c too and we do not
> need to export it.

I think for the other components we'll want to call it from cxl_pci_ras_unmask()
so an export needed.

I also wonder if a more generic function would be better as seems likely
similar code will be needed for errors other than this pair.


> 
> This patch only enables it for (CXL) RCECs. You might want to extend
> this for CXL endpoints (and ports?) then.

Definitely.  We have the same limitation you are seeing.  No errors
without turning this on.

Jonathan



> 
> > 
> > Is this an acceptable direction?  Terry is welcome to steal the above from my
> > patch and throw it into the PCI core.
> > 
> > Looking at the current state of things I think cxl_pci_ras_unmask() may
> > actually be broken now without calling something like the above.  For that I
> > dropped the ball.  
> 
> Thanks,
> 
> -Robert
> 
> > 
> > Ira
Robert Richter April 14, 2023, 11:58 a.m. UTC | #8
On 13.04.23 18:05:08, Jonathan Cameron wrote:
> On Thu, 13 Apr 2023 15:38:07 +0200
> Robert Richter <rrichter@amd.com> wrote:
> 
> > On 12.04.23 16:29:01, Bjorn Helgaas wrote:
> > > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:  

> > > With the exception of this function, this patch looks like all CXL
> > > code that maybe could be with other CXL code.  Would require making
> > > pcie_walk_rcec() available outside drivers/pci, I guess.  
> > 
> > Even this is CXL code, it implements AER support and fits better here
> > around AER code. Export of pcie_walk_rcec() (and others?) is not the
> > main issue here. CXL drivers can come as modules and would need to
> > register a hook at the aer handler.  This would add even more
> > complexity here. In contrast, current solution just adds two functions
> > for enablement and handling which are empty stubs if code is disabled.
> > 
> > I could move that code to aer_cxl.c similar to aer_inject.c. Since the
> > CXL part is small compared to the remaining aer code I left it in
> > aer.c. Also, it is guarded by #ifdef which additionally encapsulates
> > it.
> > 
> 
> To throw another option in there (what Bjorn suggested IIRC for the more
> general case..) 
> 
> Just enable internal errors always.  No need to know if they are CXL
> or something else.
> 
> There will/might be fallout and it will be fun.

I left the fun part to others. :-)

If some PCI root port goes crazy it tears down the whole system, would
avoid that.

Since internal error are implementation specific, I would only enable
them once a handler exists. What's why enablement is limited to CXL
RCECs only.

-Robert
Robert Richter April 14, 2023, 2:47 p.m. UTC | #9
On 14.04.23 12:55:43, Jonathan Cameron wrote:
> On Fri, 14 Apr 2023 13:21:37 +0200
> Robert Richter <rrichter@amd.com> wrote:

> > The version I have ready after addressing Bjorn's comments is pretty
> > much the same, apart from error checking of the read/writes.
> > 
> > From your patch proposed you will need it in aer.c too and we do not
> > need to export it.
> 
> I think for the other components we'll want to call it from cxl_pci_ras_unmask()
> so an export needed.
> 
> I also wonder if a more generic function would be better as seems likely
> similar code will be needed for errors other than this pair.

There are only a few masked by default, but not only internals. Will
consider that and also make it easy to export later once needed.

Thanks,

-Robert
Bjorn Helgaas April 14, 2023, 9:49 p.m. UTC | #10
On Thu, Apr 13, 2023 at 03:38:07PM +0200, Robert Richter wrote:
> On 12.04.23 16:29:01, Bjorn Helgaas wrote:
> > On Tue, Apr 11, 2023 at 01:03:02PM -0500, Terry Bowman wrote:
> > > From: Robert Richter <rrichter@amd.com>
> > > 
> > > RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> > > disabled by default.

> > > +static void cxl_unmask_internal_errors(struct pci_dev *rcec)
> 
> Also renaming this to cxl_enable_rcec() to more generalize the
> function.

I didn't follow this.  "cxl_enable_rcec" doesn't say anything about
"unmasking" or "internal errors", which seems like the whole point.
And the function doesn't actually *enable* and RCEC.

> > > +{
> > > +	if (!handles_cxl_errors(rcec))
> > > +		return;
> > > +
> > > +	if (__cxl_unmask_internal_errors(rcec))
> > > +		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
> > > +	else
> > > +		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");
> 
> I am going to change this to a pci_info() for alignment with other
> messages around:
> 
> [   14.200265] pcieport 0000:40:00.3: PME: Signaling with IRQ 44
> [   14.213925] pcieport 0000:40:00.3: AER: cxl: Internal errors unmasked
> [   14.228413] pcieport 0000:40:00.3: AER: enabled with IRQ 44
> 
> Plus, using pci_err() instead of dev_err().

Thanks for that!

Bjorn
Dan Williams April 18, 2023, 2:37 a.m. UTC | #11
Terry Bowman wrote:
> From: Robert Richter <rrichter@amd.com>
> 
> RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> disabled by default. [1][2] Enable them to receive CXL downstream port
> errors of a Restricted CXL Host (RCH).
> 
> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
>     7.8.4.6 Correctable Error Mask Register

My comment on patch5 to make CXL link details a first class property of a
'struct pci_dev':

http://lore.kernel.org/r/643debf5af445_1b66294f4@dwillia2-xfh.jf.intel.com.notmuch/

...also applies here.

Other than that nothing more from me on this one beyond what Bjorn and
Jonathan have said. I do agree with Robert about being cautious about
only enabling this for CXL devices for now and not all internal errors
for all AER capable devices globally. The rationale being that CXL
devices are a new link on top of PCIe and abuse/reuse internal errors
when they are conceptually functionally equivalent to PCIe link errors.
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 171a08fd8ebd..3973c731e11d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1000,7 +1000,79 @@  static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 		pcie_walk_rcec(dev, cxl_handle_error_iter, info);
 }
 
+static bool cxl_error_is_native(struct pci_dev *dev)
+{
+	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
+
+	if (pcie_ports_native)
+		return true;
+
+	return host->native_aer && host->native_cxl_error;
+}
+
+static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
+{
+	int *handles_cxl = data;
+
+	*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
+
+	return *handles_cxl;
+}
+
+static bool handles_cxl_errors(struct pci_dev *rcec)
+{
+	int handles_cxl = 0;
+
+	if (!rcec->aer_cap)
+		return false;
+
+	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+
+	return !!handles_cxl;
+}
+
+static int __cxl_unmask_internal_errors(struct pci_dev *rcec)
+{
+	int aer, rc;
+	u32 mask;
+
+	/*
+	 * Internal errors are masked by default, unmask RCEC's here
+	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
+	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
+	 */
+	aer = rcec->aer_cap;
+	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
+	if (rc)
+		return rc;
+	mask &= ~PCI_ERR_UNC_INTN;
+	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
+	if (rc)
+		return rc;
+
+	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
+	if (rc)
+		return rc;
+	mask &= ~PCI_ERR_COR_INTERNAL;
+	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
+
+	return rc;
+}
+
+static void cxl_unmask_internal_errors(struct pci_dev *rcec)
+{
+	if (!handles_cxl_errors(rcec))
+		return;
+
+	if (__cxl_unmask_internal_errors(rcec))
+		dev_err(&rcec->dev, "cxl: Failed to unmask internal errors");
+	else
+		dev_dbg(&rcec->dev, "cxl: Internal errors unmasked");
+}
+
 #else
+static inline void cxl_unmask_internal_errors(struct pci_dev *dev) { }
 static inline void cxl_handle_error(struct pci_dev *dev,
 				    struct aer_err_info *info) { }
 #endif
@@ -1397,6 +1469,7 @@  static int aer_probe(struct pcie_device *dev)
 		return status;
 	}
 
+	cxl_unmask_internal_errors(port);
 	aer_enable_rootport(rpc);
 	pci_info(port, "enabled with IRQ %d\n", dev->irq);
 	return 0;