diff mbox series

[v4,14/15] cxl/pci: Add support to assign and clear pci_driver::cxl_err_handlers

Message ID 20241211234002.3728674-15-terry.bowman@amd.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Enable CXL PCIe Port protocol error handling and logging | expand

Commit Message

Bowman, Terry Dec. 11, 2024, 11:40 p.m. UTC
pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
The handlers can't be set in the pci_driver static definition because the
CXL PCIe Port devices are bound to the portdrv driver which is not CXL
driver aware.

Add cxl_assign_port_error_handlers() in the cxl_core module. This
function will assign the default handlers for a CXL PCIe Port device.

When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
pci_driver::cxl_err_handlers must be set to NULL indicating they should no
longer be used.

Create cxl_clear_port_error_handlers() and register it to be called
when the CXL Port device (cxl_port or cxl_dport) is destroyed.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

Comments

Li Ming Dec. 12, 2024, 2:31 a.m. UTC | #1
On 12/12/2024 7:40 AM, Terry Bowman wrote:
> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
> The handlers can't be set in the pci_driver static definition because the
> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
> driver aware.
>
> Add cxl_assign_port_error_handlers() in the cxl_core module. This
> function will assign the default handlers for a CXL PCIe Port device.
>
> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
> longer be used.
>
> Create cxl_clear_port_error_handlers() and register it to be called
> when the CXL Port device (cxl_port or cxl_dport) is destroyed.
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
>
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 3294ad5ff28f..9734a4c55b29 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -841,8 +841,38 @@ static bool cxl_port_error_detected(struct pci_dev *pdev)
>  	return __cxl_handle_ras(&pdev->dev, ras_base);
>  }
>  
> +static const struct cxl_error_handlers cxl_port_error_handlers = {
> +	.error_detected	= cxl_port_error_detected,
> +	.cor_error_detected = cxl_port_cor_error_detected,
> +};
> +
> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
> +{
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver)
> +		return;
> +
> +	pdrv = pdev->driver;
> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;
> +}
> +
> +static void cxl_clear_port_error_handlers(void *data)
> +{
> +	struct pci_dev *pdev = data;
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver)
> +		return;
> +
> +	pdrv = pdev->driver;
> +	pdrv->cxl_err_handler = NULL;
> +}
> +
>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  {
> +	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
> +
>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>  	if (port->uport_regs.ras)
>  		return;
> @@ -853,6 +883,9 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  		dev_err(&port->dev, "Failed to map RAS capability.\n");
>  		return;
>  	}
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(port->uport_dev, cxl_clear_port_error_handlers, pdev);

I think the first parameter of devm_add_action_or_reset() should be 'port->dev' rather than 'port->uport_dev'.

'port->uport_dev' is 'pci_dev->dev' which will be destroyed on pci side, 'port->dev' will be destroyed on cxl side.

>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, CXL);
>  
> @@ -864,6 +897,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  {
>  	struct device *dport_dev = dport->dport_dev;
>  	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
> +	struct pci_dev *pdev = to_pci_dev(dport_dev);
>  
>  	dport->reg_map.host = dport_dev;
>  	if (dport->rch && host_bridge->native_aer) {
> @@ -880,6 +914,12 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  		dev_err(dport_dev, "Failed to map RAS capability.\n");
>  		return;
>  	}
> +
> +	if (dport->rch)
> +		return;
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(dport_dev, cxl_clear_port_error_handlers, pdev);

Same as above, should use 'port->dev'.

please correct me if I am wrong.


Ming

>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);
>
Bowman, Terry Dec. 17, 2024, 2:39 p.m. UTC | #2
On 12/11/2024 8:31 PM, Li Ming wrote:
> On 12/12/2024 7:40 AM, Terry Bowman wrote:
>> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
>> The handlers can't be set in the pci_driver static definition because the
>> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
>> driver aware.
>>
>> Add cxl_assign_port_error_handlers() in the cxl_core module. This
>> function will assign the default handlers for a CXL PCIe Port device.
>>
>> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
>> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
>> longer be used.
>>
>> Create cxl_clear_port_error_handlers() and register it to be called
>> when the CXL Port device (cxl_port or cxl_dport) is destroyed.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 40 insertions(+)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 3294ad5ff28f..9734a4c55b29 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -841,8 +841,38 @@ static bool cxl_port_error_detected(struct pci_dev *pdev)
>>  	return __cxl_handle_ras(&pdev->dev, ras_base);
>>  }
>>  
>> +static const struct cxl_error_handlers cxl_port_error_handlers = {
>> +	.error_detected	= cxl_port_error_detected,
>> +	.cor_error_detected = cxl_port_cor_error_detected,
>> +};
>> +
>> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
>> +{
>> +	struct pci_driver *pdrv;
>> +
>> +	if (!pdev || !pdev->driver)
>> +		return;
>> +
>> +	pdrv = pdev->driver;
>> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;
>> +}
>> +
>> +static void cxl_clear_port_error_handlers(void *data)
>> +{
>> +	struct pci_dev *pdev = data;
>> +	struct pci_driver *pdrv;
>> +
>> +	if (!pdev || !pdev->driver)
>> +		return;
>> +
>> +	pdrv = pdev->driver;
>> +	pdrv->cxl_err_handler = NULL;
>> +}
>> +
>>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  {
>> +	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
>> +
>>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>>  	if (port->uport_regs.ras)
>>  		return;
>> @@ -853,6 +883,9 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  		dev_err(&port->dev, "Failed to map RAS capability.\n");
>>  		return;
>>  	}
>> +
>> +	cxl_assign_port_error_handlers(pdev);
>> +	devm_add_action_or_reset(port->uport_dev, cxl_clear_port_error_handlers, pdev);
> I think the first parameter of devm_add_action_or_reset() should be 'port->dev' rather than 'port->uport_dev'.
>
> 'port->uport_dev' is 'pci_dev->dev' which will be destroyed on pci side, 'port->dev' will be destroyed on cxl side.

Hi Ming,

Indeed, this needs to be changed for dport and uport as you also indicated in your other email.

This is necessary in case the CXL drivers are uninstalled. Making this change will de-register 
the RAS error handler callbacks so they are not potentially used to handle Port CXL errors 
after the CXL drivers are removed.

Thanks Ming.

-Terry

>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, CXL);
>>  
>> @@ -864,6 +897,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>  {
>>  	struct device *dport_dev = dport->dport_dev;
>>  	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
>> +	struct pci_dev *pdev = to_pci_dev(dport_dev);
>>  
>>  	dport->reg_map.host = dport_dev;
>>  	if (dport->rch && host_bridge->native_aer) {
>> @@ -880,6 +914,12 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>  		dev_err(dport_dev, "Failed to map RAS capability.\n");
>>  		return;
>>  	}
>> +
>> +	if (dport->rch)
>> +		return;
>> +
>> +	cxl_assign_port_error_handlers(pdev);
>> +	devm_add_action_or_reset(dport_dev, cxl_clear_port_error_handlers, pdev);
> Same as above, should use 'port->dev'.
>
> please correct me if I am wrong.
>
>
> Ming
>
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);
>>  
>
Jonathan Cameron Dec. 24, 2024, 6:50 p.m. UTC | #3
On Wed, 11 Dec 2024 17:40:01 -0600
Terry Bowman <terry.bowman@amd.com> wrote:

> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
> The handlers can't be set in the pci_driver static definition because the
> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
> driver aware.
> 
> Add cxl_assign_port_error_handlers() in the cxl_core module. This
> function will assign the default handlers for a CXL PCIe Port device.
> 
> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
> longer be used.
> 
> Create cxl_clear_port_error_handlers() and register it to be called
> when the CXL Port device (cxl_port or cxl_dport) is destroyed.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 3294ad5ff28f..9734a4c55b29 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -841,8 +841,38 @@ static bool cxl_port_error_detected(struct pci_dev *pdev)
>  	return __cxl_handle_ras(&pdev->dev, ras_base);
>  }
>  
> +static const struct cxl_error_handlers cxl_port_error_handlers = {
> +	.error_detected	= cxl_port_error_detected,
> +	.cor_error_detected = cxl_port_cor_error_detected,
> +};
> +
> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
> +{
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver)
> +		return;
> +
> +	pdrv = pdev->driver;

What stops a race here?  It's fiddly to remove that driver but
it can be done.  At least I think we are messing withe portdrv
but this is such a fiddly stack I'm not 100% sure.

> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;
> +}
> +
> +static void cxl_clear_port_error_handlers(void *data)
> +{
> +	struct pci_dev *pdev = data;
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver)
> +		return;
> +
> +	pdrv = pdev->driver;
Likewise. Smells like a possible race.

> +	pdrv->cxl_err_handler = NULL;
> +}
> +
>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  {
> +	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
> +
>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>  	if (port->uport_regs.ras)
>  		return;
> @@ -853,6 +883,9 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  		dev_err(&port->dev, "Failed to map RAS capability.\n");
>  		return;
>  	}
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(port->uport_dev, cxl_clear_port_error_handlers, pdev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, CXL);
>  
> @@ -864,6 +897,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  {
>  	struct device *dport_dev = dport->dport_dev;
>  	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
> +	struct pci_dev *pdev = to_pci_dev(dport_dev);
>  
>  	dport->reg_map.host = dport_dev;
>  	if (dport->rch && host_bridge->native_aer) {
> @@ -880,6 +914,12 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  		dev_err(dport_dev, "Failed to map RAS capability.\n");
>  		return;
>  	}
> +
> +	if (dport->rch)
> +		return;
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(dport_dev, cxl_clear_port_error_handlers, pdev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);
>
Bowman, Terry Dec. 26, 2024, 5:07 p.m. UTC | #4
On 12/24/2024 12:50 PM, Jonathan Cameron wrote:
> On Wed, 11 Dec 2024 17:40:01 -0600
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
>> The handlers can't be set in the pci_driver static definition because the
>> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
>> driver aware.
>>
>> Add cxl_assign_port_error_handlers() in the cxl_core module. This
>> function will assign the default handlers for a CXL PCIe Port device.
>>
>> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
>> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
>> longer be used.
>>
>> Create cxl_clear_port_error_handlers() and register it to be called
>> when the CXL Port device (cxl_port or cxl_dport) is destroyed.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 40 ++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 40 insertions(+)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 3294ad5ff28f..9734a4c55b29 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -841,8 +841,38 @@ static bool cxl_port_error_detected(struct pci_dev *pdev)
>>  	return __cxl_handle_ras(&pdev->dev, ras_base);
>>  }
>>  
>> +static const struct cxl_error_handlers cxl_port_error_handlers = {
>> +	.error_detected	= cxl_port_error_detected,
>> +	.cor_error_detected = cxl_port_cor_error_detected,
>> +};
>> +
>> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
>> +{
>> +	struct pci_driver *pdrv;
>> +
>> +	if (!pdev || !pdev->driver)
>> +		return;
>> +
>> +	pdrv = pdev->driver;
> What stops a race here?  It's fiddly to remove that driver but
> it can be done.  At least I think we are messing withe portdrv
> but this is such a fiddly stack I'm not 100% sure.
>
>> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;
>> +}
>> +
>> +static void cxl_clear_port_error_handlers(void *data)
>> +{
>> +	struct pci_dev *pdev = data;
>> +	struct pci_driver *pdrv;
>> +
>> +	if (!pdev || !pdev->driver)
>> +		return;
>> +
>> +	pdrv = pdev->driver;
> Likewise. Smells like a possible race.
>
>> +	pdrv->cxl_err_handler = NULL;
>> +}
>> +

I can add a get_device()/put_device() for both cxl_clear_port_error_handlers() and cxl_assign_port_error_handlers() to prevent operating on a recently destroyed pci_dev. Is that sufficient? Regards, Terry
>>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  {
>> +	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
>> +
>>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>>  	if (port->uport_regs.ras)
>>  		return;
>> @@ -853,6 +883,9 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  		dev_err(&port->dev, "Failed to map RAS capability.\n");
>>  		return;
>>  	}
>> +
>> +	cxl_assign_port_error_handlers(pdev);
>> +	devm_add_action_or_reset(port->uport_dev, cxl_clear_port_error_handlers, pdev);
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, CXL);
>>  
>> @@ -864,6 +897,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>  {
>>  	struct device *dport_dev = dport->dport_dev;
>>  	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
>> +	struct pci_dev *pdev = to_pci_dev(dport_dev);
>>  
>>  	dport->reg_map.host = dport_dev;
>>  	if (dport->rch && host_bridge->native_aer) {
>> @@ -880,6 +914,12 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>  		dev_err(dport_dev, "Failed to map RAS capability.\n");
>>  		return;
>>  	}
>> +
>> +	if (dport->rch)
>> +		return;
>> +
>> +	cxl_assign_port_error_handlers(pdev);
>> +	devm_add_action_or_reset(dport_dev, cxl_clear_port_error_handlers, pdev);
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);
>>
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 3294ad5ff28f..9734a4c55b29 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -841,8 +841,38 @@  static bool cxl_port_error_detected(struct pci_dev *pdev)
 	return __cxl_handle_ras(&pdev->dev, ras_base);
 }
 
+static const struct cxl_error_handlers cxl_port_error_handlers = {
+	.error_detected	= cxl_port_error_detected,
+	.cor_error_detected = cxl_port_cor_error_detected,
+};
+
+static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
+{
+	struct pci_driver *pdrv;
+
+	if (!pdev || !pdev->driver)
+		return;
+
+	pdrv = pdev->driver;
+	pdrv->cxl_err_handler = &cxl_port_error_handlers;
+}
+
+static void cxl_clear_port_error_handlers(void *data)
+{
+	struct pci_dev *pdev = data;
+	struct pci_driver *pdrv;
+
+	if (!pdev || !pdev->driver)
+		return;
+
+	pdrv = pdev->driver;
+	pdrv->cxl_err_handler = NULL;
+}
+
 void cxl_uport_init_ras_reporting(struct cxl_port *port)
 {
+	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
+
 	/* uport may have more than 1 downstream EP. Check if already mapped. */
 	if (port->uport_regs.ras)
 		return;
@@ -853,6 +883,9 @@  void cxl_uport_init_ras_reporting(struct cxl_port *port)
 		dev_err(&port->dev, "Failed to map RAS capability.\n");
 		return;
 	}
+
+	cxl_assign_port_error_handlers(pdev);
+	devm_add_action_or_reset(port->uport_dev, cxl_clear_port_error_handlers, pdev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, CXL);
 
@@ -864,6 +897,7 @@  void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
 {
 	struct device *dport_dev = dport->dport_dev;
 	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
+	struct pci_dev *pdev = to_pci_dev(dport_dev);
 
 	dport->reg_map.host = dport_dev;
 	if (dport->rch && host_bridge->native_aer) {
@@ -880,6 +914,12 @@  void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
 		dev_err(dport_dev, "Failed to map RAS capability.\n");
 		return;
 	}
+
+	if (dport->rch)
+		return;
+
+	cxl_assign_port_error_handlers(pdev);
+	devm_add_action_or_reset(dport_dev, cxl_clear_port_error_handlers, pdev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL);