Message ID | 20250211192444.2292833-16-terry.bowman@amd.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | Enable CXL PCIe port protocol error handling and logging | expand |
On 2/11/25 12:24 PM, Terry Bowman wrote: > pci_driver::cxl_err_handlers are not currently assigned handler callbacks. > The handlers can't be set in the pci_driver static definition because the > CXL PCIe Port devices are bound to the portdrv driver which is not CXL > driver aware. > > Add cxl_assign_port_error_handlers() in the cxl_core module. This > function will assign the default handlers for a CXL PCIe Port device. > > When the CXL Port (cxl_port or cxl_dport) is destroyed the device's > pci_driver::cxl_err_handlers must be set to NULL indicating they should no > longer be used. > > Create cxl_clear_port_error_handlers() and register it to be called > when the CXL Port device (cxl_port or cxl_dport) is destroyed. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> Reviewed-by: Dave Jiang <dave.jiang@intel.com> > --- > drivers/cxl/core/pci.c | 59 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 57 insertions(+), 2 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index f154dcf6dfda..03ae21a944e0 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -860,8 +860,39 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev) > return __cxl_handle_ras(dev, &pdev->dev, ras_base); > } > > +static const struct cxl_error_handlers cxl_port_error_handlers = { > + .error_detected = cxl_port_error_detected, > + .cor_error_detected = cxl_port_cor_error_detected, > +}; > + > +static void cxl_assign_port_error_handlers(struct pci_dev *pdev) > +{ > + struct pci_driver *pdrv; > + > + if (!pdev || !pdev->driver || !get_device(&pdev->dev)) > + return; > + > + pdrv = pdev->driver; > + pdrv->cxl_err_handler = &cxl_port_error_handlers; > + put_device(&pdev->dev); > +} > + > +static void cxl_clear_port_error_handlers(void *data) > +{ > + struct pci_dev *pdev = data; > + struct pci_driver *pdrv; > + > + if (!pdev || !pdev->driver || !get_device(&pdev->dev)) > + return; > + > + pdrv = pdev->driver; > + pdrv->cxl_err_handler = NULL; > + put_device(&pdev->dev); > +} > + > void cxl_uport_init_ras_reporting(struct cxl_port *port) > { > + struct pci_dev *pdev = to_pci_dev(port->uport_dev); > > /* uport may have more than 1 downstream EP. Check if already mapped. */ > mutex_lock(&ras_init_mutex); > @@ -872,9 +903,15 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port) > > port->reg_map.host = &port->dev; > if (cxl_map_component_regs(&port->reg_map, &port->uport_regs, > - BIT(CXL_CM_CAP_CAP_ID_RAS))) > + BIT(CXL_CM_CAP_CAP_ID_RAS))) { > dev_err(&port->dev, "Failed to map RAS capability\n"); > + mutex_unlock(&ras_init_mutex); > + return; > + } > mutex_unlock(&ras_init_mutex); > + > + cxl_assign_port_error_handlers(pdev); > + devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev); > } > EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL"); > > @@ -886,6 +923,8 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport) > { > struct device *dport_dev = dport->dport_dev; > struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev); > + struct pci_dev *pdev = to_pci_dev(dport_dev); > + struct cxl_port *port; > > dport->reg_map.host = dport_dev; > if (dport->rch && host_bridge->native_aer) { > @@ -901,9 +940,25 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport) > } > > if (cxl_map_component_regs(&dport->reg_map, &dport->regs.component, > - BIT(CXL_CM_CAP_CAP_ID_RAS))) > + BIT(CXL_CM_CAP_CAP_ID_RAS))) { > dev_err(dport_dev, "Failed to map RAS capability\n"); > + mutex_unlock(&ras_init_mutex); > + return; > + } > mutex_unlock(&ras_init_mutex); > + > + if (dport->rch) > + return; > + > + port = find_cxl_port(dport_dev, NULL); > + if (!port) { > + dev_err(dport_dev, "Failed to find upstream port\n"); > + return; > + } > + > + cxl_assign_port_error_handlers(pdev); > + devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev); > + put_device(&port->dev); > } > EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); >
Terry Bowman wrote: > pci_driver::cxl_err_handlers are not currently assigned handler callbacks. > The handlers can't be set in the pci_driver static definition because the > CXL PCIe Port devices are bound to the portdrv driver which is not CXL > driver aware. > > Add cxl_assign_port_error_handlers() in the cxl_core module. This > function will assign the default handlers for a CXL PCIe Port device. > > When the CXL Port (cxl_port or cxl_dport) is destroyed the device's > pci_driver::cxl_err_handlers must be set to NULL indicating they should no > longer be used. > > Create cxl_clear_port_error_handlers() and register it to be called > when the CXL Port device (cxl_port or cxl_dport) is destroyed. This is another complication that naturally goes away with cxl_error_handlers are instances that get attached to 'struct cxl_driver' instances rather tha 'struct pci_driver' instances. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> > --- > drivers/cxl/core/pci.c | 59 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 57 insertions(+), 2 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index f154dcf6dfda..03ae21a944e0 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -860,8 +860,39 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev) > return __cxl_handle_ras(dev, &pdev->dev, ras_base); > } > > +static const struct cxl_error_handlers cxl_port_error_handlers = { > + .error_detected = cxl_port_error_detected, > + .cor_error_detected = cxl_port_cor_error_detected, > +}; > + > +static void cxl_assign_port_error_handlers(struct pci_dev *pdev) > +{ > + struct pci_driver *pdrv; > + > + if (!pdev || !pdev->driver || !get_device(&pdev->dev)) > + return; > + > + pdrv = pdev->driver; > + pdrv->cxl_err_handler = &cxl_port_error_handlers; Nothing is holding the @pdev device_lock(), so @pdev->driver may go NULL immediately after reading it. Also, it is possible for a 'struct cxl_port' to exist even though its uport_dev (pci_dev) is not attached to a driver. This would seem to result in unpredictable behavior from one kernel to the next as the PCIe portdrv situation evolves. Lastly, I do not like the precedent of not being able to read a 'struct pci_driver' template and be assured that it captures all possible error handlers, or even worse, this unceremoniously overrides a PCI driver that thinks it knows what the CXL error handlers should be.
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index f154dcf6dfda..03ae21a944e0 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -860,8 +860,39 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev) return __cxl_handle_ras(dev, &pdev->dev, ras_base); } +static const struct cxl_error_handlers cxl_port_error_handlers = { + .error_detected = cxl_port_error_detected, + .cor_error_detected = cxl_port_cor_error_detected, +}; + +static void cxl_assign_port_error_handlers(struct pci_dev *pdev) +{ + struct pci_driver *pdrv; + + if (!pdev || !pdev->driver || !get_device(&pdev->dev)) + return; + + pdrv = pdev->driver; + pdrv->cxl_err_handler = &cxl_port_error_handlers; + put_device(&pdev->dev); +} + +static void cxl_clear_port_error_handlers(void *data) +{ + struct pci_dev *pdev = data; + struct pci_driver *pdrv; + + if (!pdev || !pdev->driver || !get_device(&pdev->dev)) + return; + + pdrv = pdev->driver; + pdrv->cxl_err_handler = NULL; + put_device(&pdev->dev); +} + void cxl_uport_init_ras_reporting(struct cxl_port *port) { + struct pci_dev *pdev = to_pci_dev(port->uport_dev); /* uport may have more than 1 downstream EP. Check if already mapped. */ mutex_lock(&ras_init_mutex); @@ -872,9 +903,15 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port) port->reg_map.host = &port->dev; if (cxl_map_component_regs(&port->reg_map, &port->uport_regs, - BIT(CXL_CM_CAP_CAP_ID_RAS))) + BIT(CXL_CM_CAP_CAP_ID_RAS))) { dev_err(&port->dev, "Failed to map RAS capability\n"); + mutex_unlock(&ras_init_mutex); + return; + } mutex_unlock(&ras_init_mutex); + + cxl_assign_port_error_handlers(pdev); + devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev); } EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL"); @@ -886,6 +923,8 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport) { struct device *dport_dev = dport->dport_dev; struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev); + struct pci_dev *pdev = to_pci_dev(dport_dev); + struct cxl_port *port; dport->reg_map.host = dport_dev; if (dport->rch && host_bridge->native_aer) { @@ -901,9 +940,25 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport) } if (cxl_map_component_regs(&dport->reg_map, &dport->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS))) + BIT(CXL_CM_CAP_CAP_ID_RAS))) { dev_err(dport_dev, "Failed to map RAS capability\n"); + mutex_unlock(&ras_init_mutex); + return; + } mutex_unlock(&ras_init_mutex); + + if (dport->rch) + return; + + port = find_cxl_port(dport_dev, NULL); + if (!port) { + dev_err(dport_dev, "Failed to find upstream port\n"); + return; + } + + cxl_assign_port_error_handlers(pdev); + devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev); + put_device(&port->dev); } EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");