diff mbox series

[06/16] PCI/ERR: Remove devices on recovery failure

Message ID 20180831212639.10196-7-keith.busch@intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series PCI, error handling and hot plug | expand

Commit Message

Keith Busch Aug. 31, 2018, 9:26 p.m. UTC
This patch removes devices connected through a bus that can't recover
from an error.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/pcie/err.c | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

Comments

Sinan Kaya Aug. 31, 2018, 10:26 p.m. UTC | #1
On 8/31/2018 2:26 PM, Keith Busch wrote:
> +static void pcie_disconnect_device(struct pci_dev *dev)
> +{
> +	struct pci_bus *bus = dev->subordinate;
> +	struct pci_dev *child, *tmp;
> +
> +	broadcast_error_message(dev, PCI_ERS_RESULT_DISCONNECT,
> +				"disconnect", report_disconnect);
> +	pci_lock_rescan_remove();
> +	list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
> +		pci_stop_and_remove_bus_device(child);
> +
> +	pci_bridge_secondary_bus_reset(dev);

Series look great so far. Please check the return value of
pci_bridge_secondary_bus_reset().
diff mbox series

Patch

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 44c55f7ceb39..45f574954fd6 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -166,6 +166,15 @@  static int report_resume(struct pci_dev *dev, void *data)
 	return 0;
 }
 
+static int report_disconnect(struct pci_dev *dev, void *data)
+{
+	device_lock(&dev->dev);
+	pci_dev_set_disconnected(dev, NULL);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+	device_unlock(&dev->dev);
+	return 0;
+}
+
 /**
  * default_reset_link - default reset function
  * @dev: pointer to pci_dev data structure
@@ -271,6 +280,34 @@  static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 	return result_data.result;
 }
 
+/**
+ * pcie_disconnect_device - Called when error handling ends with
+ * 			    PCI_ERS_RESULT_DISCONNECT status.
+ *
+ * Reaching here means error handling has irrevocably failed. This function
+ * will ungracefully disconnect all the devices below the bus that has
+ * experienced the unrecoverable error.
+ *
+ * If the link is active after the removing all devices on the bus, this will
+ * attempt to re-enumerate the bus from scratch.
+ */
+static void pcie_disconnect_device(struct pci_dev *dev)
+{
+	struct pci_bus *bus = dev->subordinate;
+	struct pci_dev *child, *tmp;
+
+	broadcast_error_message(dev, PCI_ERS_RESULT_DISCONNECT,
+				"disconnect", report_disconnect);
+	pci_lock_rescan_remove();
+	list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
+		pci_stop_and_remove_bus_device(child);
+
+	pci_bridge_secondary_bus_reset(dev);
+	if (pcie_wait_for_link(dev, true))
+		pci_rescan_bus(bus);
+	pci_unlock_rescan_remove();
+}
+
 static void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 			     u32 service)
 {
@@ -313,12 +350,9 @@  static void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 
 	pci_info(dev, "AER: Device recovery successful\n");
 	return;
-
 failed:
-	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
-
-	/* TODO: Should kernel panic here? */
 	pci_info(dev, "AER: Device recovery failed\n");
+	pcie_disconnect_device(dev);
 }
 
 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)