diff mbox series

[PATCHv3,05/10] PCI/ERR: Handle fatal error recovery

Message ID 20180918235702.26573-6-keith.busch@intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series PCI error handling | expand

Commit Message

Keith Busch Sept. 18, 2018, 11:56 p.m. UTC
We don't need to be paranoid about the topology changing while handling an
error. If the device has changed in a hotplug capable slot, we can rely
on the presence detection handling to react to a changing topology. This
patch restores the fatal error handling behavior that existed before
merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with
removal and re-enumeration of devices").

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/pci.h      |  4 +--
 drivers/pci/pcie/aer.c | 12 +++++---
 drivers/pci/pcie/dpc.c |  4 +--
 drivers/pci/pcie/err.c | 75 ++++----------------------------------------------
 4 files changed, 18 insertions(+), 77 deletions(-)

Comments

Bjorn Helgaas Sept. 19, 2018, 3:46 p.m. UTC | #1
On Tue, Sep 18, 2018 at 05:56:57PM -0600, Keith Busch wrote:
> We don't need to be paranoid about the topology changing while handling an
> error. If the device has changed in a hotplug capable slot, we can rely
> on the presence detection handling to react to a changing topology. This
> patch restores the fatal error handling behavior that existed before
> merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with
> removal and re-enumeration of devices").

7e9084b3674 updated Documentation/PCI/pci-error-recovery.txt; doesn't
this require another update to keep it matching the code?

> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/pci/pci.h      |  4 +--
>  drivers/pci/pcie/aer.c | 12 +++++---
>  drivers/pci/pcie/dpc.c |  4 +--
>  drivers/pci/pcie/err.c | 75 ++++----------------------------------------------
>  4 files changed, 18 insertions(+), 77 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 21bfa30db18d..5a96978d3403 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -425,8 +425,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
>  #endif
>  
>  /* PCI error reporting and recovery */
> -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
> -void pcie_do_nonfatal_recovery(struct pci_dev *dev);
> +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
> +		      u32 service);
>  
>  bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
>  #ifdef CONFIG_PCIEASPM
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index b4d14acee66d..6b59a23568f8 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1010,9 +1010,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  					info->status);
>  		pci_aer_clear_device_status(dev);
>  	} else if (info->severity == AER_NONFATAL)
> -		pcie_do_nonfatal_recovery(dev);
> +		pcie_do_recovery(dev, pci_channel_io_normal,
> +				 PCIE_PORT_SERVICE_AER);
>  	else if (info->severity == AER_FATAL)
> -		pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
> +		pcie_do_recovery(dev, pci_channel_io_frozen,
> +				 PCIE_PORT_SERVICE_AER);
>  	pci_dev_put(dev);
>  }
>  
> @@ -1048,9 +1050,11 @@ static void aer_recover_work_func(struct work_struct *work)
>  		}
>  		cper_print_aer(pdev, entry.severity, entry.regs);
>  		if (entry.severity == AER_NONFATAL)
> -			pcie_do_nonfatal_recovery(pdev);
> +			pcie_do_recovery(pdev, pci_channel_io_normal,
> +					 PCIE_PORT_SERVICE_AER);
>  		else if (entry.severity == AER_FATAL)
> -			pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
> +			pcie_do_recovery(pdev, pci_channel_io_frozen,
> +					 PCIE_PORT_SERVICE_AER);
>  		pci_dev_put(pdev);
>  	}
>  }
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index aacfb50eccfc..1ed07db8ea7d 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -169,7 +169,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  
>  	reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
>  	ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
> -	dev_warn(dev, "DPC %s detected, remove downstream devices\n",
> +	dev_warn(dev, "DPC %s detected\n",
>  		 (reason == 0) ? "unmasked uncorrectable error" :
>  		 (reason == 1) ? "ERR_NONFATAL" :
>  		 (reason == 2) ? "ERR_FATAL" :
> @@ -186,7 +186,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  	}
>  
>  	/* We configure DPC so it only triggers on ERR_FATAL */
> -	pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
> +	pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
>  
>  	return IRQ_HANDLED;
>  }
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 62ab665f0f03..644f3f725ef0 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -271,83 +271,20 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
>  	return result_data.result;
>  }
>  
> -/**
> - * pcie_do_fatal_recovery - handle fatal error recovery process
> - * @dev: pointer to a pci_dev data structure of agent detecting an error
> - *
> - * Invoked when an error is fatal. Once being invoked, removes the devices
> - * beneath this AER agent, followed by reset link e.g. secondary bus reset
> - * followed by re-enumeration of devices.
> - */
> -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
> -{
> -	struct pci_dev *udev;
> -	struct pci_bus *parent;
> -	struct pci_dev *pdev, *temp;
> -	pci_ers_result_t result;
> -
> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
> -		udev = dev;
> -	else
> -		udev = dev->bus->self;
> -
> -	parent = udev->subordinate;
> -	pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
> -
> -	pci_lock_rescan_remove();
> -	pci_dev_get(dev);
> -	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
> -					 bus_list) {
> -		pci_stop_and_remove_bus_device(pdev);
> -	}
> -
> -	result = reset_link(udev, service);
> -
> -	if ((service == PCIE_PORT_SERVICE_AER) &&
> -	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
> -		/*
> -		 * If the error is reported by a bridge, we think this error
> -		 * is related to the downstream link of the bridge, so we
> -		 * do error recovery on all subordinates of the bridge instead
> -		 * of the bridge and clear the error status of the bridge.
> -		 */
> -		pci_aer_clear_fatal_status(dev);
> -		pci_aer_clear_device_status(dev);
> -	}
> -
> -	if (result == PCI_ERS_RESULT_RECOVERED) {
> -		if (pcie_wait_for_link(udev, true))
> -			pci_rescan_bus(udev->bus);
> -		pci_info(dev, "Device recovery from fatal error successful\n");
> -	} else {
> -		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
> -		pci_info(dev, "Device recovery from fatal error failed\n");
> -	}
> -
> -	pci_dev_put(dev);
> -	pci_unlock_rescan_remove();
> -}
> -
> -/**
> - * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
> - * @dev: pointer to a pci_dev data structure of agent detecting an error
> - *
> - * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
> - * error detected message to all downstream drivers within a hierarchy in
> - * question and return the returned code.
> - */
> -void pcie_do_nonfatal_recovery(struct pci_dev *dev)
> +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
> +		      u32 service)
>  {
>  	pci_ers_result_t status;
> -	enum pci_channel_state state;
> -
> -	state = pci_channel_io_normal;
>  
>  	status = broadcast_error_message(dev,
>  			state,
>  			"error_detected",
>  			report_error_detected);
>  
> +	if (state == pci_channel_io_frozen &&
> +	    reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
> +		goto failed;
> +
>  	if (status == PCI_ERS_RESULT_CAN_RECOVER)
>  		status = broadcast_error_message(dev,
>  				state,
> -- 
> 2.14.4
>
Keith Busch Sept. 19, 2018, 3:52 p.m. UTC | #2
On Wed, Sep 19, 2018 at 10:46:51AM -0500, Bjorn Helgaas wrote:
> On Tue, Sep 18, 2018 at 05:56:57PM -0600, Keith Busch wrote:
> > We don't need to be paranoid about the topology changing while handling an
> > error. If the device has changed in a hotplug capable slot, we can rely
> > on the presence detection handling to react to a changing topology. This
> > patch restores the fatal error handling behavior that existed before
> > merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with
> > removal and re-enumeration of devices").
> 
> 7e9084b3674 updated Documentation/PCI/pci-error-recovery.txt; doesn't
> this require another update to keep it matching the code?

Indeed, I will fix that.
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 21bfa30db18d..5a96978d3403 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -425,8 +425,8 @@  static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
 #endif
 
 /* PCI error reporting and recovery */
-void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
-void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
+		      u32 service);
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index b4d14acee66d..6b59a23568f8 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1010,9 +1010,11 @@  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 					info->status);
 		pci_aer_clear_device_status(dev);
 	} else if (info->severity == AER_NONFATAL)
-		pcie_do_nonfatal_recovery(dev);
+		pcie_do_recovery(dev, pci_channel_io_normal,
+				 PCIE_PORT_SERVICE_AER);
 	else if (info->severity == AER_FATAL)
-		pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
+		pcie_do_recovery(dev, pci_channel_io_frozen,
+				 PCIE_PORT_SERVICE_AER);
 	pci_dev_put(dev);
 }
 
@@ -1048,9 +1050,11 @@  static void aer_recover_work_func(struct work_struct *work)
 		}
 		cper_print_aer(pdev, entry.severity, entry.regs);
 		if (entry.severity == AER_NONFATAL)
-			pcie_do_nonfatal_recovery(pdev);
+			pcie_do_recovery(pdev, pci_channel_io_normal,
+					 PCIE_PORT_SERVICE_AER);
 		else if (entry.severity == AER_FATAL)
-			pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
+			pcie_do_recovery(pdev, pci_channel_io_frozen,
+					 PCIE_PORT_SERVICE_AER);
 		pci_dev_put(pdev);
 	}
 }
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index aacfb50eccfc..1ed07db8ea7d 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -169,7 +169,7 @@  static irqreturn_t dpc_handler(int irq, void *context)
 
 	reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
 	ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
-	dev_warn(dev, "DPC %s detected, remove downstream devices\n",
+	dev_warn(dev, "DPC %s detected\n",
 		 (reason == 0) ? "unmasked uncorrectable error" :
 		 (reason == 1) ? "ERR_NONFATAL" :
 		 (reason == 2) ? "ERR_FATAL" :
@@ -186,7 +186,7 @@  static irqreturn_t dpc_handler(int irq, void *context)
 	}
 
 	/* We configure DPC so it only triggers on ERR_FATAL */
-	pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
+	pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 62ab665f0f03..644f3f725ef0 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -271,83 +271,20 @@  static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 	return result_data.result;
 }
 
-/**
- * pcie_do_fatal_recovery - handle fatal error recovery process
- * @dev: pointer to a pci_dev data structure of agent detecting an error
- *
- * Invoked when an error is fatal. Once being invoked, removes the devices
- * beneath this AER agent, followed by reset link e.g. secondary bus reset
- * followed by re-enumeration of devices.
- */
-void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
-{
-	struct pci_dev *udev;
-	struct pci_bus *parent;
-	struct pci_dev *pdev, *temp;
-	pci_ers_result_t result;
-
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
-		udev = dev;
-	else
-		udev = dev->bus->self;
-
-	parent = udev->subordinate;
-	pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
-
-	pci_lock_rescan_remove();
-	pci_dev_get(dev);
-	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
-					 bus_list) {
-		pci_stop_and_remove_bus_device(pdev);
-	}
-
-	result = reset_link(udev, service);
-
-	if ((service == PCIE_PORT_SERVICE_AER) &&
-	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
-		/*
-		 * If the error is reported by a bridge, we think this error
-		 * is related to the downstream link of the bridge, so we
-		 * do error recovery on all subordinates of the bridge instead
-		 * of the bridge and clear the error status of the bridge.
-		 */
-		pci_aer_clear_fatal_status(dev);
-		pci_aer_clear_device_status(dev);
-	}
-
-	if (result == PCI_ERS_RESULT_RECOVERED) {
-		if (pcie_wait_for_link(udev, true))
-			pci_rescan_bus(udev->bus);
-		pci_info(dev, "Device recovery from fatal error successful\n");
-	} else {
-		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
-		pci_info(dev, "Device recovery from fatal error failed\n");
-	}
-
-	pci_dev_put(dev);
-	pci_unlock_rescan_remove();
-}
-
-/**
- * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
- * @dev: pointer to a pci_dev data structure of agent detecting an error
- *
- * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
- * error detected message to all downstream drivers within a hierarchy in
- * question and return the returned code.
- */
-void pcie_do_nonfatal_recovery(struct pci_dev *dev)
+void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
+		      u32 service)
 {
 	pci_ers_result_t status;
-	enum pci_channel_state state;
-
-	state = pci_channel_io_normal;
 
 	status = broadcast_error_message(dev,
 			state,
 			"error_detected",
 			report_error_detected);
 
+	if (state == pci_channel_io_frozen &&
+	    reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
+		goto failed;
+
 	if (status == PCI_ERS_RESULT_CAN_RECOVER)
 		status = broadcast_error_message(dev,
 				state,