diff mbox

[12/12] pcie, aer: report all error before recovery

Message ID 4AA4C17B.6010105@jp.fujitsu.com (mailing list archive)
State Accepted, archived
Headers show

Commit Message

Hidetoshi Seto Sept. 7, 2009, 8:16 a.m. UTC
This patch is required not to lost error records by action invoked on
error recovery, such as slot reset etc.

Following sample (real machine + dummy record injected by aer-inject)
shows that record of 28:00.1 could not be retrieved by recovery of 28:00.0:

- Before:

pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801
e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID)
e1000e 0000:28:00.0:   device [8086:1096] error status/mask=00001000/00100000
e1000e 0000:28:00.0:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.0:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.0: broadcast error_detected message
e1000e 0000:28:00.0: broadcast slot_reset message
e1000e 0000:28:00.0: setting latency timer to 64
e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.1: setting latency timer to 64
e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.0: broadcast resume message
e1000e 0000:28:00.0: AER driver successfully recovered
e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX

- After:

pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801
e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID)
e1000e 0000:28:00.0:   device [8086:1096] error status/mask=00001000/00100000
e1000e 0000:28:00.0:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.0:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.1: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2801(Receiver ID)
e1000e 0000:28:00.1:   device [8086:1096] error status/mask=00081000/00100000
e1000e 0000:28:00.1:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.1:    [19] ECRC
e1000e 0000:28:00.1:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.1:   Error of this Agent(2801) is reported first
e1000e 0000:28:00.0: broadcast error_detected message
e1000e 0000:28:00.0: broadcast slot_reset message
e1000e 0000:28:00.0: setting latency timer to 64
e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.1: setting latency timer to 64
e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.0: broadcast resume message
e1000e 0000:28:00.0: AER driver successfully recovered
e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 drivers/pci/pcie/aer/aerdrv.h      |    2 --
 drivers/pci/pcie/aer/aerdrv_core.c |   29 ++++++++++++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)
diff mbox

Patch

diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index f9979eb..bd833ea 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -29,8 +29,6 @@ 
 #define ERR_COR_ID(d)			(d & 0xffff)
 #define ERR_UNCOR_ID(d)			(d >> 16)
 
-#define AER_SUCCESS			0
-#define AER_UNSUCCESS			1
 #define AER_ERROR_SOURCES_MAX		100
 
 #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index d9185cd..9f5ccbe 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -696,6 +696,13 @@  static struct aer_err_source *get_e_source(struct aer_rpc *rpc)
 	return e_source;
 }
 
+/**
+ * get_device_error_info - read error status from dev and store it to info
+ * @dev: pointer to the device expected to have a error record
+ * @info: pointer to structure to store the error record
+ *
+ * Return 1 on success, 0 on error.
+ */
 static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 {
 	int pos, temp;
@@ -707,7 +714,7 @@  static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 
 	/* The device might not support AER */
 	if (!pos)
-		return AER_SUCCESS;
+		return 1;
 
 	if (info->severity == AER_CORRECTABLE) {
 		pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
@@ -715,7 +722,7 @@  static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 		pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
-			return AER_UNSUCCESS;
+			return 0;
 	} else if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE ||
 		info->severity == AER_NONFATAL) {
 
@@ -725,7 +732,7 @@  static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
 			&info->mask);
 		if (!(info->status & ~info->mask))
-			return AER_UNSUCCESS;
+			return 0;
 
 		/* Get First Error Pointer */
 		pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp);
@@ -744,7 +751,7 @@  static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 		}
 	}
 
-	return AER_SUCCESS;
+	return 1;
 }
 
 static inline void aer_process_err_devices(struct pcie_device *p_device,
@@ -758,14 +765,14 @@  static inline void aer_process_err_devices(struct pcie_device *p_device,
 				e_info->id);
 	}
 
+	/* Report all before handle them, not to lost records by reset etc. */
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-		if (get_device_error_info(e_info->dev[i], e_info) ==
-				AER_SUCCESS) {
+		if (get_device_error_info(e_info->dev[i], e_info))
 			aer_print_error(e_info->dev[i], e_info);
-			handle_error_source(p_device,
-					e_info->dev[i],
-					e_info);
-		}
+	}
+	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
+		if (get_device_error_info(e_info->dev[i], e_info))
+			handle_error_source(p_device, e_info->dev[i], e_info);
 	}
 }
 
@@ -870,5 +877,5 @@  int aer_init(struct pcie_device *dev)
 	if (aer_osc_setup(dev) && !forceload)
 		return -ENXIO;
 
-	return AER_SUCCESS;
+	return 0;
 }