diff mbox series

[net] ice: Fix ice module unload

Message ID 20230523173033.3577110-1-anthony.l.nguyen@intel.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] ice: Fix ice module unload | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 43 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Tony Nguyen May 23, 2023, 5:30 p.m. UTC
From: Jakub Buchocki <jakubx.buchocki@intel.com>

Clearing interrupt scheme before PFR reset, during the removal routine,
could cause the hardware errors and possibly lead to system reboot, as
the PF reset can cause the interrupt to be generated.
Move clearing interrupt scheme from device deinitialization subprocedure,
and call it directly in particular routines. In ice_remove(), call the
ice_clear_interrupt_scheme() after the PFR is complete and all pending
transactions are done.

Error example:
[   75.229328] ice 0000:ca:00.1: Failed to read Tx Scheduler Tree - User Selection data from flash
[   77.571315] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
[   77.571418] {1}[Hardware Error]: event severity: recoverable
[   77.571459] {1}[Hardware Error]:  Error 0, type: recoverable
[   77.571500] {1}[Hardware Error]:   section_type: PCIe error
[   77.571540] {1}[Hardware Error]:   port_type: 4, root port
[   77.571580] {1}[Hardware Error]:   version: 3.0
[   77.571615] {1}[Hardware Error]:   command: 0x0547, status: 0x4010
[   77.571661] {1}[Hardware Error]:   device_id: 0000:c9:02.0
[   77.571703] {1}[Hardware Error]:   slot: 25
[   77.571736] {1}[Hardware Error]:   secondary_bus: 0xca
[   77.571773] {1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
[   77.571821] {1}[Hardware Error]:   class_code: 060400
[   77.571858] {1}[Hardware Error]:   bridge: secondary_status: 0x2800, control: 0x0013
[   77.572490] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
[   77.572870] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
[   77.573222] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
[   77.573554] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010
[   77.691273] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
[   77.691738] {2}[Hardware Error]: event severity: recoverable
[   77.691971] {2}[Hardware Error]:  Error 0, type: recoverable
[   77.692192] {2}[Hardware Error]:   section_type: PCIe error
[   77.692403] {2}[Hardware Error]:   port_type: 4, root port
[   77.692616] {2}[Hardware Error]:   version: 3.0
[   77.692825] {2}[Hardware Error]:   command: 0x0547, status: 0x4010
[   77.693032] {2}[Hardware Error]:   device_id: 0000:c9:02.0
[   77.693238] {2}[Hardware Error]:   slot: 25
[   77.693440] {2}[Hardware Error]:   secondary_bus: 0xca
[   77.693641] {2}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
[   77.693853] {2}[Hardware Error]:   class_code: 060400
[   77.694054] {2}[Hardware Error]:   bridge: secondary_status: 0x0800, control: 0x0013
[   77.719115] pci 0000:ca:00.1: AER: can't recover (no error_detected callback)
[   77.719140] pcieport 0000:c9:02.0: AER: device recovery failed
[   77.719216] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
[   77.719390] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
[   77.719557] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
[   77.719723] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010

Fixes: 5b246e533d01 ("ice: split probe into smaller functions")
Signed-off-by: Jakub Buchocki <jakubx.buchocki@intel.com>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Jiri Pirko May 24, 2023, 11:45 a.m. UTC | #1
Tue, May 23, 2023 at 07:30:33PM CEST, anthony.l.nguyen@intel.com wrote:
>From: Jakub Buchocki <jakubx.buchocki@intel.com>
>
>Clearing interrupt scheme before PFR reset, during the removal routine,
>could cause the hardware errors and possibly lead to system reboot, as
>the PF reset can cause the interrupt to be generated.
>Move clearing interrupt scheme from device deinitialization subprocedure,
>and call it directly in particular routines. In ice_remove(), call the
>ice_clear_interrupt_scheme() after the PFR is complete and all pending
>transactions are done.
>
>Error example:
>[   75.229328] ice 0000:ca:00.1: Failed to read Tx Scheduler Tree - User Selection data from flash
>[   77.571315] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
>[   77.571418] {1}[Hardware Error]: event severity: recoverable
>[   77.571459] {1}[Hardware Error]:  Error 0, type: recoverable
>[   77.571500] {1}[Hardware Error]:   section_type: PCIe error
>[   77.571540] {1}[Hardware Error]:   port_type: 4, root port
>[   77.571580] {1}[Hardware Error]:   version: 3.0
>[   77.571615] {1}[Hardware Error]:   command: 0x0547, status: 0x4010
>[   77.571661] {1}[Hardware Error]:   device_id: 0000:c9:02.0
>[   77.571703] {1}[Hardware Error]:   slot: 25
>[   77.571736] {1}[Hardware Error]:   secondary_bus: 0xca
>[   77.571773] {1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
>[   77.571821] {1}[Hardware Error]:   class_code: 060400
>[   77.571858] {1}[Hardware Error]:   bridge: secondary_status: 0x2800, control: 0x0013
>[   77.572490] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
>[   77.572870] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
>[   77.573222] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
>[   77.573554] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010
>[   77.691273] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
>[   77.691738] {2}[Hardware Error]: event severity: recoverable
>[   77.691971] {2}[Hardware Error]:  Error 0, type: recoverable
>[   77.692192] {2}[Hardware Error]:   section_type: PCIe error
>[   77.692403] {2}[Hardware Error]:   port_type: 4, root port
>[   77.692616] {2}[Hardware Error]:   version: 3.0
>[   77.692825] {2}[Hardware Error]:   command: 0x0547, status: 0x4010
>[   77.693032] {2}[Hardware Error]:   device_id: 0000:c9:02.0
>[   77.693238] {2}[Hardware Error]:   slot: 25
>[   77.693440] {2}[Hardware Error]:   secondary_bus: 0xca
>[   77.693641] {2}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
>[   77.693853] {2}[Hardware Error]:   class_code: 060400
>[   77.694054] {2}[Hardware Error]:   bridge: secondary_status: 0x0800, control: 0x0013
>[   77.719115] pci 0000:ca:00.1: AER: can't recover (no error_detected callback)
>[   77.719140] pcieport 0000:c9:02.0: AER: device recovery failed
>[   77.719216] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
>[   77.719390] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
>[   77.719557] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
>[   77.719723] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010
>
>Fixes: 5b246e533d01 ("ice: split probe into smaller functions")
>Signed-off-by: Jakub Buchocki <jakubx.buchocki@intel.com>
>Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
>Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
>Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
>---
> drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
>index a1f7c8edc22f..5052250b147e 100644
>--- a/drivers/net/ethernet/intel/ice/ice_main.c
>+++ b/drivers/net/ethernet/intel/ice/ice_main.c
>@@ -4802,7 +4802,6 @@ static int ice_init_dev(struct ice_pf *pf)
> static void ice_deinit_dev(struct ice_pf *pf)
> {
> 	ice_free_irq_msix_misc(pf);
>-	ice_clear_interrupt_scheme(pf);
> 	ice_deinit_pf(pf);
> 	ice_deinit_hw(&pf->hw);
> }
>@@ -5071,6 +5070,7 @@ static int ice_init(struct ice_pf *pf)
> 	ice_dealloc_vsis(pf);
> err_alloc_vsis:
> 	ice_deinit_dev(pf);
>+	ice_clear_interrupt_scheme(pf);

Can't you maintain the same order of calling
ice_clear_interrupt_scheme() and ice_deinit_pf()?

> 	return err;
> }
> 
>@@ -5098,6 +5098,8 @@ int ice_load(struct ice_pf *pf)
> 	if (err)
> 		return err;

Don't you need pci_wait_for_pending_transaction() here as well?

Btw, why can't you do reset in ice_unload to follow the same patterns as
probe/remove?


> 
>+	ice_clear_interrupt_scheme(pf);
>+
> 	err = ice_init_dev(pf);
> 	if (err)
> 		return err;
>@@ -5132,6 +5134,7 @@ int ice_load(struct ice_pf *pf)
> 	ice_vsi_decfg(ice_get_main_vsi(pf));
> err_vsi_cfg:
> 	ice_deinit_dev(pf);
>+	ice_clear_interrupt_scheme(pf);
> 	return err;
> }
> 
>@@ -5251,6 +5254,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> 	ice_deinit_eth(pf);
> err_init_eth:
> 	ice_deinit(pf);
>+	ice_clear_interrupt_scheme(pf);
> err_init:
> 	pci_disable_device(pdev);
> 	return err;
>@@ -5360,6 +5364,7 @@ static void ice_remove(struct pci_dev *pdev)
> 	 */
> 	ice_reset(&pf->hw, ICE_RESET_PFR);
> 	pci_wait_for_pending_transaction(pdev);
>+	ice_clear_interrupt_scheme(pf);
> 	pci_disable_device(pdev);
> }
> 
>-- 
>2.38.1
>
>
Buchocki, JakubX May 30, 2023, 3:11 p.m. UTC | #2
On 5/24/2023 1:45 PM, Jiri Pirko wrote:
> Tue, May 23, 2023 at 07:30:33PM CEST, anthony.l.nguyen@intel.com wrote:
>> From: Jakub Buchocki <jakubx.buchocki@intel.com>
>>
>> Clearing interrupt scheme before PFR reset, during the removal routine,
>> could cause the hardware errors and possibly lead to system reboot, as
>> the PF reset can cause the interrupt to be generated.
>> Move clearing interrupt scheme from device deinitialization subprocedure,
>> and call it directly in particular routines. In ice_remove(), call the
>> ice_clear_interrupt_scheme() after the PFR is complete and all pending
>> transactions are done.
>>
>> Error example:
>> [   75.229328] ice 0000:ca:00.1: Failed to read Tx Scheduler Tree - User Selection data from flash
>> [   77.571315] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
>> [   77.571418] {1}[Hardware Error]: event severity: recoverable
>> [   77.571459] {1}[Hardware Error]:  Error 0, type: recoverable
>> [   77.571500] {1}[Hardware Error]:   section_type: PCIe error
>> [   77.571540] {1}[Hardware Error]:   port_type: 4, root port
>> [   77.571580] {1}[Hardware Error]:   version: 3.0
>> [   77.571615] {1}[Hardware Error]:   command: 0x0547, status: 0x4010
>> [   77.571661] {1}[Hardware Error]:   device_id: 0000:c9:02.0
>> [   77.571703] {1}[Hardware Error]:   slot: 25
>> [   77.571736] {1}[Hardware Error]:   secondary_bus: 0xca
>> [   77.571773] {1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
>> [   77.571821] {1}[Hardware Error]:   class_code: 060400
>> [   77.571858] {1}[Hardware Error]:   bridge: secondary_status: 0x2800, control: 0x0013
>> [   77.572490] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
>> [   77.572870] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
>> [   77.573222] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
>> [   77.573554] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010
>> [   77.691273] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
>> [   77.691738] {2}[Hardware Error]: event severity: recoverable
>> [   77.691971] {2}[Hardware Error]:  Error 0, type: recoverable
>> [   77.692192] {2}[Hardware Error]:   section_type: PCIe error
>> [   77.692403] {2}[Hardware Error]:   port_type: 4, root port
>> [   77.692616] {2}[Hardware Error]:   version: 3.0
>> [   77.692825] {2}[Hardware Error]:   command: 0x0547, status: 0x4010
>> [   77.693032] {2}[Hardware Error]:   device_id: 0000:c9:02.0
>> [   77.693238] {2}[Hardware Error]:   slot: 25
>> [   77.693440] {2}[Hardware Error]:   secondary_bus: 0xca
>> [   77.693641] {2}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x347a
>> [   77.693853] {2}[Hardware Error]:   class_code: 060400
>> [   77.694054] {2}[Hardware Error]:   bridge: secondary_status: 0x0800, control: 0x0013
>> [   77.719115] pci 0000:ca:00.1: AER: can't recover (no error_detected callback)
>> [   77.719140] pcieport 0000:c9:02.0: AER: device recovery failed
>> [   77.719216] pcieport 0000:c9:02.0: AER: aer_status: 0x00200000, aer_mask: 0x00100020
>> [   77.719390] pcieport 0000:c9:02.0:    [21] ACSViol                (First)
>> [   77.719557] pcieport 0000:c9:02.0: AER: aer_layer=Transaction Layer, aer_agent=Receiver ID
>> [   77.719723] pcieport 0000:c9:02.0: AER: aer_uncor_severity: 0x00463010
>>
>> Fixes: 5b246e533d01 ("ice: split probe into smaller functions")
>> Signed-off-by: Jakub Buchocki <jakubx.buchocki@intel.com>
>> Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
>> Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)
>> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
>> ---
>> drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++++-
>> 1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
>> index a1f7c8edc22f..5052250b147e 100644
>> --- a/drivers/net/ethernet/intel/ice/ice_main.c
>> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
>> @@ -4802,7 +4802,6 @@ static int ice_init_dev(struct ice_pf *pf)
>> static void ice_deinit_dev(struct ice_pf *pf)
>> {
>> 	ice_free_irq_msix_misc(pf);
>> -	ice_clear_interrupt_scheme(pf);
>> 	ice_deinit_pf(pf);
>> 	ice_deinit_hw(&pf->hw);
>> }
>> @@ -5071,6 +5070,7 @@ static int ice_init(struct ice_pf *pf)
>> 	ice_dealloc_vsis(pf);
>> err_alloc_vsis:
>> 	ice_deinit_dev(pf);
>> +	ice_clear_interrupt_scheme(pf);
> 
> Can't you maintain the same order of calling
> ice_clear_interrupt_scheme() and ice_deinit_pf()?
> 

Sorry, for the delayed response.
We tried to maintain the same order and move the PFR reset and
pci_wait_for_pending_transaction() to ice_deinit_dev().
The code looked cleaner but it made the unloading process much longer (~6s/PF).
At this point, to keep the order of calls we would have to break the
ice_deinit_dev() back to individual calls and it doesn't seem to be
that beneficial.

>> 	return err;
>> }
>>
>> @@ -5098,6 +5098,8 @@ int ice_load(struct ice_pf *pf)
>> 	if (err)
>> 		return err;
> 
> Don't you need pci_wait_for_pending_transaction() here as well?
> 
> Btw, why can't you do reset in ice_unload to follow the same patterns as
> probe/remove?
> 
> 

This patch was limited to the changes required to fix the issue, targeted for net.
Your ask is accurate and it might be a good change, but I would prefer to work
on it in another series (for net-next), so it can be properly tested.

>>
>> +	ice_clear_interrupt_scheme(pf);
>> +
>> 	err = ice_init_dev(pf);
>> 	if (err)
>> 		return err;
>> @@ -5132,6 +5134,7 @@ int ice_load(struct ice_pf *pf)
>> 	ice_vsi_decfg(ice_get_main_vsi(pf));
>> err_vsi_cfg:
>> 	ice_deinit_dev(pf);
>> +	ice_clear_interrupt_scheme(pf);
>> 	return err;
>> }
>>
>> @@ -5251,6 +5254,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
>> 	ice_deinit_eth(pf);
>> err_init_eth:
>> 	ice_deinit(pf);
>> +	ice_clear_interrupt_scheme(pf);
>> err_init:
>> 	pci_disable_device(pdev);
>> 	return err;
>> @@ -5360,6 +5364,7 @@ static void ice_remove(struct pci_dev *pdev)
>> 	 */
>> 	ice_reset(&pf->hw, ICE_RESET_PFR);
>> 	pci_wait_for_pending_transaction(pdev);
>> +	ice_clear_interrupt_scheme(pf);
>> 	pci_disable_device(pdev);
>> }
>>
>> -- 
>> 2.38.1
>>
>>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a1f7c8edc22f..5052250b147e 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -4802,7 +4802,6 @@  static int ice_init_dev(struct ice_pf *pf)
 static void ice_deinit_dev(struct ice_pf *pf)
 {
 	ice_free_irq_msix_misc(pf);
-	ice_clear_interrupt_scheme(pf);
 	ice_deinit_pf(pf);
 	ice_deinit_hw(&pf->hw);
 }
@@ -5071,6 +5070,7 @@  static int ice_init(struct ice_pf *pf)
 	ice_dealloc_vsis(pf);
 err_alloc_vsis:
 	ice_deinit_dev(pf);
+	ice_clear_interrupt_scheme(pf);
 	return err;
 }
 
@@ -5098,6 +5098,8 @@  int ice_load(struct ice_pf *pf)
 	if (err)
 		return err;
 
+	ice_clear_interrupt_scheme(pf);
+
 	err = ice_init_dev(pf);
 	if (err)
 		return err;
@@ -5132,6 +5134,7 @@  int ice_load(struct ice_pf *pf)
 	ice_vsi_decfg(ice_get_main_vsi(pf));
 err_vsi_cfg:
 	ice_deinit_dev(pf);
+	ice_clear_interrupt_scheme(pf);
 	return err;
 }
 
@@ -5251,6 +5254,7 @@  ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	ice_deinit_eth(pf);
 err_init_eth:
 	ice_deinit(pf);
+	ice_clear_interrupt_scheme(pf);
 err_init:
 	pci_disable_device(pdev);
 	return err;
@@ -5360,6 +5364,7 @@  static void ice_remove(struct pci_dev *pdev)
 	 */
 	ice_reset(&pf->hw, ICE_RESET_PFR);
 	pci_wait_for_pending_transaction(pdev);
+	ice_clear_interrupt_scheme(pf);
 	pci_disable_device(pdev);
 }