diff mbox series

bnx2x: Fix error recovering in switch configuration

Message ID 20220825200029.4143670-1-thinhtr@linux.vnet.ibm.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series bnx2x: Fix error recovering in switch configuration | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 102 this patch: 102
netdev/cc_maintainers warning 2 maintainers not CCed: edumazet@google.com pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 2 this patch: 2
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 102 this patch: 102
netdev/checkpatch warning WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 10 this patch: 10
netdev/source_inline success Was 0 now: 0

Commit Message

Thinh Tran Aug. 25, 2022, 8 p.m. UTC
As the BCM57810 and other I/O adapters are connected
through a PCIe switch, the bnx2x driver causes unexpected
system hang/crash while handling PCIe switch errors, if 
its error handler is called after other drivers' handlers.

In this case, after numbers of bnx2x_tx_timout(), the
bnx2x_nic_unload() is  called, frees up resources and
calls bnx2x_napi_disable(). Then when EEH calls its
error handler, the bnx2x_io_error_detected() and
bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
and freeing the resources.

This patch will:
- reduce the numbers of bnx2x_panic_dump() while in
  bnx2x_tx_timeout(), avoid filling up dmesg buffer.
- use checking new napi_enable flags to prevent calling 
  disable again which causing system hangs.
- cheking if fp->page_pool already freed avoid system
  crash.

Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
 3 files changed, 30 insertions(+), 1 deletion(-)

Comments

Jakub Kicinski Aug. 27, 2022, 1:44 a.m. UTC | #1
On Thu, 25 Aug 2022 20:00:29 +0000 Thinh Tran wrote:
> As the BCM57810 and other I/O adapters are connected
> through a PCIe switch, the bnx2x driver causes unexpected
> system hang/crash while handling PCIe switch errors, if 
> its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and
> calls bnx2x_napi_disable(). Then when EEH calls its
> error handler, the bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
> and freeing the resources.
> 
> This patch will:
> - reduce the numbers of bnx2x_panic_dump() while in
>   bnx2x_tx_timeout(), avoid filling up dmesg buffer.
> - use checking new napi_enable flags to prevent calling 
>   disable again which causing system hangs.
> - cheking if fp->page_pool already freed avoid system
>   crash.

> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 712b5595bc39..bb8d91f44642 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>  static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
>  {
>  	int i;
> +	if (bp->cnic_napi_enable)

empty line between variables and code, pls

> +		return;
>  
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = true;

The concept of not calling napi_enable() / disable()
feels a little wrong. It's the state of the driver,
not the NAPI that's the problem so perhaps you could
a appropriately named bool for that (IDK, maybe 
nic_stopped) and prevent coming into the NAPI handling
functions completely?

Is all other code in the driver on the path in question 
really idempotent?

>  }
>  
>  static void bnx2x_napi_enable(struct bnx2x *bp)
>  {
>  	int i;
> +	if (bp->napi_enable)
> +		return;
>  
>  	for_each_eth_queue(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = true;
>  }
>  
>  static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
>  {
>  	int i;
> +	if (!bp->cnic_napi_enable)
> +		return;
>  
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = false;
>  }
>  
>  static void bnx2x_napi_disable(struct bnx2x *bp)
>  {
>  	int i;
> +	if (!bp->napi_enable)
> +		return;
>  
>  	for_each_eth_queue(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = false;
>  }
>  
>  void bnx2x_netif_start(struct bnx2x *bp)
> @@ -2554,6 +2566,7 @@ int bnx2x_load_cnic(struct bnx2x *bp)
>  	}
>  
>  	/* Add all CNIC NAPI objects */
> +	bp->cnic_napi_enable = false;
>  	bnx2x_add_all_napi_cnic(bp);
>  	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>  	bnx2x_napi_enable_cnic(bp);
> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
>  	 */
>  	bnx2x_setup_tc(bp->dev, bp->max_cos);
>  
> +	bp->tx_timeout_cnt = 0;
>  	/* Add all NAPI objects */
> +	bp->napi_enable = false;
>  	bnx2x_add_all_napi(bp);
>  	DP(NETIF_MSG_IFUP, "napi added\n");
>  	bnx2x_napi_enable(bp);
> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>  	 */
>  	if (!bp->panic)
>  #ifndef BNX2X_STOP_ON_ERROR
> -		bnx2x_panic_dump(bp, false);
> +	{
> +		if (++bp->tx_timeout_cnt > 3) {
> +			bnx2x_panic_dump(bp, false);
> +			bp->tx_timeout_cnt = 0;
> +		} else {
> +			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
> +		}
> +	}

Please put this code in a helper function so that the oddly looking
brackets are not needed.

>  #else
>  		bnx2x_panic();
>  #endif
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> index d8b1824c334d..7e1d38a2c7ec 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
> @@ -1018,6 +1018,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>  	if (fp->mode == TPA_MODE_DISABLED)
>  		return;
>  
> +	if (!fp->page_pool.page)
> +		return;

See, another thing that's not idempotent. Better to bail higher up,
in the callee.

>  	for (i = 0; i < last; i++)
>  		bnx2x_free_rx_sge(bp, fp, i);
>
Manish Chopra Aug. 30, 2022, 9:19 a.m. UTC | #2
> -----Original Message-----
> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> Sent: Friday, August 26, 2022 1:30 AM
> To: kuba@kernel.org
> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
> Reddy Kalluru <skalluru@marvell.com>; Thinh Tran
> <thinhtr@linux.vnet.ibm.com>
> Subject: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
> 
> External Email
> 
> ----------------------------------------------------------------------
> As the BCM57810 and other I/O adapters are connected through a PCIe
> switch, the bnx2x driver causes unexpected system hang/crash while handling
> PCIe switch errors, if its error handler is called after other drivers' handlers.
> 
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is  called, frees up resources and calls
> bnx2x_napi_disable(). Then when EEH calls its error handler, the
> bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
> resources.
> 
> This patch will:
> - reduce the numbers of bnx2x_panic_dump() while in
>   bnx2x_tx_timeout(), avoid filling up dmesg buffer.
> - use checking new napi_enable flags to prevent calling
>   disable again which causing system hangs.
> - cheking if fp->page_pool already freed avoid system
>   crash.
> 
> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
>  .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
>  3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> index dd5945c4bfec..7fa23d47907a 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> @@ -1509,6 +1509,10 @@ struct bnx2x {
>  	bool			cnic_loaded;
>  	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
> 
> +	bool			napi_enable;
> +	bool			cnic_napi_enable;
> +	int			tx_timeout_cnt;
> +
>  	/* Flag that indicates that we can start looking for FCoE L2 queue
>  	 * completions in the default status block.
>  	 */
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 712b5595bc39..bb8d91f44642 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
> static void bnx2x_napi_enable_cnic(struct bnx2x *bp)  {
>  	int i;
> +	if (bp->cnic_napi_enable)
> +		return;
> 
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = true;
>  }
> 
>  static void bnx2x_napi_enable(struct bnx2x *bp)  {
>  	int i;
> +	if (bp->napi_enable)
> +		return;
> 
>  	for_each_eth_queue(bp, i) {
>  		napi_enable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = true;
>  }
> 
>  static void bnx2x_napi_disable_cnic(struct bnx2x *bp)  {
>  	int i;
> +	if (!bp->cnic_napi_enable)
> +		return;
> 
>  	for_each_rx_queue_cnic(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->cnic_napi_enable = false;
>  }
> 
>  static void bnx2x_napi_disable(struct bnx2x *bp)  {
>  	int i;
> +	if (!bp->napi_enable)
> +		return;
> 
>  	for_each_eth_queue(bp, i) {
>  		napi_disable(&bnx2x_fp(bp, i, napi));
>  	}
> +	bp->napi_enable = false;
>  }
> 
>  void bnx2x_netif_start(struct bnx2x *bp) @@ -2554,6 +2566,7 @@ int
> bnx2x_load_cnic(struct bnx2x *bp)
>  	}
> 
>  	/* Add all CNIC NAPI objects */
> +	bp->cnic_napi_enable = false;
>  	bnx2x_add_all_napi_cnic(bp);
>  	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>  	bnx2x_napi_enable_cnic(bp);
> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int
> load_mode)
>  	 */
>  	bnx2x_setup_tc(bp->dev, bp->max_cos);
> 
> +	bp->tx_timeout_cnt = 0;
>  	/* Add all NAPI objects */
> +	bp->napi_enable = false;
>  	bnx2x_add_all_napi(bp);
>  	DP(NETIF_MSG_IFUP, "napi added\n");
>  	bnx2x_napi_enable(bp);
> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev,
> unsigned int txqueue)
>  	 */
>  	if (!bp->panic)
>  #ifndef BNX2X_STOP_ON_ERROR
> -		bnx2x_panic_dump(bp, false);
> +	{
> +		if (++bp->tx_timeout_cnt > 3) {
> +			bnx2x_panic_dump(bp, false);
> +			bp->tx_timeout_cnt = 0;
> +		} else {
> +			netdev_err(bp->dev, "TX timeout %d times\n", bp-
> >tx_timeout_cnt);
> +		}
> +	

Hello Trinh,
bnx2x_panic_dump() dumps quite of some useful debug (fastpath, hw related) info to look at in the logs,
I think they should be dumped at very first occurrence of tx timeout rather than waiting on next few tx timeout
event (which may not even occur in some cases). One another way to prevent it could be by turning off the
carrier (netif_carrier_off()) at very first place in bnx2x_tx_timeout() to prevent such repeated occurrences of tx timeout on the netdev.

Thanks,
Manish
Thinh Tran Aug. 30, 2022, 4:15 p.m. UTC | #3
Thanks for reviewing it.

On 8/26/2022 8:44 PM, Jakub Kicinski wrote:
> On Thu, 25 Aug 2022 20:00:29 +0000 Thinh Tran wrote:
>> As the BCM57810 and other I/O adapters are connected
>> through a PCIe switch, the bnx2x driver causes unexpected
>> system hang/crash while handling PCIe switch errors, if
>> its error handler is called after other drivers' handlers.
>>
>> In this case, after numbers of bnx2x_tx_timout(), the
>> bnx2x_nic_unload() is  called, frees up resources and
>> calls bnx2x_napi_disable(). Then when EEH calls its
>> error handler, the bnx2x_io_error_detected() and
>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable()
>> and freeing the resources.
>>
>> This patch will:
>> - reduce the numbers of bnx2x_panic_dump() while in
>>    bnx2x_tx_timeout(), avoid filling up dmesg buffer.
>> - use checking new napi_enable flags to prevent calling
>>    disable again which causing system hangs.
>> - cheking if fp->page_pool already freed avoid system
>>    crash.
> 
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> index 712b5595bc39..bb8d91f44642 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>>   static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (bp->cnic_napi_enable)
> 
> empty line between variables and code, pls
> 
Will correct it.

>> +		return;
>>   
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = true;
> 
> The concept of not calling napi_enable() / disable()
> feels a little wrong. It's the state of the driver,
> not the NAPI that's the problem so perhaps you could
> a appropriately named bool for that (IDK, maybe
> nic_stopped) and prevent coming into the NAPI handling
> functions completely?
> > Is all other code in the driver on the path in question
> really idempotent?
> 

For the states of the driver, it already has bnx2x_netif_start()
and bnx2x_netif_stop() to handle NAPI functions.
But the bnx2x_nic_load() directly calls bnx2x_napi_enable() and 
_disable() in case of errors, and is being called from various functions 
while the driver in different states.

I will work with the suggestion.


>>   }
>>   
>>   static void bnx2x_napi_enable(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (bp->napi_enable)
>> +		return;
>>   
>>   	for_each_eth_queue(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = true;
>>   }
>>   
>>   static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (!bp->cnic_napi_enable)
>> +		return;
>>   
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = false;
>>   }
>>   
>>   static void bnx2x_napi_disable(struct bnx2x *bp)
>>   {
>>   	int i;
>> +	if (!bp->napi_enable)
>> +		return;
>>   
>>   	for_each_eth_queue(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = false;
>>   }
>>   
>>   void bnx2x_netif_start(struct bnx2x *bp)
>> @@ -2554,6 +2566,7 @@ int bnx2x_load_cnic(struct bnx2x *bp)
>>   	}
>>   
>>   	/* Add all CNIC NAPI objects */
>> +	bp->cnic_napi_enable = false;
>>   	bnx2x_add_all_napi_cnic(bp);
>>   	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>>   	bnx2x_napi_enable_cnic(bp);
>> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
>>   	 */
>>   	bnx2x_setup_tc(bp->dev, bp->max_cos);
>>   
>> +	bp->tx_timeout_cnt = 0;
>>   	/* Add all NAPI objects */
>> +	bp->napi_enable = false;
>>   	bnx2x_add_all_napi(bp);
>>   	DP(NETIF_MSG_IFUP, "napi added\n");
>>   	bnx2x_napi_enable(bp);
>> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
>>   	 */
>>   	if (!bp->panic)
>>   #ifndef BNX2X_STOP_ON_ERROR
>> -		bnx2x_panic_dump(bp, false);
>> +	{
>> +		if (++bp->tx_timeout_cnt > 3) {
>> +			bnx2x_panic_dump(bp, false);
>> +			bp->tx_timeout_cnt = 0;
>> +		} else {
>> +			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
>> +		}
>> +	}
> 
> Please put this code in a helper function so that the oddly looking
> brackets are not needed.
> 
>>   #else
>>   		bnx2x_panic();
>>   #endif
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> index d8b1824c334d..7e1d38a2c7ec 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
>> @@ -1018,6 +1018,9 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
>>   	if (fp->mode == TPA_MODE_DISABLED)
>>   		return;
>>   
>> +	if (!fp->page_pool.page)
>> +		return;
> 
> See, another thing that's not idempotent. Better to bail higher up,
> in the callee.
>

Will correct it.

>>   	for (i = 0; i < last; i++)
>>   		bnx2x_free_rx_sge(bp, fp, i);
>>   
> 


Thinh Tran
Thinh Tran Aug. 30, 2022, 9:01 p.m. UTC | #4
On 8/30/2022 4:19 AM, Manish Chopra wrote:
>> -----Original Message-----
>> From: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> Sent: Friday, August 26, 2022 1:30 AM
>> To: kuba@kernel.org
>> Cc: netdev@vger.kernel.org; Ariel Elior <aelior@marvell.com>;
>> davem@davemloft.net; Manish Chopra <manishc@marvell.com>; Sudarsana
>> Reddy Kalluru <skalluru@marvell.com>; Thinh Tran
>> <thinhtr@linux.vnet.ibm.com>
>> Subject: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> As the BCM57810 and other I/O adapters are connected through a PCIe
>> switch, the bnx2x driver causes unexpected system hang/crash while handling
>> PCIe switch errors, if its error handler is called after other drivers' handlers.
>>
>> In this case, after numbers of bnx2x_tx_timout(), the
>> bnx2x_nic_unload() is  called, frees up resources and calls
>> bnx2x_napi_disable(). Then when EEH calls its error handler, the
>> bnx2x_io_error_detected() and
>> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
>> resources.
>>
>> This patch will:
>> - reduce the numbers of bnx2x_panic_dump() while in
>>    bnx2x_tx_timeout(), avoid filling up dmesg buffer.
>> - use checking new napi_enable flags to prevent calling
>>    disable again which causing system hangs.
>> - cheking if fp->page_pool already freed avoid system
>>    crash.
>>
>> Signed-off-by: Thinh Tran <thinhtr@linux.vnet.ibm.com>
>> ---
>>   drivers/net/ethernet/broadcom/bnx2x/bnx2x.h   |  4 ++++
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   | 24 ++++++++++++++++++-
>>   .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 +++
>>   3 files changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> index dd5945c4bfec..7fa23d47907a 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
>> @@ -1509,6 +1509,10 @@ struct bnx2x {
>>   	bool			cnic_loaded;
>>   	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
>>
>> +	bool			napi_enable;
>> +	bool			cnic_napi_enable;
>> +	int			tx_timeout_cnt;
>> +
>>   	/* Flag that indicates that we can start looking for FCoE L2 queue
>>   	 * completions in the default status block.
>>   	 */
>> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> index 712b5595bc39..bb8d91f44642 100644
>> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
>> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
>> static void bnx2x_napi_enable_cnic(struct bnx2x *bp)  {
>>   	int i;
>> +	if (bp->cnic_napi_enable)
>> +		return;
>>
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = true;
>>   }
>>
>>   static void bnx2x_napi_enable(struct bnx2x *bp)  {
>>   	int i;
>> +	if (bp->napi_enable)
>> +		return;
>>
>>   	for_each_eth_queue(bp, i) {
>>   		napi_enable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = true;
>>   }
>>
>>   static void bnx2x_napi_disable_cnic(struct bnx2x *bp)  {
>>   	int i;
>> +	if (!bp->cnic_napi_enable)
>> +		return;
>>
>>   	for_each_rx_queue_cnic(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->cnic_napi_enable = false;
>>   }
>>
>>   static void bnx2x_napi_disable(struct bnx2x *bp)  {
>>   	int i;
>> +	if (!bp->napi_enable)
>> +		return;
>>
>>   	for_each_eth_queue(bp, i) {
>>   		napi_disable(&bnx2x_fp(bp, i, napi));
>>   	}
>> +	bp->napi_enable = false;
>>   }
>>
>>   void bnx2x_netif_start(struct bnx2x *bp) @@ -2554,6 +2566,7 @@ int
>> bnx2x_load_cnic(struct bnx2x *bp)
>>   	}
>>
>>   	/* Add all CNIC NAPI objects */
>> +	bp->cnic_napi_enable = false;
>>   	bnx2x_add_all_napi_cnic(bp);
>>   	DP(NETIF_MSG_IFUP, "cnic napi added\n");
>>   	bnx2x_napi_enable_cnic(bp);
>> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int
>> load_mode)
>>   	 */
>>   	bnx2x_setup_tc(bp->dev, bp->max_cos);
>>
>> +	bp->tx_timeout_cnt = 0;
>>   	/* Add all NAPI objects */
>> +	bp->napi_enable = false;
>>   	bnx2x_add_all_napi(bp);
>>   	DP(NETIF_MSG_IFUP, "napi added\n");
>>   	bnx2x_napi_enable(bp);
>> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev,
>> unsigned int txqueue)
>>   	 */
>>   	if (!bp->panic)
>>   #ifndef BNX2X_STOP_ON_ERROR
>> -		bnx2x_panic_dump(bp, false);
>> +	{
>> +		if (++bp->tx_timeout_cnt > 3) {
>> +			bnx2x_panic_dump(bp, false);
>> +			bp->tx_timeout_cnt = 0;
>> +		} else {
>> +			netdev_err(bp->dev, "TX timeout %d times\n", bp-
>>> tx_timeout_cnt);
>> +		}
>> +	
> 
> Hello Trinh,
> bnx2x_panic_dump() dumps quite of some useful debug (fastpath, hw related) info to look at in the logs,
> I think they should be dumped at very first occurrence of tx timeout rather than waiting on next few tx timeout
> event (which may not even occur in some cases). One another way to prevent it could be by turning off the
> carrier (netif_carrier_off()) at very first place in bnx2x_tx_timeout() to prevent such repeated occurrences of tx timeout on the netdev.
> 
> Thanks,
> Manish

will do

Thanks,
Thinh Tran
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index dd5945c4bfec..7fa23d47907a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1509,6 +1509,10 @@  struct bnx2x {
 	bool			cnic_loaded;
 	struct cnic_eth_dev	*(*cnic_probe)(struct net_device *);
 
+	bool			napi_enable;
+	bool			cnic_napi_enable;
+	int			tx_timeout_cnt;
+
 	/* Flag that indicates that we can start looking for FCoE L2 queue
 	 * completions in the default status block.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 712b5595bc39..bb8d91f44642 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1860,37 +1860,49 @@  static int bnx2x_setup_irqs(struct bnx2x *bp)
 static void bnx2x_napi_enable_cnic(struct bnx2x *bp)
 {
 	int i;
+	if (bp->cnic_napi_enable)
+		return;
 
 	for_each_rx_queue_cnic(bp, i) {
 		napi_enable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->cnic_napi_enable = true;
 }
 
 static void bnx2x_napi_enable(struct bnx2x *bp)
 {
 	int i;
+	if (bp->napi_enable)
+		return;
 
 	for_each_eth_queue(bp, i) {
 		napi_enable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->napi_enable = true;
 }
 
 static void bnx2x_napi_disable_cnic(struct bnx2x *bp)
 {
 	int i;
+	if (!bp->cnic_napi_enable)
+		return;
 
 	for_each_rx_queue_cnic(bp, i) {
 		napi_disable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->cnic_napi_enable = false;
 }
 
 static void bnx2x_napi_disable(struct bnx2x *bp)
 {
 	int i;
+	if (!bp->napi_enable)
+		return;
 
 	for_each_eth_queue(bp, i) {
 		napi_disable(&bnx2x_fp(bp, i, napi));
 	}
+	bp->napi_enable = false;
 }
 
 void bnx2x_netif_start(struct bnx2x *bp)
@@ -2554,6 +2566,7 @@  int bnx2x_load_cnic(struct bnx2x *bp)
 	}
 
 	/* Add all CNIC NAPI objects */
+	bp->cnic_napi_enable = false;
 	bnx2x_add_all_napi_cnic(bp);
 	DP(NETIF_MSG_IFUP, "cnic napi added\n");
 	bnx2x_napi_enable_cnic(bp);
@@ -2701,7 +2714,9 @@  int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	 */
 	bnx2x_setup_tc(bp->dev, bp->max_cos);
 
+	bp->tx_timeout_cnt = 0;
 	/* Add all NAPI objects */
+	bp->napi_enable = false;
 	bnx2x_add_all_napi(bp);
 	DP(NETIF_MSG_IFUP, "napi added\n");
 	bnx2x_napi_enable(bp);
@@ -4982,7 +4997,14 @@  void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue)
 	 */
 	if (!bp->panic)
 #ifndef BNX2X_STOP_ON_ERROR
-		bnx2x_panic_dump(bp, false);
+	{
+		if (++bp->tx_timeout_cnt > 3) {
+			bnx2x_panic_dump(bp, false);
+			bp->tx_timeout_cnt = 0;
+		} else {
+			netdev_err(bp->dev, "TX timeout %d times\n", bp->tx_timeout_cnt);
+		}
+	}
 #else
 		bnx2x_panic();
 #endif
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index d8b1824c334d..7e1d38a2c7ec 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -1018,6 +1018,9 @@  static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 	if (fp->mode == TPA_MODE_DISABLED)
 		return;
 
+	if (!fp->page_pool.page)
+		return;
+
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);