diff mbox series

[RFC] bonding: rate-limit bonding driver inspect messages

Message ID 20240215172554.4211-1-praveen.kannoju@oracle.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [RFC] bonding: rate-limit bonding driver inspect messages | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 989 this patch: 989
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 1006 this patch: 1006
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1006 this patch: 1006
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Praveen Kumar Kannoju Feb. 15, 2024, 5:25 p.m. UTC
Rate limit bond driver log messages, to prevent a log flood in a run-away
situation, e.g couldn't get rtnl lock. Message flood leads to instability
of system and loss of other crucial messages.

v2: Use exising net_ratelimit() instead of introducing new rate-limit
parameter.

Signed-off-by: Praveen Kumar Kannoju <praveen.kannoju@oracle.com>
---
 drivers/net/bonding/bond_main.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

Comments

Hangbin Liu Feb. 16, 2024, 9:03 a.m. UTC | #1
On Thu, Feb 15, 2024 at 10:55:54PM +0530, Praveen Kumar Kannoju wrote:
> Rate limit bond driver log messages, to prevent a log flood in a run-away
> situation, e.g couldn't get rtnl lock. Message flood leads to instability
> of system and loss of other crucial messages.

Hi Praveen,

The patch looks good to me. But would you please help explain why these
slave_info() are chosen under net_ratelimit?

Thanks
Hangbin
> 
> v2: Use exising net_ratelimit() instead of introducing new rate-limit
> parameter.
> 
> Signed-off-by: Praveen Kumar Kannoju <praveen.kannoju@oracle.com>
> ---
>  drivers/net/bonding/bond_main.c | 36 ++++++++++++++++++++----------------
>  1 file changed, 20 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 4e0600c..e92eba1 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -2610,12 +2610,13 @@ static int bond_miimon_inspect(struct bonding *bond)
>  			commit++;
>  			slave->delay = bond->params.downdelay;
>  			if (slave->delay) {
> -				slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
> -					   (BOND_MODE(bond) ==
> -					    BOND_MODE_ACTIVEBACKUP) ?
> -					    (bond_is_active_slave(slave) ?
> -					     "active " : "backup ") : "",
> -					   bond->params.downdelay * bond->params.miimon);
> +				if (net_ratelimit())
> +					slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
> +						   (BOND_MODE(bond) ==
> +						   BOND_MODE_ACTIVEBACKUP) ?
> +						   (bond_is_active_slave(slave) ?
> +						   "active " : "backup ") : "",
> +						   bond->params.downdelay * bond->params.miimon);
>  			}
>  			fallthrough;
>  		case BOND_LINK_FAIL:
> @@ -2623,9 +2624,10 @@ static int bond_miimon_inspect(struct bonding *bond)
>  				/* recovered before downdelay expired */
>  				bond_propose_link_state(slave, BOND_LINK_UP);
>  				slave->last_link_up = jiffies;
> -				slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
> -					   (bond->params.downdelay - slave->delay) *
> -					   bond->params.miimon);
> +				if (net_ratelimit())
> +					slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
> +						   (bond->params.downdelay - slave->delay) *
> +						   bond->params.miimon);
>  				commit++;
>  				continue;
>  			}
> @@ -2648,18 +2650,20 @@ static int bond_miimon_inspect(struct bonding *bond)
>  			slave->delay = bond->params.updelay;
>  
>  			if (slave->delay) {
> -				slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
> -					   ignore_updelay ? 0 :
> -					   bond->params.updelay *
> -					   bond->params.miimon);
> +				if (net_ratelimit())
> +					slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
> +						   ignore_updelay ? 0 :
> +						   bond->params.updelay *
> +						   bond->params.miimon);
>  			}
>  			fallthrough;
>  		case BOND_LINK_BACK:
>  			if (!link_state) {
>  				bond_propose_link_state(slave, BOND_LINK_DOWN);
> -				slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
> -					   (bond->params.updelay - slave->delay) *
> -					   bond->params.miimon);
> +				if (net_ratelimit())
> +					slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
> +						   (bond->params.updelay - slave->delay) *
> +						   bond->params.miimon);
>  				commit++;
>  				continue;
>  			}
> -- 
> 1.8.3.1
>
Praveen Kumar Kannoju Feb. 17, 2024, 12:39 p.m. UTC | #2
> -----Original Message-----
> From: Hangbin Liu <liuhangbin@gmail.com>
> Sent: 16 February 2024 02:33 PM
> To: Praveen Kannoju <praveen.kannoju@oracle.com>
> Cc: j.vosburgh@gmail.com; andy@greyhouse.net; davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Rajesh Sivaramasubramaniom
> <rajesh.sivaramasubramaniom@oracle.com>; Rama Nichanamatlu <rama.nichanamatlu@oracle.com>; Manjunath Patil
> <manjunath.b.patil@oracle.com>
> Subject: Re: [PATCH RFC] bonding: rate-limit bonding driver inspect messages
> 
> On Thu, Feb 15, 2024 at 10:55:54PM +0530, Praveen Kumar Kannoju wrote:
> > Rate limit bond driver log messages, to prevent a log flood in a
> > run-away situation, e.g couldn't get rtnl lock. Message flood leads to
> > instability of system and loss of other crucial messages.
> 
> Hi Praveen,
> 
> The patch looks good to me. But would you please help explain why these
> slave_info() are chosen under net_ratelimit?
> 
> Thanks
> Hangbin

Thank you, Hangbin.

The routine bond_mii_monitor() periodically inspects the slave carrier state in order to detect for state changes, on a state change internally records it and does the state change action.

Parked-to-Parked state changes goes through transient state. As an example for Up to Down, BOND_LINK_UP to BOND_LINK_DOWN, is thru BOND_LINK_FAIL.  In order to attain next parked state or transient state bond needs rtnl mutex. If in a situation it cannot get it, a state change wouldn't happen.  In order to achieve a state change as quickly as possible  bond_mii_monitor() reschedules itself to come around after 1 msec. And every single come around reinspects the link and sees a state change compared to its internally recorded, which in reality internal state could be not changed earlier as failed to get rtnl lock, and throws again log indicating it sees a state change. If attaining rtnl mutex take long say hypothetical 5 secs, then bond logs 5000 state change message. 1 message at every 1 msec. 

And in production environments we have seen bond taking long to achieve a state as someone else holding rtnl. Many processes do get rtnl lock.  As an example we can see eth drivers. They hold rtnl mutex for the entire duration while performing a fault recovery. There are many such scenarios.

This patch doesn't change -how- bond functions. It only simply limits this kind of log flood. 

-
Praveen.
> >
> > v2: Use exising net_ratelimit() instead of introducing new rate-limit
> > parameter.
> >
> > Signed-off-by: Praveen Kumar Kannoju <praveen.kannoju@oracle.com>
> > ---
> >  drivers/net/bonding/bond_main.c | 36
> > ++++++++++++++++++++----------------
> >  1 file changed, 20 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/net/bonding/bond_main.c
> > b/drivers/net/bonding/bond_main.c index 4e0600c..e92eba1 100644
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -2610,12 +2610,13 @@ static int bond_miimon_inspect(struct bonding *bond)
> >  			commit++;
> >  			slave->delay = bond->params.downdelay;
> >  			if (slave->delay) {
> > -				slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
> > -					   (BOND_MODE(bond) ==
> > -					    BOND_MODE_ACTIVEBACKUP) ?
> > -					    (bond_is_active_slave(slave) ?
> > -					     "active " : "backup ") : "",
> > -					   bond->params.downdelay * bond->params.miimon);
> > +				if (net_ratelimit())
> > +					slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
> > +						   (BOND_MODE(bond) ==
> > +						   BOND_MODE_ACTIVEBACKUP) ?
> > +						   (bond_is_active_slave(slave) ?
> > +						   "active " : "backup ") : "",
> > +						   bond->params.downdelay * bond->params.miimon);
> >  			}
> >  			fallthrough;
> >  		case BOND_LINK_FAIL:
> > @@ -2623,9 +2624,10 @@ static int bond_miimon_inspect(struct bonding *bond)
> >  				/* recovered before downdelay expired */
> >  				bond_propose_link_state(slave, BOND_LINK_UP);
> >  				slave->last_link_up = jiffies;
> > -				slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
> > -					   (bond->params.downdelay - slave->delay) *
> > -					   bond->params.miimon);
> > +				if (net_ratelimit())
> > +					slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
> > +						   (bond->params.downdelay - slave->delay) *
> > +						   bond->params.miimon);
> >  				commit++;
> >  				continue;
> >  			}
> > @@ -2648,18 +2650,20 @@ static int bond_miimon_inspect(struct bonding *bond)
> >  			slave->delay = bond->params.updelay;
> >
> >  			if (slave->delay) {
> > -				slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
> > -					   ignore_updelay ? 0 :
> > -					   bond->params.updelay *
> > -					   bond->params.miimon);
> > +				if (net_ratelimit())
> > +					slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
> > +						   ignore_updelay ? 0 :
> > +						   bond->params.updelay *
> > +						   bond->params.miimon);
> >  			}
> >  			fallthrough;
> >  		case BOND_LINK_BACK:
> >  			if (!link_state) {
> >  				bond_propose_link_state(slave, BOND_LINK_DOWN);
> > -				slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
> > -					   (bond->params.updelay - slave->delay) *
> > -					   bond->params.miimon);
> > +				if (net_ratelimit())
> > +					slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
> > +						   (bond->params.updelay - slave->delay) *
> > +						   bond->params.miimon);
> >  				commit++;
> >  				continue;
> >  			}
> > --
> > 1.8.3.1
> >
Hangbin Liu Feb. 18, 2024, 3:09 a.m. UTC | #3
On Sat, Feb 17, 2024 at 12:39:44PM +0000, Praveen Kannoju wrote:
> > -----Original Message-----
> > From: Hangbin Liu <liuhangbin@gmail.com>
> > Sent: 16 February 2024 02:33 PM
> > To: Praveen Kannoju <praveen.kannoju@oracle.com>
> > Cc: j.vosburgh@gmail.com; andy@greyhouse.net; davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> > pabeni@redhat.com; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Rajesh Sivaramasubramaniom
> > <rajesh.sivaramasubramaniom@oracle.com>; Rama Nichanamatlu <rama.nichanamatlu@oracle.com>; Manjunath Patil
> > <manjunath.b.patil@oracle.com>
> > Subject: Re: [PATCH RFC] bonding: rate-limit bonding driver inspect messages
> > 
> > On Thu, Feb 15, 2024 at 10:55:54PM +0530, Praveen Kumar Kannoju wrote:
> > > Rate limit bond driver log messages, to prevent a log flood in a
> > > run-away situation, e.g couldn't get rtnl lock. Message flood leads to
> > > instability of system and loss of other crucial messages.
> > 
> > Hi Praveen,
> > 
> > The patch looks good to me. But would you please help explain why these
> > slave_info() are chosen under net_ratelimit?
> > 
> > Thanks
> > Hangbin
> 
> Thank you, Hangbin.
> 
> The routine bond_mii_monitor() periodically inspects the slave carrier state in order to detect for state changes, on a state change internally records it and does the state change action.
> 
> Parked-to-Parked state changes goes through transient state. As an example for Up to Down, BOND_LINK_UP to BOND_LINK_DOWN, is thru BOND_LINK_FAIL.  In order to attain next parked state or transient state bond needs rtnl mutex. If in a situation it cannot get it, a state change wouldn't happen.  In order to achieve a state change as quickly as possible  bond_mii_monitor() reschedules itself to come around after 1 msec. 

I think a large miimon downdelay/updelay setting could reduce this.

> And every single come around reinspects the link and sees a state change compared to its internally recorded, which in reality internal state could be not changed earlier as failed to get rtnl lock, and throws again log indicating it sees a state change. If attaining rtnl mutex take long say hypothetical 5 secs, then bond logs 5000 state change message. 1 message at every 1 msec. 

Anyway, setting the rate limit do reduce the message flood. Would you please
summarise the paragraph and add it in commit description when post the formal
patch?

thanks
Hangbin
Praveen Kumar Kannoju Feb. 19, 2024, 11:35 a.m. UTC | #4
> -----Original Message-----
> From: Hangbin Liu <liuhangbin@gmail.com>
> Sent: 18 February 2024 08:39 AM
> To: Praveen Kannoju <praveen.kannoju@oracle.com>
> Cc: j.vosburgh@gmail.com; andy@greyhouse.net; davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Rajesh Sivaramasubramaniom
> <rajesh.sivaramasubramaniom@oracle.com>; Rama Nichanamatlu <rama.nichanamatlu@oracle.com>; Manjunath Patil
> <manjunath.b.patil@oracle.com>
> Subject: Re: [PATCH RFC] bonding: rate-limit bonding driver inspect messages
> 
> On Sat, Feb 17, 2024 at 12:39:44PM +0000, Praveen Kannoju wrote:
> > > -----Original Message-----
> > > From: Hangbin Liu <liuhangbin@gmail.com>
> > > Sent: 16 February 2024 02:33 PM
> > > To: Praveen Kannoju <praveen.kannoju@oracle.com>
> > > Cc: j.vosburgh@gmail.com; andy@greyhouse.net; davem@davemloft.net;
> > > edumazet@google.com; kuba@kernel.org; pabeni@redhat.com;
> > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Rajesh
> > > Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>; Rama
> > > Nichanamatlu <rama.nichanamatlu@oracle.com>; Manjunath Patil
> > > <manjunath.b.patil@oracle.com>
> > > Subject: Re: [PATCH RFC] bonding: rate-limit bonding driver inspect
> > > messages
> > >
> > > On Thu, Feb 15, 2024 at 10:55:54PM +0530, Praveen Kumar Kannoju wrote:
> > > > Rate limit bond driver log messages, to prevent a log flood in a
> > > > run-away situation, e.g couldn't get rtnl lock. Message flood
> > > > leads to instability of system and loss of other crucial messages.
> > >
> > > Hi Praveen,
> > >
> > > The patch looks good to me. But would you please help explain why
> > > these
> > > slave_info() are chosen under net_ratelimit?
> > >
> > > Thanks
> > > Hangbin
> >
> > Thank you, Hangbin.
> >
> > The routine bond_mii_monitor() periodically inspects the slave carrier state in order to detect for state changes, on a state change
> internally records it and does the state change action.
> >
> > Parked-to-Parked state changes goes through transient state. As an example for Up to Down, BOND_LINK_UP to
> BOND_LINK_DOWN, is thru BOND_LINK_FAIL.  In order to attain next parked state or transient state bond needs rtnl mutex. If in a
> situation it cannot get it, a state change wouldn't happen.  In order to achieve a state change as quickly as possible
> bond_mii_monitor() reschedules itself to come around after 1 msec.
> 
> I think a large miimon downdelay/updelay setting could reduce this.
> 
> > And every single come around reinspects the link and sees a state change compared to its internally recorded, which in reality
> internal state could be not changed earlier as failed to get rtnl lock, and throws again log indicating it sees a state change. If attaining
> rtnl mutex take long say hypothetical 5 secs, then bond logs 5000 state change message. 1 message at every 1 msec.
> 
> Anyway, setting the rate limit do reduce the message flood. Would you please summarise the paragraph and add it in commit
> description when post the formal patch?
> 
> thanks
> Hangbin

Thank you very much, Hangbin.

I've added the summary on why we intend to rate-limit the messages in the commit, and re-sent the formal patch.

-
Praveen.
diff mbox series

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4e0600c..e92eba1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2610,12 +2610,13 @@  static int bond_miimon_inspect(struct bonding *bond)
 			commit++;
 			slave->delay = bond->params.downdelay;
 			if (slave->delay) {
-				slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
-					   (BOND_MODE(bond) ==
-					    BOND_MODE_ACTIVEBACKUP) ?
-					    (bond_is_active_slave(slave) ?
-					     "active " : "backup ") : "",
-					   bond->params.downdelay * bond->params.miimon);
+				if (net_ratelimit())
+					slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
+						   (BOND_MODE(bond) ==
+						   BOND_MODE_ACTIVEBACKUP) ?
+						   (bond_is_active_slave(slave) ?
+						   "active " : "backup ") : "",
+						   bond->params.downdelay * bond->params.miimon);
 			}
 			fallthrough;
 		case BOND_LINK_FAIL:
@@ -2623,9 +2624,10 @@  static int bond_miimon_inspect(struct bonding *bond)
 				/* recovered before downdelay expired */
 				bond_propose_link_state(slave, BOND_LINK_UP);
 				slave->last_link_up = jiffies;
-				slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
-					   (bond->params.downdelay - slave->delay) *
-					   bond->params.miimon);
+				if (net_ratelimit())
+					slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
+						   (bond->params.downdelay - slave->delay) *
+						   bond->params.miimon);
 				commit++;
 				continue;
 			}
@@ -2648,18 +2650,20 @@  static int bond_miimon_inspect(struct bonding *bond)
 			slave->delay = bond->params.updelay;
 
 			if (slave->delay) {
-				slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
-					   ignore_updelay ? 0 :
-					   bond->params.updelay *
-					   bond->params.miimon);
+				if (net_ratelimit())
+					slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
+						   ignore_updelay ? 0 :
+						   bond->params.updelay *
+						   bond->params.miimon);
 			}
 			fallthrough;
 		case BOND_LINK_BACK:
 			if (!link_state) {
 				bond_propose_link_state(slave, BOND_LINK_DOWN);
-				slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
-					   (bond->params.updelay - slave->delay) *
-					   bond->params.miimon);
+				if (net_ratelimit())
+					slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
+						   (bond->params.updelay - slave->delay) *
+						   bond->params.miimon);
 				commit++;
 				continue;
 			}