diff mbox series

[RFC,net,1/2] nexthop: Do not flush blackhole nexthops when loopback goes down

Message ID 20210228142613.1642938-2-idosch@idosch.org (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series nexthop: Do not flush blackhole nexthops when loopback goes down | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net
netdev/subject_prefix success Link
netdev/cc_maintainers fail 1 blamed authors not CCed: dsahern@kernel.org; 2 maintainers not CCed: dsahern@kernel.org yoshfuji@linux-ipv6.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 2 this patch: 2
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 31 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 2 this patch: 2
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Ido Schimmel Feb. 28, 2021, 2:26 p.m. UTC
From: Ido Schimmel <idosch@nvidia.com>

As far as user space is concerned, blackhole nexthops do not have a
nexthop device and therefore should not be affected by the
administrative or carrier state of any netdev.

However, when the loopback netdev goes down all the blackhole nexthops
are flushed. This happens because internally the kernel associates
blackhole nexthops with the loopback netdev.

This behavior is both confusing to those not familiar with kernel
internals and also diverges from the legacy API where blackhole IPv4
routes are not flushed when the loopback netdev goes down:

 # ip route add blackhole 198.51.100.0/24
 # ip link set dev lo down
 # ip route show 198.51.100.0/24
 blackhole 198.51.100.0/24

Blackhole IPv6 routes are flushed, but at least user space knows that
they are associated with the loopback netdev:

 # ip -6 route show 2001:db8:1::/64
 blackhole 2001:db8:1::/64 dev lo metric 1024 pref medium

Fix this by only flushing blackhole nexthops when the loopback netdev is
unregistered.

Fixes: ab84be7e54fc ("net: Initial nexthop code")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reported-by: Donald Sharp <sharpd@nvidia.com>
---
 net/ipv4/nexthop.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Comments

David Ahern Feb. 28, 2021, 11:40 p.m. UTC | #1
On 2/28/21 7:26 AM, Ido Schimmel wrote:
> From: Ido Schimmel <idosch@nvidia.com>
> 
> As far as user space is concerned, blackhole nexthops do not have a
> nexthop device and therefore should not be affected by the
> administrative or carrier state of any netdev.
> 
> However, when the loopback netdev goes down all the blackhole nexthops
> are flushed. This happens because internally the kernel associates
> blackhole nexthops with the loopback netdev.

That was not intended, so definitely a bug.

> 
> This behavior is both confusing to those not familiar with kernel
> internals and also diverges from the legacy API where blackhole IPv4
> routes are not flushed when the loopback netdev goes down:
> 
>  # ip route add blackhole 198.51.100.0/24
>  # ip link set dev lo down
>  # ip route show 198.51.100.0/24
>  blackhole 198.51.100.0/24
> 
> Blackhole IPv6 routes are flushed, but at least user space knows that
> they are associated with the loopback netdev:
> 
>  # ip -6 route show 2001:db8:1::/64
>  blackhole 2001:db8:1::/64 dev lo metric 1024 pref medium
> 
> Fix this by only flushing blackhole nexthops when the loopback netdev is
> unregistered.
> 
> Fixes: ab84be7e54fc ("net: Initial nexthop code")
> Signed-off-by: Ido Schimmel <idosch@nvidia.com>
> Reported-by: Donald Sharp <sharpd@nvidia.com>
> ---
>  net/ipv4/nexthop.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
> index f1c6cbdb9e43..743777bce179 100644
> --- a/net/ipv4/nexthop.c
> +++ b/net/ipv4/nexthop.c
> @@ -1399,7 +1399,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
>  
>  /* rtnl */
>  /* remove all nexthops tied to a device being deleted */
> -static void nexthop_flush_dev(struct net_device *dev)
> +static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
>  {
>  	unsigned int hash = nh_dev_hashfn(dev->ifindex);
>  	struct net *net = dev_net(dev);
> @@ -1411,6 +1411,10 @@ static void nexthop_flush_dev(struct net_device *dev)
>  		if (nhi->fib_nhc.nhc_dev != dev)
>  			continue;
>  
> +		if (nhi->reject_nh &&
> +		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
> +			continue;
> +
>  		remove_nexthop(net, nhi->nh_parent, NULL);
>  	}
>  }
> @@ -2189,11 +2193,11 @@ static int nh_netdev_event(struct notifier_block *this,
>  	switch (event) {
>  	case NETDEV_DOWN:
>  	case NETDEV_UNREGISTER:
> -		nexthop_flush_dev(dev);
> +		nexthop_flush_dev(dev, event);
>  		break;
>  	case NETDEV_CHANGE:
>  		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
> -			nexthop_flush_dev(dev);
> +			nexthop_flush_dev(dev, event);
>  		break;
>  	case NETDEV_CHANGEMTU:
>  		info_ext = ptr;
> 

LGTM. I suggest submitting without the RFC.

Reviewed-by: David Ahern <dsahern@gmail.com>
Ido Schimmel March 1, 2021, 8:27 a.m. UTC | #2
On Sun, Feb 28, 2021 at 04:40:13PM -0700, David Ahern wrote:
> LGTM. I suggest submitting without the RFC.
> 
> Reviewed-by: David Ahern <dsahern@gmail.com>

Thanks, David. Will let it go through regression and submit later this
week.
diff mbox series

Patch

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index f1c6cbdb9e43..743777bce179 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1399,7 +1399,7 @@  static int insert_nexthop(struct net *net, struct nexthop *new_nh,
 
 /* rtnl */
 /* remove all nexthops tied to a device being deleted */
-static void nexthop_flush_dev(struct net_device *dev)
+static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 {
 	unsigned int hash = nh_dev_hashfn(dev->ifindex);
 	struct net *net = dev_net(dev);
@@ -1411,6 +1411,10 @@  static void nexthop_flush_dev(struct net_device *dev)
 		if (nhi->fib_nhc.nhc_dev != dev)
 			continue;
 
+		if (nhi->reject_nh &&
+		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
+			continue;
+
 		remove_nexthop(net, nhi->nh_parent, NULL);
 	}
 }
@@ -2189,11 +2193,11 @@  static int nh_netdev_event(struct notifier_block *this,
 	switch (event) {
 	case NETDEV_DOWN:
 	case NETDEV_UNREGISTER:
-		nexthop_flush_dev(dev);
+		nexthop_flush_dev(dev, event);
 		break;
 	case NETDEV_CHANGE:
 		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
-			nexthop_flush_dev(dev);
+			nexthop_flush_dev(dev, event);
 		break;
 	case NETDEV_CHANGEMTU:
 		info_ext = ptr;