Message ID | 20211121152453.2580051-4-razor@blackwall.org (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: nexthop: fix refcount issues when replacing groups | expand |
On Sun, Nov 21, 2021 at 05:24:53PM +0200, Nikolay Aleksandrov wrote: > From: Nikolay Aleksandrov <nikolay@nvidia.com> > > The new selftest runs a sequence which causes circular refcount > dependency between deleted objects which cannot be released and results > in a netdevice refcount imbalance. > > Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com> > --- > tools/testing/selftests/net/fib_nexthops.sh | 56 +++++++++++++++++++++ > 1 file changed, 56 insertions(+) > > diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh > index b5a69ad191b0..48d88a36ae27 100755 > --- a/tools/testing/selftests/net/fib_nexthops.sh > +++ b/tools/testing/selftests/net/fib_nexthops.sh > @@ -629,6 +629,59 @@ ipv6_fcnal() > log_test $? 0 "Nexthops removed on admin down" > } > > +ipv6_grp_refs() > +{ > + run_cmd "$IP link set dev veth1 up" > + run_cmd "$IP link add veth1.10 link veth1 up type vlan id 10" > + run_cmd "$IP link add veth1.20 link veth1 up type vlan id 20" > + run_cmd "$IP -6 addr add 2001:db8:91::1/64 dev veth1.10" > + run_cmd "$IP -6 addr add 2001:db8:92::1/64 dev veth1.20" > + run_cmd "$IP -6 neigh add 2001:db8:91::2 lladdr 00:11:22:33:44:55 dev veth1.10" > + run_cmd "$IP -6 neigh add 2001:db8:92::2 lladdr 00:11:22:33:44:55 dev veth1.20" > + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1.10" > + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth1.20" > + run_cmd "$IP nexthop add id 102 group 100" > + run_cmd "$IP route add 2001:db8:101::1/128 nhid 102" > + > + # create per-cpu dsts through nh 100 > + run_cmd "ip netns exec me mausezahn -6 veth1.10 -B 2001:db8:101::1 -A 2001:db8:91::1 -c 5 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1" I see that other test cases in this file that are using mausezahn check that it exists. See ipv4_torture() for example > + > + # remove nh 100 from the group to delete the route potentially leaving > + # a stale per-cpu dst Not sure I understand the comment. Maybe: "Remove nh 100 from the group. If the bug described in the previous commit is not fixed, the nexthop continues to cache a per-CPU dst entry that holds a reference on the IPv6 route." ? > + run_cmd "$IP nexthop replace id 102 group 101" > + run_cmd "$IP route del 2001:db8:101::1/128" > + > + # add both nexthops to the group so a reference is taken on them > + run_cmd "$IP nexthop replace id 102 group 100/101" > + > + # if the bug exists at this point we have an unlinked IPv6 route I would mention that by "the bug" you are referring to the bug described in previous commit > + # (but not freed due to stale dst) with a reference over the group > + # so we delete the group which will again only unlink it due to the > + # route reference > + run_cmd "$IP nexthop del id 102" > + > + # delete the nexthop with stale dst, since we have an unlinked > + # group with a ref to it and an unlinked IPv6 route with ref to the > + # group, the nh will only be unlinked and not freed so the stale dst > + # remains forever and we get a net device refcount imbalance > + run_cmd "$IP nexthop del id 100" > + > + # if the bug exists this command will hang because the net device > + # cannot be removed > + timeout -s KILL 5 ip netns exec me ip link del veth1.10 >/dev/null 2>&1 > + > + # we can't cleanup if the command is hung trying to delete the netdev > + if [ $? -eq 137 ]; then > + return 1 > + fi > + > + # cleanup > + run_cmd "$IP link del veth1.20" > + run_cmd "$IP nexthop flush" > + > + return 0 > +} > + > ipv6_grp_fcnal() > { > local rc > @@ -734,6 +787,9 @@ ipv6_grp_fcnal() > > run_cmd "$IP nexthop add id 108 group 31/24" > log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" > + > + ipv6_grp_refs > + log_test $? 0 "Nexthop group replace refcounts" > } > > ipv6_res_grp_fcnal() > -- > 2.31.1 >
On 21/11/2021 19:53, Ido Schimmel wrote: > On Sun, Nov 21, 2021 at 05:24:53PM +0200, Nikolay Aleksandrov wrote: >> From: Nikolay Aleksandrov <nikolay@nvidia.com> >> >> The new selftest runs a sequence which causes circular refcount >> dependency between deleted objects which cannot be released and results >> in a netdevice refcount imbalance. >> >> Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com> >> --- >> tools/testing/selftests/net/fib_nexthops.sh | 56 +++++++++++++++++++++ >> 1 file changed, 56 insertions(+) >> >> diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh >> index b5a69ad191b0..48d88a36ae27 100755 >> --- a/tools/testing/selftests/net/fib_nexthops.sh >> +++ b/tools/testing/selftests/net/fib_nexthops.sh >> @@ -629,6 +629,59 @@ ipv6_fcnal() >> log_test $? 0 "Nexthops removed on admin down" >> } >> >> +ipv6_grp_refs() >> +{ >> + run_cmd "$IP link set dev veth1 up" >> + run_cmd "$IP link add veth1.10 link veth1 up type vlan id 10" >> + run_cmd "$IP link add veth1.20 link veth1 up type vlan id 20" >> + run_cmd "$IP -6 addr add 2001:db8:91::1/64 dev veth1.10" >> + run_cmd "$IP -6 addr add 2001:db8:92::1/64 dev veth1.20" >> + run_cmd "$IP -6 neigh add 2001:db8:91::2 lladdr 00:11:22:33:44:55 dev veth1.10" >> + run_cmd "$IP -6 neigh add 2001:db8:92::2 lladdr 00:11:22:33:44:55 dev veth1.20" >> + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1.10" >> + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth1.20" >> + run_cmd "$IP nexthop add id 102 group 100" >> + run_cmd "$IP route add 2001:db8:101::1/128 nhid 102" >> + >> + # create per-cpu dsts through nh 100 >> + run_cmd "ip netns exec me mausezahn -6 veth1.10 -B 2001:db8:101::1 -A 2001:db8:91::1 -c 5 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1" > > I see that other test cases in this file that are using mausezahn check > that it exists. See ipv4_torture() for example > Indeed, I'll adjust the test >> + >> + # remove nh 100 from the group to delete the route potentially leaving >> + # a stale per-cpu dst > > Not sure I understand the comment. Maybe: > > "Remove nh 100 from the group. If the bug described in the previous > commit is not fixed, the nexthop continues to cache a per-CPU dst entry > that holds a reference on the IPv6 route." > > ? > Yes, that is the stale per-cpu dst. >> + run_cmd "$IP nexthop replace id 102 group 101" >> + run_cmd "$IP route del 2001:db8:101::1/128" >> + >> + # add both nexthops to the group so a reference is taken on them >> + run_cmd "$IP nexthop replace id 102 group 100/101" >> + >> + # if the bug exists at this point we have an unlinked IPv6 route > > I would mention that by "the bug" you are referring to the bug described > in previous commit > since there is no commit id yet, I can give a brief description only I'll may refer to it by subject though >> + # (but not freed due to stale dst) with a reference over the group >> + # so we delete the group which will again only unlink it due to the >> + # route reference >> + run_cmd "$IP nexthop del id 102" >> + >> + # delete the nexthop with stale dst, since we have an unlinked >> + # group with a ref to it and an unlinked IPv6 route with ref to the >> + # group, the nh will only be unlinked and not freed so the stale dst >> + # remains forever and we get a net device refcount imbalance >> + run_cmd "$IP nexthop del id 100" >> + >> + # if the bug exists this command will hang because the net device >> + # cannot be removed >> + timeout -s KILL 5 ip netns exec me ip link del veth1.10 >/dev/null 2>&1 >> + >> + # we can't cleanup if the command is hung trying to delete the netdev >> + if [ $? -eq 137 ]; then >> + return 1 >> + fi >> + >> + # cleanup >> + run_cmd "$IP link del veth1.20" >> + run_cmd "$IP nexthop flush" >> + >> + return 0 >> +} >> + >> ipv6_grp_fcnal() >> { >> local rc >> @@ -734,6 +787,9 @@ ipv6_grp_fcnal() >> >> run_cmd "$IP nexthop add id 108 group 31/24" >> log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" >> + >> + ipv6_grp_refs >> + log_test $? 0 "Nexthop group replace refcounts" >> } >> >> ipv6_res_grp_fcnal() >> -- >> 2.31.1 >>
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b5a69ad191b0..48d88a36ae27 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -629,6 +629,59 @@ ipv6_fcnal() log_test $? 0 "Nexthops removed on admin down" } +ipv6_grp_refs() +{ + run_cmd "$IP link set dev veth1 up" + run_cmd "$IP link add veth1.10 link veth1 up type vlan id 10" + run_cmd "$IP link add veth1.20 link veth1 up type vlan id 20" + run_cmd "$IP -6 addr add 2001:db8:91::1/64 dev veth1.10" + run_cmd "$IP -6 addr add 2001:db8:92::1/64 dev veth1.20" + run_cmd "$IP -6 neigh add 2001:db8:91::2 lladdr 00:11:22:33:44:55 dev veth1.10" + run_cmd "$IP -6 neigh add 2001:db8:92::2 lladdr 00:11:22:33:44:55 dev veth1.20" + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1.10" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth1.20" + run_cmd "$IP nexthop add id 102 group 100" + run_cmd "$IP route add 2001:db8:101::1/128 nhid 102" + + # create per-cpu dsts through nh 100 + run_cmd "ip netns exec me mausezahn -6 veth1.10 -B 2001:db8:101::1 -A 2001:db8:91::1 -c 5 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1" + + # remove nh 100 from the group to delete the route potentially leaving + # a stale per-cpu dst + run_cmd "$IP nexthop replace id 102 group 101" + run_cmd "$IP route del 2001:db8:101::1/128" + + # add both nexthops to the group so a reference is taken on them + run_cmd "$IP nexthop replace id 102 group 100/101" + + # if the bug exists at this point we have an unlinked IPv6 route + # (but not freed due to stale dst) with a reference over the group + # so we delete the group which will again only unlink it due to the + # route reference + run_cmd "$IP nexthop del id 102" + + # delete the nexthop with stale dst, since we have an unlinked + # group with a ref to it and an unlinked IPv6 route with ref to the + # group, the nh will only be unlinked and not freed so the stale dst + # remains forever and we get a net device refcount imbalance + run_cmd "$IP nexthop del id 100" + + # if the bug exists this command will hang because the net device + # cannot be removed + timeout -s KILL 5 ip netns exec me ip link del veth1.10 >/dev/null 2>&1 + + # we can't cleanup if the command is hung trying to delete the netdev + if [ $? -eq 137 ]; then + return 1 + fi + + # cleanup + run_cmd "$IP link del veth1.20" + run_cmd "$IP nexthop flush" + + return 0 +} + ipv6_grp_fcnal() { local rc @@ -734,6 +787,9 @@ ipv6_grp_fcnal() run_cmd "$IP nexthop add id 108 group 31/24" log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" + + ipv6_grp_refs + log_test $? 0 "Nexthop group replace refcounts" } ipv6_res_grp_fcnal()