diff mbox series

[PATCHv3,net-next,4/4] selftests: bonding: use slowwait instead of hard code sleep

Message ID 20240202023754.932930-5-liuhangbin@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series selftests: bonding: use slowwait when waiting | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success Errors and warnings before: 1 this patch: 0
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-02-03--09-00 (tests: 719)

Commit Message

Hangbin Liu Feb. 2, 2024, 2:37 a.m. UTC
Use slowwait instead of hard code sleep for bonding tests.

In function setup_prepare(), the client_create() will be called after
server_create(). So I think there is no need to sleep in server_create()
and remove it.

For lab_lib.sh, remove bonding module may affect other running bonding tests.
And some test env may buildin bond which can't be removed. The bonding
link should be removed by lag_reset_network() or netns delete.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 .../drivers/net/bonding/bond-lladdr-target.sh | 21 ++++++++++++++++---
 .../drivers/net/bonding/bond_macvlan.sh       |  5 ++---
 .../drivers/net/bonding/bond_topo_2d1c.sh     |  6 +++---
 .../selftests/drivers/net/bonding/lag_lib.sh  |  7 +++----
 4 files changed, 26 insertions(+), 13 deletions(-)

Comments

Jakub Kicinski Feb. 3, 2024, 5:41 p.m. UTC | #1
On Fri,  2 Feb 2024 10:37:54 +0800 Hangbin Liu wrote:
> diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> index b609fb6231f4..acd3ebed3e20 100755
> --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> @@ -58,7 +58,7 @@ macvlan_over_bond()
>  	ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
>  	ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
>  
> -	sleep 2
> +	slowwait 2 ip netns exec ${c_ns} ping ${s_ip4} -c 1 -W 0.1 &> /dev/null
>  
>  	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
>  	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
> @@ -69,8 +69,7 @@ macvlan_over_bond()
>  	check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
>  	check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
>  
> -
> -	sleep 5
> +	slowwait 5 ip netns exec ${s_ns} ping ${c_ip4} -c 1 -W 0.1 &> /dev/null
>  
>  	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
>  	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"

This makes the bond_macvlan.sh test flaky:

https://netdev.bots.linux.dev/contest.html?test=bond-macvlan-sh

I repro'd it and the ping in check_connection() fails - neigh resolution
fails. I guess we need to insert more of the slowwaits?

Reverting this patch from the pending patch tree fixes it. The runner
has no KVM support, and runs a VM with 64 CPUs. If I lower the number
of CPUs to 4 the test passes. I added the note that some flakiness may
be caused by high CPU count:

https://github.com/linux-netdev/nipa/wiki/How-to-run-netdev-selftests-CI-style#tips
Hangbin Liu Feb. 4, 2024, 8:31 a.m. UTC | #2
On Sat, Feb 03, 2024 at 09:41:51AM -0800, Jakub Kicinski wrote:
> On Fri,  2 Feb 2024 10:37:54 +0800 Hangbin Liu wrote:
> > diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> > index b609fb6231f4..acd3ebed3e20 100755
> > --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> > +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
> > @@ -58,7 +58,7 @@ macvlan_over_bond()
> >  	ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
> >  	ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
> >  
> > -	sleep 2
> > +	slowwait 2 ip netns exec ${c_ns} ping ${s_ip4} -c 1 -W 0.1 &> /dev/null
> >  
> >  	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
> >  	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
> > @@ -69,8 +69,7 @@ macvlan_over_bond()
> >  	check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
> >  	check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
> >  
> > -
> > -	sleep 5
> > +	slowwait 5 ip netns exec ${s_ns} ping ${c_ip4} -c 1 -W 0.1 &> /dev/null
> >  
> >  	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
> >  	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
> 
> This makes the bond_macvlan.sh test flaky:
> 
> https://netdev.bots.linux.dev/contest.html?test=bond-macvlan-sh

Hi Jakub,

Thanks for the report.

> 
> I repro'd it and the ping in check_connection() fails - neigh resolution
> fails. I guess we need to insert more of the slowwaits?
> 
> Reverting this patch from the pending patch tree fixes it. The runner
> has no KVM support, and runs a VM with 64 CPUs. If I lower the number
> of CPUs to 4 the test passes. I added the note that some flakiness may
> be caused by high CPU count:
> 
> https://github.com/linux-netdev/nipa/wiki/How-to-run-netdev-selftests-CI-style#tips

Sadly, I can't reproduce it with an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz,
which has 20 Cores and 40 Processors. From your logs[1][2][3], all the tests
failed when ping from client to macvlan_2. e.g.

# TEST: balance-tlb: IPv4: client->macvlan_1                          [ OK ]
# TEST: balance-tlb: IPv6: client->macvlan_1                          [ OK ]
# TEST: balance-tlb: IPv4: client->macvlan_2                          [FAIL]
# ping failed
# TEST: balance-tlb: IPv6: client->macvlan_2                          [ OK ]

Or

# TEST: balance-alb: IPv4: client->macvlan_1                          [ OK ]
# TEST: balance-alb: IPv6: client->macvlan_1                          [ OK ]
# TEST: balance-alb: IPv4: client->macvlan_2                          [FAIL]
# ping failed
# TEST: balance-alb: IPv6: client->macvlan_2                          [ OK ]

Let us checking the client to macvlan2 connection via slowwait and see
if it works.

[1] https://netdev-2.bots.linux.dev/vmksft-bonding/results/449541/2-bond-macvlan-sh/stdout
[2] https://netdev-2.bots.linux.dev/vmksft-bonding/results/449361/4-bond-macvlan-sh/stdout
[3] https://netdev-2.bots.linux.dev/vmksft-bonding/results/449001/4-bond-macvlan-sh/stdout

Thanks
Hangbin
diff mbox series

Patch

diff --git a/tools/testing/selftests/drivers/net/bonding/bond-lladdr-target.sh b/tools/testing/selftests/drivers/net/bonding/bond-lladdr-target.sh
index 89af402fabbe..78d3e0fe6604 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond-lladdr-target.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond-lladdr-target.sh
@@ -17,6 +17,11 @@ 
 #  +----------------+
 #
 # We use veths instead of physical interfaces
+REQUIRE_MZ=no
+NUM_NETIFS=0
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+
 sw="sw-$(mktemp -u XXXXXX)"
 host="ns-$(mktemp -u XXXXXX)"
 
@@ -26,6 +31,16 @@  cleanup()
 	ip netns del $host
 }
 
+wait_lladdr_dad()
+{
+	$@ | grep fe80 | grep -qv tentative
+}
+
+wait_bond_up()
+{
+	$@ | grep -q 'state UP'
+}
+
 trap cleanup 0 1 2
 
 ip netns add $sw
@@ -37,8 +52,8 @@  ip -n $host link add veth1 type veth peer name veth1 netns $sw
 ip -n $sw link add br0 type bridge
 ip -n $sw link set br0 up
 sw_lladdr=$(ip -n $sw addr show br0 | awk '/fe80/{print $2}' | cut -d'/' -f1)
-# sleep some time to make sure bridge lladdr pass DAD
-sleep 2
+# wait some time to make sure bridge lladdr pass DAD
+slowwait 2 wait_lladdr_dad ip -n $sw addr show br0
 
 ip -n $host link add bond0 type bond mode 1 ns_ip6_target ${sw_lladdr} \
 	arp_validate 3 arp_interval 1000
@@ -53,7 +68,7 @@  ip -n $sw link set veth1 master br0
 ip -n $sw link set veth0 up
 ip -n $sw link set veth1 up
 
-sleep 5
+slowwait 5 wait_bond_up ip -n $host link show bond0
 
 rc=0
 if ip -n $host link show bond0 | grep -q LOWER_UP; then
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
index b609fb6231f4..acd3ebed3e20 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
@@ -58,7 +58,7 @@  macvlan_over_bond()
 	ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
 	ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
 
-	sleep 2
+	slowwait 2 ip netns exec ${c_ns} ping ${s_ip4} -c 1 -W 0.1 &> /dev/null
 
 	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
 	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
@@ -69,8 +69,7 @@  macvlan_over_bond()
 	check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
 	check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
 
-
-	sleep 5
+	slowwait 5 ip netns exec ${s_ns} ping ${c_ip4} -c 1 -W 0.1 &> /dev/null
 
 	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
 	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
index 0eb7edfb584c..195ef83cfbf1 100644
--- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
@@ -73,7 +73,6 @@  server_create()
 	ip -n ${s_ns} link set bond0 up
 	ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
 	ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
-	sleep 2
 }
 
 # Reset bond with new mode and options
@@ -96,7 +95,8 @@  bond_reset()
 	ip -n ${s_ns} link set bond0 up
 	ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
 	ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
-	sleep 2
+	# Wait for IPv6 address ready as it needs DAD
+	slowwait 2 ip netns exec ${s_ns} ping6 ${c_ip6} -c 1 -W 0.1 &> /dev/null
 }
 
 server_destroy()
@@ -150,7 +150,7 @@  bond_check_connection()
 {
 	local msg=${1:-"check connection"}
 
-	sleep 2
+	slowwait 2 ip netns exec ${s_ns} ping ${c_ip4} -c 1 -W 0.1 &> /dev/null
 	ip netns exec ${s_ns} ping ${c_ip4} -c5 -i 0.1 &>/dev/null
 	check_err $? "${msg}: ping failed"
 	ip netns exec ${s_ns} ping6 ${c_ip6} -c5 -i 0.1 &>/dev/null
diff --git a/tools/testing/selftests/drivers/net/bonding/lag_lib.sh b/tools/testing/selftests/drivers/net/bonding/lag_lib.sh
index dbdd736a41d3..bf9bcd1b5ec0 100644
--- a/tools/testing/selftests/drivers/net/bonding/lag_lib.sh
+++ b/tools/testing/selftests/drivers/net/bonding/lag_lib.sh
@@ -107,13 +107,12 @@  lag_setup2x2()
 	NAMESPACES="${namespaces}"
 }
 
-# cleanup all lag related namespaces and remove the bonding module
+# cleanup all lag related namespaces
 lag_cleanup()
 {
 	for n in ${NAMESPACES}; do
 		ip netns delete ${n} >/dev/null 2>&1 || true
 	done
-	modprobe -r bonding
 }
 
 SWITCH="lag_node1"
@@ -159,7 +158,7 @@  test_bond_recovery()
 	create_bond $@
 
 	# verify connectivity
-	ip netns exec ${CLIENT} ping ${SWITCHIP} -c 2 >/dev/null 2>&1
+	slowwait 2 ip netns exec ${CLIENT} ping ${SWITCHIP} -c 2 -W 0.1 &> /dev/null
 	check_err $? "No connectivity"
 
 	# force the links of the bond down
@@ -169,7 +168,7 @@  test_bond_recovery()
 	ip netns exec ${SWITCH} ip link set eth1 down
 
 	# re-verify connectivity
-	ip netns exec ${CLIENT} ping ${SWITCHIP} -c 2 >/dev/null 2>&1
+	slowwait 2 ip netns exec ${CLIENT} ping ${SWITCHIP} -c 2 -W 0.1 &> /dev/null
 
 	local rc=$?
 	check_err $rc "Bond failed to recover"