diff mbox series

[v2,bpf-next,1/2] selftests/bpf: Fix the flaky tc_redirect_dtime test

Message ID 20240120060518.3604920-1-martin.lau@linux.dev (mailing list archive)
State Accepted
Commit 177f1d083a19af58f4b1206d299ed73689249fd8
Delegated to: BPF
Headers show
Series [v2,bpf-next,1/2] selftests/bpf: Fix the flaky tc_redirect_dtime test | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / veristat
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success Errors and warnings before: 1 this patch: 0
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Martin KaFai Lau Jan. 20, 2024, 6:05 a.m. UTC
From: Martin KaFai Lau <martin.lau@kernel.org>

BPF CI has been reporting the tc_redirect_dtime test failing
from time to time:

test_inet_dtime:PASS:setns src 0 nsec
(network_helpers.c:253: errno: No route to host) Failed to connect to server
close_netns:PASS:setns 0 nsec
test_inet_dtime:FAIL:connect_to_fd unexpected connect_to_fd: actual -1 < expected 0
test_tcp_clear_dtime:PASS:tcp ip6 clear dtime ingress_fwdns_p100 0 nsec

The connect_to_fd failure (EHOSTUNREACH) is from the
test_tcp_clear_dtime() test and it is the very first IPv6 traffic
after setting up all the links, addresses, and routes.

The symptom is this first connect() is always slow. In my setup, it
could take ~3s.

After some tracing and tcpdump, the slowness is mostly spent in
the neighbor solicitation in the "ns_fwd" namespace while
the "ns_src" and "ns_dst" are fine.

I forced the kernel to drop the neighbor solicitation messages.
I can then reproduce EHOSTUNREACH. What actually happen could be:
- the neighbor advertisement came back a little slow.
- the "ns_fwd" namespace concluded a neighbor discovery failure
  and triggered the ndisc_error_report() => ip6_link_failure() =>
  icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0)
- the client's connect() reports EHOSTUNREACH after receiving
  the ICMPV6_DEST_UNREACH message.

The neigh table of both "ns_src" and "ns_dst" namespace has already
been manually populated but not the "ns_fwd" namespace. This patch
fixes it by manually populating the neigh table also in the "ns_fwd"
namespace.

Although the namespace configuration part had been existed before
the tc_redirect_dtime test, still Fixes-tagging the patch when
the tc_redirect_dtime test was added since it is the only test
hitting it so far.

Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

Comments

Yonghong Song Jan. 23, 2024, 5:02 a.m. UTC | #1
On 1/19/24 10:05 PM, Martin KaFai Lau wrote:
> From: Martin KaFai Lau <martin.lau@kernel.org>
>
> BPF CI has been reporting the tc_redirect_dtime test failing
> from time to time:
>
> test_inet_dtime:PASS:setns src 0 nsec
> (network_helpers.c:253: errno: No route to host) Failed to connect to server
> close_netns:PASS:setns 0 nsec
> test_inet_dtime:FAIL:connect_to_fd unexpected connect_to_fd: actual -1 < expected 0
> test_tcp_clear_dtime:PASS:tcp ip6 clear dtime ingress_fwdns_p100 0 nsec
>
> The connect_to_fd failure (EHOSTUNREACH) is from the
> test_tcp_clear_dtime() test and it is the very first IPv6 traffic
> after setting up all the links, addresses, and routes.
>
> The symptom is this first connect() is always slow. In my setup, it
> could take ~3s.
>
> After some tracing and tcpdump, the slowness is mostly spent in
> the neighbor solicitation in the "ns_fwd" namespace while
> the "ns_src" and "ns_dst" are fine.
>
> I forced the kernel to drop the neighbor solicitation messages.
> I can then reproduce EHOSTUNREACH. What actually happen could be:
> - the neighbor advertisement came back a little slow.
> - the "ns_fwd" namespace concluded a neighbor discovery failure
>    and triggered the ndisc_error_report() => ip6_link_failure() =>
>    icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0)
> - the client's connect() reports EHOSTUNREACH after receiving
>    the ICMPV6_DEST_UNREACH message.
>
> The neigh table of both "ns_src" and "ns_dst" namespace has already
> been manually populated but not the "ns_fwd" namespace. This patch

I tried one experiment which comments out manual population of neigh mac address
like below:

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 518f143c5b0f..feb477366393 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -241,14 +241,14 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
         SYS(fail, "ip route add " IP4_DST "/32 dev src scope global");
         SYS(fail, "ip route add " IP4_NET "/16 dev src scope global");
         SYS(fail, "ip route add " IP6_DST "/128 dev src scope global");
-
+#if 0
         if (result->dev_mode == MODE_VETH) {
                 SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s",
                     src_fwd_addr);
                 SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s",
                     src_fwd_addr);
         }
-
+#endif
         close_netns(nstoken);
  
         /** setup in 'fwd' namespace */
@@ -284,12 +284,12 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
         SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global");
         SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global");
         SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global");
-
+#if 0
         if (result->dev_mode == MODE_VETH) {
                 SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD);
                 SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD);
         }
-
+#endif
         close_netns(nstoken);
  
         return 0;

And I can 100% reliably trigger the following error:

$ ./test_progs -t tc_redirect
...
All error logs:
test_tc_redirect:PASS:pthread_create 0 nsec
netns_setup_namespaces:PASS:ip netns add ns_src 0 nsec
netns_setup_namespaces:PASS:ip netns add ns_fwd 0 nsec
netns_setup_namespaces:PASS:ip netns add ns_dst 0 nsec
test_tc_redirect_run_tests:PASS:setup namespaces 0 nsec
...
test_tcp:PASS:setns dst 0 nsec
test_tcp:PASS:listen 0 nsec
close_netns:PASS:setns 0 nsec
open_netns:PASS:malloc token 0 nsec
open_netns:PASS:open /proc/self/ns/net 0 nsec
open_netns:PASS:open netns fd 0 nsec
open_netns:PASS:setns 0 nsec
test_tcp:PASS:setns src 0 nsec
(network_helpers.c:253: errno: Operation now in progress) Failed to connect to server
test_tcp:FAIL:connect_to_fd unexpected connect_to_fd: actual -1 < expected 0
close_netns:PASS:setns 0 nsec
test_ping:FAIL:ip netns exec ns_src ping -i 0.2 -c 3 -w 10 -q 172.16.2.100 > /dev/null unexpected error: 256 (errno 115)
...
open_netns:PASS:setns 0 nsec
test_tcp:PASS:setns src 0 nsec
(network_helpers.c:253: errno: Operation now in progress) Failed to connect to server
test_tcp:FAIL:connect_to_fd unexpected connect_to_fd: actual -1 < expected 0
close_netns:PASS:setns 0 nsec
test_ping:FAIL:ip netns exec ns_src ping6 -i 0.2 -c 3 -w 10 -q 0::2:dead:beef:cafe > /dev/null unexpected error: 256 (errno 115)
close_netns:PASS:setns 0 nsec

The error message:
   (network_helpers.c:253: errno: Operation now in progress) Failed to connect to server
not exactly the same but very similar to the failure observed in CI:
   (network_helpers.c:253: errno: No route to host) Failed to connect to server

The above error 'Operation now in progress' may indicate some slowness in the kernel
which caused the test failure, with or without my above hack.

> fixes it by manually populating the neigh table also in the "ns_fwd"
> namespace.
>
> Although the namespace configuration part had been existed before
> the tc_redirect_dtime test, still Fixes-tagging the patch when
> the tc_redirect_dtime test was added since it is the only test
> hitting it so far.
>
> Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh")
> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
> ---
>   tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
> index 518f143c5b0f..610887157fd8 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
> @@ -188,6 +188,7 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
>   {
>   	struct nstoken *nstoken = NULL;
>   	char src_fwd_addr[IFADDR_STR_LEN+1] = {};
> +	char src_addr[IFADDR_STR_LEN + 1] = {};
>   	int err;
>   
>   	if (result->dev_mode == MODE_VETH) {
> @@ -208,6 +209,9 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
>   	if (get_ifaddr("src_fwd", src_fwd_addr))
>   		goto fail;
>   
> +	if (get_ifaddr("src", src_addr))
> +		goto fail;
> +
>   	result->ifindex_src = if_nametoindex("src");
>   	if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src"))
>   		goto fail;
> @@ -270,6 +274,13 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
>   	SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global");
>   	SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global");
>   
> +	if (result->dev_mode == MODE_VETH) {
> +		SYS(fail, "ip neigh add " IP4_SRC " dev src_fwd lladdr %s", src_addr);
> +		SYS(fail, "ip neigh add " IP6_SRC " dev src_fwd lladdr %s", src_addr);
> +		SYS(fail, "ip neigh add " IP4_DST " dev dst_fwd lladdr %s", MAC_DST);
> +		SYS(fail, "ip neigh add " IP6_DST " dev dst_fwd lladdr %s", MAC_DST);
> +	}
> +
>   	close_netns(nstoken);
>   
>   	/** setup in 'dst' namespace */
patchwork-bot+netdevbpf@kernel.org Jan. 24, 2024, 6:20 p.m. UTC | #2
Hello:

This series was applied to bpf/bpf-next.git (master)
by Andrii Nakryiko <andrii@kernel.org>:

On Fri, 19 Jan 2024 22:05:17 -0800 you wrote:
> From: Martin KaFai Lau <martin.lau@kernel.org>
> 
> BPF CI has been reporting the tc_redirect_dtime test failing
> from time to time:
> 
> test_inet_dtime:PASS:setns src 0 nsec
> (network_helpers.c:253: errno: No route to host) Failed to connect to server
> close_netns:PASS:setns 0 nsec
> test_inet_dtime:FAIL:connect_to_fd unexpected connect_to_fd: actual -1 < expected 0
> test_tcp_clear_dtime:PASS:tcp ip6 clear dtime ingress_fwdns_p100 0 nsec
> 
> [...]

Here is the summary with links:
  - [v2,bpf-next,1/2] selftests/bpf: Fix the flaky tc_redirect_dtime test
    https://git.kernel.org/bpf/bpf-next/c/177f1d083a19
  - [v2,bpf-next,2/2] selftests/bpf: Wait for the netstamp_needed_key static key to be turned on
    https://git.kernel.org/bpf/bpf-next/c/ce6f6cffaeaa

You are awesome, thank you!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 518f143c5b0f..610887157fd8 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -188,6 +188,7 @@  static int netns_setup_links_and_routes(struct netns_setup_result *result)
 {
 	struct nstoken *nstoken = NULL;
 	char src_fwd_addr[IFADDR_STR_LEN+1] = {};
+	char src_addr[IFADDR_STR_LEN + 1] = {};
 	int err;
 
 	if (result->dev_mode == MODE_VETH) {
@@ -208,6 +209,9 @@  static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	if (get_ifaddr("src_fwd", src_fwd_addr))
 		goto fail;
 
+	if (get_ifaddr("src", src_addr))
+		goto fail;
+
 	result->ifindex_src = if_nametoindex("src");
 	if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src"))
 		goto fail;
@@ -270,6 +274,13 @@  static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global");
 	SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global");
 
+	if (result->dev_mode == MODE_VETH) {
+		SYS(fail, "ip neigh add " IP4_SRC " dev src_fwd lladdr %s", src_addr);
+		SYS(fail, "ip neigh add " IP6_SRC " dev src_fwd lladdr %s", src_addr);
+		SYS(fail, "ip neigh add " IP4_DST " dev dst_fwd lladdr %s", MAC_DST);
+		SYS(fail, "ip neigh add " IP6_DST " dev dst_fwd lladdr %s", MAC_DST);
+	}
+
 	close_netns(nstoken);
 
 	/** setup in 'dst' namespace */