diff mbox series

[net-next,2/3] selftests: fix OOM problem in msg_zerocopy selftest

Message ID 20240409205300.1346681-3-zijianzhang@bytedance.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: socket sendmsg MSG_ZEROCOPY_UARG | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 17 this patch: 17
netdev/build_tools success Errors and warnings before: 5 this patch: 5
netdev/cc_maintainers warning 3 maintainers not CCed: pabeni@redhat.com linux-kselftest@vger.kernel.org shuah@kernel.org
netdev/build_clang fail Errors and warnings before: 18 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 17 this patch: 17
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 63 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Zijian Zhang April 9, 2024, 8:52 p.m. UTC
From: Zijian Zhang <zijianzhang@bytedance.com>

In selftests/net/msg_zerocopy.c, it has a while loop keeps calling sendmsg
on a socket, and it will recv the completion notifications when the socket
is not writable. Typically, it will start the receiving process after
around 30+ sendmsgs.

However, because of the commit dfa2f0483360
("tcp: get rid of sysctl_tcp_adv_win_scale"), the sender is always writable
and does not get any chance to run recv notifications. The selftest always
exits with OUT_OF_MEMORY because the memory used by opt_skb exceeds
the core.sysctl_optmem_max. We introduce "cfg_notification_limit" to
force sender to receive notifications after some number of sendmsgs. Plus,
in the selftest, we need to update skb_orphan_frags_rx to be the same as
skb_orphan_frags. In this case, for some reason, notifications do not
come in order now. We introduce "cfg_notification_order_check" to
possibly ignore the checking for order.

Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
---
 tools/testing/selftests/net/msg_zerocopy.c | 24 ++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

Comments

Willem de Bruijn April 9, 2024, 9:25 p.m. UTC | #1
zijianzhang@ wrote:
> From: Zijian Zhang <zijianzhang@bytedance.com>
> 
> In selftests/net/msg_zerocopy.c, it has a while loop keeps calling sendmsg
> on a socket, and it will recv the completion notifications when the socket
> is not writable. Typically, it will start the receiving process after
> around 30+ sendmsgs.
> 
> However, because of the commit dfa2f0483360
> ("tcp: get rid of sysctl_tcp_adv_win_scale"), the sender is always writable
> and does not get any chance to run recv notifications. The selftest always
> exits with OUT_OF_MEMORY because the memory used by opt_skb exceeds
> the core.sysctl_optmem_max. We introduce "cfg_notification_limit" to
> force sender to receive notifications after some number of sendmsgs.

No need for a new option. Existing test automation will not enable
that.

I have not observed this behavior in tests (so I wonder what is
different about the setups). But it is fine to unconditionally force
a call to do_recv_completions every few sends.

> Plus,
> in the selftest, we need to update skb_orphan_frags_rx to be the same as
> skb_orphan_frags.

To be able to test over loopback, I suppose?

> In this case, for some reason, notifications do not
> come in order now. We introduce "cfg_notification_order_check" to
> possibly ignore the checking for order.

Were you testing UDP?

I don't think this is needed. I wonder what you were doing to see
enough of these events to want to suppress the log output.
 
> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
> ---
>  tools/testing/selftests/net/msg_zerocopy.c | 24 ++++++++++++++++++----
>  1 file changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
> index bdc03a2097e8..8e595216a0af 100644
> --- a/tools/testing/selftests/net/msg_zerocopy.c
> +++ b/tools/testing/selftests/net/msg_zerocopy.c
> @@ -85,6 +85,8 @@ static bool cfg_rx;
>  static int  cfg_runtime_ms	= 4200;
>  static int  cfg_verbose;
>  static int  cfg_waittime_ms	= 500;
> +static bool cfg_notification_order_check;
> +static int  cfg_notification_limit = 32;
>  static bool cfg_zerocopy;
>  
>  static socklen_t cfg_alen;
> @@ -435,7 +437,7 @@ static bool do_recv_completion(int fd, int domain)
>  	/* Detect notification gaps. These should not happen often, if at all.
>  	 * Gaps can occur due to drops, reordering and retransmissions.
>  	 */
> -	if (lo != next_completion)
> +	if (cfg_notification_order_check && lo != next_completion)
>  		fprintf(stderr, "gap: %u..%u does not append to %u\n",
>  			lo, hi, next_completion);
>  	next_completion = hi + 1;
> @@ -489,7 +491,7 @@ static void do_tx(int domain, int type, int protocol)
>  		struct iphdr iph;
>  	} nh;
>  	uint64_t tstop;
> -	int fd;
> +	int fd, sendmsg_counter = 0;
>  
>  	fd = do_setup_tx(domain, type, protocol);
>  
> @@ -548,10 +550,18 @@ static void do_tx(int domain, int type, int protocol)
>  			do_sendmsg_corked(fd, &msg);
>  		else
>  			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
> +		sendmsg_counter++;
> +
> +		if (sendmsg_counter == cfg_notification_limit && cfg_zerocopy) {
> +			do_recv_completions(fd, domain);
> +			sendmsg_counter = 0;
> +		}
>  
>  		while (!do_poll(fd, POLLOUT)) {
> -			if (cfg_zerocopy)
> +			if (cfg_zerocopy) {
>  				do_recv_completions(fd, domain);
> +				sendmsg_counter = 0;
> +			}
>  		}
>  
>  	} while (gettimeofday_ms() < tstop);
> @@ -708,7 +718,7 @@ static void parse_opts(int argc, char **argv)
>  
>  	cfg_payload_len = max_payload_len;
>  
> -	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
> +	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vzol:")) != -1) {
>  		switch (c) {
>  		case '4':
>  			if (cfg_family != PF_UNSPEC)
> @@ -760,6 +770,12 @@ static void parse_opts(int argc, char **argv)
>  		case 'z':
>  			cfg_zerocopy = true;
>  			break;
> +		case 'o':
> +			cfg_notification_order_check = true;
> +			break;
> +		case 'l':
> +			cfg_notification_limit = strtoul(optarg, NULL, 0);
> +			break;
>  		}
>  	}
>  
> -- 
> 2.20.1
>
Eric Dumazet April 9, 2024, 9:30 p.m. UTC | #2
On Tue, Apr 9, 2024 at 11:25 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> zijianzhang@ wrote:
> > From: Zijian Zhang <zijianzhang@bytedance.com>
> >
> > In selftests/net/msg_zerocopy.c, it has a while loop keeps calling sendmsg
> > on a socket, and it will recv the completion notifications when the socket
> > is not writable. Typically, it will start the receiving process after
> > around 30+ sendmsgs.
> >
> > However, because of the commit dfa2f0483360
> > ("tcp: get rid of sysctl_tcp_adv_win_scale"), the sender is always writable
> > and does not get any chance to run recv notifications. The selftest always
> > exits with OUT_OF_MEMORY because the memory used by opt_skb exceeds
> > the core.sysctl_optmem_max. We introduce "cfg_notification_limit" to
> > force sender to receive notifications after some number of sendmsgs.
>
> No need for a new option. Existing test automation will not enable
> that.
>
> I have not observed this behavior in tests (so I wonder what is
> different about the setups). But it is fine to unconditionally force
> a call to do_recv_completions every few sends.

Maybe their kernel does not have yet :

commit 4944566706b27918ca15eda913889db296792415    net: increase
optmem_max default value

???

>
> > Plus,
> > in the selftest, we need to update skb_orphan_frags_rx to be the same as
> > skb_orphan_frags.
>
> To be able to test over loopback, I suppose?
>
> > In this case, for some reason, notifications do not
> > come in order now. We introduce "cfg_notification_order_check" to
> > possibly ignore the checking for order.
>
> Were you testing UDP?
>
> I don't think this is needed. I wonder what you were doing to see
> enough of these events to want to suppress the log output.
Zijian Zhang April 9, 2024, 11:57 p.m. UTC | #3
Firstly, thanks for your time and quick reply!

On 4/9/24 2:30 PM, Eric Dumazet wrote:
> On Tue, Apr 9, 2024 at 11:25 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>>
>> zijianzhang@ wrote:
>>> From: Zijian Zhang <zijianzhang@bytedance.com>
>>>
>>> In selftests/net/msg_zerocopy.c, it has a while loop keeps calling sendmsg
>>> on a socket, and it will recv the completion notifications when the socket
>>> is not writable. Typically, it will start the receiving process after
>>> around 30+ sendmsgs.
>>>
>>> However, because of the commit dfa2f0483360
>>> ("tcp: get rid of sysctl_tcp_adv_win_scale"), the sender is always writable
>>> and does not get any chance to run recv notifications. The selftest always
>>> exits with OUT_OF_MEMORY because the memory used by opt_skb exceeds
>>> the core.sysctl_optmem_max. We introduce "cfg_notification_limit" to
>>> force sender to receive notifications after some number of sendmsgs.
>>
>> No need for a new option. Existing test automation will not enable
>> that.
>>
>> I have not observed this behavior in tests (so I wonder what is
>> different about the setups). But it is fine to unconditionally force
>> a call to do_recv_completions every few sends.
> 
> Maybe their kernel does not have yet :
> 
> commit 4944566706b27918ca15eda913889db296792415    net: increase
> optmem_max default value
> 
> ???
> 

I did the selftest on a qemu vm with linux repo v6.8-rc3 kernel.
It should have the commit 4944566706b2 ("net: increase optmem_max
default value")

"
qemu-system-x86_64 \
     -enable-kvm \
     -nographic \
     -drive file=$HOME/guest.qcow2,if=virtio \
     -device vfio-pci,host=3b:00.2,multifunction=on \
     -m 32G \
     -smp 16 \
     -kernel $HOME/linux-master/arch/x86/boot/bzImage \
     -append 'root=/dev/vda1 console=ttyS0 earlyprintk=ttyS0 
net.ifnames=0 biosdevname=0'
"

I did it again just now with a clean image, and there was no problem...
Unfortunately, I did not save the image I tested before, I will give you
more information about my network configuration if I could restore it.

Thus, it is not a BUG, but a problem due to my custom conf, sorry about 
this, I will delete this patch in the next version.

>>
>>> Plus,
>>> in the selftest, we need to update skb_orphan_frags_rx to be the same as
>>> skb_orphan_frags.
>>
>> To be able to test over loopback, I suppose?
>>

Yes.

>>> In this case, for some reason, notifications do not
>>> come in order now. We introduce "cfg_notification_order_check" to
>>> possibly ignore the checking for order.
>>
>> Were you testing UDP?
>>
>> I don't think this is needed. I wonder what you were doing to see
>> enough of these events to want to suppress the log output.

I tested again on both TCP and UDP just now, and it happened to both of 
them. For tcp test, too many printfs will delay the sending and thus 
affect the throughput.

ipv4 tcp -z -t 1
gap: 277..277 does not append to 276
gap: 276..276 does not append to 278
gap: 278..1112 does not append to 277
gap: 1114..1114 does not append to 1113
gap: 1113..1113 does not append to 1115
gap: 1115..2330 does not append to 1114
gap: 2332..2332 does not append to 2331
gap: 2331..2331 does not append to 2333
gap: 2333..2559 does not append to 2332
gap: 2562..2562 does not append to 2560
...
gap: 25841..25841 does not append to 25843
gap: 25843..25997 does not append to 25842

...

ipv6 udp -z -t 1
gap: 11632..11687 does not append to 11625
gap: 11625..11631 does not append to 11688
gap: 11688..54662 does not append to 11632
Willem de Bruijn April 10, 2024, 11:06 p.m. UTC | #4
> >>> In this case, for some reason, notifications do not
> >>> come in order now. We introduce "cfg_notification_order_check" to
> >>> possibly ignore the checking for order.
> >>
> >> Were you testing UDP?
> >>
> >> I don't think this is needed. I wonder what you were doing to see
> >> enough of these events to want to suppress the log output.
> 
> I tested again on both TCP and UDP just now, and it happened to both of 
> them. For tcp test, too many printfs will delay the sending and thus 
> affect the throughput.
> 
> ipv4 tcp -z -t 1
> gap: 277..277 does not append to 276

There is something wrong here. 277 clearly appends to 276

> gap: 276..276 does not append to 278

This would be an actual reordering. But the above line already
indicates that 276 is the next expected value.

> gap: 278..1112 does not append to 277
> gap: 1114..1114 does not append to 1113
> gap: 1113..1113 does not append to 1115
> gap: 1115..2330 does not append to 1114
> gap: 2332..2332 does not append to 2331
> gap: 2331..2331 does not append to 2333
> gap: 2333..2559 does not append to 2332
> gap: 2562..2562 does not append to 2560
> ...
> gap: 25841..25841 does not append to 25843
> gap: 25843..25997 does not append to 25842
> 
> ...
> 
> ipv6 udp -z -t 1
> gap: 11632..11687 does not append to 11625
> gap: 11625..11631 does not append to 11688
> gap: 11688..54662 does not append to 11632

If you ran this on a kernel with a variety of changes, please repeat
this on a clean kernel with no other changes besides the
skb_orphan_frags_rx loopback change.

It this is a real issue, I don't mind moving this behind cfg_verbose.
And prefer that approach over adding a new flag.

But I have never seen this before, and this kind of reordering is rare
with UDP and should not happen with TCP except for really edge cases:
the uarg is released only when both the skb was delivered and the ACK
response was received to free the clone on the retransmit queue.
Zijian Zhang April 12, 2024, 12:26 a.m. UTC | #5
On 4/10/24 4:06 PM, Willem de Bruijn wrote:
>>>>> In this case, for some reason, notifications do not
>>>>> come in order now. We introduce "cfg_notification_order_check" to
>>>>> possibly ignore the checking for order.
>>>>
>>>> Were you testing UDP?
>>>>
>>>> I don't think this is needed. I wonder what you were doing to see
>>>> enough of these events to want to suppress the log output.
>>
>> I tested again on both TCP and UDP just now, and it happened to both of
>> them. For tcp test, too many printfs will delay the sending and thus
>> affect the throughput.
>>
>> ipv4 tcp -z -t 1
>> gap: 277..277 does not append to 276
> 
> There is something wrong here. 277 clearly appends to 276
> 

```
if (lo != next_completion)
     fprintf(stderr, "gap: %u..%u does not append to %u\n",
         lo, hi, next_completion);
```

According to the code, it expects the lo to be 276, but it's 277.

>> gap: 276..276 does not append to 278
> 
> This would be an actual reordering. But the above line already
> indicates that 276 is the next expected value.
> 
>> gap: 278..1112 does not append to 277
>> gap: 1114..1114 does not append to 1113
>> gap: 1113..1113 does not append to 1115
>> gap: 1115..2330 does not append to 1114
>> gap: 2332..2332 does not append to 2331
>> gap: 2331..2331 does not append to 2333
>> gap: 2333..2559 does not append to 2332
>> gap: 2562..2562 does not append to 2560
>> ...
>> gap: 25841..25841 does not append to 25843
>> gap: 25843..25997 does not append to 25842
>>
>> ...
>>
>> ipv6 udp -z -t 1
>> gap: 11632..11687 does not append to 11625
>> gap: 11625..11631 does not append to 11688
>> gap: 11688..54662 does not append to 11632
> 
> If you ran this on a kernel with a variety of changes, please repeat
> this on a clean kernel with no other changes besides the
> skb_orphan_frags_rx loopback change.
> 
> It this is a real issue, I don't mind moving this behind cfg_verbose.
> And prefer that approach over adding a new flag.
> 
> But I have never seen this before, and this kind of reordering is rare
> with UDP and should not happen with TCP except for really edge cases:
> the uarg is released only when both the skb was delivered and the ACK
> response was received to free the clone on the retransmit queue.

I found the set up where I encountered the OOM problem in msg_zerocopy
selftest. I did it on a clean kernel vm booted by qemu, 
dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale") with only
skb_orphan_frags_rx change.

Then, I do `make olddefconfig` and turn on some configurations for
virtualization like VIRTIO_FS, VIRTIO_NET and some confs like 9P_FS
to share folders. Let's call it config, here was the result I got,
```
./msg_zerocopy.sh
ipv4 tcp -z -t 1
./msg_zerocopy: send: No buffer space available
rx=564 (70 MB)
```

Since the TCP socket is always writable, the do_poll always return True.
There is no any chance for `do_recv_completions` to run.
```
while (!do_poll(fd, POLLOUT)) {
     if (cfg_zerocopy)
         do_recv_completions(fd, domain);
     }
```
Finally, the size of sendmsg zerocopy notification skbs exceeds the 
opt_mem limit. I got "No buffer space available".


However, if I change the config by
```
  DEBUG_IRQFLAGS n -> y
  DEBUG_LOCK_ALLOC n -> y
  DEBUG_MUTEXES n -> y
  DEBUG_RT_MUTEXES n -> y
  DEBUG_RWSEMS n -> y
  DEBUG_SPINLOCK n -> y
  DEBUG_WW_MUTEX_SLOWPATH n -> y
  PROVE_LOCKING n -> y
+DEBUG_LOCKDEP y
+LOCKDEP y
+LOCKDEP_BITS 15
+LOCKDEP_CHAINS_BITS 16
+LOCKDEP_CIRCULAR_QUEUE_BITS 12
+LOCKDEP_STACK_TRACE_BITS 19
+LOCKDEP_STACK_TRACE_HASH_BITS 14
+PREEMPTIRQ_TRACEPOINTS y
+PROVE_RAW_LOCK_NESTING n
+PROVE_RCU y
+TRACE_IRQFLAGS y
+TRACE_IRQFLAGS_NMI y
```

Let's call it config-debug, the selftest works well with reordered
notifications.
```
ipv4 tcp -z -t 1
gap: 2117..2117 does not append to 2115
gap: 2115..2116 does not append to 2118
gap: 2118..3144 does not append to 2117
gap: 3146..3146 does not append to 3145
gap: 3145..3145 does not append to 3147
gap: 3147..3768 does not append to 3146
...
gap: 34935..34935 does not append to 34937
gap: 34938..36409 does not append to 34936

rx=36097 (2272 MB)
missing notifications: 36410 < 36412
tx=36412 (2272 MB) txc=36410 zc=y
```
For exact config to compile the kernel, please see
https://github.com/Sm0ckingBird/config


I also did selftest on 63c8778d9149("Merge branch 
'net-mana-fix-doorbell-access-for-receive-queues'"), the parent of 
dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale")

with config, selftest works well.
```
ipv4 tcp -z -t 1
missing notifications: 223181 < 223188
tx=223188 (13927 MB) txc=223181 zc=y
rx=111592 (13927 MB)
```

with config-debug, selftest works well with reordered notifications
```
ipv4 tcp -z -t 1
...
gap: 30397..30404 does not append to 30389
gap: 30435..30442 does not append to 30427
gap: 30427..30434 does not append to 30443
gap: 30443..30450 does not append to 30435
gap: 30473..30480 does not append to 30465
gap: 30465..30472 does not append to 30481
gap: 30481..30488 does not append to 30473
tx=30491 (1902 MB) txc=30491 zc=y
rx=15245 (1902 MB)
```

Not sure about the exact reason for this OOM problem, and why
turning on DEBUG_LOCKDEP and PROVE_RAW_LOCK_NESTING can solve
the problem with reordered notifications... If you have any thoughts or
comments, please feel free to share them with us.

If the problem does exist, I guess we can force `do_recv_completions`
after some number of sendmsgs and move "gap: ..." after cfg_verbose?
Willem de Bruijn April 12, 2024, 3:36 p.m. UTC | #6
Zijian Zhang wrote:
> On 4/10/24 4:06 PM, Willem de Bruijn wrote:
> >>>>> In this case, for some reason, notifications do not
> >>>>> come in order now. We introduce "cfg_notification_order_check" to
> >>>>> possibly ignore the checking for order.
> >>>>
> >>>> Were you testing UDP?
> >>>>
> >>>> I don't think this is needed. I wonder what you were doing to see
> >>>> enough of these events to want to suppress the log output.
> >>
> >> I tested again on both TCP and UDP just now, and it happened to both of
> >> them. For tcp test, too many printfs will delay the sending and thus
> >> affect the throughput.
> >>
> >> ipv4 tcp -z -t 1
> >> gap: 277..277 does not append to 276
> > 
> > There is something wrong here. 277 clearly appends to 276
> > 
> 
> ```
> if (lo != next_completion)
>      fprintf(stderr, "gap: %u..%u does not append to %u\n",
>          lo, hi, next_completion);
> ```
> 
> According to the code, it expects the lo to be 276, but it's 277.

Ack. I should have phrased that message better.
 
> > If you ran this on a kernel with a variety of changes, please repeat
> > this on a clean kernel with no other changes besides the
> > skb_orphan_frags_rx loopback change.
> > 
> > It this is a real issue, I don't mind moving this behind cfg_verbose.
> > And prefer that approach over adding a new flag.
> > 
> > But I have never seen this before, and this kind of reordering is rare
> > with UDP and should not happen with TCP except for really edge cases:
> > the uarg is released only when both the skb was delivered and the ACK
> > response was received to free the clone on the retransmit queue.
> 
> I found the set up where I encountered the OOM problem in msg_zerocopy
> selftest. I did it on a clean kernel vm booted by qemu, 
> dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale") with only
> skb_orphan_frags_rx change.
> 
> Then, I do `make olddefconfig` and turn on some configurations for
> virtualization like VIRTIO_FS, VIRTIO_NET and some confs like 9P_FS
> to share folders. Let's call it config, here was the result I got,
> ```
> ./msg_zerocopy.sh
> ipv4 tcp -z -t 1
> ./msg_zerocopy: send: No buffer space available
> rx=564 (70 MB)
> ```
> 
> Since the TCP socket is always writable, the do_poll always return True.
> There is no any chance for `do_recv_completions` to run.
> ```
> while (!do_poll(fd, POLLOUT)) {
>      if (cfg_zerocopy)
>          do_recv_completions(fd, domain);
>      }
> ```
> Finally, the size of sendmsg zerocopy notification skbs exceeds the 
> opt_mem limit. I got "No buffer space available".
> 
> 
> However, if I change the config by
> ```
>   DEBUG_IRQFLAGS n -> y
>   DEBUG_LOCK_ALLOC n -> y
>   DEBUG_MUTEXES n -> y
>   DEBUG_RT_MUTEXES n -> y
>   DEBUG_RWSEMS n -> y
>   DEBUG_SPINLOCK n -> y
>   DEBUG_WW_MUTEX_SLOWPATH n -> y
>   PROVE_LOCKING n -> y
> +DEBUG_LOCKDEP y
> +LOCKDEP y
> +LOCKDEP_BITS 15
> +LOCKDEP_CHAINS_BITS 16
> +LOCKDEP_CIRCULAR_QUEUE_BITS 12
> +LOCKDEP_STACK_TRACE_BITS 19
> +LOCKDEP_STACK_TRACE_HASH_BITS 14
> +PREEMPTIRQ_TRACEPOINTS y
> +PROVE_RAW_LOCK_NESTING n
> +PROVE_RCU y
> +TRACE_IRQFLAGS y
> +TRACE_IRQFLAGS_NMI y
> ```
> 
> Let's call it config-debug, the selftest works well with reordered
> notifications.
> ```
> ipv4 tcp -z -t 1
> gap: 2117..2117 does not append to 2115
> gap: 2115..2116 does not append to 2118
> gap: 2118..3144 does not append to 2117
> gap: 3146..3146 does not append to 3145
> gap: 3145..3145 does not append to 3147
> gap: 3147..3768 does not append to 3146
> ...
> gap: 34935..34935 does not append to 34937
> gap: 34938..36409 does not append to 34936
> 
> rx=36097 (2272 MB)
> missing notifications: 36410 < 36412
> tx=36412 (2272 MB) txc=36410 zc=y
> ```
> For exact config to compile the kernel, please see
> https://github.com/Sm0ckingBird/config

Thanks for sharing the system configs. I'm quite surprised at these
reorderings *over loopback* with these debug settings, and no weird
qdiscs that would explain it. Can you see whether you see drops and
retransmits?

> 
> I also did selftest on 63c8778d9149("Merge branch 
> 'net-mana-fix-doorbell-access-for-receive-queues'"), the parent of 
> dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale")
> 
> with config, selftest works well.
> ```
> ipv4 tcp -z -t 1
> missing notifications: 223181 < 223188
> tx=223188 (13927 MB) txc=223181 zc=y
> rx=111592 (13927 MB)
> ```
> 
> with config-debug, selftest works well with reordered notifications
> ```
> ipv4 tcp -z -t 1
> ...
> gap: 30397..30404 does not append to 30389
> gap: 30435..30442 does not append to 30427
> gap: 30427..30434 does not append to 30443
> gap: 30443..30450 does not append to 30435
> gap: 30473..30480 does not append to 30465
> gap: 30465..30472 does not append to 30481
> gap: 30481..30488 does not append to 30473
> tx=30491 (1902 MB) txc=30491 zc=y
> rx=15245 (1902 MB)
> ```
> 
> Not sure about the exact reason for this OOM problem, and why
> turning on DEBUG_LOCKDEP and PROVE_RAW_LOCK_NESTING can solve
> the problem with reordered notifications...

The debug config causes the reordering notifications, right? But
solves the OOM.

> If you have any thoughts or
> comments, please feel free to share them with us.
> 
> If the problem does exist, I guess we can force `do_recv_completions`
> after some number of sendmsgs and move "gap: ..." after cfg_verbose?

I do want to understand the issue better. But not sure when I'll find
the time.

Both sound reasonable to me, yes.
Zijian Zhang April 12, 2024, 6:58 p.m. UTC | #7
On 4/12/24 8:36 AM, Willem de Bruijn wrote:
> Zijian Zhang wrote:
>> On 4/10/24 4:06 PM, Willem de Bruijn wrote:
>>>>>>> In this case, for some reason, notifications do not
>>>>>>> come in order now. We introduce "cfg_notification_order_check" to
>>>>>>> possibly ignore the checking for order.
>>>>>>
>>>>>> Were you testing UDP?
>>>>>>
>>>>>> I don't think this is needed. I wonder what you were doing to see
>>>>>> enough of these events to want to suppress the log output.
>>>>
>>>> I tested again on both TCP and UDP just now, and it happened to both of
>>>> them. For tcp test, too many printfs will delay the sending and thus
>>>> affect the throughput.
>>>>
>>>> ipv4 tcp -z -t 1
>>>> gap: 277..277 does not append to 276
>>>
>>> There is something wrong here. 277 clearly appends to 276
>>>
>>
>> ```
>> if (lo != next_completion)
>>       fprintf(stderr, "gap: %u..%u does not append to %u\n",
>>           lo, hi, next_completion);
>> ```
>>
>> According to the code, it expects the lo to be 276, but it's 277.
> 
> Ack. I should have phrased that message better.
>   
>>> If you ran this on a kernel with a variety of changes, please repeat
>>> this on a clean kernel with no other changes besides the
>>> skb_orphan_frags_rx loopback change.
>>>
>>> It this is a real issue, I don't mind moving this behind cfg_verbose.
>>> And prefer that approach over adding a new flag.
>>>
>>> But I have never seen this before, and this kind of reordering is rare
>>> with UDP and should not happen with TCP except for really edge cases:
>>> the uarg is released only when both the skb was delivered and the ACK
>>> response was received to free the clone on the retransmit queue.
>>
>> I found the set up where I encountered the OOM problem in msg_zerocopy
>> selftest. I did it on a clean kernel vm booted by qemu,
>> dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale") with only
>> skb_orphan_frags_rx change.
>>
>> Then, I do `make olddefconfig` and turn on some configurations for
>> virtualization like VIRTIO_FS, VIRTIO_NET and some confs like 9P_FS
>> to share folders. Let's call it config, here was the result I got,
>> ```
>> ./msg_zerocopy.sh
>> ipv4 tcp -z -t 1
>> ./msg_zerocopy: send: No buffer space available
>> rx=564 (70 MB)
>> ```
>>
>> Since the TCP socket is always writable, the do_poll always return True.
>> There is no any chance for `do_recv_completions` to run.
>> ```
>> while (!do_poll(fd, POLLOUT)) {
>>       if (cfg_zerocopy)
>>           do_recv_completions(fd, domain);
>>       }
>> ```
>> Finally, the size of sendmsg zerocopy notification skbs exceeds the
>> opt_mem limit. I got "No buffer space available".
>>
>>
>> However, if I change the config by
>> ```
>>    DEBUG_IRQFLAGS n -> y
>>    DEBUG_LOCK_ALLOC n -> y
>>    DEBUG_MUTEXES n -> y
>>    DEBUG_RT_MUTEXES n -> y
>>    DEBUG_RWSEMS n -> y
>>    DEBUG_SPINLOCK n -> y
>>    DEBUG_WW_MUTEX_SLOWPATH n -> y
>>    PROVE_LOCKING n -> y
>> +DEBUG_LOCKDEP y
>> +LOCKDEP y
>> +LOCKDEP_BITS 15
>> +LOCKDEP_CHAINS_BITS 16
>> +LOCKDEP_CIRCULAR_QUEUE_BITS 12
>> +LOCKDEP_STACK_TRACE_BITS 19
>> +LOCKDEP_STACK_TRACE_HASH_BITS 14
>> +PREEMPTIRQ_TRACEPOINTS y
>> +PROVE_RAW_LOCK_NESTING n
>> +PROVE_RCU y
>> +TRACE_IRQFLAGS y
>> +TRACE_IRQFLAGS_NMI y
>> ```
>>
>> Let's call it config-debug, the selftest works well with reordered
>> notifications.
>> ```
>> ipv4 tcp -z -t 1
>> gap: 2117..2117 does not append to 2115
>> gap: 2115..2116 does not append to 2118
>> gap: 2118..3144 does not append to 2117
>> gap: 3146..3146 does not append to 3145
>> gap: 3145..3145 does not append to 3147
>> gap: 3147..3768 does not append to 3146
>> ...
>> gap: 34935..34935 does not append to 34937
>> gap: 34938..36409 does not append to 34936
>>
>> rx=36097 (2272 MB)
>> missing notifications: 36410 < 36412
>> tx=36412 (2272 MB) txc=36410 zc=y
>> ```
>> For exact config to compile the kernel, please see
>> https://github.com/Sm0ckingBird/config
> 
> Thanks for sharing the system configs. I'm quite surprised at these
> reorderings *over loopback* with these debug settings, and no weird
> qdiscs that would explain it. Can you see whether you see drops and
> retransmits?
> 

No drops and retransmits are observed.

```
ip netns exec ns-djROUw1 netstat -s

Tcp:
     1 active connection openings
     0 passive connection openings
     0 failed connection attempts
     1 connection resets received
     0 connections established
     16158 segments received
     32311 segments sent out
     0 segments retransmitted
     0 bad segments received
     0 resets sent

ip netns exec ns-djROUw1 ip -s link show veth0

2: veth0@if2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 65535 qdisc noqueue 
state UP mode DEFAULT group default qlen 1000
     link/ether 02:02:02:02:02:02 brd ff:ff:ff:ff:ff:ff link-netns 
ns-djROUw2
     RX: bytes  packets  errors  dropped overrun mcast
     1067646    16173    0       0       0       0
     TX: bytes  packets  errors  dropped carrier collsns
     2116207634 32325    0       0       0       0
```

>>
>> I also did selftest on 63c8778d9149("Merge branch
>> 'net-mana-fix-doorbell-access-for-receive-queues'"), the parent of
>> dfa2f0483360("tcp: get rid of sysctl_tcp_adv_win_scale")
>>
>> with config, selftest works well.
>> ```
>> ipv4 tcp -z -t 1
>> missing notifications: 223181 < 223188
>> tx=223188 (13927 MB) txc=223181 zc=y
>> rx=111592 (13927 MB)
>> ```
>>
>> with config-debug, selftest works well with reordered notifications
>> ```
>> ipv4 tcp -z -t 1
>> ...
>> gap: 30397..30404 does not append to 30389
>> gap: 30435..30442 does not append to 30427
>> gap: 30427..30434 does not append to 30443
>> gap: 30443..30450 does not append to 30435
>> gap: 30473..30480 does not append to 30465
>> gap: 30465..30472 does not append to 30481
>> gap: 30481..30488 does not append to 30473
>> tx=30491 (1902 MB) txc=30491 zc=y
>> rx=15245 (1902 MB)
>> ```
>>
>> Not sure about the exact reason for this OOM problem, and why
>> turning on DEBUG_LOCKDEP and PROVE_RAW_LOCK_NESTING can solve
>> the problem with reordered notifications...
> 
> The debug config causes the reordering notifications, right? But
> solves the OOM.
> 

With config, OOM is introduced by dfa2f0483360("tcp: get rid of
sysctl_tcp_adv_win_scale"). And, it can be solved by config-debug,
but reordering notifications are observed.

With config, there is no OOM in 63c8778d9149("Merge branch
'net-mana-fix-doorbell-access-for-receive-queues'"), everything works
well. But with config-debug, reordering notifications are observed.

>> If you have any thoughts or
>> comments, please feel free to share them with us.
>>
>> If the problem does exist, I guess we can force `do_recv_completions`
>> after some number of sendmsgs and move "gap: ..." after cfg_verbose?
> 
> I do want to understand the issue better. But not sure when I'll find
> the time.
> 

I also want to understand this, I will look into it when I find time :)

> Both sound reasonable to me, yes.
Thanks for the review and suggestions, I will update in the next iteration.
diff mbox series

Patch

diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
index bdc03a2097e8..8e595216a0af 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -85,6 +85,8 @@  static bool cfg_rx;
 static int  cfg_runtime_ms	= 4200;
 static int  cfg_verbose;
 static int  cfg_waittime_ms	= 500;
+static bool cfg_notification_order_check;
+static int  cfg_notification_limit = 32;
 static bool cfg_zerocopy;
 
 static socklen_t cfg_alen;
@@ -435,7 +437,7 @@  static bool do_recv_completion(int fd, int domain)
 	/* Detect notification gaps. These should not happen often, if at all.
 	 * Gaps can occur due to drops, reordering and retransmissions.
 	 */
-	if (lo != next_completion)
+	if (cfg_notification_order_check && lo != next_completion)
 		fprintf(stderr, "gap: %u..%u does not append to %u\n",
 			lo, hi, next_completion);
 	next_completion = hi + 1;
@@ -489,7 +491,7 @@  static void do_tx(int domain, int type, int protocol)
 		struct iphdr iph;
 	} nh;
 	uint64_t tstop;
-	int fd;
+	int fd, sendmsg_counter = 0;
 
 	fd = do_setup_tx(domain, type, protocol);
 
@@ -548,10 +550,18 @@  static void do_tx(int domain, int type, int protocol)
 			do_sendmsg_corked(fd, &msg);
 		else
 			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
+		sendmsg_counter++;
+
+		if (sendmsg_counter == cfg_notification_limit && cfg_zerocopy) {
+			do_recv_completions(fd, domain);
+			sendmsg_counter = 0;
+		}
 
 		while (!do_poll(fd, POLLOUT)) {
-			if (cfg_zerocopy)
+			if (cfg_zerocopy) {
 				do_recv_completions(fd, domain);
+				sendmsg_counter = 0;
+			}
 		}
 
 	} while (gettimeofday_ms() < tstop);
@@ -708,7 +718,7 @@  static void parse_opts(int argc, char **argv)
 
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
+	while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vzol:")) != -1) {
 		switch (c) {
 		case '4':
 			if (cfg_family != PF_UNSPEC)
@@ -760,6 +770,12 @@  static void parse_opts(int argc, char **argv)
 		case 'z':
 			cfg_zerocopy = true;
 			break;
+		case 'o':
+			cfg_notification_order_check = true;
+			break;
+		case 'l':
+			cfg_notification_limit = strtoul(optarg, NULL, 0);
+			break;
 		}
 	}