diff mbox series

selftests: net: udpgso_bench_tx: Introduce exponential back-off retries

Message ID 20230127181625.286546-1-andrei.gherzan@canonical.com (mailing list archive)
State New
Headers show
Series selftests: net: udpgso_bench_tx: Introduce exponential back-off retries | expand

Commit Message

Andrei Gherzan Jan. 27, 2023, 6:16 p.m. UTC
The tx and rx test programs are used in a couple of test scripts including
"udpgro_bench.sh". Taking this as an example, when the rx/tx programs
are invoked subsequently, there is a chance that the rx one is not ready to
accept socket connections. This racing bug could fail the test with at
least one of the following:

./udpgso_bench_tx: connect: Connection refused
./udpgso_bench_tx: sendmsg: Connection refused
./udpgso_bench_tx: write: Connection refused

This change addresses this by adding routines that retry the socket
operations with an exponential back off algorithm from 100ms to 2s.

Fixes: 3a687bef148d ("selftests: udp gso benchmark")
Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
---
 tools/testing/selftests/net/udpgso_bench_tx.c | 57 +++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

Comments

Willem de Bruijn Jan. 27, 2023, 10:03 p.m. UTC | #1
On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> The tx and rx test programs are used in a couple of test scripts including
> "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> are invoked subsequently, there is a chance that the rx one is not ready to
> accept socket connections. This racing bug could fail the test with at
> least one of the following:
>
> ./udpgso_bench_tx: connect: Connection refused
> ./udpgso_bench_tx: sendmsg: Connection refused
> ./udpgso_bench_tx: write: Connection refused
>
> This change addresses this by adding routines that retry the socket
> operations with an exponential back off algorithm from 100ms to 2s.
>
> Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>

Synchronizing the two processes is indeed tricky.

Perhaps more robust is opening an initial TCP connection, with
SO_RCVTIMEO to bound the waiting time. That covers all tests in one
go.

> ---
>  tools/testing/selftests/net/udpgso_bench_tx.c | 57 +++++++++++++------
>  1 file changed, 41 insertions(+), 16 deletions(-)
>
> diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
> index f1fdaa270291..4dea9ee7eb46 100644
> --- a/tools/testing/selftests/net/udpgso_bench_tx.c
> +++ b/tools/testing/selftests/net/udpgso_bench_tx.c
> @@ -53,6 +53,9 @@
>
>  #define NUM_PKT                100
>
> +#define MAX_DELAY      2000000
> +#define INIT_DELAY     100000
> +
>  static bool    cfg_cache_trash;
>  static int     cfg_cpu         = -1;
>  static int     cfg_connected   = true;
> @@ -257,13 +260,18 @@ static void flush_errqueue(int fd, const bool do_poll)
>  static int send_tcp(int fd, char *data)
>  {
>         int ret, done = 0, count = 0;
> +       useconds_t delay = INIT_DELAY;
>
>         while (done < cfg_payload_len) {
> -               ret = send(fd, data + done, cfg_payload_len - done,
> -                          cfg_zerocopy ? MSG_ZEROCOPY : 0);
> -               if (ret == -1)
> -                       error(1, errno, "write");
> -
> +               delay = INIT_DELAY;
> +               while ((ret = send(fd, data + done, cfg_payload_len - done,
> +                               cfg_zerocopy ? MSG_ZEROCOPY : 0)) == -1) {
> +                       usleep(delay);
> +                       if (delay < MAX_DELAY)
> +                               delay *= 2;
> +                       else
> +                               error(1, errno, "write");
> +               }
>                 done += ret;
>                 count++;
>         }

send_tcp should not be affected, as connect will by then already have
succeeded. Also, as a reliable protocol it will internally retry,
after returning with success.

> @@ -274,17 +282,23 @@ static int send_tcp(int fd, char *data)
>  static int send_udp(int fd, char *data)
>  {
>         int ret, total_len, len, count = 0;
> +       useconds_t delay = INIT_DELAY;
>
>         total_len = cfg_payload_len;
>
>         while (total_len) {
>                 len = total_len < cfg_mss ? total_len : cfg_mss;
>
> -               ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
> -                            cfg_connected ? NULL : (void *)&cfg_dst_addr,
> -                            cfg_connected ? 0 : cfg_alen);
> -               if (ret == -1)
> -                       error(1, errno, "write");
> +               delay = INIT_DELAY;
> +               while ((ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
> +                               cfg_connected ? NULL : (void *)&cfg_dst_addr,
> +                               cfg_connected ? 0 : cfg_alen)) == -1) {

should ideally only retry on the expected errno. Unreliable datagram
sendto will succeed and initially. It will fail with error later,
after reception of an ICMP dst unreachable response.

> +                       usleep(delay);
> +                       if (delay < MAX_DELAY)
> +                               delay *= 2;
> +                       else
> +                               error(1, errno, "write");
> +               }
>                 if (ret != len)
>                         error(1, errno, "write: %uB != %uB\n", ret, len);
>
> @@ -378,6 +392,7 @@ static int send_udp_segment(int fd, char *data)
>         struct iovec iov = {0};
>         size_t msg_controllen;
>         struct cmsghdr *cmsg;
> +       useconds_t delay = INIT_DELAY;
>         int ret;
>
>         iov.iov_base = data;
> @@ -401,9 +416,13 @@ static int send_udp_segment(int fd, char *data)
>         msg.msg_name = (void *)&cfg_dst_addr;
>         msg.msg_namelen = cfg_alen;
>
> -       ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
> -       if (ret == -1)
> -               error(1, errno, "sendmsg");
> +       while ((ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0)) == -1) {
> +               usleep(delay);
> +               if (delay < MAX_DELAY)
> +                       delay *= 2;
> +               else
> +                       error(1, errno, "sendmsg");
> +       }
>         if (ret != iov.iov_len)
>                 error(1, 0, "sendmsg: %u != %llu\n", ret,
>                         (unsigned long long)iov.iov_len);
> @@ -616,6 +635,7 @@ int main(int argc, char **argv)
>  {
>         unsigned long num_msgs, num_sends;
>         unsigned long tnow, treport, tstop;
> +       useconds_t delay = INIT_DELAY;
>         int fd, i, val, ret;
>
>         parse_opts(argc, argv);
> @@ -648,9 +668,14 @@ int main(int argc, char **argv)
>                 }
>         }
>
> -       if (cfg_connected &&
> -           connect(fd, (void *)&cfg_dst_addr, cfg_alen))
> -               error(1, errno, "connect");
> +       if (cfg_connected)
> +               while (connect(fd, (void *)&cfg_dst_addr, cfg_alen)) {
> +                       usleep(delay);
> +                       if (delay < MAX_DELAY)
> +                               delay *= 2;
> +                       else
> +                               error(1, errno, "connect");
> +               }
>
>         if (cfg_segment)
>                 set_pmtu_discover(fd, cfg_family == PF_INET);
> --
> 2.34.1
>
Paolo Abeni Jan. 30, 2023, 8:26 a.m. UTC | #2
On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> > 
> > The tx and rx test programs are used in a couple of test scripts including
> > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > are invoked subsequently, there is a chance that the rx one is not ready to
> > accept socket connections. This racing bug could fail the test with at
> > least one of the following:
> > 
> > ./udpgso_bench_tx: connect: Connection refused
> > ./udpgso_bench_tx: sendmsg: Connection refused
> > ./udpgso_bench_tx: write: Connection refused
> > 
> > This change addresses this by adding routines that retry the socket
> > operations with an exponential back off algorithm from 100ms to 2s.
> > 
> > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> 
> Synchronizing the two processes is indeed tricky.
> 
> Perhaps more robust is opening an initial TCP connection, with
> SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> go.

Another option would be waiting for the listener(tcp)/receiver(udp)
socket to show up in 'ss' output before firing-up the client - quite
alike what mptcp self-tests are doing.

Cheers,

Paolo
Andrei Gherzan Jan. 30, 2023, 12:51 p.m. UTC | #3
On 23/01/30 09:26AM, Paolo Abeni wrote:
> On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > > 
> > > The tx and rx test programs are used in a couple of test scripts including
> > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > accept socket connections. This racing bug could fail the test with at
> > > least one of the following:
> > > 
> > > ./udpgso_bench_tx: connect: Connection refused
> > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > ./udpgso_bench_tx: write: Connection refused
> > > 
> > > This change addresses this by adding routines that retry the socket
> > > operations with an exponential back off algorithm from 100ms to 2s.
> > > 
> > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > 
> > Synchronizing the two processes is indeed tricky.
> > 
> > Perhaps more robust is opening an initial TCP connection, with
> > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > go.
> 
> Another option would be waiting for the listener(tcp)/receiver(udp)
> socket to show up in 'ss' output before firing-up the client - quite
> alike what mptcp self-tests are doing.

I like this idea. I have tested it and it works as expected with the
exeception of:

./udpgso_bench_tx: sendmsg: No buffer space available

Any ideas on how to handle this? I could retry and that works.
Willem de Bruijn Jan. 30, 2023, 1:35 p.m. UTC | #4
On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> On 23/01/30 09:26AM, Paolo Abeni wrote:
> > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > <andrei.gherzan@canonical.com> wrote:
> > > >
> > > > The tx and rx test programs are used in a couple of test scripts including
> > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > accept socket connections. This racing bug could fail the test with at
> > > > least one of the following:
> > > >
> > > > ./udpgso_bench_tx: connect: Connection refused
> > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > ./udpgso_bench_tx: write: Connection refused
> > > >
> > > > This change addresses this by adding routines that retry the socket
> > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > >
> > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > >
> > > Synchronizing the two processes is indeed tricky.
> > >
> > > Perhaps more robust is opening an initial TCP connection, with
> > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > go.
> >
> > Another option would be waiting for the listener(tcp)/receiver(udp)
> > socket to show up in 'ss' output before firing-up the client - quite
> > alike what mptcp self-tests are doing.
>
> I like this idea. I have tested it and it works as expected with the
> exeception of:
>
> ./udpgso_bench_tx: sendmsg: No buffer space available
>
> Any ideas on how to handle this? I could retry and that works.

This happens (also) without the zerocopy flag, right? That

It might mean reaching the sndbuf limit, which can be adjusted with
SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
expect this test to bump up against that limit.

A few zerocopy specific reasons are captured in
https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
Andrei Gherzan Jan. 30, 2023, 2:28 p.m. UTC | #5
On 23/01/30 08:35AM, Willem de Bruijn wrote:
> On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> >
> > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > <andrei.gherzan@canonical.com> wrote:
> > > > >
> > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > accept socket connections. This racing bug could fail the test with at
> > > > > least one of the following:
> > > > >
> > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > ./udpgso_bench_tx: write: Connection refused
> > > > >
> > > > > This change addresses this by adding routines that retry the socket
> > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > >
> > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > >
> > > > Synchronizing the two processes is indeed tricky.
> > > >
> > > > Perhaps more robust is opening an initial TCP connection, with
> > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > go.
> > >
> > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > socket to show up in 'ss' output before firing-up the client - quite
> > > alike what mptcp self-tests are doing.
> >
> > I like this idea. I have tested it and it works as expected with the
> > exeception of:
> >
> > ./udpgso_bench_tx: sendmsg: No buffer space available
> >
> > Any ideas on how to handle this? I could retry and that works.
> 
> This happens (also) without the zerocopy flag, right? That
> 
> It might mean reaching the sndbuf limit, which can be adjusted with
> SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> expect this test to bump up against that limit.
> 
> A few zerocopy specific reasons are captured in
> https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.

I have dug a bit more into this, and it does look like your hint was in
the right direction. The fails I'm seeing are only with the zerocopy
flag.

From the reasons (doc) above I can only assume optmem limit as I've
reproduced it with unlimited locked pages and the fails are transient.
That leaves optmem limit. Bumping the value I have by default (20480) to
(2048000) made the sendmsg succeed as expected. On the other hand, the
tests started to fail with something like:

./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
expected    773707 received

Also, this audit fail is transient as with the buffer limit one.
Willem de Bruijn Jan. 30, 2023, 4:03 p.m. UTC | #6
On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > >
> > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > >
> > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > least one of the following:
> > > > > >
> > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > >
> > > > > > This change addresses this by adding routines that retry the socket
> > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > >
> > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > >
> > > > > Synchronizing the two processes is indeed tricky.
> > > > >
> > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > go.
> > > >
> > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > alike what mptcp self-tests are doing.
> > >
> > > I like this idea. I have tested it and it works as expected with the
> > > exeception of:
> > >
> > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > >
> > > Any ideas on how to handle this? I could retry and that works.
> >
> > This happens (also) without the zerocopy flag, right? That
> >
> > It might mean reaching the sndbuf limit, which can be adjusted with
> > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > expect this test to bump up against that limit.
> >
> > A few zerocopy specific reasons are captured in
> > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
>
> I have dug a bit more into this, and it does look like your hint was in
> the right direction. The fails I'm seeing are only with the zerocopy
> flag.
>
> From the reasons (doc) above I can only assume optmem limit as I've
> reproduced it with unlimited locked pages and the fails are transient.
> That leaves optmem limit. Bumping the value I have by default (20480) to
> (2048000) made the sendmsg succeed as expected. On the other hand, the
> tests started to fail with something like:
>
> ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> expected    773707 received

More zerocopy completions than number of sends. I have not seen this before.

The completions are ranges of IDs, one per send call for datagram sockets.

Even with segmentation offload, the counter increases per call, not per segment.

Do you experience this without any other changes to udpgso_bench_tx.c.
Or are there perhaps additional sendmsg calls somewhere (during
initial sync) that are not accounted to num_sends?

> Also, this audit fail is transient as with the buffer limit one.
Andrei Gherzan Jan. 30, 2023, 4:15 p.m. UTC | #7
On 23/01/30 11:03AM, Willem de Bruijn wrote:
> On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> >
> > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > <andrei.gherzan@canonical.com> wrote:
> > > >
> > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > >
> > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > least one of the following:
> > > > > > >
> > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > >
> > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > >
> > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > >
> > > > > > Synchronizing the two processes is indeed tricky.
> > > > > >
> > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > go.
> > > > >
> > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > alike what mptcp self-tests are doing.
> > > >
> > > > I like this idea. I have tested it and it works as expected with the
> > > > exeception of:
> > > >
> > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > >
> > > > Any ideas on how to handle this? I could retry and that works.
> > >
> > > This happens (also) without the zerocopy flag, right? That
> > >
> > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > expect this test to bump up against that limit.
> > >
> > > A few zerocopy specific reasons are captured in
> > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> >
> > I have dug a bit more into this, and it does look like your hint was in
> > the right direction. The fails I'm seeing are only with the zerocopy
> > flag.
> >
> > From the reasons (doc) above I can only assume optmem limit as I've
> > reproduced it with unlimited locked pages and the fails are transient.
> > That leaves optmem limit. Bumping the value I have by default (20480) to
> > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > tests started to fail with something like:
> >
> > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > expected    773707 received
> 
> More zerocopy completions than number of sends. I have not seen this before.
> 
> The completions are ranges of IDs, one per send call for datagram sockets.
> 
> Even with segmentation offload, the counter increases per call, not per segment.
> 
> Do you experience this without any other changes to udpgso_bench_tx.c.
> Or are there perhaps additional sendmsg calls somewhere (during
> initial sync) that are not accounted to num_sends?

Indeed, that looks off. No, I have run into this without any changes in
the tests (besides the retry routine in the shell script that waits for
rx to come up). Also, as a data point.

As an additional data point, this was only seen on the IPv6 tests. I've
never been able to replicate it on the IPv4 run.
Andrei Gherzan Jan. 30, 2023, 4:23 p.m. UTC | #8
On 23/01/30 04:15PM, Andrei Gherzan wrote:
> On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > >
> > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > <andrei.gherzan@canonical.com> wrote:
> > > > >
> > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > >
> > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > least one of the following:
> > > > > > > >
> > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > >
> > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > >
> > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > >
> > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > >
> > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > go.
> > > > > >
> > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > alike what mptcp self-tests are doing.
> > > > >
> > > > > I like this idea. I have tested it and it works as expected with the
> > > > > exeception of:
> > > > >
> > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > >
> > > > > Any ideas on how to handle this? I could retry and that works.
> > > >
> > > > This happens (also) without the zerocopy flag, right? That
> > > >
> > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > expect this test to bump up against that limit.
> > > >
> > > > A few zerocopy specific reasons are captured in
> > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > >
> > > I have dug a bit more into this, and it does look like your hint was in
> > > the right direction. The fails I'm seeing are only with the zerocopy
> > > flag.
> > >
> > > From the reasons (doc) above I can only assume optmem limit as I've
> > > reproduced it with unlimited locked pages and the fails are transient.
> > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > tests started to fail with something like:
> > >
> > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > expected    773707 received
> > 
> > More zerocopy completions than number of sends. I have not seen this before.
> > 
> > The completions are ranges of IDs, one per send call for datagram sockets.
> > 
> > Even with segmentation offload, the counter increases per call, not per segment.
> > 
> > Do you experience this without any other changes to udpgso_bench_tx.c.
> > Or are there perhaps additional sendmsg calls somewhere (during
> > initial sync) that are not accounted to num_sends?
> 
> Indeed, that looks off. No, I have run into this without any changes in
> the tests (besides the retry routine in the shell script that waits for
> rx to come up). Also, as a data point.

Actually wait. I don't think that is the case here. "expected" is the
number of sends. In this case we sent 1076 more messages than
completions. Am I missing something obvious?

> 
> As an additional data point, this was only seen on the IPv6 tests. I've
> never been able to replicate it on the IPv4 run.

I was also fast to send this but it is not correct. I managed to
reproduce it on both IPv4 and IPv6.
Willem de Bruijn Jan. 30, 2023, 4:29 p.m. UTC | #9
On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > <andrei.gherzan@canonical.com> wrote:
> > > >
> > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > >
> > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > >
> > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > least one of the following:
> > > > > > > > >
> > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > >
> > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > >
> > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > >
> > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > >
> > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > go.
> > > > > > >
> > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > alike what mptcp self-tests are doing.
> > > > > >
> > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > exeception of:
> > > > > >
> > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > >
> > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > >
> > > > > This happens (also) without the zerocopy flag, right? That
> > > > >
> > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > expect this test to bump up against that limit.
> > > > >
> > > > > A few zerocopy specific reasons are captured in
> > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > >
> > > > I have dug a bit more into this, and it does look like your hint was in
> > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > flag.
> > > >
> > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > tests started to fail with something like:
> > > >
> > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > expected    773707 received
> > >
> > > More zerocopy completions than number of sends. I have not seen this before.
> > >
> > > The completions are ranges of IDs, one per send call for datagram sockets.
> > >
> > > Even with segmentation offload, the counter increases per call, not per segment.
> > >
> > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > Or are there perhaps additional sendmsg calls somewhere (during
> > > initial sync) that are not accounted to num_sends?
> >
> > Indeed, that looks off. No, I have run into this without any changes in
> > the tests (besides the retry routine in the shell script that waits for
> > rx to come up). Also, as a data point.
>
> Actually wait. I don't think that is the case here. "expected" is the
> number of sends. In this case we sent 1076 more messages than
> completions. Am I missing something obvious?

Oh indeed.

Receiving fewer completions than transmission is more likely.

This should be the result of datagrams still being somewhere in the
system. In a qdisc, or waiting for the network interface to return a
completion notification, say.

Does this remain if adding a longer wait before the final flush_errqueue?

Or, really, the right fix is to keep polling there until the two are
equal, up to some timeout. Currently flush_errqueue calls poll only
once.


>
> >
> > As an additional data point, this was only seen on the IPv6 tests. I've
> > never been able to replicate it on the IPv4 run.
>
> I was also fast to send this but it is not correct. I managed to
> reproduce it on both IPv4 and IPv6.
>
> --
> Andrei Gherzan
Andrei Gherzan Jan. 30, 2023, 5:31 p.m. UTC | #10
On 23/01/30 11:29AM, Willem de Bruijn wrote:
> On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> >
> > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > <andrei.gherzan@canonical.com> wrote:
> > > > >
> > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > >
> > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > >
> > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > least one of the following:
> > > > > > > > > >
> > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > >
> > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > >
> > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > >
> > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > >
> > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > go.
> > > > > > > >
> > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > alike what mptcp self-tests are doing.
> > > > > > >
> > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > exeception of:
> > > > > > >
> > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > >
> > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > >
> > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > >
> > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > expect this test to bump up against that limit.
> > > > > >
> > > > > > A few zerocopy specific reasons are captured in
> > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > >
> > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > flag.
> > > > >
> > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > tests started to fail with something like:
> > > > >
> > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > expected    773707 received
> > > >
> > > > More zerocopy completions than number of sends. I have not seen this before.
> > > >
> > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > >
> > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > >
> > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > initial sync) that are not accounted to num_sends?
> > >
> > > Indeed, that looks off. No, I have run into this without any changes in
> > > the tests (besides the retry routine in the shell script that waits for
> > > rx to come up). Also, as a data point.
> >
> > Actually wait. I don't think that is the case here. "expected" is the
> > number of sends. In this case we sent 1076 more messages than
> > completions. Am I missing something obvious?
> 
> Oh indeed.
> 
> Receiving fewer completions than transmission is more likely.

Exactly, yes.

> This should be the result of datagrams still being somewhere in the
> system. In a qdisc, or waiting for the network interface to return a
> completion notification, say.
> 
> Does this remain if adding a longer wait before the final flush_errqueue?

Yes and no. But not realiably unless I go overboard.

> Or, really, the right fix is to keep polling there until the two are
> equal, up to some timeout. Currently flush_errqueue calls poll only
> once.

That makes sense. I have implemented a retry and this ran for a good
while now.

-               flush_errqueue(fd, true);
+               while (true) {
+                       flush_errqueue(fd, true);
+                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
+                               break;
+                       usleep(delay);
+                       delay *= 2;
+               }

What do you think?
Willem de Bruijn Jan. 30, 2023, 5:35 p.m. UTC | #11
On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > >
> > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > >
> > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > >
> > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > least one of the following:
> > > > > > > > > > >
> > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > >
> > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > >
> > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > >
> > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > >
> > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > go.
> > > > > > > > >
> > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > >
> > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > exeception of:
> > > > > > > >
> > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > >
> > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > >
> > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > >
> > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > expect this test to bump up against that limit.
> > > > > > >
> > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > >
> > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > flag.
> > > > > >
> > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > tests started to fail with something like:
> > > > > >
> > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > expected    773707 received
> > > > >
> > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > >
> > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > >
> > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > >
> > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > initial sync) that are not accounted to num_sends?
> > > >
> > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > the tests (besides the retry routine in the shell script that waits for
> > > > rx to come up). Also, as a data point.
> > >
> > > Actually wait. I don't think that is the case here. "expected" is the
> > > number of sends. In this case we sent 1076 more messages than
> > > completions. Am I missing something obvious?
> >
> > Oh indeed.
> >
> > Receiving fewer completions than transmission is more likely.
>
> Exactly, yes.
>
> > This should be the result of datagrams still being somewhere in the
> > system. In a qdisc, or waiting for the network interface to return a
> > completion notification, say.
> >
> > Does this remain if adding a longer wait before the final flush_errqueue?
>
> Yes and no. But not realiably unless I go overboard.
>
> > Or, really, the right fix is to keep polling there until the two are
> > equal, up to some timeout. Currently flush_errqueue calls poll only
> > once.
>
> That makes sense. I have implemented a retry and this ran for a good
> while now.
>
> -               flush_errqueue(fd, true);
> +               while (true) {
> +                       flush_errqueue(fd, true);
> +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> +                               break;
> +                       usleep(delay);
> +                       delay *= 2;
> +               }
>
> What do you think?

Thanks for running experiments.

We can avoid the unconditional sleep, as the poll() inside
flush_errqueue already takes a timeout.

One option is to use start_time = clock_gettime(..) or gettimeofday
before poll, and restart poll until either the exit condition or
timeout is reached, with timeout = orig_time - elapsed_time.
Andrei Gherzan Jan. 30, 2023, 6:24 p.m. UTC | #12
On 23/01/30 12:35PM, Willem de Bruijn wrote:
> On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> >
> > On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > > <andrei.gherzan@canonical.com> wrote:
> > > >
> > > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > >
> > > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > >
> > > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > > least one of the following:
> > > > > > > > > > > >
> > > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > > >
> > > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > > >
> > > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > > >
> > > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > > >
> > > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > > go.
> > > > > > > > > >
> > > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > > >
> > > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > > exeception of:
> > > > > > > > >
> > > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > > >
> > > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > > >
> > > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > > >
> > > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > > expect this test to bump up against that limit.
> > > > > > > >
> > > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > > >
> > > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > > flag.
> > > > > > >
> > > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > > tests started to fail with something like:
> > > > > > >
> > > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > > expected    773707 received
> > > > > >
> > > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > > >
> > > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > > >
> > > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > > >
> > > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > > initial sync) that are not accounted to num_sends?
> > > > >
> > > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > > the tests (besides the retry routine in the shell script that waits for
> > > > > rx to come up). Also, as a data point.
> > > >
> > > > Actually wait. I don't think that is the case here. "expected" is the
> > > > number of sends. In this case we sent 1076 more messages than
> > > > completions. Am I missing something obvious?
> > >
> > > Oh indeed.
> > >
> > > Receiving fewer completions than transmission is more likely.
> >
> > Exactly, yes.
> >
> > > This should be the result of datagrams still being somewhere in the
> > > system. In a qdisc, or waiting for the network interface to return a
> > > completion notification, say.
> > >
> > > Does this remain if adding a longer wait before the final flush_errqueue?
> >
> > Yes and no. But not realiably unless I go overboard.
> >
> > > Or, really, the right fix is to keep polling there until the two are
> > > equal, up to some timeout. Currently flush_errqueue calls poll only
> > > once.
> >
> > That makes sense. I have implemented a retry and this ran for a good
> > while now.
> >
> > -               flush_errqueue(fd, true);
> > +               while (true) {
> > +                       flush_errqueue(fd, true);
> > +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> > +                               break;
> > +                       usleep(delay);
> > +                       delay *= 2;
> > +               }
> >
> > What do you think?
> 
> Thanks for running experiments.
> 
> We can avoid the unconditional sleep, as the poll() inside
> flush_errqueue already takes a timeout.
> 
> One option is to use start_time = clock_gettime(..) or gettimeofday
> before poll, and restart poll until either the exit condition or
> timeout is reached, with timeout = orig_time - elapsed_time.

Yes, this was more of a quick draft. I was thinking to move it into the
flush function (while making it aware of num_sends via a parameter):

if (do_poll) {
  struct pollfd fds = {0};
  int ret;
  unsigned long tnow, tstop;

  fds.fd = fd;
  tnow = gettimeofday_ms();
  tstop = tnow + POLL_LOOP_TIMEOUT_MS;
  while ((stat_zcopies != num_sends) && (tnow < tstop)) {
    ret = poll(&fds, 1, 500);
    if (ret == 0) {
      if (cfg_verbose)
        fprintf(stderr, "poll timeout\n");
      } else if (ret < 0) {
        error(1, errno, "poll");
    }
    tnow = gettimeofday_ms();
  }
}

Does this make more sense?
Andrei Gherzan Jan. 30, 2023, 7:51 p.m. UTC | #13
On 23/01/30 06:24PM, Andrei Gherzan wrote:
> On 23/01/30 12:35PM, Willem de Bruijn wrote:
> > On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > >
> > > On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > > > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > > > <andrei.gherzan@canonical.com> wrote:
> > > > >
> > > > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > >
> > > > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > > > least one of the following:
> > > > > > > > > > > > >
> > > > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > > > >
> > > > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > > > >
> > > > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > > > >
> > > > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > > > go.
> > > > > > > > > > >
> > > > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > > > >
> > > > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > > > exeception of:
> > > > > > > > > >
> > > > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > > > >
> > > > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > > > >
> > > > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > > > >
> > > > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > > > expect this test to bump up against that limit.
> > > > > > > > >
> > > > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > > > >
> > > > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > > > flag.
> > > > > > > >
> > > > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > > > tests started to fail with something like:
> > > > > > > >
> > > > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > > > expected    773707 received
> > > > > > >
> > > > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > > > >
> > > > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > > > >
> > > > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > > > >
> > > > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > > > initial sync) that are not accounted to num_sends?
> > > > > >
> > > > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > > > the tests (besides the retry routine in the shell script that waits for
> > > > > > rx to come up). Also, as a data point.
> > > > >
> > > > > Actually wait. I don't think that is the case here. "expected" is the
> > > > > number of sends. In this case we sent 1076 more messages than
> > > > > completions. Am I missing something obvious?
> > > >
> > > > Oh indeed.
> > > >
> > > > Receiving fewer completions than transmission is more likely.
> > >
> > > Exactly, yes.
> > >
> > > > This should be the result of datagrams still being somewhere in the
> > > > system. In a qdisc, or waiting for the network interface to return a
> > > > completion notification, say.
> > > >
> > > > Does this remain if adding a longer wait before the final flush_errqueue?
> > >
> > > Yes and no. But not realiably unless I go overboard.
> > >
> > > > Or, really, the right fix is to keep polling there until the two are
> > > > equal, up to some timeout. Currently flush_errqueue calls poll only
> > > > once.
> > >
> > > That makes sense. I have implemented a retry and this ran for a good
> > > while now.
> > >
> > > -               flush_errqueue(fd, true);
> > > +               while (true) {
> > > +                       flush_errqueue(fd, true);
> > > +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> > > +                               break;
> > > +                       usleep(delay);
> > > +                       delay *= 2;
> > > +               }
> > >
> > > What do you think?
> > 
> > Thanks for running experiments.
> > 
> > We can avoid the unconditional sleep, as the poll() inside
> > flush_errqueue already takes a timeout.
> > 
> > One option is to use start_time = clock_gettime(..) or gettimeofday
> > before poll, and restart poll until either the exit condition or
> > timeout is reached, with timeout = orig_time - elapsed_time.
> 
> Yes, this was more of a quick draft. I was thinking to move it into the
> flush function (while making it aware of num_sends via a parameter):
> 
> if (do_poll) {
>   struct pollfd fds = {0};
>   int ret;
>   unsigned long tnow, tstop;
> 
>   fds.fd = fd;
>   tnow = gettimeofday_ms();
>   tstop = tnow + POLL_LOOP_TIMEOUT_MS;
>   while ((stat_zcopies != num_sends) && (tnow < tstop)) {
>     ret = poll(&fds, 1, 500);
>     if (ret == 0) {
>       if (cfg_verbose)
>         fprintf(stderr, "poll timeout\n");
>       } else if (ret < 0) {
>         error(1, errno, "poll");
>     }
>     tnow = gettimeofday_ms();
>   }
> }
> 
> Does this make more sense?

Obviously, this should be a do/while. Anyway, this works as expected
after leaving it for a around two hours.
Willem de Bruijn Jan. 30, 2023, 7:57 p.m. UTC | #14
On Mon, Jan 30, 2023 at 2:51 PM Andrei Gherzan
<andrei.gherzan@canonical.com> wrote:
>
> On 23/01/30 06:24PM, Andrei Gherzan wrote:
> > On 23/01/30 12:35PM, Willem de Bruijn wrote:
> > > On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
> > > <andrei.gherzan@canonical.com> wrote:
> > > >
> > > > On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > > > > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > >
> > > > > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > >
> > > > > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > > > > least one of the following:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > > > > >
> > > > > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > > > > go.
> > > > > > > > > > > >
> > > > > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > > > > >
> > > > > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > > > > exeception of:
> > > > > > > > > > >
> > > > > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > > > > >
> > > > > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > > > > >
> > > > > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > > > > >
> > > > > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > > > > expect this test to bump up against that limit.
> > > > > > > > > >
> > > > > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > > > > >
> > > > > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > > > > flag.
> > > > > > > > >
> > > > > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > > > > tests started to fail with something like:
> > > > > > > > >
> > > > > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > > > > expected    773707 received
> > > > > > > >
> > > > > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > > > > >
> > > > > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > > > > >
> > > > > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > > > > >
> > > > > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > > > > initial sync) that are not accounted to num_sends?
> > > > > > >
> > > > > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > > > > the tests (besides the retry routine in the shell script that waits for
> > > > > > > rx to come up). Also, as a data point.
> > > > > >
> > > > > > Actually wait. I don't think that is the case here. "expected" is the
> > > > > > number of sends. In this case we sent 1076 more messages than
> > > > > > completions. Am I missing something obvious?
> > > > >
> > > > > Oh indeed.
> > > > >
> > > > > Receiving fewer completions than transmission is more likely.
> > > >
> > > > Exactly, yes.
> > > >
> > > > > This should be the result of datagrams still being somewhere in the
> > > > > system. In a qdisc, or waiting for the network interface to return a
> > > > > completion notification, say.
> > > > >
> > > > > Does this remain if adding a longer wait before the final flush_errqueue?
> > > >
> > > > Yes and no. But not realiably unless I go overboard.
> > > >
> > > > > Or, really, the right fix is to keep polling there until the two are
> > > > > equal, up to some timeout. Currently flush_errqueue calls poll only
> > > > > once.
> > > >
> > > > That makes sense. I have implemented a retry and this ran for a good
> > > > while now.
> > > >
> > > > -               flush_errqueue(fd, true);
> > > > +               while (true) {
> > > > +                       flush_errqueue(fd, true);
> > > > +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> > > > +                               break;
> > > > +                       usleep(delay);
> > > > +                       delay *= 2;
> > > > +               }
> > > >
> > > > What do you think?
> > >
> > > Thanks for running experiments.
> > >
> > > We can avoid the unconditional sleep, as the poll() inside
> > > flush_errqueue already takes a timeout.
> > >
> > > One option is to use start_time = clock_gettime(..) or gettimeofday
> > > before poll, and restart poll until either the exit condition or
> > > timeout is reached, with timeout = orig_time - elapsed_time.
> >
> > Yes, this was more of a quick draft. I was thinking to move it into the
> > flush function (while making it aware of num_sends via a parameter):
> >
> > if (do_poll) {
> >   struct pollfd fds = {0};
> >   int ret;
> >   unsigned long tnow, tstop;
> >
> >   fds.fd = fd;
> >   tnow = gettimeofday_ms();
> >   tstop = tnow + POLL_LOOP_TIMEOUT_MS;
> >   while ((stat_zcopies != num_sends) && (tnow < tstop)) {

The new condition to loop until stat_zcopies == num_sends should only
be tested on the final call. This likely needs to become a separate
boolean. Or a separate flush_errqueue_last() function, and leave the
existing one as is.

We can probably merge the outer for and inner while loops

> >     ret = poll(&fds, 1, 500);

Instead of 500, this becomes tstop - tnow.

> >     if (ret == 0) {
> >       if (cfg_verbose)
> >         fprintf(stderr, "poll timeout\n");

Poll timeouts are now expected to an extent. Only report once at the
end of the function if the poll was only called once and timed out.
> >       } else if (ret < 0) {
> >         error(1, errno, "poll");
> >     }
> >     tnow = gettimeofday_ms();
> >   }
> > }
> >
> > Does this make more sense?
>
> Obviously, this should be a do/while. Anyway, this works as expected
> after leaving it for a around two hours.

Great to hear you found the cause.
Andrei Gherzan Jan. 30, 2023, 8:25 p.m. UTC | #15
On 23/01/30 02:57PM, Willem de Bruijn wrote:
> On Mon, Jan 30, 2023 at 2:51 PM Andrei Gherzan
> <andrei.gherzan@canonical.com> wrote:
> >
> > On 23/01/30 06:24PM, Andrei Gherzan wrote:
> > > On 23/01/30 12:35PM, Willem de Bruijn wrote:
> > > > On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
> > > > <andrei.gherzan@canonical.com> wrote:
> > > > >
> > > > > On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > > > > > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > >
> > > > > > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > > > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > > > > > least one of the following:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > > > > > go.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > > > > > >
> > > > > > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > > > > > exeception of:
> > > > > > > > > > > >
> > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > > > > > >
> > > > > > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > > > > > >
> > > > > > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > > > > > >
> > > > > > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > > > > > expect this test to bump up against that limit.
> > > > > > > > > > >
> > > > > > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > > > > > >
> > > > > > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > > > > > flag.
> > > > > > > > > >
> > > > > > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > > > > > tests started to fail with something like:
> > > > > > > > > >
> > > > > > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > > > > > expected    773707 received
> > > > > > > > >
> > > > > > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > > > > > >
> > > > > > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > > > > > >
> > > > > > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > > > > > >
> > > > > > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > > > > > initial sync) that are not accounted to num_sends?
> > > > > > > >
> > > > > > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > > > > > the tests (besides the retry routine in the shell script that waits for
> > > > > > > > rx to come up). Also, as a data point.
> > > > > > >
> > > > > > > Actually wait. I don't think that is the case here. "expected" is the
> > > > > > > number of sends. In this case we sent 1076 more messages than
> > > > > > > completions. Am I missing something obvious?
> > > > > >
> > > > > > Oh indeed.
> > > > > >
> > > > > > Receiving fewer completions than transmission is more likely.
> > > > >
> > > > > Exactly, yes.
> > > > >
> > > > > > This should be the result of datagrams still being somewhere in the
> > > > > > system. In a qdisc, or waiting for the network interface to return a
> > > > > > completion notification, say.
> > > > > >
> > > > > > Does this remain if adding a longer wait before the final flush_errqueue?
> > > > >
> > > > > Yes and no. But not realiably unless I go overboard.
> > > > >
> > > > > > Or, really, the right fix is to keep polling there until the two are
> > > > > > equal, up to some timeout. Currently flush_errqueue calls poll only
> > > > > > once.
> > > > >
> > > > > That makes sense. I have implemented a retry and this ran for a good
> > > > > while now.
> > > > >
> > > > > -               flush_errqueue(fd, true);
> > > > > +               while (true) {
> > > > > +                       flush_errqueue(fd, true);
> > > > > +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> > > > > +                               break;
> > > > > +                       usleep(delay);
> > > > > +                       delay *= 2;
> > > > > +               }
> > > > >
> > > > > What do you think?
> > > >
> > > > Thanks for running experiments.
> > > >
> > > > We can avoid the unconditional sleep, as the poll() inside
> > > > flush_errqueue already takes a timeout.
> > > >
> > > > One option is to use start_time = clock_gettime(..) or gettimeofday
> > > > before poll, and restart poll until either the exit condition or
> > > > timeout is reached, with timeout = orig_time - elapsed_time.
> > >
> > > Yes, this was more of a quick draft. I was thinking to move it into the
> > > flush function (while making it aware of num_sends via a parameter):
> > >
> > > if (do_poll) {
> > >   struct pollfd fds = {0};
> > >   int ret;
> > >   unsigned long tnow, tstop;
> > >
> > >   fds.fd = fd;
> > >   tnow = gettimeofday_ms();
> > >   tstop = tnow + POLL_LOOP_TIMEOUT_MS;
> > >   while ((stat_zcopies != num_sends) && (tnow < tstop)) {
> 
> The new condition to loop until stat_zcopies == num_sends should only
> be tested on the final call. This likely needs to become a separate
> boolean. Or a separate flush_errqueue_last() function, and leave the
> existing one as is.

Wouldn't a do/while be enough here?

> 
> We can probably merge the outer for and inner while loops
> 
> > >     ret = poll(&fds, 1, 500);
> 
> Instead of 500, this becomes tstop - tnow.

Right. Missed this one.

> 
> > >     if (ret == 0) {
> > >       if (cfg_verbose)
> > >         fprintf(stderr, "poll timeout\n");
> 
> Poll timeouts are now expected to an extent. Only report once at the
> end of the function if the poll was only called once and timed out.

I had to think about this a bit but now I see your point and it makes
sense.

> > >       } else if (ret < 0) {
> > >         error(1, errno, "poll");
> > >     }
> > >     tnow = gettimeofday_ms();
> > >   }
> > > }
> > >
> > > Does this make more sense?
> >
> > Obviously, this should be a do/while. Anyway, this works as expected
> > after leaving it for a around two hours.
> 
> Great to hear you found the cause.

Hats off for hints.
Andrei Gherzan Jan. 31, 2023, 1:10 p.m. UTC | #16
On 23/01/30 08:25PM, Andrei Gherzan wrote:
> On 23/01/30 02:57PM, Willem de Bruijn wrote:
> > On Mon, Jan 30, 2023 at 2:51 PM Andrei Gherzan
> > <andrei.gherzan@canonical.com> wrote:
> > >
> > > On 23/01/30 06:24PM, Andrei Gherzan wrote:
> > > > On 23/01/30 12:35PM, Willem de Bruijn wrote:
> > > > > On Mon, Jan 30, 2023 at 12:31 PM Andrei Gherzan
> > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > >
> > > > > > On 23/01/30 11:29AM, Willem de Bruijn wrote:
> > > > > > > On Mon, Jan 30, 2023 at 11:23 AM Andrei Gherzan
> > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > >
> > > > > > > > On 23/01/30 04:15PM, Andrei Gherzan wrote:
> > > > > > > > > On 23/01/30 11:03AM, Willem de Bruijn wrote:
> > > > > > > > > > On Mon, Jan 30, 2023 at 9:28 AM Andrei Gherzan
> > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On 23/01/30 08:35AM, Willem de Bruijn wrote:
> > > > > > > > > > > > On Mon, Jan 30, 2023 at 7:51 AM Andrei Gherzan
> > > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On 23/01/30 09:26AM, Paolo Abeni wrote:
> > > > > > > > > > > > > > On Fri, 2023-01-27 at 17:03 -0500, Willem de Bruijn wrote:
> > > > > > > > > > > > > > > On Fri, Jan 27, 2023 at 1:16 PM Andrei Gherzan
> > > > > > > > > > > > > > > <andrei.gherzan@canonical.com> wrote:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > The tx and rx test programs are used in a couple of test scripts including
> > > > > > > > > > > > > > > > "udpgro_bench.sh". Taking this as an example, when the rx/tx programs
> > > > > > > > > > > > > > > > are invoked subsequently, there is a chance that the rx one is not ready to
> > > > > > > > > > > > > > > > accept socket connections. This racing bug could fail the test with at
> > > > > > > > > > > > > > > > least one of the following:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ./udpgso_bench_tx: connect: Connection refused
> > > > > > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: Connection refused
> > > > > > > > > > > > > > > > ./udpgso_bench_tx: write: Connection refused
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > This change addresses this by adding routines that retry the socket
> > > > > > > > > > > > > > > > operations with an exponential back off algorithm from 100ms to 2s.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Fixes: 3a687bef148d ("selftests: udp gso benchmark")
> > > > > > > > > > > > > > > > Signed-off-by: Andrei Gherzan <andrei.gherzan@canonical.com>
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Synchronizing the two processes is indeed tricky.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Perhaps more robust is opening an initial TCP connection, with
> > > > > > > > > > > > > > > SO_RCVTIMEO to bound the waiting time. That covers all tests in one
> > > > > > > > > > > > > > > go.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Another option would be waiting for the listener(tcp)/receiver(udp)
> > > > > > > > > > > > > > socket to show up in 'ss' output before firing-up the client - quite
> > > > > > > > > > > > > > alike what mptcp self-tests are doing.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I like this idea. I have tested it and it works as expected with the
> > > > > > > > > > > > > exeception of:
> > > > > > > > > > > > >
> > > > > > > > > > > > > ./udpgso_bench_tx: sendmsg: No buffer space available
> > > > > > > > > > > > >
> > > > > > > > > > > > > Any ideas on how to handle this? I could retry and that works.
> > > > > > > > > > > >
> > > > > > > > > > > > This happens (also) without the zerocopy flag, right? That
> > > > > > > > > > > >
> > > > > > > > > > > > It might mean reaching the sndbuf limit, which can be adjusted with
> > > > > > > > > > > > SO_SNDBUF (or SO_SNDBUFFORCE if CAP_NET_ADMIN). Though I would not
> > > > > > > > > > > > expect this test to bump up against that limit.
> > > > > > > > > > > >
> > > > > > > > > > > > A few zerocopy specific reasons are captured in
> > > > > > > > > > > > https://www.kernel.org/doc/html/latest/networking/msg_zerocopy.html#transmission.
> > > > > > > > > > >
> > > > > > > > > > > I have dug a bit more into this, and it does look like your hint was in
> > > > > > > > > > > the right direction. The fails I'm seeing are only with the zerocopy
> > > > > > > > > > > flag.
> > > > > > > > > > >
> > > > > > > > > > > From the reasons (doc) above I can only assume optmem limit as I've
> > > > > > > > > > > reproduced it with unlimited locked pages and the fails are transient.
> > > > > > > > > > > That leaves optmem limit. Bumping the value I have by default (20480) to
> > > > > > > > > > > (2048000) made the sendmsg succeed as expected. On the other hand, the
> > > > > > > > > > > tests started to fail with something like:
> > > > > > > > > > >
> > > > > > > > > > > ./udpgso_bench_tx: Unexpected number of Zerocopy completions:    774783
> > > > > > > > > > > expected    773707 received
> > > > > > > > > >
> > > > > > > > > > More zerocopy completions than number of sends. I have not seen this before.
> > > > > > > > > >
> > > > > > > > > > The completions are ranges of IDs, one per send call for datagram sockets.
> > > > > > > > > >
> > > > > > > > > > Even with segmentation offload, the counter increases per call, not per segment.
> > > > > > > > > >
> > > > > > > > > > Do you experience this without any other changes to udpgso_bench_tx.c.
> > > > > > > > > > Or are there perhaps additional sendmsg calls somewhere (during
> > > > > > > > > > initial sync) that are not accounted to num_sends?
> > > > > > > > >
> > > > > > > > > Indeed, that looks off. No, I have run into this without any changes in
> > > > > > > > > the tests (besides the retry routine in the shell script that waits for
> > > > > > > > > rx to come up). Also, as a data point.
> > > > > > > >
> > > > > > > > Actually wait. I don't think that is the case here. "expected" is the
> > > > > > > > number of sends. In this case we sent 1076 more messages than
> > > > > > > > completions. Am I missing something obvious?
> > > > > > >
> > > > > > > Oh indeed.
> > > > > > >
> > > > > > > Receiving fewer completions than transmission is more likely.
> > > > > >
> > > > > > Exactly, yes.
> > > > > >
> > > > > > > This should be the result of datagrams still being somewhere in the
> > > > > > > system. In a qdisc, or waiting for the network interface to return a
> > > > > > > completion notification, say.
> > > > > > >
> > > > > > > Does this remain if adding a longer wait before the final flush_errqueue?
> > > > > >
> > > > > > Yes and no. But not realiably unless I go overboard.
> > > > > >
> > > > > > > Or, really, the right fix is to keep polling there until the two are
> > > > > > > equal, up to some timeout. Currently flush_errqueue calls poll only
> > > > > > > once.
> > > > > >
> > > > > > That makes sense. I have implemented a retry and this ran for a good
> > > > > > while now.
> > > > > >
> > > > > > -               flush_errqueue(fd, true);
> > > > > > +               while (true) {
> > > > > > +                       flush_errqueue(fd, true);
> > > > > > +                       if ((stat_zcopies == num_sends) || (delay >= MAX_DELAY))
> > > > > > +                               break;
> > > > > > +                       usleep(delay);
> > > > > > +                       delay *= 2;
> > > > > > +               }
> > > > > >
> > > > > > What do you think?
> > > > >
> > > > > Thanks for running experiments.
> > > > >
> > > > > We can avoid the unconditional sleep, as the poll() inside
> > > > > flush_errqueue already takes a timeout.
> > > > >
> > > > > One option is to use start_time = clock_gettime(..) or gettimeofday
> > > > > before poll, and restart poll until either the exit condition or
> > > > > timeout is reached, with timeout = orig_time - elapsed_time.
> > > >
> > > > Yes, this was more of a quick draft. I was thinking to move it into the
> > > > flush function (while making it aware of num_sends via a parameter):
> > > >
> > > > if (do_poll) {
> > > >   struct pollfd fds = {0};
> > > >   int ret;
> > > >   unsigned long tnow, tstop;
> > > >
> > > >   fds.fd = fd;
> > > >   tnow = gettimeofday_ms();
> > > >   tstop = tnow + POLL_LOOP_TIMEOUT_MS;
> > > >   while ((stat_zcopies != num_sends) && (tnow < tstop)) {
> > 
> > The new condition to loop until stat_zcopies == num_sends should only
> > be tested on the final call. This likely needs to become a separate
> > boolean. Or a separate flush_errqueue_last() function, and leave the
> > existing one as is.
> 
> Wouldn't a do/while be enough here?
> 
> > 
> > We can probably merge the outer for and inner while loops
> > 
> > > >     ret = poll(&fds, 1, 500);
> > 
> > Instead of 500, this becomes tstop - tnow.
> 
> Right. Missed this one.
> 
> > 
> > > >     if (ret == 0) {
> > > >       if (cfg_verbose)
> > > >         fprintf(stderr, "poll timeout\n");
> > 
> > Poll timeouts are now expected to an extent. Only report once at the
> > end of the function if the poll was only called once and timed out.
> 
> I had to think about this a bit but now I see your point and it makes
> sense.
> 
> > > >       } else if (ret < 0) {
> > > >         error(1, errno, "poll");
> > > >     }
> > > >     tnow = gettimeofday_ms();
> > > >   }
> > > > }
> > > >
> > > > Does this make more sense?
> > >
> > > Obviously, this should be a do/while. Anyway, this works as expected
> > > after leaving it for a around two hours.
> > 
> > Great to hear you found the cause.
> 
> Hats off for hints.

I have pushed a new version with a couple of other changes here and
there.

https://lore.kernel.org/netdev/20230131130412.432549-1-andrei.gherzan@canonical.com/T/#t

Paolo, for the synchronisation implementation I took your advice.

Looking forward for your feedback.
diff mbox series

Patch

diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
index f1fdaa270291..4dea9ee7eb46 100644
--- a/tools/testing/selftests/net/udpgso_bench_tx.c
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -53,6 +53,9 @@ 
 
 #define NUM_PKT		100
 
+#define MAX_DELAY	2000000
+#define INIT_DELAY	100000
+
 static bool	cfg_cache_trash;
 static int	cfg_cpu		= -1;
 static int	cfg_connected	= true;
@@ -257,13 +260,18 @@  static void flush_errqueue(int fd, const bool do_poll)
 static int send_tcp(int fd, char *data)
 {
 	int ret, done = 0, count = 0;
+	useconds_t delay = INIT_DELAY;
 
 	while (done < cfg_payload_len) {
-		ret = send(fd, data + done, cfg_payload_len - done,
-			   cfg_zerocopy ? MSG_ZEROCOPY : 0);
-		if (ret == -1)
-			error(1, errno, "write");
-
+		delay = INIT_DELAY;
+		while ((ret = send(fd, data + done, cfg_payload_len - done,
+				cfg_zerocopy ? MSG_ZEROCOPY : 0)) == -1) {
+			usleep(delay);
+			if (delay < MAX_DELAY)
+				delay *= 2;
+			else
+				error(1, errno, "write");
+		}
 		done += ret;
 		count++;
 	}
@@ -274,17 +282,23 @@  static int send_tcp(int fd, char *data)
 static int send_udp(int fd, char *data)
 {
 	int ret, total_len, len, count = 0;
+	useconds_t delay = INIT_DELAY;
 
 	total_len = cfg_payload_len;
 
 	while (total_len) {
 		len = total_len < cfg_mss ? total_len : cfg_mss;
 
-		ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
-			     cfg_connected ? NULL : (void *)&cfg_dst_addr,
-			     cfg_connected ? 0 : cfg_alen);
-		if (ret == -1)
-			error(1, errno, "write");
+		delay = INIT_DELAY;
+		while ((ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
+				cfg_connected ? NULL : (void *)&cfg_dst_addr,
+				cfg_connected ? 0 : cfg_alen)) == -1) {
+			usleep(delay);
+			if (delay < MAX_DELAY)
+				delay *= 2;
+			else
+				error(1, errno, "write");
+		}
 		if (ret != len)
 			error(1, errno, "write: %uB != %uB\n", ret, len);
 
@@ -378,6 +392,7 @@  static int send_udp_segment(int fd, char *data)
 	struct iovec iov = {0};
 	size_t msg_controllen;
 	struct cmsghdr *cmsg;
+	useconds_t delay = INIT_DELAY;
 	int ret;
 
 	iov.iov_base = data;
@@ -401,9 +416,13 @@  static int send_udp_segment(int fd, char *data)
 	msg.msg_name = (void *)&cfg_dst_addr;
 	msg.msg_namelen = cfg_alen;
 
-	ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
-	if (ret == -1)
-		error(1, errno, "sendmsg");
+	while ((ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0)) == -1) {
+		usleep(delay);
+		if (delay < MAX_DELAY)
+			delay *= 2;
+		else
+			error(1, errno, "sendmsg");
+	}
 	if (ret != iov.iov_len)
 		error(1, 0, "sendmsg: %u != %llu\n", ret,
 			(unsigned long long)iov.iov_len);
@@ -616,6 +635,7 @@  int main(int argc, char **argv)
 {
 	unsigned long num_msgs, num_sends;
 	unsigned long tnow, treport, tstop;
+	useconds_t delay = INIT_DELAY;
 	int fd, i, val, ret;
 
 	parse_opts(argc, argv);
@@ -648,9 +668,14 @@  int main(int argc, char **argv)
 		}
 	}
 
-	if (cfg_connected &&
-	    connect(fd, (void *)&cfg_dst_addr, cfg_alen))
-		error(1, errno, "connect");
+	if (cfg_connected)
+		while (connect(fd, (void *)&cfg_dst_addr, cfg_alen)) {
+			usleep(delay);
+			if (delay < MAX_DELAY)
+				delay *= 2;
+			else
+				error(1, errno, "connect");
+		}
 
 	if (cfg_segment)
 		set_pmtu_discover(fd, cfg_family == PF_INET);