diff mbox series

SUNRPC: Don't retry using the same source port if connection failed

Message ID 20230927192712.317799-1-trondmy@kernel.org (mailing list archive)
State New, archived
Headers show
Series SUNRPC: Don't retry using the same source port if connection failed | expand

Commit Message

Trond Myklebust Sept. 27, 2023, 7:27 p.m. UTC
From: Trond Myklebust <trond.myklebust@hammerspace.com>

If the TCP connection attempt fails without ever establishing a
connection, then assume the problem may be the server is rejecting us
due to port reuse.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

Comments

Olga Kornievskaia Sept. 28, 2023, 2:58 p.m. UTC | #1
On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
>
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>
> If the TCP connection attempt fails without ever establishing a
> connection, then assume the problem may be the server is rejecting us
> due to port reuse.

Doesn't this break 4.0 replay cache? Seems too general to assume that
any unsuccessful SYN was due to a server reboot and it's ok for the
client to change the port.

>
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  net/sunrpc/xprtsock.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 71848ab90d13..1a96777f0ed5 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -62,6 +62,7 @@
>  #include "sunrpc.h"
>
>  static void xs_close(struct rpc_xprt *xprt);
> +static void xs_reset_srcport(struct sock_xprt *transport);
>  static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock);
>  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
>                 struct socket *sock);
> @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct sock *sk)
>                 break;
>         case TCP_CLOSE:
>                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> -                                       &transport->sock_state))
> +                                      &transport->sock_state)) {
> +                       xs_reset_srcport(transport);
>                         xprt_clear_connecting(xprt);
> +               }
>                 clear_bit(XPRT_CLOSING, &xprt->state);
>                 /* Trigger the socket release */
>                 xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
> @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
>         xs_update_peer_port(xprt);
>  }
>
> +static void xs_reset_srcport(struct sock_xprt *transport)
> +{
> +       transport->srcport = 0;
> +}
> +
>  static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
>  {
>         if (transport->srcport == 0 && transport->xprt.reuseport)
> --
> 2.41.0
>
Trond Myklebust Sept. 30, 2023, 2:57 a.m. UTC | #2
On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > If the TCP connection attempt fails without ever establishing a
> > connection, then assume the problem may be the server is rejecting
> > us
> > due to port reuse.
> 
> Doesn't this break 4.0 replay cache? Seems too general to assume that
> any unsuccessful SYN was due to a server reboot and it's ok for the
> client to change the port.

This is where things get interesting. Yes, if we change the port
number, then it will almost certainly break NFSv3 and NFSv4.0 replay
caching on the server.

However the problem is that once we get stuck in the situation where we
cannot connect, then each new connection attempt is just causing the
server's TCP layer to push back and recall that the connection from
this port was closed.
IOW: the problem is that once we're in this situation, we cannot easily
exit without doing one of the following. Either we have to

   1. Change the port number, so that the TCP layer allows us to
      connect.
   2. Or.. Wait for long enough that the TCP layer has forgotten
      altogether about the previous connection.

The problem is that option (2) is subject to livelock, and so has a
potential infinite time out. I've seen this livelock in action, and I'm
not seeing a solution that has predictable results.

So unless there is a solution for the problems in (2), I don't see how
we can avoid defaulting to option (1) at some point, in which case the
only question is "when do we switch ports?".

> 
> > 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  net/sunrpc/xprtsock.c | 10 +++++++++-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index 71848ab90d13..1a96777f0ed5 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -62,6 +62,7 @@
> >  #include "sunrpc.h"
> > 
> >  static void xs_close(struct rpc_xprt *xprt);
> > +static void xs_reset_srcport(struct sock_xprt *transport);
> >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > socket *sock);
> >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> >                 struct socket *sock);
> > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct sock
> > *sk)
> >                 break;
> >         case TCP_CLOSE:
> >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > -                                       &transport->sock_state))
> > +                                      &transport->sock_state)) {
> > +                       xs_reset_srcport(transport);
> >                         xprt_clear_connecting(xprt);
> > +               }
> >                 clear_bit(XPRT_CLOSING, &xprt->state);
> >                 /* Trigger the socket release */
> >                 xs_run_error_worker(transport,
> > XPRT_SOCK_WAKE_DISCONNECT);
> > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > *xprt, unsigned short port)
> >         xs_update_peer_port(xprt);
> >  }
> > 
> > +static void xs_reset_srcport(struct sock_xprt *transport)
> > +{
> > +       transport->srcport = 0;
> > +}
> > +
> >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > socket *sock)
> >  {
> >         if (transport->srcport == 0 && transport->xprt.reuseport)
> > --
> > 2.41.0
> >
Olga Kornievskaia Sept. 30, 2023, 10:36 p.m. UTC | #3
On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
<trondmy@hammerspace.com> wrote:
>
> On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > >
> > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > >
> > > If the TCP connection attempt fails without ever establishing a
> > > connection, then assume the problem may be the server is rejecting
> > > us
> > > due to port reuse.
> >
> > Doesn't this break 4.0 replay cache? Seems too general to assume that
> > any unsuccessful SYN was due to a server reboot and it's ok for the
> > client to change the port.
>
> This is where things get interesting. Yes, if we change the port
> number, then it will almost certainly break NFSv3 and NFSv4.0 replay
> caching on the server.
>
> However the problem is that once we get stuck in the situation where we
> cannot connect, then each new connection attempt is just causing the
> server's TCP layer to push back and recall that the connection from
> this port was closed.
> IOW: the problem is that once we're in this situation, we cannot easily
> exit without doing one of the following. Either we have to
>
>    1. Change the port number, so that the TCP layer allows us to
>       connect.
>    2. Or.. Wait for long enough that the TCP layer has forgotten
>       altogether about the previous connection.
>
> The problem is that option (2) is subject to livelock, and so has a
> potential infinite time out. I've seen this livelock in action, and I'm
> not seeing a solution that has predictable results.
>
> So unless there is a solution for the problems in (2), I don't see how
> we can avoid defaulting to option (1) at some point, in which case the
> only question is "when do we switch ports?".

I'm not sure how one can justify that regression that will come out of
#1 will be less of a problem then the problem in #2.

I think I'm still not grasping why the NFS server would (legitimately)
be closing a connection that is re-using the port. Can you present a
sequence of events that would lead to this?

But can't we at least arm ourselves in not unnecessarily breaking the
reply cache by at least imposing some timeout/number of retries before
resetting? If the client was retrying to unsuccessfully re-establish
connection for a (fixed) while, then 4.0 client's lease would expire
and switching the port after the lease expires makes no difference.
There isn't a solution in v3 unfortunately. But a time-based approach
would at least separate these 'peculiar' servers vs normal servers.
And if this is a 4.1 client, we can reset the port without a timeout.

Am I correct that every unsuccessful SYN causes a new source point to
be taken? If so, then a server reboot where multiple SYNs are sent
prior to connection re-establishment (times number of mounts) might
cause source port exhaustion?


>
> >
> > >
> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > ---
> > >  net/sunrpc/xprtsock.c | 10 +++++++++-
> > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > index 71848ab90d13..1a96777f0ed5 100644
> > > --- a/net/sunrpc/xprtsock.c
> > > +++ b/net/sunrpc/xprtsock.c
> > > @@ -62,6 +62,7 @@
> > >  #include "sunrpc.h"
> > >
> > >  static void xs_close(struct rpc_xprt *xprt);
> > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > socket *sock);
> > >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > >                 struct socket *sock);
> > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct sock
> > > *sk)
> > >                 break;
> > >         case TCP_CLOSE:
> > >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > -                                       &transport->sock_state))
> > > +                                      &transport->sock_state)) {
> > > +                       xs_reset_srcport(transport);
> > >                         xprt_clear_connecting(xprt);
> > > +               }
> > >                 clear_bit(XPRT_CLOSING, &xprt->state);
> > >                 /* Trigger the socket release */
> > >                 xs_run_error_worker(transport,
> > > XPRT_SOCK_WAKE_DISCONNECT);
> > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > *xprt, unsigned short port)
> > >         xs_update_peer_port(xprt);
> > >  }
> > >
> > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > +{
> > > +       transport->srcport = 0;
> > > +}
> > > +
> > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > socket *sock)
> > >  {
> > >         if (transport->srcport == 0 && transport->xprt.reuseport)
> > > --
> > > 2.41.0
> > >
>
> --
> Trond Myklebust Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
Trond Myklebust Sept. 30, 2023, 11:06 p.m. UTC | #4
On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> <trondmy@hammerspace.com> wrote:
> > 
> > On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > > > 
> > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > 
> > > > If the TCP connection attempt fails without ever establishing a
> > > > connection, then assume the problem may be the server is
> > > > rejecting
> > > > us
> > > > due to port reuse.
> > > 
> > > Doesn't this break 4.0 replay cache? Seems too general to assume
> > > that
> > > any unsuccessful SYN was due to a server reboot and it's ok for
> > > the
> > > client to change the port.
> > 
> > This is where things get interesting. Yes, if we change the port
> > number, then it will almost certainly break NFSv3 and NFSv4.0
> > replay
> > caching on the server.
> > 
> > However the problem is that once we get stuck in the situation
> > where we
> > cannot connect, then each new connection attempt is just causing
> > the
> > server's TCP layer to push back and recall that the connection from
> > this port was closed.
> > IOW: the problem is that once we're in this situation, we cannot
> > easily
> > exit without doing one of the following. Either we have to
> > 
> >    1. Change the port number, so that the TCP layer allows us to
> >       connect.
> >    2. Or.. Wait for long enough that the TCP layer has forgotten
> >       altogether about the previous connection.
> > 
> > The problem is that option (2) is subject to livelock, and so has a
> > potential infinite time out. I've seen this livelock in action, and
> > I'm
> > not seeing a solution that has predictable results.
> > 
> > So unless there is a solution for the problems in (2), I don't see
> > how
> > we can avoid defaulting to option (1) at some point, in which case
> > the
> > only question is "when do we switch ports?".
> 
> I'm not sure how one can justify that regression that will come out
> of
> #1 will be less of a problem then the problem in #2.
> 
> I think I'm still not grasping why the NFS server would
> (legitimately)
> be closing a connection that is re-using the port. Can you present a
> sequence of events that would lead to this?
> 

Yes. It is essentially the problem described in this blog:
https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/

...and as you can see, it is nothing to do with NFS. This is the TCP
protocol working as expected.

> But can't we at least arm ourselves in not unnecessarily breaking the
> reply cache by at least imposing some timeout/number of retries
> before
> resetting? If the client was retrying to unsuccessfully re-establish
> connection for a (fixed) while, then 4.0 client's lease would expire
> and switching the port after the lease expires makes no difference.
> There isn't a solution in v3 unfortunately. But a time-based approach
> would at least separate these 'peculiar' servers vs normal servers.
> And if this is a 4.1 client, we can reset the port without a timeout.
> 

This is not a 'peculiar server' vs 'normal server' problem. The reuse
of ports in this way violates the TCP protocol, and has been a problem
for NFS/TCP since the beginning. However, it was never a problem for
the older connectionless UDP protocol, which is where the practice of
tying the replay cache to the source port began in the first place.

NFSv4.1 does not have this problem because it deliberately does not
reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
state problems.

NFSv3 tries to avoid it by doing an incremental back off, but we
recently saw that does not suffice to avoid live lock, after a system
got stuck for several hours in this state.

> Am I correct that every unsuccessful SYN causes a new source point to
> be taken? If so, then a server reboot where multiple SYNs are sent
> prior to connection re-establishment (times number of mounts) might
> cause source port exhaustion?
> 

No. Not every unsuccessful SYN: It is every unsuccessful sequence of
SYNs. If the server is not replying to our SYN packets, then the TCP
layer will back off and retransmit. So there is already a backoff-retry
happening at that level.

> 
> > 
> > > 
> > > > 
> > > > Signed-off-by: Trond Myklebust
> > > > <trond.myklebust@hammerspace.com>
> > > > ---
> > > >  net/sunrpc/xprtsock.c | 10 +++++++++-
> > > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > index 71848ab90d13..1a96777f0ed5 100644
> > > > --- a/net/sunrpc/xprtsock.c
> > > > +++ b/net/sunrpc/xprtsock.c
> > > > @@ -62,6 +62,7 @@
> > > >  #include "sunrpc.h"
> > > > 
> > > >  static void xs_close(struct rpc_xprt *xprt);
> > > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > socket *sock);
> > > >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > > >                 struct socket *sock);
> > > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> > > > sock
> > > > *sk)
> > > >                 break;
> > > >         case TCP_CLOSE:
> > > >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > > -                                       &transport-
> > > > >sock_state))
> > > > +                                      &transport->sock_state))
> > > > {
> > > > +                       xs_reset_srcport(transport);
> > > >                         xprt_clear_connecting(xprt);
> > > > +               }
> > > >                 clear_bit(XPRT_CLOSING, &xprt->state);
> > > >                 /* Trigger the socket release */
> > > >                 xs_run_error_worker(transport,
> > > > XPRT_SOCK_WAKE_DISCONNECT);
> > > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > > *xprt, unsigned short port)
> > > >         xs_update_peer_port(xprt);
> > > >  }
> > > > 
> > > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > > +{
> > > > +       transport->srcport = 0;
> > > > +}
> > > > +
> > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > socket *sock)
> > > >  {
> > > >         if (transport->srcport == 0 && transport-
> > > > >xprt.reuseport)
> > > > --
> > > > 2.41.0
> > > > 
> > 
> > --
> > Trond Myklebust Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
Olga Kornievskaia Oct. 3, 2023, 2:44 p.m. UTC | #5
On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> > On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > >
> > > On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > > > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > > > >
> > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > >
> > > > > If the TCP connection attempt fails without ever establishing a
> > > > > connection, then assume the problem may be the server is
> > > > > rejecting
> > > > > us
> > > > > due to port reuse.
> > > >
> > > > Doesn't this break 4.0 replay cache? Seems too general to assume
> > > > that
> > > > any unsuccessful SYN was due to a server reboot and it's ok for
> > > > the
> > > > client to change the port.
> > >
> > > This is where things get interesting. Yes, if we change the port
> > > number, then it will almost certainly break NFSv3 and NFSv4.0
> > > replay
> > > caching on the server.
> > >
> > > However the problem is that once we get stuck in the situation
> > > where we
> > > cannot connect, then each new connection attempt is just causing
> > > the
> > > server's TCP layer to push back and recall that the connection from
> > > this port was closed.
> > > IOW: the problem is that once we're in this situation, we cannot
> > > easily
> > > exit without doing one of the following. Either we have to
> > >
> > >    1. Change the port number, so that the TCP layer allows us to
> > >       connect.
> > >    2. Or.. Wait for long enough that the TCP layer has forgotten
> > >       altogether about the previous connection.
> > >
> > > The problem is that option (2) is subject to livelock, and so has a
> > > potential infinite time out. I've seen this livelock in action, and
> > > I'm
> > > not seeing a solution that has predictable results.
> > >
> > > So unless there is a solution for the problems in (2), I don't see
> > > how
> > > we can avoid defaulting to option (1) at some point, in which case
> > > the
> > > only question is "when do we switch ports?".
> >
> > I'm not sure how one can justify that regression that will come out
> > of
> > #1 will be less of a problem then the problem in #2.
> >
> > I think I'm still not grasping why the NFS server would
> > (legitimately)
> > be closing a connection that is re-using the port. Can you present a
> > sequence of events that would lead to this?
> >
>
> Yes. It is essentially the problem described in this blog:
> https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
>
> ...and as you can see, it is nothing to do with NFS. This is the TCP
> protocol working as expected.

What I'm seeing are statements that RFC allows for/provides guidance
on how to transition out of TIME_WAIT state. I'm also hearing that the
reasons that the server can't allow for port reuse is due to broken
client implementation or use of (broken?) NAT implementation.

I don't see how any of this justifies allowing a regression in the
linux client code. I'm clearly missing something. How are you possibly
OK with breaking the reply cache?

> > But can't we at least arm ourselves in not unnecessarily breaking the
> > reply cache by at least imposing some timeout/number of retries
> > before
> > resetting? If the client was retrying to unsuccessfully re-establish
> > connection for a (fixed) while, then 4.0 client's lease would expire
> > and switching the port after the lease expires makes no difference.
> > There isn't a solution in v3 unfortunately. But a time-based approach
> > would at least separate these 'peculiar' servers vs normal servers.
> > And if this is a 4.1 client, we can reset the port without a timeout.
> >
>
> This is not a 'peculiar server' vs 'normal server' problem. The reuse
> of ports in this way violates the TCP protocol, and has been a problem

I disagree here. Even the RFC quoted by the blogger says that reuse of
port is allowed.

> for NFS/TCP since the beginning. However, it was never a problem for
> the older connectionless UDP protocol, which is where the practice of
> tying the replay cache to the source port began in the first place.
>
> NFSv4.1 does not have this problem because it deliberately does not
> reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
> state problems.
>
> NFSv3 tries to avoid it by doing an incremental back off, but we
> recently saw that does not suffice to avoid live lock, after a system
> got stuck for several hours in this state.
>
> > Am I correct that every unsuccessful SYN causes a new source point to
> > be taken? If so, then a server reboot where multiple SYNs are sent
> > prior to connection re-establishment (times number of mounts) might
> > cause source port exhaustion?
> >
>
> No. Not every unsuccessful SYN: It is every unsuccessful sequence of

I disagree. Here's a snippet of the network trace with the proposed
patch. The port is changed on EVERY unsuccessful SYN.

   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
TSecr=3081600630
  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
[TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
TSval=3081660681 TSecr=3542359002
  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 909 [RST] Seq=1113 Win=0 Len=0
  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
[TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081663720 TSecr=0 WS=128
  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081666760 TSecr=0 WS=128
  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081669801 TSecr=0 WS=128
  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081672841 TSecr=0 WS=128
  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081675881 TSecr=0 WS=128
  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081678921 TSecr=0
WS=128
  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081681961 TSecr=0 WS=128
  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081685001 TSecr=0 WS=128
  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081688040 TSecr=0 WS=128
  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
TSval=1993141756 TSecr=3081688040 WS=128
  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
TSecr=1993141756
  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
TSecr=3081688040
  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC

This is the same without the patch. Port is successfully reused.
Replay cache OK here not above.

   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
TSecr=3081600630
  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
[TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
TSval=3081660681 TSecr=3542359002
  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 909 [RST] Seq=1113 Win=0 Len=0
  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
[TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081663720 TSecr=0 WS=128
  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081666760 TSecr=0 WS=128
  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081669801 TSecr=0 WS=128
  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081672841 TSecr=0 WS=128
  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081675881 TSecr=0 WS=128
  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081678921 TSecr=0 WS=128
  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081681961 TSecr=0 WS=128
  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081685001 TSecr=0 WS=128
  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
TSval=3081688040 TSecr=0 WS=128
  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
TSval=1993141756 TSecr=3081688040 WS=128
  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
TSecr=1993141756
  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
TSecr=3081688040
  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
  750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
TSecr=1993171927


> SYNs. If the server is not replying to our SYN packets, then the TCP
> layer will back off and retransmit. So there is already a backoff-retry
> happening at that level.
>
> >
> > >
> > > >
> > > > >
> > > > > Signed-off-by: Trond Myklebust
> > > > > <trond.myklebust@hammerspace.com>
> > > > > ---
> > > > >  net/sunrpc/xprtsock.c | 10 +++++++++-
> > > > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > > index 71848ab90d13..1a96777f0ed5 100644
> > > > > --- a/net/sunrpc/xprtsock.c
> > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > @@ -62,6 +62,7 @@
> > > > >  #include "sunrpc.h"
> > > > >
> > > > >  static void xs_close(struct rpc_xprt *xprt);
> > > > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > socket *sock);
> > > > >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > > > >                 struct socket *sock);
> > > > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> > > > > sock
> > > > > *sk)
> > > > >                 break;
> > > > >         case TCP_CLOSE:
> > > > >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > > > -                                       &transport-
> > > > > >sock_state))
> > > > > +                                      &transport->sock_state))
> > > > > {
> > > > > +                       xs_reset_srcport(transport);
> > > > >                         xprt_clear_connecting(xprt);
> > > > > +               }
> > > > >                 clear_bit(XPRT_CLOSING, &xprt->state);
> > > > >                 /* Trigger the socket release */
> > > > >                 xs_run_error_worker(transport,
> > > > > XPRT_SOCK_WAKE_DISCONNECT);
> > > > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > > > *xprt, unsigned short port)
> > > > >         xs_update_peer_port(xprt);
> > > > >  }
> > > > >
> > > > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > > > +{
> > > > > +       transport->srcport = 0;
> > > > > +}
> > > > > +
> > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > socket *sock)
> > > > >  {
> > > > >         if (transport->srcport == 0 && transport-
> > > > > >xprt.reuseport)
> > > > > --
> > > > > 2.41.0
> > > > >
> > >
> > > --
> > > Trond Myklebust Linux NFS client maintainer, Hammerspace
> > > trond.myklebust@hammerspace.com
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>
Jeff Layton Oct. 3, 2023, 3:12 p.m. UTC | #6
On Tue, 2023-10-03 at 10:44 -0400, Olga Kornievskaia wrote:
> On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> > 
> > On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> > > On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> > > <trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > > > > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > > > > > 
> > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > 
> > > > > > If the TCP connection attempt fails without ever establishing a
> > > > > > connection, then assume the problem may be the server is
> > > > > > rejecting
> > > > > > us
> > > > > > due to port reuse.
> > > > > 
> > > > > Doesn't this break 4.0 replay cache? Seems too general to assume
> > > > > that
> > > > > any unsuccessful SYN was due to a server reboot and it's ok for
> > > > > the
> > > > > client to change the port.
> > > > 
> > > > This is where things get interesting. Yes, if we change the port
> > > > number, then it will almost certainly break NFSv3 and NFSv4.0
> > > > replay
> > > > caching on the server.
> > > > 
> > > > However the problem is that once we get stuck in the situation
> > > > where we
> > > > cannot connect, then each new connection attempt is just causing
> > > > the
> > > > server's TCP layer to push back and recall that the connection from
> > > > this port was closed.
> > > > IOW: the problem is that once we're in this situation, we cannot
> > > > easily
> > > > exit without doing one of the following. Either we have to
> > > > 
> > > >    1. Change the port number, so that the TCP layer allows us to
> > > >       connect.
> > > >    2. Or.. Wait for long enough that the TCP layer has forgotten
> > > >       altogether about the previous connection.
> > > > 
> > > > The problem is that option (2) is subject to livelock, and so has a
> > > > potential infinite time out. I've seen this livelock in action, and
> > > > I'm
> > > > not seeing a solution that has predictable results.
> > > > 
> > > > So unless there is a solution for the problems in (2), I don't see
> > > > how
> > > > we can avoid defaulting to option (1) at some point, in which case
> > > > the
> > > > only question is "when do we switch ports?".
> > > 
> > > I'm not sure how one can justify that regression that will come out
> > > of
> > > #1 will be less of a problem then the problem in #2.
> > > 
> > > I think I'm still not grasping why the NFS server would
> > > (legitimately)
> > > be closing a connection that is re-using the port. Can you present a
> > > sequence of events that would lead to this?
> > > 
> > 
> > Yes. It is essentially the problem described in this blog:
> > https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
> > 
> > ...and as you can see, it is nothing to do with NFS. This is the TCP
> > protocol working as expected.
> 
> What I'm seeing are statements that RFC allows for/provides guidance
> on how to transition out of TIME_WAIT state. I'm also hearing that the
> reasons that the server can't allow for port reuse is due to broken
> client implementation or use of (broken?) NAT implementation.
> 
> I don't see how any of this justifies allowing a regression in the
> linux client code. I'm clearly missing something. How are you possibly
> OK with breaking the reply cache?
> 

Is it really breaking things though if you can't connect otherwise? Bear
in mind that if you're dealing with NAT'ed setup, and you wait until the
connection is completely forgotten, then the NAT'ing firewall is likely
to change your source port anyway.

Chuck brought up an interesting question privately: should knfsd's
v3/v4.0 DRC start ignoring the source port? We already check this
otherwise:

- IP addr
- XID
- hash of first 256 bytes of the payload

That seems like enough discriminators that we could stop comparing the
source port without breaking things.

> > > But can't we at least arm ourselves in not unnecessarily breaking the
> > > reply cache by at least imposing some timeout/number of retries
> > > before
> > > resetting? If the client was retrying to unsuccessfully re-establish
> > > connection for a (fixed) while, then 4.0 client's lease would expire
> > > and switching the port after the lease expires makes no difference.
> > > There isn't a solution in v3 unfortunately. But a time-based approach
> > > would at least separate these 'peculiar' servers vs normal servers.
> > > And if this is a 4.1 client, we can reset the port without a timeout.
> > > 
> > 
> > This is not a 'peculiar server' vs 'normal server' problem. The reuse
> > of ports in this way violates the TCP protocol, and has been a problem
> 
> I disagree here. Even the RFC quoted by the blogger says that reuse of
> port is allowed.
> 
> > for NFS/TCP since the beginning. However, it was never a problem for
> > the older connectionless UDP protocol, which is where the practice of
> > tying the replay cache to the source port began in the first place.
> > 
> > NFSv4.1 does not have this problem because it deliberately does not
> > reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
> > state problems.
> > 
> > NFSv3 tries to avoid it by doing an incremental back off, but we
> > recently saw that does not suffice to avoid live lock, after a system
> > got stuck for several hours in this state.
> > 
> > > Am I correct that every unsuccessful SYN causes a new source point to
> > > be taken? If so, then a server reboot where multiple SYNs are sent
> > > prior to connection re-establishment (times number of mounts) might
> > > cause source port exhaustion?
> > > 
> > 
> > No. Not every unsuccessful SYN: It is every unsuccessful sequence of
> 
> I disagree. Here's a snippet of the network trace with the proposed
> patch. The port is changed on EVERY unsuccessful SYN.
> 
>    76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>    77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> TSecr=3081600630
>   256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> TSval=3081660681 TSecr=3542359002
>   259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
>   260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
>   261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081663720 TSecr=0 WS=128
>   267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081666760 TSecr=0 WS=128
>   276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081669801 TSecr=0 WS=128
>   287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081672841 TSecr=0 WS=128
>   290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081675881 TSecr=0 WS=128
>   295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081678921 TSecr=0
> WS=128
>   301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081681961 TSecr=0 WS=128
>   309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081685001 TSecr=0 WS=128
>   313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081688040 TSecr=0 WS=128
>   321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> TSval=1993141756 TSecr=3081688040 WS=128
>   322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> TSecr=1993141756
>   323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>   324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> TSecr=3081688040
>   749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> 
> This is the same without the patch. Port is successfully reused.
> Replay cache OK here not above.
> 
>    76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>    77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> TSecr=3081600630
>   256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> TSval=3081660681 TSecr=3542359002
>   259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
>   260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
>   261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081663720 TSecr=0 WS=128
>   267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081666760 TSecr=0 WS=128
>   276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081669801 TSecr=0 WS=128
>   287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081672841 TSecr=0 WS=128
>   290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081675881 TSecr=0 WS=128
>   295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081678921 TSecr=0 WS=128
>   301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081681961 TSecr=0 WS=128
>   309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081685001 TSecr=0 WS=128
>   313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>   320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> TSval=3081688040 TSecr=0 WS=128
>   321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> TSval=1993141756 TSecr=3081688040 WS=128
>   322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> TSecr=1993141756
>   323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>   324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> TSecr=3081688040
>   749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
>   750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
> 803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
> TSecr=1993171927
> 
> 
> > SYNs. If the server is not replying to our SYN packets, then the TCP
> > layer will back off and retransmit. So there is already a backoff-retry
> > happening at that level.
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > Signed-off-by: Trond Myklebust
> > > > > > <trond.myklebust@hammerspace.com>
> > > > > > ---
> > > > > >  net/sunrpc/xprtsock.c | 10 +++++++++-
> > > > > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > > > > > 
> > > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > > > index 71848ab90d13..1a96777f0ed5 100644
> > > > > > --- a/net/sunrpc/xprtsock.c
> > > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > > @@ -62,6 +62,7 @@
> > > > > >  #include "sunrpc.h"
> > > > > > 
> > > > > >  static void xs_close(struct rpc_xprt *xprt);
> > > > > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > socket *sock);
> > > > > >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > > > > >                 struct socket *sock);
> > > > > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> > > > > > sock
> > > > > > *sk)
> > > > > >                 break;
> > > > > >         case TCP_CLOSE:
> > > > > >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > > > > -                                       &transport-
> > > > > > > sock_state))
> > > > > > +                                      &transport->sock_state))
> > > > > > {
> > > > > > +                       xs_reset_srcport(transport);
> > > > > >                         xprt_clear_connecting(xprt);
> > > > > > +               }
> > > > > >                 clear_bit(XPRT_CLOSING, &xprt->state);
> > > > > >                 /* Trigger the socket release */
> > > > > >                 xs_run_error_worker(transport,
> > > > > > XPRT_SOCK_WAKE_DISCONNECT);
> > > > > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > > > > *xprt, unsigned short port)
> > > > > >         xs_update_peer_port(xprt);
> > > > > >  }
> > > > > > 
> > > > > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > > > > +{
> > > > > > +       transport->srcport = 0;
> > > > > > +}
> > > > > > +
> > > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > socket *sock)
> > > > > >  {
> > > > > >         if (transport->srcport == 0 && transport-
> > > > > > > xprt.reuseport)
> > > > > > --
> > > > > > 2.41.0
> > > > > > 
> > > > 
> > > > --
> > > > Trond Myklebust Linux NFS client maintainer, Hammerspace
> > > > trond.myklebust@hammerspace.com
> > 
> > --
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> > 
> >
Olga Kornievskaia Oct. 3, 2023, 3:28 p.m. UTC | #7
On Tue, Oct 3, 2023 at 11:12 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, 2023-10-03 at 10:44 -0400, Olga Kornievskaia wrote:
> > On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> > >
> > > On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> > > > On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> > > > <trondmy@hammerspace.com> wrote:
> > > > >
> > > > > On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > > > > > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > > > > > >
> > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > >
> > > > > > > If the TCP connection attempt fails without ever establishing a
> > > > > > > connection, then assume the problem may be the server is
> > > > > > > rejecting
> > > > > > > us
> > > > > > > due to port reuse.
> > > > > >
> > > > > > Doesn't this break 4.0 replay cache? Seems too general to assume
> > > > > > that
> > > > > > any unsuccessful SYN was due to a server reboot and it's ok for
> > > > > > the
> > > > > > client to change the port.
> > > > >
> > > > > This is where things get interesting. Yes, if we change the port
> > > > > number, then it will almost certainly break NFSv3 and NFSv4.0
> > > > > replay
> > > > > caching on the server.
> > > > >
> > > > > However the problem is that once we get stuck in the situation
> > > > > where we
> > > > > cannot connect, then each new connection attempt is just causing
> > > > > the
> > > > > server's TCP layer to push back and recall that the connection from
> > > > > this port was closed.
> > > > > IOW: the problem is that once we're in this situation, we cannot
> > > > > easily
> > > > > exit without doing one of the following. Either we have to
> > > > >
> > > > >    1. Change the port number, so that the TCP layer allows us to
> > > > >       connect.
> > > > >    2. Or.. Wait for long enough that the TCP layer has forgotten
> > > > >       altogether about the previous connection.
> > > > >
> > > > > The problem is that option (2) is subject to livelock, and so has a
> > > > > potential infinite time out. I've seen this livelock in action, and
> > > > > I'm
> > > > > not seeing a solution that has predictable results.
> > > > >
> > > > > So unless there is a solution for the problems in (2), I don't see
> > > > > how
> > > > > we can avoid defaulting to option (1) at some point, in which case
> > > > > the
> > > > > only question is "when do we switch ports?".
> > > >
> > > > I'm not sure how one can justify that regression that will come out
> > > > of
> > > > #1 will be less of a problem then the problem in #2.
> > > >
> > > > I think I'm still not grasping why the NFS server would
> > > > (legitimately)
> > > > be closing a connection that is re-using the port. Can you present a
> > > > sequence of events that would lead to this?
> > > >
> > >
> > > Yes. It is essentially the problem described in this blog:
> > > https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
> > >
> > > ...and as you can see, it is nothing to do with NFS. This is the TCP
> > > protocol working as expected.
> >
> > What I'm seeing are statements that RFC allows for/provides guidance
> > on how to transition out of TIME_WAIT state. I'm also hearing that the
> > reasons that the server can't allow for port reuse is due to broken
> > client implementation or use of (broken?) NAT implementation.
> >
> > I don't see how any of this justifies allowing a regression in the
> > linux client code. I'm clearly missing something. How are you possibly
> > OK with breaking the reply cache?
> >
>
> Is it really breaking things though if you can't connect otherwise? Bear
> in mind that if you're dealing with NAT'ed setup, and you wait until the
> connection is completely forgotten, then the NAT'ing firewall is likely
> to change your source port anyway.
>
> Chuck brought up an interesting question privately: should knfsd's
> v3/v4.0 DRC start ignoring the source port? We already check this
> otherwise:
>
> - IP addr
> - XID
> - hash of first 256 bytes of the payload

Calculating a hash of every packet has a great performance impact. But
perhaps if we require v3 traffic to do that then we can get v3 and
v4.1 performance on the same level and folks would finally switch to
v4.1.

I also forgot to write that while we don't care about port (not being
reused) for 4.1. If we switch the port on every connection
establishment we will same way run into port exhaustion. Use of
nconnect is becoming common so something like a server reboot on a
client machine with about only 10 mounts using nconnect=16 and average
of 7 SYNs (as per example here) before the server starts again, the
client would use 1K source ports?

> That seems like enough discriminators that we could stop comparing the
> source port without breaking things.
>
> > > > But can't we at least arm ourselves in not unnecessarily breaking the
> > > > reply cache by at least imposing some timeout/number of retries
> > > > before
> > > > resetting? If the client was retrying to unsuccessfully re-establish
> > > > connection for a (fixed) while, then 4.0 client's lease would expire
> > > > and switching the port after the lease expires makes no difference.
> > > > There isn't a solution in v3 unfortunately. But a time-based approach
> > > > would at least separate these 'peculiar' servers vs normal servers.
> > > > And if this is a 4.1 client, we can reset the port without a timeout.
> > > >
> > >
> > > This is not a 'peculiar server' vs 'normal server' problem. The reuse
> > > of ports in this way violates the TCP protocol, and has been a problem
> >
> > I disagree here. Even the RFC quoted by the blogger says that reuse of
> > port is allowed.
> >
> > > for NFS/TCP since the beginning. However, it was never a problem for
> > > the older connectionless UDP protocol, which is where the practice of
> > > tying the replay cache to the source port began in the first place.
> > >
> > > NFSv4.1 does not have this problem because it deliberately does not
> > > reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
> > > state problems.
> > >
> > > NFSv3 tries to avoid it by doing an incremental back off, but we
> > > recently saw that does not suffice to avoid live lock, after a system
> > > got stuck for several hours in this state.
> > >
> > > > Am I correct that every unsuccessful SYN causes a new source point to
> > > > be taken? If so, then a server reboot where multiple SYNs are sent
> > > > prior to connection re-establishment (times number of mounts) might
> > > > cause source port exhaustion?
> > > >
> > >
> > > No. Not every unsuccessful SYN: It is every unsuccessful sequence of
> >
> > I disagree. Here's a snippet of the network trace with the proposed
> > patch. The port is changed on EVERY unsuccessful SYN.
> >
> >    76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >    77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> > 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> > TSecr=3081600630
> >   256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> > [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> > TSval=3081660681 TSecr=3542359002
> >   259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> >   260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> > [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> > MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> >   261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> > 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081663720 TSecr=0 WS=128
> >   267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> > 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081666760 TSecr=0 WS=128
> >   276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> > 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081669801 TSecr=0 WS=128
> >   287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> > 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081672841 TSecr=0 WS=128
> >   290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> > 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081675881 TSecr=0 WS=128
> >   295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> > 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081678921 TSecr=0
> > WS=128
> >   301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> > 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081681961 TSecr=0 WS=128
> >   309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> > 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081685001 TSecr=0 WS=128
> >   313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> > 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081688040 TSecr=0 WS=128
> >   321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> > 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> > TSval=1993141756 TSecr=3081688040 WS=128
> >   322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> > 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> > TSecr=1993141756
> >   323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >   324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> > 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> > TSecr=3081688040
> >   749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> > V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> >
> > This is the same without the patch. Port is successfully reused.
> > Replay cache OK here not above.
> >
> >    76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >    77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> > 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> > TSecr=3081600630
> >   256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> > [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> > TSval=3081660681 TSecr=3542359002
> >   259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> >   260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> > [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> > MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> >   261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> > 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081663720 TSecr=0 WS=128
> >   267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> > 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081666760 TSecr=0 WS=128
> >   276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> > 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081669801 TSecr=0 WS=128
> >   287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> > 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081672841 TSecr=0 WS=128
> >   290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> > 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081675881 TSecr=0 WS=128
> >   295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> > 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081678921 TSecr=0 WS=128
> >   301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> > 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081681961 TSecr=0 WS=128
> >   309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> > 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081685001 TSecr=0 WS=128
> >   313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> > 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >   320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> > 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > TSval=3081688040 TSecr=0 WS=128
> >   321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> > 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> > TSval=1993141756 TSecr=3081688040 WS=128
> >   322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> > 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> > TSecr=1993141756
> >   323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >   324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> > 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> > TSecr=3081688040
> >   749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> > V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> >   750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
> > 803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
> > TSecr=1993171927
> >
> >
> > > SYNs. If the server is not replying to our SYN packets, then the TCP
> > > layer will back off and retransmit. So there is already a backoff-retry
> > > happening at that level.
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > Signed-off-by: Trond Myklebust
> > > > > > > <trond.myklebust@hammerspace.com>
> > > > > > > ---
> > > > > > >  net/sunrpc/xprtsock.c | 10 +++++++++-
> > > > > > >  1 file changed, 9 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > > > > index 71848ab90d13..1a96777f0ed5 100644
> > > > > > > --- a/net/sunrpc/xprtsock.c
> > > > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > > > @@ -62,6 +62,7 @@
> > > > > > >  #include "sunrpc.h"
> > > > > > >
> > > > > > >  static void xs_close(struct rpc_xprt *xprt);
> > > > > > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > > > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > > socket *sock);
> > > > > > >  static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > > > > > >                 struct socket *sock);
> > > > > > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> > > > > > > sock
> > > > > > > *sk)
> > > > > > >                 break;
> > > > > > >         case TCP_CLOSE:
> > > > > > >                 if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > > > > > -                                       &transport-
> > > > > > > > sock_state))
> > > > > > > +                                      &transport->sock_state))
> > > > > > > {
> > > > > > > +                       xs_reset_srcport(transport);
> > > > > > >                         xprt_clear_connecting(xprt);
> > > > > > > +               }
> > > > > > >                 clear_bit(XPRT_CLOSING, &xprt->state);
> > > > > > >                 /* Trigger the socket release */
> > > > > > >                 xs_run_error_worker(transport,
> > > > > > > XPRT_SOCK_WAKE_DISCONNECT);
> > > > > > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > > > > > *xprt, unsigned short port)
> > > > > > >         xs_update_peer_port(xprt);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > > > > > +{
> > > > > > > +       transport->srcport = 0;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > > socket *sock)
> > > > > > >  {
> > > > > > >         if (transport->srcport == 0 && transport-
> > > > > > > > xprt.reuseport)
> > > > > > > --
> > > > > > > 2.41.0
> > > > > > >
> > > > >
> > > > > --
> > > > > Trond Myklebust Linux NFS client maintainer, Hammerspace
> > > > > trond.myklebust@hammerspace.com
> > >
> > > --
> > > Trond Myklebust
> > > Linux NFS client maintainer, Hammerspace
> > > trond.myklebust@hammerspace.com
> > >
> > >
>
> --
> Jeff Layton <jlayton@kernel.org>
Chuck Lever III Oct. 3, 2023, 3:32 p.m. UTC | #8
> On Oct 3, 2023, at 11:28 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
> 
> On Tue, Oct 3, 2023 at 11:12 AM Jeff Layton <jlayton@kernel.org> wrote:
>> 
>> On Tue, 2023-10-03 at 10:44 -0400, Olga Kornievskaia wrote:
>>> On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>>>> 
>>>> On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
>>>>> On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
>>>>> <trondmy@hammerspace.com> wrote:
>>>>>> 
>>>>>> On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
>>>>>>> On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
>>>>>>>> 
>>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>>>>> 
>>>>>>>> If the TCP connection attempt fails without ever establishing a
>>>>>>>> connection, then assume the problem may be the server is
>>>>>>>> rejecting
>>>>>>>> us
>>>>>>>> due to port reuse.
>>>>>>> 
>>>>>>> Doesn't this break 4.0 replay cache? Seems too general to assume
>>>>>>> that
>>>>>>> any unsuccessful SYN was due to a server reboot and it's ok for
>>>>>>> the
>>>>>>> client to change the port.
>>>>>> 
>>>>>> This is where things get interesting. Yes, if we change the port
>>>>>> number, then it will almost certainly break NFSv3 and NFSv4.0
>>>>>> replay
>>>>>> caching on the server.
>>>>>> 
>>>>>> However the problem is that once we get stuck in the situation
>>>>>> where we
>>>>>> cannot connect, then each new connection attempt is just causing
>>>>>> the
>>>>>> server's TCP layer to push back and recall that the connection from
>>>>>> this port was closed.
>>>>>> IOW: the problem is that once we're in this situation, we cannot
>>>>>> easily
>>>>>> exit without doing one of the following. Either we have to
>>>>>> 
>>>>>>   1. Change the port number, so that the TCP layer allows us to
>>>>>>      connect.
>>>>>>   2. Or.. Wait for long enough that the TCP layer has forgotten
>>>>>>      altogether about the previous connection.
>>>>>> 
>>>>>> The problem is that option (2) is subject to livelock, and so has a
>>>>>> potential infinite time out. I've seen this livelock in action, and
>>>>>> I'm
>>>>>> not seeing a solution that has predictable results.
>>>>>> 
>>>>>> So unless there is a solution for the problems in (2), I don't see
>>>>>> how
>>>>>> we can avoid defaulting to option (1) at some point, in which case
>>>>>> the
>>>>>> only question is "when do we switch ports?".
>>>>> 
>>>>> I'm not sure how one can justify that regression that will come out
>>>>> of
>>>>> #1 will be less of a problem then the problem in #2.
>>>>> 
>>>>> I think I'm still not grasping why the NFS server would
>>>>> (legitimately)
>>>>> be closing a connection that is re-using the port. Can you present a
>>>>> sequence of events that would lead to this?
>>>>> 
>>>> 
>>>> Yes. It is essentially the problem described in this blog:
>>>> https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
>>>> 
>>>> ...and as you can see, it is nothing to do with NFS. This is the TCP
>>>> protocol working as expected.
>>> 
>>> What I'm seeing are statements that RFC allows for/provides guidance
>>> on how to transition out of TIME_WAIT state. I'm also hearing that the
>>> reasons that the server can't allow for port reuse is due to broken
>>> client implementation or use of (broken?) NAT implementation.
>>> 
>>> I don't see how any of this justifies allowing a regression in the
>>> linux client code. I'm clearly missing something. How are you possibly
>>> OK with breaking the reply cache?
>>> 
>> 
>> Is it really breaking things though if you can't connect otherwise? Bear
>> in mind that if you're dealing with NAT'ed setup, and you wait until the
>> connection is completely forgotten, then the NAT'ing firewall is likely
>> to change your source port anyway.
>> 
>> Chuck brought up an interesting question privately: should knfsd's
>> v3/v4.0 DRC start ignoring the source port? We already check this
>> otherwise:
>> 
>> - IP addr
>> - XID
>> - hash of first 256 bytes of the payload
> 
> Calculating a hash of every packet has a great performance impact.

NFSD has done this for years. On modern CPUs, it's less of a
performance hit than walking the DRC hash chain.


> But
> perhaps if we require v3 traffic to do that then we can get v3 and
> v4.1 performance on the same level and folks would finally switch to
> v4.1.
> 
> I also forgot to write that while we don't care about port (not being
> reused) for 4.1. If we switch the port on every connection
> establishment we will same way run into port exhaustion. Use of
> nconnect is becoming common so something like a server reboot on a
> client machine with about only 10 mounts using nconnect=16 and average
> of 7 SYNs (as per example here) before the server starts again, the
> client would use 1K source ports?
> 
>> That seems like enough discriminators that we could stop comparing the
>> source port without breaking things.
>> 
>>>>> But can't we at least arm ourselves in not unnecessarily breaking the
>>>>> reply cache by at least imposing some timeout/number of retries
>>>>> before
>>>>> resetting? If the client was retrying to unsuccessfully re-establish
>>>>> connection for a (fixed) while, then 4.0 client's lease would expire
>>>>> and switching the port after the lease expires makes no difference.
>>>>> There isn't a solution in v3 unfortunately. But a time-based approach
>>>>> would at least separate these 'peculiar' servers vs normal servers.
>>>>> And if this is a 4.1 client, we can reset the port without a timeout.
>>>>> 
>>>> 
>>>> This is not a 'peculiar server' vs 'normal server' problem. The reuse
>>>> of ports in this way violates the TCP protocol, and has been a problem
>>> 
>>> I disagree here. Even the RFC quoted by the blogger says that reuse of
>>> port is allowed.
>>> 
>>>> for NFS/TCP since the beginning. However, it was never a problem for
>>>> the older connectionless UDP protocol, which is where the practice of
>>>> tying the replay cache to the source port began in the first place.
>>>> 
>>>> NFSv4.1 does not have this problem because it deliberately does not
>>>> reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
>>>> state problems.
>>>> 
>>>> NFSv3 tries to avoid it by doing an incremental back off, but we
>>>> recently saw that does not suffice to avoid live lock, after a system
>>>> got stuck for several hours in this state.
>>>> 
>>>>> Am I correct that every unsuccessful SYN causes a new source point to
>>>>> be taken? If so, then a server reboot where multiple SYNs are sent
>>>>> prior to connection re-establishment (times number of mounts) might
>>>>> cause source port exhaustion?
>>>>> 
>>>> 
>>>> No. Not every unsuccessful SYN: It is every unsuccessful sequence of
>>> 
>>> I disagree. Here's a snippet of the network trace with the proposed
>>> patch. The port is changed on EVERY unsuccessful SYN.
>>> 
>>>   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
>>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>>>   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
>>> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
>>> TSecr=3081600630
>>>  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
>>> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
>>> TSval=3081660681 TSecr=3542359002
>>>  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
>>>  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
>>> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
>>> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
>>>  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
>>> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081663720 TSecr=0 WS=128
>>>  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
>>> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081666760 TSecr=0 WS=128
>>>  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
>>> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081669801 TSecr=0 WS=128
>>>  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
>>> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081672841 TSecr=0 WS=128
>>>  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
>>> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081675881 TSecr=0 WS=128
>>>  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
>>> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081678921 TSecr=0
>>> WS=128
>>>  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
>>> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081681961 TSecr=0 WS=128
>>>  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
>>> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081685001 TSecr=0 WS=128
>>>  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
>>> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081688040 TSecr=0 WS=128
>>>  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
>>> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
>>> TSval=1993141756 TSecr=3081688040 WS=128
>>>  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
>>> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
>>> TSecr=1993141756
>>>  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
>>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>>>  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
>>> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
>>> TSecr=3081688040
>>>  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
>>> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
>>> 
>>> This is the same without the patch. Port is successfully reused.
>>> Replay cache OK here not above.
>>> 
>>>   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
>>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>>>   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
>>> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
>>> TSecr=3081600630
>>>  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
>>> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
>>> TSval=3081660681 TSecr=3542359002
>>>  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
>>>  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
>>> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
>>> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
>>>  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
>>> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081663720 TSecr=0 WS=128
>>>  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
>>> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081666760 TSecr=0 WS=128
>>>  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
>>> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081669801 TSecr=0 WS=128
>>>  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
>>> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081672841 TSecr=0 WS=128
>>>  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
>>> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081675881 TSecr=0 WS=128
>>>  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
>>> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081678921 TSecr=0 WS=128
>>>  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
>>> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081681961 TSecr=0 WS=128
>>>  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
>>> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081685001 TSecr=0 WS=128
>>>  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
>>> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
>>>  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
>>> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
>>> TSval=3081688040 TSecr=0 WS=128
>>>  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
>>> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
>>> TSval=1993141756 TSecr=3081688040 WS=128
>>>  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
>>> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
>>> TSecr=1993141756
>>>  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
>>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
>>>  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
>>> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
>>> TSecr=3081688040
>>>  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
>>> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
>>>  750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
>>> 803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
>>> TSecr=1993171927
>>> 
>>> 
>>>> SYNs. If the server is not replying to our SYN packets, then the TCP
>>>> layer will back off and retransmit. So there is already a backoff-retry
>>>> happening at that level.
>>>> 
>>>>> 
>>>>>> 
>>>>>>> 
>>>>>>>> 
>>>>>>>> Signed-off-by: Trond Myklebust
>>>>>>>> <trond.myklebust@hammerspace.com>
>>>>>>>> ---
>>>>>>>> net/sunrpc/xprtsock.c | 10 +++++++++-
>>>>>>>> 1 file changed, 9 insertions(+), 1 deletion(-)
>>>>>>>> 
>>>>>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>>>>>>>> index 71848ab90d13..1a96777f0ed5 100644
>>>>>>>> --- a/net/sunrpc/xprtsock.c
>>>>>>>> +++ b/net/sunrpc/xprtsock.c
>>>>>>>> @@ -62,6 +62,7 @@
>>>>>>>> #include "sunrpc.h"
>>>>>>>> 
>>>>>>>> static void xs_close(struct rpc_xprt *xprt);
>>>>>>>> +static void xs_reset_srcport(struct sock_xprt *transport);
>>>>>>>> static void xs_set_srcport(struct sock_xprt *transport, struct
>>>>>>>> socket *sock);
>>>>>>>> static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
>>>>>>>>                struct socket *sock);
>>>>>>>> @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
>>>>>>>> sock
>>>>>>>> *sk)
>>>>>>>>                break;
>>>>>>>>        case TCP_CLOSE:
>>>>>>>>                if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
>>>>>>>> -                                       &transport-
>>>>>>>>> sock_state))
>>>>>>>> +                                      &transport->sock_state))
>>>>>>>> {
>>>>>>>> +                       xs_reset_srcport(transport);
>>>>>>>>                        xprt_clear_connecting(xprt);
>>>>>>>> +               }
>>>>>>>>                clear_bit(XPRT_CLOSING, &xprt->state);
>>>>>>>>                /* Trigger the socket release */
>>>>>>>>                xs_run_error_worker(transport,
>>>>>>>> XPRT_SOCK_WAKE_DISCONNECT);
>>>>>>>> @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
>>>>>>>> *xprt, unsigned short port)
>>>>>>>>        xs_update_peer_port(xprt);
>>>>>>>> }
>>>>>>>> 
>>>>>>>> +static void xs_reset_srcport(struct sock_xprt *transport)
>>>>>>>> +{
>>>>>>>> +       transport->srcport = 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static void xs_set_srcport(struct sock_xprt *transport, struct
>>>>>>>> socket *sock)
>>>>>>>> {
>>>>>>>>        if (transport->srcport == 0 && transport-
>>>>>>>>> xprt.reuseport)
>>>>>>>> --
>>>>>>>> 2.41.0
>>>>>>>> 
>>>>>> 
>>>>>> --
>>>>>> Trond Myklebust Linux NFS client maintainer, Hammerspace
>>>>>> trond.myklebust@hammerspace.com
>>>> 
>>>> --
>>>> Trond Myklebust
>>>> Linux NFS client maintainer, Hammerspace
>>>> trond.myklebust@hammerspace.com
>>>> 
>>>> 
>> 
>> --
>> Jeff Layton <jlayton@kernel.org>


--
Chuck Lever
Olga Kornievskaia Oct. 3, 2023, 3:54 p.m. UTC | #9
On Tue, Oct 3, 2023 at 11:32 AM Chuck Lever III <chuck.lever@oracle.com> wrote:
>
>
>
> > On Oct 3, 2023, at 11:28 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
> >
> > On Tue, Oct 3, 2023 at 11:12 AM Jeff Layton <jlayton@kernel.org> wrote:
> >>
> >> On Tue, 2023-10-03 at 10:44 -0400, Olga Kornievskaia wrote:
> >>> On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> >>>>
> >>>> On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> >>>>> On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> >>>>> <trondmy@hammerspace.com> wrote:
> >>>>>>
> >>>>>> On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> >>>>>>> On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> >>>>>>>>
> >>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> >>>>>>>>
> >>>>>>>> If the TCP connection attempt fails without ever establishing a
> >>>>>>>> connection, then assume the problem may be the server is
> >>>>>>>> rejecting
> >>>>>>>> us
> >>>>>>>> due to port reuse.
> >>>>>>>
> >>>>>>> Doesn't this break 4.0 replay cache? Seems too general to assume
> >>>>>>> that
> >>>>>>> any unsuccessful SYN was due to a server reboot and it's ok for
> >>>>>>> the
> >>>>>>> client to change the port.
> >>>>>>
> >>>>>> This is where things get interesting. Yes, if we change the port
> >>>>>> number, then it will almost certainly break NFSv3 and NFSv4.0
> >>>>>> replay
> >>>>>> caching on the server.
> >>>>>>
> >>>>>> However the problem is that once we get stuck in the situation
> >>>>>> where we
> >>>>>> cannot connect, then each new connection attempt is just causing
> >>>>>> the
> >>>>>> server's TCP layer to push back and recall that the connection from
> >>>>>> this port was closed.
> >>>>>> IOW: the problem is that once we're in this situation, we cannot
> >>>>>> easily
> >>>>>> exit without doing one of the following. Either we have to
> >>>>>>
> >>>>>>   1. Change the port number, so that the TCP layer allows us to
> >>>>>>      connect.
> >>>>>>   2. Or.. Wait for long enough that the TCP layer has forgotten
> >>>>>>      altogether about the previous connection.
> >>>>>>
> >>>>>> The problem is that option (2) is subject to livelock, and so has a
> >>>>>> potential infinite time out. I've seen this livelock in action, and
> >>>>>> I'm
> >>>>>> not seeing a solution that has predictable results.
> >>>>>>
> >>>>>> So unless there is a solution for the problems in (2), I don't see
> >>>>>> how
> >>>>>> we can avoid defaulting to option (1) at some point, in which case
> >>>>>> the
> >>>>>> only question is "when do we switch ports?".
> >>>>>
> >>>>> I'm not sure how one can justify that regression that will come out
> >>>>> of
> >>>>> #1 will be less of a problem then the problem in #2.
> >>>>>
> >>>>> I think I'm still not grasping why the NFS server would
> >>>>> (legitimately)
> >>>>> be closing a connection that is re-using the port. Can you present a
> >>>>> sequence of events that would lead to this?
> >>>>>
> >>>>
> >>>> Yes. It is essentially the problem described in this blog:
> >>>> https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
> >>>>
> >>>> ...and as you can see, it is nothing to do with NFS. This is the TCP
> >>>> protocol working as expected.
> >>>
> >>> What I'm seeing are statements that RFC allows for/provides guidance
> >>> on how to transition out of TIME_WAIT state. I'm also hearing that the
> >>> reasons that the server can't allow for port reuse is due to broken
> >>> client implementation or use of (broken?) NAT implementation.
> >>>
> >>> I don't see how any of this justifies allowing a regression in the
> >>> linux client code. I'm clearly missing something. How are you possibly
> >>> OK with breaking the reply cache?
> >>>
> >>
> >> Is it really breaking things though if you can't connect otherwise? Bear
> >> in mind that if you're dealing with NAT'ed setup, and you wait until the
> >> connection is completely forgotten, then the NAT'ing firewall is likely
> >> to change your source port anyway.
> >>
> >> Chuck brought up an interesting question privately: should knfsd's
> >> v3/v4.0 DRC start ignoring the source port? We already check this
> >> otherwise:
> >>
> >> - IP addr
> >> - XID
> >> - hash of first 256 bytes of the payload
> >
> > Calculating a hash of every packet has a great performance impact.
>
> NFSD has done this for years. On modern CPUs, it's less of a
> performance hit than walking the DRC hash chain.

Use of the word great has been overstating but the impact is non-zero.
I couldn't convince Netapp to use hashing to solve false_retrys in
4.1.

>
> > But
> > perhaps if we require v3 traffic to do that then we can get v3 and
> > v4.1 performance on the same level and folks would finally switch to
> > v4.1.
> >
> > I also forgot to write that while we don't care about port (not being
> > reused) for 4.1. If we switch the port on every connection
> > establishment we will same way run into port exhaustion. Use of
> > nconnect is becoming common so something like a server reboot on a
> > client machine with about only 10 mounts using nconnect=16 and average
> > of 7 SYNs (as per example here) before the server starts again, the
> > client would use 1K source ports?
> >
> >> That seems like enough discriminators that we could stop comparing the
> >> source port without breaking things.
> >>
> >>>>> But can't we at least arm ourselves in not unnecessarily breaking the
> >>>>> reply cache by at least imposing some timeout/number of retries
> >>>>> before
> >>>>> resetting? If the client was retrying to unsuccessfully re-establish
> >>>>> connection for a (fixed) while, then 4.0 client's lease would expire
> >>>>> and switching the port after the lease expires makes no difference.
> >>>>> There isn't a solution in v3 unfortunately. But a time-based approach
> >>>>> would at least separate these 'peculiar' servers vs normal servers.
> >>>>> And if this is a 4.1 client, we can reset the port without a timeout.
> >>>>>
> >>>>
> >>>> This is not a 'peculiar server' vs 'normal server' problem. The reuse
> >>>> of ports in this way violates the TCP protocol, and has been a problem
> >>>
> >>> I disagree here. Even the RFC quoted by the blogger says that reuse of
> >>> port is allowed.
> >>>
> >>>> for NFS/TCP since the beginning. However, it was never a problem for
> >>>> the older connectionless UDP protocol, which is where the practice of
> >>>> tying the replay cache to the source port began in the first place.
> >>>>
> >>>> NFSv4.1 does not have this problem because it deliberately does not
> >>>> reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
> >>>> state problems.
> >>>>
> >>>> NFSv3 tries to avoid it by doing an incremental back off, but we
> >>>> recently saw that does not suffice to avoid live lock, after a system
> >>>> got stuck for several hours in this state.
> >>>>
> >>>>> Am I correct that every unsuccessful SYN causes a new source point to
> >>>>> be taken? If so, then a server reboot where multiple SYNs are sent
> >>>>> prior to connection re-establishment (times number of mounts) might
> >>>>> cause source port exhaustion?
> >>>>>
> >>>>
> >>>> No. Not every unsuccessful SYN: It is every unsuccessful sequence of
> >>>
> >>> I disagree. Here's a snippet of the network trace with the proposed
> >>> patch. The port is changed on EVERY unsuccessful SYN.
> >>>
> >>>   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> >>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >>>   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> >>> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> >>> TSecr=3081600630
> >>>  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> >>> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> >>> TSval=3081660681 TSecr=3542359002
> >>>  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> >>>  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> >>> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> >>> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> >>>  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081663720 TSecr=0 WS=128
> >>>  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081666760 TSecr=0 WS=128
> >>>  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081669801 TSecr=0 WS=128
> >>>  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081672841 TSecr=0 WS=128
> >>>  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081675881 TSecr=0 WS=128
> >>>  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081678921 TSecr=0
> >>> WS=128
> >>>  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081681961 TSecr=0 WS=128
> >>>  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081685001 TSecr=0 WS=128
> >>>  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081688040 TSecr=0 WS=128
> >>>  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> >>> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> >>> TSval=1993141756 TSecr=3081688040 WS=128
> >>>  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> >>> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> >>> TSecr=1993141756
> >>>  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> >>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >>>  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> >>> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> >>> TSecr=3081688040
> >>>  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> >>> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> >>>
> >>> This is the same without the patch. Port is successfully reused.
> >>> Replay cache OK here not above.
> >>>
> >>>   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> >>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >>>   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> >>> 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> >>> TSecr=3081600630
> >>>  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> >>> [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> >>> TSval=3081660681 TSecr=3542359002
> >>>  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> >>>  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> >>> [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> >>> MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> >>>  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081663720 TSecr=0 WS=128
> >>>  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081666760 TSecr=0 WS=128
> >>>  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081669801 TSecr=0 WS=128
> >>>  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081672841 TSecr=0 WS=128
> >>>  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081675881 TSecr=0 WS=128
> >>>  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081678921 TSecr=0 WS=128
> >>>  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081681961 TSecr=0 WS=128
> >>>  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081685001 TSecr=0 WS=128
> >>>  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> >>> 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> >>>  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> >>> 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> >>> TSval=3081688040 TSecr=0 WS=128
> >>>  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> >>> 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> >>> TSval=1993141756 TSecr=3081688040 WS=128
> >>>  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> >>> 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> >>> TSecr=1993141756
> >>>  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> >>> V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> >>>  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> >>> 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> >>> TSecr=3081688040
> >>>  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> >>> V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> >>>  750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
> >>> 803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
> >>> TSecr=1993171927
> >>>
> >>>
> >>>> SYNs. If the server is not replying to our SYN packets, then the TCP
> >>>> layer will back off and retransmit. So there is already a backoff-retry
> >>>> happening at that level.
> >>>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Signed-off-by: Trond Myklebust
> >>>>>>>> <trond.myklebust@hammerspace.com>
> >>>>>>>> ---
> >>>>>>>> net/sunrpc/xprtsock.c | 10 +++++++++-
> >>>>>>>> 1 file changed, 9 insertions(+), 1 deletion(-)
> >>>>>>>>
> >>>>>>>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> >>>>>>>> index 71848ab90d13..1a96777f0ed5 100644
> >>>>>>>> --- a/net/sunrpc/xprtsock.c
> >>>>>>>> +++ b/net/sunrpc/xprtsock.c
> >>>>>>>> @@ -62,6 +62,7 @@
> >>>>>>>> #include "sunrpc.h"
> >>>>>>>>
> >>>>>>>> static void xs_close(struct rpc_xprt *xprt);
> >>>>>>>> +static void xs_reset_srcport(struct sock_xprt *transport);
> >>>>>>>> static void xs_set_srcport(struct sock_xprt *transport, struct
> >>>>>>>> socket *sock);
> >>>>>>>> static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> >>>>>>>>                struct socket *sock);
> >>>>>>>> @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> >>>>>>>> sock
> >>>>>>>> *sk)
> >>>>>>>>                break;
> >>>>>>>>        case TCP_CLOSE:
> >>>>>>>>                if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> >>>>>>>> -                                       &transport-
> >>>>>>>>> sock_state))
> >>>>>>>> +                                      &transport->sock_state))
> >>>>>>>> {
> >>>>>>>> +                       xs_reset_srcport(transport);
> >>>>>>>>                        xprt_clear_connecting(xprt);
> >>>>>>>> +               }
> >>>>>>>>                clear_bit(XPRT_CLOSING, &xprt->state);
> >>>>>>>>                /* Trigger the socket release */
> >>>>>>>>                xs_run_error_worker(transport,
> >>>>>>>> XPRT_SOCK_WAKE_DISCONNECT);
> >>>>>>>> @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> >>>>>>>> *xprt, unsigned short port)
> >>>>>>>>        xs_update_peer_port(xprt);
> >>>>>>>> }
> >>>>>>>>
> >>>>>>>> +static void xs_reset_srcport(struct sock_xprt *transport)
> >>>>>>>> +{
> >>>>>>>> +       transport->srcport = 0;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> static void xs_set_srcport(struct sock_xprt *transport, struct
> >>>>>>>> socket *sock)
> >>>>>>>> {
> >>>>>>>>        if (transport->srcport == 0 && transport-
> >>>>>>>>> xprt.reuseport)
> >>>>>>>> --
> >>>>>>>> 2.41.0
> >>>>>>>>
> >>>>>>
> >>>>>> --
> >>>>>> Trond Myklebust Linux NFS client maintainer, Hammerspace
> >>>>>> trond.myklebust@hammerspace.com
> >>>>
> >>>> --
> >>>> Trond Myklebust
> >>>> Linux NFS client maintainer, Hammerspace
> >>>> trond.myklebust@hammerspace.com
> >>>>
> >>>>
> >>
> >> --
> >> Jeff Layton <jlayton@kernel.org>
>
>
> --
> Chuck Lever
>
>
Jeff Layton Oct. 3, 2023, 3:58 p.m. UTC | #10
On Tue, 2023-10-03 at 15:32 +0000, Chuck Lever III wrote:
> 
> > On Oct 3, 2023, at 11:28 AM, Olga Kornievskaia <aglo@umich.edu> wrote:
> > 
> > On Tue, Oct 3, 2023 at 11:12 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > On Tue, 2023-10-03 at 10:44 -0400, Olga Kornievskaia wrote:
> > > > On Sat, Sep 30, 2023 at 7:06 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > 
> > > > > On Sat, 2023-09-30 at 18:36 -0400, Olga Kornievskaia wrote:
> > > > > > On Fri, Sep 29, 2023 at 10:57 PM Trond Myklebust
> > > > > > <trondmy@hammerspace.com> wrote:
> > > > > > > 
> > > > > > > On Thu, 2023-09-28 at 10:58 -0400, Olga Kornievskaia wrote:
> > > > > > > > On Wed, Sep 27, 2023 at 3:35 PM <trondmy@kernel.org> wrote:
> > > > > > > > > 
> > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > > > > 
> > > > > > > > > If the TCP connection attempt fails without ever establishing a
> > > > > > > > > connection, then assume the problem may be the server is
> > > > > > > > > rejecting
> > > > > > > > > us
> > > > > > > > > due to port reuse.
> > > > > > > > 
> > > > > > > > Doesn't this break 4.0 replay cache? Seems too general to assume
> > > > > > > > that
> > > > > > > > any unsuccessful SYN was due to a server reboot and it's ok for
> > > > > > > > the
> > > > > > > > client to change the port.
> > > > > > > 
> > > > > > > This is where things get interesting. Yes, if we change the port
> > > > > > > number, then it will almost certainly break NFSv3 and NFSv4.0
> > > > > > > replay
> > > > > > > caching on the server.
> > > > > > > 
> > > > > > > However the problem is that once we get stuck in the situation
> > > > > > > where we
> > > > > > > cannot connect, then each new connection attempt is just causing
> > > > > > > the
> > > > > > > server's TCP layer to push back and recall that the connection from
> > > > > > > this port was closed.
> > > > > > > IOW: the problem is that once we're in this situation, we cannot
> > > > > > > easily
> > > > > > > exit without doing one of the following. Either we have to
> > > > > > > 
> > > > > > >   1. Change the port number, so that the TCP layer allows us to
> > > > > > >      connect.
> > > > > > >   2. Or.. Wait for long enough that the TCP layer has forgotten
> > > > > > >      altogether about the previous connection.
> > > > > > > 
> > > > > > > The problem is that option (2) is subject to livelock, and so has a
> > > > > > > potential infinite time out. I've seen this livelock in action, and
> > > > > > > I'm
> > > > > > > not seeing a solution that has predictable results.
> > > > > > > 
> > > > > > > So unless there is a solution for the problems in (2), I don't see
> > > > > > > how
> > > > > > > we can avoid defaulting to option (1) at some point, in which case
> > > > > > > the
> > > > > > > only question is "when do we switch ports?".
> > > > > > 
> > > > > > I'm not sure how one can justify that regression that will come out
> > > > > > of
> > > > > > #1 will be less of a problem then the problem in #2.
> > > > > > 
> > > > > > I think I'm still not grasping why the NFS server would
> > > > > > (legitimately)
> > > > > > be closing a connection that is re-using the port. Can you present a
> > > > > > sequence of events that would lead to this?
> > > > > > 
> > > > > 
> > > > > Yes. It is essentially the problem described in this blog:
> > > > > https://blog.davidvassallo.me/2010/07/13/time_wait-and-port-reuse/
> > > > > 
> > > > > ...and as you can see, it is nothing to do with NFS. This is the TCP
> > > > > protocol working as expected.
> > > > 
> > > > What I'm seeing are statements that RFC allows for/provides guidance
> > > > on how to transition out of TIME_WAIT state. I'm also hearing that the
> > > > reasons that the server can't allow for port reuse is due to broken
> > > > client implementation or use of (broken?) NAT implementation.
> > > > 
> > > > I don't see how any of this justifies allowing a regression in the
> > > > linux client code. I'm clearly missing something. How are you possibly
> > > > OK with breaking the reply cache?
> > > > 
> > > 
> > > Is it really breaking things though if you can't connect otherwise? Bear
> > > in mind that if you're dealing with NAT'ed setup, and you wait until the
> > > connection is completely forgotten, then the NAT'ing firewall is likely
> > > to change your source port anyway.
> > > 
> > > Chuck brought up an interesting question privately: should knfsd's
> > > v3/v4.0 DRC start ignoring the source port? We already check this
> > > otherwise:
> > > 
> > > - IP addr
> > > - XID
> > > - hash of first 256 bytes of the payload
> > 
> > Calculating a hash of every packet has a great performance impact.
> 
> NFSD has done this for years. On modern CPUs, it's less of a
> performance hit than walking the DRC hash chain.
> 
> 

Yes, and calling it a hash is probably overstating it a bit...

It's doing a CRC over those bytes, which is fine for our purposes and
quite fast on modern hw.

> > But
> > perhaps if we require v3 traffic to do that then we can get v3 and
> > v4.1 performance on the same level and folks would finally switch to
> > v4.1.
> > 
> > I also forgot to write that while we don't care about port (not being
> > reused) for 4.1. If we switch the port on every connection
> > establishment we will same way run into port exhaustion. Use of
> > nconnect is becoming common so something like a server reboot on a
> > client machine with about only 10 mounts using nconnect=16 and average
> > of 7 SYNs (as per example here) before the server starts again, the
> > client would use 1K source ports?
> > 
> > > That seems like enough discriminators that we could stop comparing the
> > > source port without breaking things.
> > > 
> > > > > > But can't we at least arm ourselves in not unnecessarily breaking the
> > > > > > reply cache by at least imposing some timeout/number of retries
> > > > > > before
> > > > > > resetting? If the client was retrying to unsuccessfully re-establish
> > > > > > connection for a (fixed) while, then 4.0 client's lease would expire
> > > > > > and switching the port after the lease expires makes no difference.
> > > > > > There isn't a solution in v3 unfortunately. But a time-based approach
> > > > > > would at least separate these 'peculiar' servers vs normal servers.
> > > > > > And if this is a 4.1 client, we can reset the port without a timeout.
> > > > > > 
> > > > > 
> > > > > This is not a 'peculiar server' vs 'normal server' problem. The reuse
> > > > > of ports in this way violates the TCP protocol, and has been a problem
> > > > 
> > > > I disagree here. Even the RFC quoted by the blogger says that reuse of
> > > > port is allowed.
> > > > 
> > > > > for NFS/TCP since the beginning. However, it was never a problem for
> > > > > the older connectionless UDP protocol, which is where the practice of
> > > > > tying the replay cache to the source port began in the first place.
> > > > > 
> > > > > NFSv4.1 does not have this problem because it deliberately does not
> > > > > reuse TCP ports, and the reason is precisely to avoid the TIME_WAIT
> > > > > state problems.
> > > > > 
> > > > > NFSv3 tries to avoid it by doing an incremental back off, but we
> > > > > recently saw that does not suffice to avoid live lock, after a system
> > > > > got stuck for several hours in this state.
> > > > > 
> > > > > > Am I correct that every unsuccessful SYN causes a new source point to
> > > > > > be taken? If so, then a server reboot where multiple SYNs are sent
> > > > > > prior to connection re-establishment (times number of mounts) might
> > > > > > cause source port exhaustion?
> > > > > > 
> > > > > 
> > > > > No. Not every unsuccessful SYN: It is every unsuccessful sequence of
> > > > 
> > > > I disagree. Here's a snippet of the network trace with the proposed
> > > > patch. The port is changed on EVERY unsuccessful SYN.
> > > > 
> > > >   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> > > > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> > > >   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> > > > 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> > > > TSecr=3081600630
> > > >  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> > > > [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> > > > TSval=3081660681 TSecr=3542359002
> > > >  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> > > >  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> > > > [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> > > > MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> > > >  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081663720 TSecr=0 WS=128
> > > >  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081666760 TSecr=0 WS=128
> > > >  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081669801 TSecr=0 WS=128
> > > >  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081672841 TSecr=0 WS=128
> > > >  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081675881 TSecr=0 WS=128
> > > >  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081678921 TSecr=0
> > > > WS=128
> > > >  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081681961 TSecr=0 WS=128
> > > >  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081685001 TSecr=0 WS=128
> > > >  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081688040 TSecr=0 WS=128
> > > >  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> > > > 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> > > > TSval=1993141756 TSecr=3081688040 WS=128
> > > >  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> > > > 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> > > > TSecr=1993141756
> > > >  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> > > > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> > > >  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> > > > 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> > > > TSecr=3081688040
> > > >  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> > > > V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> > > > 
> > > > This is the same without the patch. Port is successfully reused.
> > > > Replay cache OK here not above.
> > > > 
> > > >   76 2023-10-03 10:17:04.285731 192.168.1.134 → 192.168.1.106 NFS 238
> > > > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> > > >   77 2023-10-03 10:17:04.328371 192.168.1.106 → 192.168.1.134 TCP 66
> > > > 2049 → 909 [ACK] Seq=1113 Ack=1501 Win=31872 Len=0 TSval=3542359002
> > > > TSecr=3081600630
> > > >  256 2023-10-03 10:18:04.341041 192.168.1.134 → 192.168.1.106 TCP 66
> > > > [TCP Keep-Alive] 909 → 2049 [ACK] Seq=1500 Ack=1113 Win=32000 Len=0
> > > > TSval=3081660681 TSecr=3542359002
> > > >  259 2023-10-03 10:18:04.341500 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 909 [RST] Seq=1113 Win=0 Len=0
> > > >  260 2023-10-03 10:18:04.341860 192.168.1.134 → 192.168.1.106 TCP 74
> > > > [TCP Port numbers reused] 909 → 2049 [SYN] Seq=0 Win=32120 Len=0
> > > > MSS=1460 SACK_PERM TSval=3081660681 TSecr=0 WS=128
> > > >  261 2023-10-03 10:18:04.342031 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 909 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  266 2023-10-03 10:18:07.380801 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 954 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081663720 TSecr=0 WS=128
> > > >  267 2023-10-03 10:18:07.380971 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 954 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  275 2023-10-03 10:18:10.423352 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 856 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081666760 TSecr=0 WS=128
> > > >  276 2023-10-03 10:18:10.423621 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 856 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  286 2023-10-03 10:18:13.466277 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 957 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081669801 TSecr=0 WS=128
> > > >  287 2023-10-03 10:18:13.466812 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 957 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  289 2023-10-03 10:18:16.509229 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 695 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081672841 TSecr=0 WS=128
> > > >  290 2023-10-03 10:18:16.509845 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 695 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  294 2023-10-03 10:18:19.551062 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 940 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081675881 TSecr=0 WS=128
> > > >  295 2023-10-03 10:18:19.551434 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 940 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  300 2023-10-03 10:18:22.590380 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 810 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081678921 TSecr=0 WS=128
> > > >  301 2023-10-03 10:18:22.590726 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 810 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  308 2023-10-03 10:18:25.628256 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 877 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081681961 TSecr=0 WS=128
> > > >  309 2023-10-03 10:18:25.628724 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 877 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  312 2023-10-03 10:18:28.665682 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 934 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081685001 TSecr=0 WS=128
> > > >  313 2023-10-03 10:18:28.666374 192.168.1.106 → 192.168.1.134 TCP 54
> > > > 2049 → 934 [RST, ACK] Seq=1 Ack=1 Win=0 Len=0
> > > >  320 2023-10-03 10:18:31.702236 192.168.1.134 → 192.168.1.106 TCP 74
> > > > 803 → 2049 [SYN] Seq=0 Win=32120 Len=0 MSS=1460 SACK_PERM
> > > > TSval=3081688040 TSecr=0 WS=128
> > > >  321 2023-10-03 10:18:31.702490 192.168.1.106 → 192.168.1.134 TCP 74
> > > > 2049 → 803 [SYN, ACK] Seq=0 Ack=1 Win=31856 Len=0 MSS=1460 SACK_PERM
> > > > TSval=1993141756 TSecr=3081688040 WS=128
> > > >  322 2023-10-03 10:18:31.702729 192.168.1.134 → 192.168.1.106 TCP 66
> > > > 803 → 2049 [ACK] Seq=1 Ack=1 Win=32128 Len=0 TSval=3081688040
> > > > TSecr=1993141756
> > > >  323 2023-10-03 10:18:31.702737 192.168.1.134 → 192.168.1.106 NFS 238
> > > > V3 WRITE Call, FH: 0x10bedd7c Offset: 0 Len: 4 FILE_SYNC
> > > >  324 2023-10-03 10:18:31.702893 192.168.1.106 → 192.168.1.134 TCP 66
> > > > 2049 → 803 [ACK] Seq=1 Ack=173 Win=31872 Len=0 TSval=1993141756
> > > > TSecr=3081688040
> > > >  749 2023-10-03 10:19:01.880214 192.168.1.106 → 192.168.1.134 NFS 206
> > > > V3 WRITE Reply (Call In 323) Len: 4 FILE_SYNC
> > > >  750 2023-10-03 10:19:01.880616 192.168.1.134 → 192.168.1.106 TCP 66
> > > > 803 → 2049 [ACK] Seq=173 Ack=141 Win=32000 Len=0 TSval=3081718241
> > > > TSecr=1993171927
> > > > 
> > > > 
> > > > > SYNs. If the server is not replying to our SYN packets, then the TCP
> > > > > layer will back off and retransmit. So there is already a backoff-retry
> > > > > happening at that level.
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Trond Myklebust
> > > > > > > > > <trond.myklebust@hammerspace.com>
> > > > > > > > > ---
> > > > > > > > > net/sunrpc/xprtsock.c | 10 +++++++++-
> > > > > > > > > 1 file changed, 9 insertions(+), 1 deletion(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > > > > > > index 71848ab90d13..1a96777f0ed5 100644
> > > > > > > > > --- a/net/sunrpc/xprtsock.c
> > > > > > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > > > > > @@ -62,6 +62,7 @@
> > > > > > > > > #include "sunrpc.h"
> > > > > > > > > 
> > > > > > > > > static void xs_close(struct rpc_xprt *xprt);
> > > > > > > > > +static void xs_reset_srcport(struct sock_xprt *transport);
> > > > > > > > > static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > > > > socket *sock);
> > > > > > > > > static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
> > > > > > > > >                struct socket *sock);
> > > > > > > > > @@ -1565,8 +1566,10 @@ static void xs_tcp_state_change(struct
> > > > > > > > > sock
> > > > > > > > > *sk)
> > > > > > > > >                break;
> > > > > > > > >        case TCP_CLOSE:
> > > > > > > > >                if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
> > > > > > > > > -                                       &transport-
> > > > > > > > > > sock_state))
> > > > > > > > > +                                      &transport->sock_state))
> > > > > > > > > {
> > > > > > > > > +                       xs_reset_srcport(transport);
> > > > > > > > >                        xprt_clear_connecting(xprt);
> > > > > > > > > +               }
> > > > > > > > >                clear_bit(XPRT_CLOSING, &xprt->state);
> > > > > > > > >                /* Trigger the socket release */
> > > > > > > > >                xs_run_error_worker(transport,
> > > > > > > > > XPRT_SOCK_WAKE_DISCONNECT);
> > > > > > > > > @@ -1722,6 +1725,11 @@ static void xs_set_port(struct rpc_xprt
> > > > > > > > > *xprt, unsigned short port)
> > > > > > > > >        xs_update_peer_port(xprt);
> > > > > > > > > }
> > > > > > > > > 
> > > > > > > > > +static void xs_reset_srcport(struct sock_xprt *transport)
> > > > > > > > > +{
> > > > > > > > > +       transport->srcport = 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > static void xs_set_srcport(struct sock_xprt *transport, struct
> > > > > > > > > socket *sock)
> > > > > > > > > {
> > > > > > > > >        if (transport->srcport == 0 && transport-
> > > > > > > > > > xprt.reuseport)
> > > > > > > > > --
> > > > > > > > > 2.41.0
> > > > > > > > > 
> > > > > > > 
> > > > > > > --
> > > > > > > Trond Myklebust Linux NFS client maintainer, Hammerspace
> > > > > > > trond.myklebust@hammerspace.com
> > > > > 
> > > > > --
> > > > > Trond Myklebust
> > > > > Linux NFS client maintainer, Hammerspace
> > > > > trond.myklebust@hammerspace.com
> > > > > 
> > > > > 
> > > 
> > > --
> > > Jeff Layton <jlayton@kernel.org>
> 
> 
> --
> Chuck Lever
> 
>
diff mbox series

Patch

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 71848ab90d13..1a96777f0ed5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -62,6 +62,7 @@ 
 #include "sunrpc.h"
 
 static void xs_close(struct rpc_xprt *xprt);
+static void xs_reset_srcport(struct sock_xprt *transport);
 static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock);
 static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 		struct socket *sock);
@@ -1565,8 +1566,10 @@  static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_CLOSE:
 		if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
-					&transport->sock_state))
+				       &transport->sock_state)) {
+			xs_reset_srcport(transport);
 			xprt_clear_connecting(xprt);
+		}
 		clear_bit(XPRT_CLOSING, &xprt->state);
 		/* Trigger the socket release */
 		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
@@ -1722,6 +1725,11 @@  static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 	xs_update_peer_port(xprt);
 }
 
+static void xs_reset_srcport(struct sock_xprt *transport)
+{
+	transport->srcport = 0;
+}
+
 static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
 {
 	if (transport->srcport == 0 && transport->xprt.reuseport)