diff mbox

[v4,4/4] sockets: Handle race condition between binds to the same port

Message ID 51d7f54d100e9dedecf6dc65691ca65adfc8394f.1498213152.git-series.knut.omang@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Knut Omang June 23, 2017, 10:31 a.m. UTC
If an offset of ports is specified to the inet_listen_saddr function(),
and two or more processes tries to bind from these ports at the same time,
occasionally more than one process may be able to bind to the same
port. The condition is detected by listen() but too late to avoid a failure.

This function is called by socket_listen() and used
by all socket listening code in QEMU, so all cases where any form of dynamic
port selection is used should be subject to this issue.

Add code to close and re-establish the socket when this
condition is observed, hiding the race condition from the user.

This has been developed and tested by means of the
test-listen unit test in the previous commit.
Enable the test for make check now that it passes.

Signed-off-by: Knut Omang <knut.omang@oracle.com>
Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
---
 tests/Makefile.include |  2 +-
 util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 53 insertions(+), 17 deletions(-)

Comments

Daniel P. Berrangé June 26, 2017, 10:22 a.m. UTC | #1
On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> If an offset of ports is specified to the inet_listen_saddr function(),
> and two or more processes tries to bind from these ports at the same time,
> occasionally more than one process may be able to bind to the same
> port. The condition is detected by listen() but too late to avoid a failure.
> 
> This function is called by socket_listen() and used
> by all socket listening code in QEMU, so all cases where any form of dynamic
> port selection is used should be subject to this issue.
> 
> Add code to close and re-establish the socket when this
> condition is observed, hiding the race condition from the user.
> 
> This has been developed and tested by means of the
> test-listen unit test in the previous commit.
> Enable the test for make check now that it passes.
> 
> Signed-off-by: Knut Omang <knut.omang@oracle.com>
> Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> ---
>  tests/Makefile.include |  2 +-
>  util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
>  2 files changed, 53 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 22bb97e..c38f94e 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
>  gcov-files-check-bufferiszero-y = util/bufferiszero.c
>  check-unit-y += tests/test-uuid$(EXESUF)
>  check-unit-y += tests/ptimer-test$(EXESUF)
> -#check-unit-y += tests/test-listen$(EXESUF)
> +check-unit-y += tests/test-listen$(EXESUF)
>  gcov-files-ptimer-test-y = hw/core/ptimer.c
>  check-unit-y += tests/test-qapi-util$(EXESUF)
>  gcov-files-test-qapi-util-y = qapi/qapi-util.c
> diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> index 48b9319..7b118b4 100644
> --- a/util/qemu-sockets.c
> +++ b/util/qemu-sockets.c
> @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e)
>  #endif
>  }
>  
> +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> +                           struct addrinfo *e, int port, Error **errp)
> +{
> +    int s = *socket;
> +    int ret;
> +
> +    inet_setport(e, port);
> +    ret = try_bind(s, saddr, e);
> +    if (ret) {
> +        if (errno != EADDRINUSE) {
> +            error_setg_errno(errp, errno, "Failed to bind socket");
> +        }
> +        return errno;
> +    }
> +    if (listen(s, 1) == 0) {
> +            return 0;
> +    }
> +    if (errno == EADDRINUSE) {
> +        /* We got to bind the socket to a port but someone else managed
> +         * to bind to the same port and beat us to listen on it!
> +         * Recreate the socket and return EADDRINUSE to preserve the
> +         * expected state by the caller:
> +         */
> +        closesocket(s);
> +        s = create_fast_reuse_socket(e, errp);
> +        if (s < 0) {
> +            return errno;
> +        }
> +        *socket = s;

I don't really like this at all - if we need to close + recreate the
socket, IMHO that should remain the job of the caller, since it owns
the socket FD ultimately.

> +        errno = EADDRINUSE;
> +        return errno;
> +    }
> +    error_setg_errno(errp, errno, "Failed to listen on socket");
> +    return errno;
> +}
> +
>  static int inet_listen_saddr(InetSocketAddress *saddr,
>                               int port_offset,
>                               bool update_addr,
> @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
>      char port[33];
>      char uaddr[INET6_ADDRSTRLEN+1];
>      char uport[33];
> -    int slisten, rc, port_min, port_max, p;
> +    int rc, port_min, port_max, p;
> +    int slisten = 0;
> +    int saved_errno = 0;
>      Error *err = NULL;
>  
>      memset(&ai,0, sizeof(ai));
> @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress *saddr,

Just above this line is the original 'create_fast_reuse_socket' call.

I'd suggest that we push that call down into the body of the loop
below:

>          port_min = inet_getport(e);
>          port_max = saddr->has_to ? saddr->to + port_offset : port_min;
>          for (p = port_min; p <= port_max; p++) {
> -            inet_setport(e, p);
> -            if (try_bind(slisten, saddr, e) >= 0) {
> -                goto listen;
> -            }
> -            if (p == port_max) {
> -                if (!e->ai_next) {
> -                    error_setg_errno(errp, errno, "Failed to bind socket");
> -                }
> +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);

Which would mean try_bind_listen no longer needs the magic to close +
recreate the socket.

The only cost of doing this is that you end up closing + recreating the
socket after bind hits EADDRINUSE, as well as after listen() hits it.

I think that's acceptable tradeoff for simpler code, since this is not
a performance critical operation.

> +            if (!eno) {
> +                goto listen_ok;
> +            } else if (eno != EADDRINUSE) {
> +                goto listen_failed;
>              }
>          }
> +    }
> +    error_setg_errno(errp, errno, "Failed to find available port");

Regards,
Daniel
Daniel P. Berrangé June 26, 2017, 10:34 a.m. UTC | #2
On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> If an offset of ports is specified to the inet_listen_saddr function(),
> and two or more processes tries to bind from these ports at the same time,
> occasionally more than one process may be able to bind to the same
> port. The condition is detected by listen() but too late to avoid a failure.
> 
> This function is called by socket_listen() and used
> by all socket listening code in QEMU, so all cases where any form of dynamic
> port selection is used should be subject to this issue.
> 
> Add code to close and re-establish the socket when this
> condition is observed, hiding the race condition from the user.
> 
> This has been developed and tested by means of the
> test-listen unit test in the previous commit.
> Enable the test for make check now that it passes.
> 
> Signed-off-by: Knut Omang <knut.omang@oracle.com>
> Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> ---
>  tests/Makefile.include |  2 +-
>  util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
>  2 files changed, 53 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 22bb97e..c38f94e 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
>  gcov-files-check-bufferiszero-y = util/bufferiszero.c
>  check-unit-y += tests/test-uuid$(EXESUF)
>  check-unit-y += tests/ptimer-test$(EXESUF)
> -#check-unit-y += tests/test-listen$(EXESUF)
> +check-unit-y += tests/test-listen$(EXESUF)
>  gcov-files-ptimer-test-y = hw/core/ptimer.c
>  check-unit-y += tests/test-qapi-util$(EXESUF)
>  gcov-files-test-qapi-util-y = qapi/qapi-util.c
> diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> index 48b9319..7b118b4 100644
> --- a/util/qemu-sockets.c
> +++ b/util/qemu-sockets.c
> @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e)
>  #endif
>  }
>  
> +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> +                           struct addrinfo *e, int port, Error **errp)
> +{
> +    int s = *socket;
> +    int ret;
> +
> +    inet_setport(e, port);
> +    ret = try_bind(s, saddr, e);
> +    if (ret) {
> +        if (errno != EADDRINUSE) {
> +            error_setg_errno(errp, errno, "Failed to bind socket");
> +        }
> +        return errno;
> +    }
> +    if (listen(s, 1) == 0) {
> +            return 0;
> +    }
> +    if (errno == EADDRINUSE) {
> +        /* We got to bind the socket to a port but someone else managed
> +         * to bind to the same port and beat us to listen on it!
> +         * Recreate the socket and return EADDRINUSE to preserve the
> +         * expected state by the caller:
> +         */
> +        closesocket(s);
> +        s = create_fast_reuse_socket(e, errp);

This usage scenario for create_fast_reuse_socket() makes its error
reporting behaviour even more wrong. Recall that create_fast_reuse_socket
is reporting an error if e->ai_next is NULL, which is a way of determining
this is the last call to create_fast_reuse_socket in the loop. That
assumption is violated though now that we're calling the method from
inside the inner loop. Even when e->ai_next is NULL, we may be calling
create_fast_reuse_socket many many times due to the port  'to' range.

> +        if (s < 0) {
> +            return errno;
> +        }
> +        *socket = s;
> +        errno = EADDRINUSE;
> +        return errno;
> +    }
> +    error_setg_errno(errp, errno, "Failed to listen on socket");
> +    return errno;
> +}

This method is both preserving the global errno, and returning the
global errno. The caller expects global errno to be preserved, so
I think we can just return '-1' from this method.

> +
>  static int inet_listen_saddr(InetSocketAddress *saddr,
>                               int port_offset,
>                               bool update_addr,
> @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
>      char port[33];
>      char uaddr[INET6_ADDRSTRLEN+1];
>      char uport[33];
> -    int slisten, rc, port_min, port_max, p;
> +    int rc, port_min, port_max, p;
> +    int slisten = 0;
> +    int saved_errno = 0;
>      Error *err = NULL;
>  
>      memset(&ai,0, sizeof(ai));
> @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
>          port_min = inet_getport(e);
>          port_max = saddr->has_to ? saddr->to + port_offset : port_min;
>          for (p = port_min; p <= port_max; p++) {
> -            inet_setport(e, p);
> -            if (try_bind(slisten, saddr, e) >= 0) {
> -                goto listen;
> -            }
> -            if (p == port_max) {
> -                if (!e->ai_next) {
> -                    error_setg_errno(errp, errno, "Failed to bind socket");
> -                }
> +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
> +            if (!eno) {
> +                goto listen_ok;
> +            } else if (eno != EADDRINUSE) {
> +                goto listen_failed;
>              }
>          }
> +    }
> +    error_setg_errno(errp, errno, "Failed to find available port");
> +
> +listen_failed:
> +    saved_errno = errno;
> +    if (slisten >= 0) {
>          closesocket(slisten);
>      }
>      freeaddrinfo(res);
> +    errno = saved_errno;
>      return -1;
>  
> -listen:
> -    if (listen(slisten,1) != 0) {
> -        error_setg_errno(errp, errno, "Failed to listen on socket");
> -        closesocket(slisten);
> -        freeaddrinfo(res);
> -        return -1;
> -    }
> +listen_ok:
>      if (update_addr) {
>          g_free(saddr->host);
>          saddr->host = g_strdup(uaddr);
> -- 
> git-series 0.9.1

Regards,
Daniel
Knut Omang June 26, 2017, 12:32 p.m. UTC | #3
On Mon, 2017-06-26 at 11:22 +0100, Daniel P. Berrange wrote:
> On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> > If an offset of ports is specified to the inet_listen_saddr function(),
> > and two or more processes tries to bind from these ports at the same time,
> > occasionally more than one process may be able to bind to the same
> > port. The condition is detected by listen() but too late to avoid a failure.
> > 
> > This function is called by socket_listen() and used
> > by all socket listening code in QEMU, so all cases where any form of dynamic
> > port selection is used should be subject to this issue.
> > 
> > Add code to close and re-establish the socket when this
> > condition is observed, hiding the race condition from the user.
> > 
> > This has been developed and tested by means of the
> > test-listen unit test in the previous commit.
> > Enable the test for make check now that it passes.
> > 
> > Signed-off-by: Knut Omang <knut.omang@oracle.com>
> > Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> > Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> > Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> > ---
> >  tests/Makefile.include |  2 +-
> >  util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
> >  2 files changed, 53 insertions(+), 17 deletions(-)
> > 
> > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > index 22bb97e..c38f94e 100644
> > --- a/tests/Makefile.include
> > +++ b/tests/Makefile.include
> > @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
> >  gcov-files-check-bufferiszero-y = util/bufferiszero.c
> >  check-unit-y += tests/test-uuid$(EXESUF)
> >  check-unit-y += tests/ptimer-test$(EXESUF)
> > -#check-unit-y += tests/test-listen$(EXESUF)
> > +check-unit-y += tests/test-listen$(EXESUF)
> >  gcov-files-ptimer-test-y = hw/core/ptimer.c
> >  check-unit-y += tests/test-qapi-util$(EXESUF)
> >  gcov-files-test-qapi-util-y = qapi/qapi-util.c
> > diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> > index 48b9319..7b118b4 100644
> > --- a/util/qemu-sockets.c
> > +++ b/util/qemu-sockets.c
> > @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress *saddr, struct
> addrinfo *e)
> >  #endif
> >  }
> >  
> > +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> > +                           struct addrinfo *e, int port, Error **errp)
> > +{
> > +    int s = *socket;
> > +    int ret;
> > +
> > +    inet_setport(e, port);
> > +    ret = try_bind(s, saddr, e);
> > +    if (ret) {
> > +        if (errno != EADDRINUSE) {
> > +            error_setg_errno(errp, errno, "Failed to bind socket");
> > +        }
> > +        return errno;
> > +    }
> > +    if (listen(s, 1) == 0) {
> > +            return 0;
> > +    }
> > +    if (errno == EADDRINUSE) {
> > +        /* We got to bind the socket to a port but someone else managed
> > +         * to bind to the same port and beat us to listen on it!
> > +         * Recreate the socket and return EADDRINUSE to preserve the
> > +         * expected state by the caller:
> > +         */
> > +        closesocket(s);
> > +        s = create_fast_reuse_socket(e, errp);
> > +        if (s < 0) {
> > +            return errno;
> > +        }
> > +        *socket = s;
> 
> I don't really like this at all - if we need to close + recreate the
> socket, IMHO that should remain the job of the caller, since it owns
> the socket FD ultimately.

Normally I would agree, but this is a very unlikely situation. I considered moving the
complexity out to the caller, even to recreate for every call, but found those solutions
to be inferior as they do not in any way confine the problem, and cause the handling of
the common cases to be much less readable. It's going to be some trade-offs here.

As long as the caller is aware of (by the reference call) that the socket in use may
change, this is in my view a clean (as clean as possible) abstraction that simplifies the
logic at the next level. My intention is to make the common, good case as readable as
possible and hide some of the complexity of these 
unlikely error scenarios inside the new functions - divide and conquer..

> 
> > +        errno = EADDRINUSE;
> > +        return errno;
> > +    }
> > +    error_setg_errno(errp, errno, "Failed to listen on socket");
> > +    return errno;
> > +}
> > +
> >  static int inet_listen_saddr(InetSocketAddress *saddr,
> >                               int port_offset,
> >                               bool update_addr,
> > @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> >      char port[33];
> >      char uaddr[INET6_ADDRSTRLEN+1];
> >      char uport[33];
> > -    int slisten, rc, port_min, port_max, p;
> > +    int rc, port_min, port_max, p;
> > +    int slisten = 0;
> > +    int saved_errno = 0;
> >      Error *err = NULL;
> >  
> >      memset(&ai,0, sizeof(ai));
> > @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> 
> Just above this line is the original 'create_fast_reuse_socket' call.
> 
> I'd suggest that we push that call down into the body of the loop
> below:
> 
> >          port_min = inet_getport(e);
> >          port_max = saddr->has_to ? saddr->to + port_offset : port_min;
> >          for (p = port_min; p <= port_max; p++) {
> > -            inet_setport(e, p);
> > -            if (try_bind(slisten, saddr, e) >= 0) {
> > -                goto listen;
> > -            }
> > -            if (p == port_max) {
> > -                if (!e->ai_next) {
> > -                    error_setg_errno(errp, errno, "Failed to bind socket");
> > -                }
> > +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
> 
> Which would mean try_bind_listen no longer needs the magic to close +
> recreate the socket.
> 
> The only cost of doing this is that you end up closing + recreating the
> socket after bind hits EADDRINUSE, as well as after listen() hits it.

The problem with this approach in my opinion is that one has to understand the
fix for the problem I am trying to solve here in order to read the main code, 
even though this is a very special case. Everyone reading the code would ask themselves
the question 'why do they recreate the socket here?' and then be forced to ready the
details of try_bind_listen anyway, or we would need additional comments.

The idea behind the abstractions I have used here is to hide the details inside functions,
but leave them with an as clean as possible (although not ideal) interface that 
makes the overall logic more readable.

> I think that's acceptable tradeoff for simpler code, since this is not
> a performance critical operation.

Also should we perhaps worry about any side effects of creating and closing a lot of
sockets unnecessary?

Thanks,
Knut

> 
> > +            if (!eno) {
> > +                goto listen_ok;
> > +            } else if (eno != EADDRINUSE) {
> > +                goto listen_failed;
> >              }
> >          }
> > +    }
> > +    error_setg_errno(errp, errno, "Failed to find available port");
> 
> Regards,
> Daniel
Daniel P. Berrangé June 26, 2017, 12:49 p.m. UTC | #4
On Mon, Jun 26, 2017 at 02:32:48PM +0200, Knut Omang wrote:
> On Mon, 2017-06-26 at 11:22 +0100, Daniel P. Berrange wrote:
> > On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> > > If an offset of ports is specified to the inet_listen_saddr function(),
> > > and two or more processes tries to bind from these ports at the same time,
> > > occasionally more than one process may be able to bind to the same
> > > port. The condition is detected by listen() but too late to avoid a failure.
> > > 
> > > This function is called by socket_listen() and used
> > > by all socket listening code in QEMU, so all cases where any form of dynamic
> > > port selection is used should be subject to this issue.
> > > 
> > > Add code to close and re-establish the socket when this
> > > condition is observed, hiding the race condition from the user.
> > > 
> > > This has been developed and tested by means of the
> > > test-listen unit test in the previous commit.
> > > Enable the test for make check now that it passes.
> > > 
> > > Signed-off-by: Knut Omang <knut.omang@oracle.com>
> > > Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> > > Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> > > Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> > > ---
> > >  tests/Makefile.include |  2 +-
> > >  util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
> > >  2 files changed, 53 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > > index 22bb97e..c38f94e 100644
> > > --- a/tests/Makefile.include
> > > +++ b/tests/Makefile.include
> > > @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
> > >  gcov-files-check-bufferiszero-y = util/bufferiszero.c
> > >  check-unit-y += tests/test-uuid$(EXESUF)
> > >  check-unit-y += tests/ptimer-test$(EXESUF)
> > > -#check-unit-y += tests/test-listen$(EXESUF)
> > > +check-unit-y += tests/test-listen$(EXESUF)
> > >  gcov-files-ptimer-test-y = hw/core/ptimer.c
> > >  check-unit-y += tests/test-qapi-util$(EXESUF)
> > >  gcov-files-test-qapi-util-y = qapi/qapi-util.c
> > > diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> > > index 48b9319..7b118b4 100644
> > > --- a/util/qemu-sockets.c
> > > +++ b/util/qemu-sockets.c
> > > @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress *saddr, struct
> > addrinfo *e)
> > >  #endif
> > >  }
> > >  
> > > +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> > > +                           struct addrinfo *e, int port, Error **errp)
> > > +{
> > > +    int s = *socket;
> > > +    int ret;
> > > +
> > > +    inet_setport(e, port);
> > > +    ret = try_bind(s, saddr, e);
> > > +    if (ret) {
> > > +        if (errno != EADDRINUSE) {
> > > +            error_setg_errno(errp, errno, "Failed to bind socket");
> > > +        }
> > > +        return errno;
> > > +    }
> > > +    if (listen(s, 1) == 0) {
> > > +            return 0;
> > > +    }
> > > +    if (errno == EADDRINUSE) {
> > > +        /* We got to bind the socket to a port but someone else managed
> > > +         * to bind to the same port and beat us to listen on it!
> > > +         * Recreate the socket and return EADDRINUSE to preserve the
> > > +         * expected state by the caller:
> > > +         */
> > > +        closesocket(s);
> > > +        s = create_fast_reuse_socket(e, errp);
> > > +        if (s < 0) {
> > > +            return errno;
> > > +        }
> > > +        *socket = s;
> > 
> > I don't really like this at all - if we need to close + recreate the
> > socket, IMHO that should remain the job of the caller, since it owns
> > the socket FD ultimately.
> 
> Normally I would agree, but this is a very unlikely situation. I considered moving the
> complexity out to the caller, even to recreate for every call, but found those solutions
> to be inferior as they do not in any way confine the problem, and cause the handling of
> the common cases to be much less readable. It's going to be some trade-offs here.
> 
> As long as the caller is aware of (by the reference call) that the socket in use may
> change, this is in my view a clean (as clean as possible) abstraction that simplifies the
> logic at the next level. My intention is to make the common, good case as readable as
> possible and hide some of the complexity of these 
> unlikely error scenarios inside the new functions - divide and conquer..
> 
> > 
> > > +        errno = EADDRINUSE;
> > > +        return errno;
> > > +    }
> > > +    error_setg_errno(errp, errno, "Failed to listen on socket");
> > > +    return errno;
> > > +}
> > > +
> > >  static int inet_listen_saddr(InetSocketAddress *saddr,
> > >                               int port_offset,
> > >                               bool update_addr,
> > > @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> > >      char port[33];
> > >      char uaddr[INET6_ADDRSTRLEN+1];
> > >      char uport[33];
> > > -    int slisten, rc, port_min, port_max, p;
> > > +    int rc, port_min, port_max, p;
> > > +    int slisten = 0;
> > > +    int saved_errno = 0;
> > >      Error *err = NULL;
> > >  
> > >      memset(&ai,0, sizeof(ai));
> > > @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> > 
> > Just above this line is the original 'create_fast_reuse_socket' call.
> > 
> > I'd suggest that we push that call down into the body of the loop
> > below:
> > 
> > >          port_min = inet_getport(e);
> > >          port_max = saddr->has_to ? saddr->to + port_offset : port_min;
> > >          for (p = port_min; p <= port_max; p++) {
> > > -            inet_setport(e, p);
> > > -            if (try_bind(slisten, saddr, e) >= 0) {
> > > -                goto listen;
> > > -            }
> > > -            if (p == port_max) {
> > > -                if (!e->ai_next) {
> > > -                    error_setg_errno(errp, errno, "Failed to bind socket");
> > > -                }
> > > +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
> > 
> > Which would mean try_bind_listen no longer needs the magic to close +
> > recreate the socket.
> > 
> > The only cost of doing this is that you end up closing + recreating the
> > socket after bind hits EADDRINUSE, as well as after listen() hits it.
> 
> The problem with this approach in my opinion is that one has to understand the
> fix for the problem I am trying to solve here in order to read the main code, 
> even though this is a very special case. Everyone reading the code would ask themselves
> the question 'why do they recreate the socket here?' and then be forced to ready the
> details of try_bind_listen anyway, or we would need additional comments.

That's easily solved by adding a comment

  /* We recreate the socket FD on each iteration because
     if bind succeeds & listen fails, we can't bind
     again on the same socket FD */

> The idea behind the abstractions I have used here is to hide the details inside functions,
> but leave them with an as clean as possible (although not ideal) interface that 
> makes the overall logic more readable.

I think the result is actually harder to understand, because of the
peculiar way the function closes & reopens the socket FD belonging
to the caller, and the error handling is really very unclear and
buggy as a result too.

> > I think that's acceptable tradeoff for simpler code, since this is not
> > a performance critical operation.
> 
> Also should we perhaps worry about any side effects of creating and closing a lot of
> sockets unnecessary?

What side effects ? I don't think there are any - since this is server
side, not client side, we're not leaving any state around in timed waits
or similar.

Regards,
Daniel
Knut Omang July 2, 2017, 8:15 a.m. UTC | #5
On Mon, 2017-06-26 at 11:34 +0100, Daniel P. Berrange wrote:
> On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> > 
> > If an offset of ports is specified to the inet_listen_saddr function(),
> > and two or more processes tries to bind from these ports at the same time,
> > occasionally more than one process may be able to bind to the same
> > port. The condition is detected by listen() but too late to avoid a failure.
> > 
> > This function is called by socket_listen() and used
> > by all socket listening code in QEMU, so all cases where any form of dynamic
> > port selection is used should be subject to this issue.
> > 
> > Add code to close and re-establish the socket when this
> > condition is observed, hiding the race condition from the user.
> > 
> > This has been developed and tested by means of the
> > test-listen unit test in the previous commit.
> > Enable the test for make check now that it passes.
> > 
> > Signed-off-by: Knut Omang <knut.omang@oracle.com>
> > Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> > Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> > Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> > ---
> >  tests/Makefile.include |  2 +-
> >  util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++-----------
> >  2 files changed, 53 insertions(+), 17 deletions(-)
> > 
> > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > index 22bb97e..c38f94e 100644
> > --- a/tests/Makefile.include
> > +++ b/tests/Makefile.include
> > @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
> >  gcov-files-check-bufferiszero-y = util/bufferiszero.c
> >  check-unit-y += tests/test-uuid$(EXESUF)
> >  check-unit-y += tests/ptimer-test$(EXESUF)
> > -#check-unit-y += tests/test-listen$(EXESUF)
> > +check-unit-y += tests/test-listen$(EXESUF)
> >  gcov-files-ptimer-test-y = hw/core/ptimer.c
> >  check-unit-y += tests/test-qapi-util$(EXESUF)
> >  gcov-files-test-qapi-util-y = qapi/qapi-util.c
> > diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> > index 48b9319..7b118b4 100644
> > --- a/util/qemu-sockets.c
> > +++ b/util/qemu-sockets.c
> > @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress
> > *saddr, struct addrinfo *e)
> >  #endif
> >  }
> >  
> > +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> > +                           struct addrinfo *e, int port, Error **errp)
> > +{
> > +    int s = *socket;
> > +    int ret;
> > +
> > +    inet_setport(e, port);
> > +    ret = try_bind(s, saddr, e);
> > +    if (ret) {
> > +        if (errno != EADDRINUSE) {
> > +            error_setg_errno(errp, errno, "Failed to bind socket");
> > +        }
> > +        return errno;
> > +    }
> > +    if (listen(s, 1) == 0) {
> > +            return 0;
> > +    }
> > +    if (errno == EADDRINUSE) {
> > +        /* We got to bind the socket to a port but someone else managed
> > +         * to bind to the same port and beat us to listen on it!
> > +         * Recreate the socket and return EADDRINUSE to preserve the
> > +         * expected state by the caller:
> > +         */
> > +        closesocket(s);
> > +        s = create_fast_reuse_socket(e, errp);
> 
> This usage scenario for create_fast_reuse_socket() makes its error
> reporting behaviour even more wrong. Recall that create_fast_reuse_socket
> is reporting an error if e->ai_next is NULL, which is a way of determining
> this is the last call to create_fast_reuse_socket in the loop. That
> assumption is violated though now that we're calling the method from
> inside the inner loop. Even when e->ai_next is NULL, we may be calling
> create_fast_reuse_socket many many times due to the port  'to' range.

I agree that the error reporting should go out of create_fast_reuse_socket().
Note however that this code will only be called when the race condition occurs,
which I think is very unlikely to happen more than once for each call to
inet_listen_saddr (except in my test of course..)

> 
> > 
> > +        if (s < 0) {
> > +            return errno;
> > +        }
> > +        *socket = s;
> > +        errno = EADDRINUSE;
> > +        return errno;
> > +    }
> > +    error_setg_errno(errp, errno, "Failed to listen on socket");
> > +    return errno;
> > +}
> 
> This method is both preserving the global errno, and returning the
> global errno. The caller expects global errno to be preserved, so
> I think we can just return '-1' from this method.

will do,

Thanks,
Knut

> 
> > 
> > +
> >  static int inet_listen_saddr(InetSocketAddress *saddr,
> >                               int port_offset,
> >                               bool update_addr,
> > @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> >      char port[33];
> >      char uaddr[INET6_ADDRSTRLEN+1];
> >      char uport[33];
> > -    int slisten, rc, port_min, port_max, p;
> > +    int rc, port_min, port_max, p;
> > +    int slisten = 0;
> > +    int saved_errno = 0;
> >      Error *err = NULL;
> >  
> >      memset(&ai,0, sizeof(ai));
> > @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress *saddr,
> >          port_min = inet_getport(e);
> >          port_max = saddr->has_to ? saddr->to + port_offset : port_min;
> >          for (p = port_min; p <= port_max; p++) {
> > -            inet_setport(e, p);
> > -            if (try_bind(slisten, saddr, e) >= 0) {
> > -                goto listen;
> > -            }
> > -            if (p == port_max) {
> > -                if (!e->ai_next) {
> > -                    error_setg_errno(errp, errno, "Failed to bind socket");
> > -                }
> > +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
> > +            if (!eno) {
> > +                goto listen_ok;
> > +            } else if (eno != EADDRINUSE) {
> > +                goto listen_failed;
> >              }
> >          }
> > +    }
> > +    error_setg_errno(errp, errno, "Failed to find available port");
> > +
> > +listen_failed:
> > +    saved_errno = errno;
> > +    if (slisten >= 0) {
> >          closesocket(slisten);
> >      }
> >      freeaddrinfo(res);
> > +    errno = saved_errno;
> >      return -1;
> >  
> > -listen:
> > -    if (listen(slisten,1) != 0) {
> > -        error_setg_errno(errp, errno, "Failed to listen on socket");
> > -        closesocket(slisten);
> > -        freeaddrinfo(res);
> > -        return -1;
> > -    }
> > +listen_ok:
> >      if (update_addr) {
> >          g_free(saddr->host);
> >          saddr->host = g_strdup(uaddr);
> > -- 
> > git-series 0.9.1
> 
> Regards,
> Daniel
Knut Omang July 2, 2017, 8:17 a.m. UTC | #6
On Mon, 2017-06-26 at 13:49 +0100, Daniel P. Berrange wrote:
> On Mon, Jun 26, 2017 at 02:32:48PM +0200, Knut Omang wrote:
> > 
> > On Mon, 2017-06-26 at 11:22 +0100, Daniel P. Berrange wrote:
> > > 
> > > On Fri, Jun 23, 2017 at 12:31:08PM +0200, Knut Omang wrote:
> > > > 
> > > > If an offset of ports is specified to the inet_listen_saddr function(),
> > > > and two or more processes tries to bind from these ports at the same
> > > > time,
> > > > occasionally more than one process may be able to bind to the same
> > > > port. The condition is detected by listen() but too late to avoid a
> > > > failure.
> > > >  
> > > > This function is called by socket_listen() and used
> > > > by all socket listening code in QEMU, so all cases where any form of
> > > > dynamic
> > > > port selection is used should be subject to this issue.
> > > >  
> > > > Add code to close and re-establish the socket when this
> > > > condition is observed, hiding the race condition from the user.
> > > >  
> > > > This has been developed and tested by means of the
> > > > test-listen unit test in the previous commit.
> > > > Enable the test for make check now that it passes.
> > > >  
> > > > Signed-off-by: Knut Omang <knut.omang@oracle.com>
> > > > Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
> > > > Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> > > > Reviewed-by: Girish Moodalbail <girish.moodalbail@oracle.com>
> > > > ---
> > > >   tests/Makefile.include |  2 +-
> > > >   util/qemu-sockets.c    | 68 ++++++++++++++++++++++++++++++++--------
> > > > ---
> > > >   2 files changed, 53 insertions(+), 17 deletions(-)
> > > >  
> > > > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > > > index 22bb97e..c38f94e 100644
> > > > --- a/tests/Makefile.include
> > > > +++ b/tests/Makefile.include
> > > > @@ -127,7 +127,7 @@ check-unit-y += tests/test-bufferiszero$(EXESUF)
> > > >   gcov-files-check-bufferiszero-y = util/bufferiszero.c
> > > >   check-unit-y += tests/test-uuid$(EXESUF)
> > > >   check-unit-y += tests/ptimer-test$(EXESUF)
> > > > -#check-unit-y += tests/test-listen$(EXESUF)
> > > > +check-unit-y += tests/test-listen$(EXESUF)
> > > >   gcov-files-ptimer-test-y = hw/core/ptimer.c
> > > >   check-unit-y += tests/test-qapi-util$(EXESUF)
> > > >   gcov-files-test-qapi-util-y = qapi/qapi-util.c
> > > > diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
> > > > index 48b9319..7b118b4 100644
> > > > --- a/util/qemu-sockets.c
> > > > +++ b/util/qemu-sockets.c
> > > > @@ -201,6 +201,42 @@ static int try_bind(int socket, InetSocketAddress
> > > > *saddr, struct
> > > addrinfo *e)
> > > > 
> > > >   #endif
> > > >   }
> > > >   
> > > > +static int try_bind_listen(int *socket, InetSocketAddress *saddr,
> > > > +                           struct addrinfo *e, int port, Error **errp)
> > > > +{
> > > > +    int s = *socket;
> > > > +    int ret;
> > > > +
> > > > +    inet_setport(e, port);
> > > > +    ret = try_bind(s, saddr, e);
> > > > +    if (ret) {
> > > > +        if (errno != EADDRINUSE) {
> > > > +            error_setg_errno(errp, errno, "Failed to bind socket");
> > > > +        }
> > > > +        return errno;
> > > > +    }
> > > > +    if (listen(s, 1) == 0) {
> > > > +            return 0;
> > > > +    }
> > > > +    if (errno == EADDRINUSE) {
> > > > +        /* We got to bind the socket to a port but someone else managed
> > > > +         * to bind to the same port and beat us to listen on it!
> > > > +         * Recreate the socket and return EADDRINUSE to preserve the
> > > > +         * expected state by the caller:
> > > > +         */
> > > > +        closesocket(s);
> > > > +        s = create_fast_reuse_socket(e, errp);
> > > > +        if (s < 0) {
> > > > +            return errno;
> > > > +        }
> > > > +        *socket = s;
> > > 
> > > I don't really like this at all - if we need to close + recreate the
> > > socket, IMHO that should remain the job of the caller, since it owns
> > > the socket FD ultimately.
> > 
> > Normally I would agree, but this is a very unlikely situation. I considered
> > moving the
> > complexity out to the caller, even to recreate for every call, but found
> > those solutions
> > to be inferior as they do not in any way confine the problem, and cause the
> > handling of
> > the common cases to be much less readable. It's going to be some trade-offs
> > here.
> > 
> > As long as the caller is aware of (by the reference call) that the socket in
> > use may
> > change, this is in my view a clean (as clean as possible) abstraction that
> > simplifies the
> > logic at the next level. My intention is to make the common, good case as
> > readable as
> > possible and hide some of the complexity of these 
> > unlikely error scenarios inside the new functions - divide and conquer..
> > 
> > > 
> > > 
> > > > 
> > > > +        errno = EADDRINUSE;
> > > > +        return errno;
> > > > +    }
> > > > +    error_setg_errno(errp, errno, "Failed to listen on socket");
> > > > +    return errno;
> > > > +}
> > > > +
> > > >   static int inet_listen_saddr(InetSocketAddress *saddr,
> > > >                                int port_offset,
> > > >                                bool update_addr,
> > > > @@ -210,7 +246,9 @@ static int inet_listen_saddr(InetSocketAddress
> > > > *saddr,
> > > >       char port[33];
> > > >       char uaddr[INET6_ADDRSTRLEN+1];
> > > >       char uport[33];
> > > > -    int slisten, rc, port_min, port_max, p;
> > > > +    int rc, port_min, port_max, p;
> > > > +    int slisten = 0;
> > > > +    int saved_errno = 0;
> > > >       Error *err = NULL;
> > > >   
> > > >       memset(&ai,0, sizeof(ai));
> > > > @@ -276,28 +314,26 @@ static int inet_listen_saddr(InetSocketAddress
> > > > *saddr,
> > > 
> > > Just above this line is the original 'create_fast_reuse_socket' call.
> > > 
> > > I'd suggest that we push that call down into the body of the loop
> > > below:
> > > 
> > > > 
> > > >           port_min = inet_getport(e);
> > > >           port_max = saddr->has_to ? saddr->to + port_offset : port_min;
> > > >           for (p = port_min; p <= port_max; p++) {
> > > > -            inet_setport(e, p);
> > > > -            if (try_bind(slisten, saddr, e) >= 0) {
> > > > -                goto listen;
> > > > -            }
> > > > -            if (p == port_max) {
> > > > -                if (!e->ai_next) {
> > > > -                    error_setg_errno(errp, errno, "Failed to bind
> > > > socket");
> > > > -                }
> > > > +            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
> > > 
> > > Which would mean try_bind_listen no longer needs the magic to close +
> > > recreate the socket.
> > > 
> > > The only cost of doing this is that you end up closing + recreating the
> > > socket after bind hits EADDRINUSE, as well as after listen() hits it.
> > 
> > The problem with this approach in my opinion is that one has to understand
> > the
> > fix for the problem I am trying to solve here in order to read the main
> > code, 
> > even though this is a very special case. Everyone reading the code would ask
> > themselves
> > the question 'why do they recreate the socket here?' and then be forced to
> > ready the
> > details of try_bind_listen anyway, or we would need additional comments.
> 
> That's easily solved by adding a comment
> 
>   /* We recreate the socket FD on each iteration because
>      if bind succeeds & listen fails, we can't bind
>      again on the same socket FD */
> 
> > 
> > The idea behind the abstractions I have used here is to hide the details
> > inside functions,
> > but leave them with an as clean as possible (although not ideal) interface
> > that 
> > makes the overall logic more readable.
> 
> I think the result is actually harder to understand, because of the
> peculiar way the function closes & reopens the socket FD belonging
> to the caller, and the error handling is really very unclear and
> buggy as a result too.

I assume we have sorted out the error reporting issues in the other part of the
thread. Although the reopen of the socket this way is not ideal, I still think
it is the best tradeoff but I'll have a second look at it,

> > 
> > > I think that's acceptable tradeoff for simpler code, since this is not
> > > a performance critical operation.
> > 
> > Also should we perhaps worry about any side effects of creating and closing
> > a lot of
> > sockets unnecessary?
> 
> What side effects ? I don't think there are any - since this is server
> side, not client side, we're not leaving any state around in timed waits
> or similar.

If you say so.
I would not know for all platforms involved without further work
testing/investigating.

If you don't mind I still think it is better to avoid socket recreation for all
cases just to handle this (very rare) condition.

Thanks for the thorough review,

Regards,
Knut

> 
> Regards,
> Daniel
> -- 
> > 
> > : https://berrange.com      -o-    https://www.flickr.com/photos/dberrange
> > :|
> > : https://libvirt.org         -o-            https://fstop138.berrange.com
> > :|
> > : https://entangle-photo.org    -o-    https://www.instagram.com/dberrange
> > :|
diff mbox

Patch

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 22bb97e..c38f94e 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -127,7 +127,7 @@  check-unit-y += tests/test-bufferiszero$(EXESUF)
 gcov-files-check-bufferiszero-y = util/bufferiszero.c
 check-unit-y += tests/test-uuid$(EXESUF)
 check-unit-y += tests/ptimer-test$(EXESUF)
-#check-unit-y += tests/test-listen$(EXESUF)
+check-unit-y += tests/test-listen$(EXESUF)
 gcov-files-ptimer-test-y = hw/core/ptimer.c
 check-unit-y += tests/test-qapi-util$(EXESUF)
 gcov-files-test-qapi-util-y = qapi/qapi-util.c
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 48b9319..7b118b4 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -201,6 +201,42 @@  static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e)
 #endif
 }
 
+static int try_bind_listen(int *socket, InetSocketAddress *saddr,
+                           struct addrinfo *e, int port, Error **errp)
+{
+    int s = *socket;
+    int ret;
+
+    inet_setport(e, port);
+    ret = try_bind(s, saddr, e);
+    if (ret) {
+        if (errno != EADDRINUSE) {
+            error_setg_errno(errp, errno, "Failed to bind socket");
+        }
+        return errno;
+    }
+    if (listen(s, 1) == 0) {
+            return 0;
+    }
+    if (errno == EADDRINUSE) {
+        /* We got to bind the socket to a port but someone else managed
+         * to bind to the same port and beat us to listen on it!
+         * Recreate the socket and return EADDRINUSE to preserve the
+         * expected state by the caller:
+         */
+        closesocket(s);
+        s = create_fast_reuse_socket(e, errp);
+        if (s < 0) {
+            return errno;
+        }
+        *socket = s;
+        errno = EADDRINUSE;
+        return errno;
+    }
+    error_setg_errno(errp, errno, "Failed to listen on socket");
+    return errno;
+}
+
 static int inet_listen_saddr(InetSocketAddress *saddr,
                              int port_offset,
                              bool update_addr,
@@ -210,7 +246,9 @@  static int inet_listen_saddr(InetSocketAddress *saddr,
     char port[33];
     char uaddr[INET6_ADDRSTRLEN+1];
     char uport[33];
-    int slisten, rc, port_min, port_max, p;
+    int rc, port_min, port_max, p;
+    int slisten = 0;
+    int saved_errno = 0;
     Error *err = NULL;
 
     memset(&ai,0, sizeof(ai));
@@ -276,28 +314,26 @@  static int inet_listen_saddr(InetSocketAddress *saddr,
         port_min = inet_getport(e);
         port_max = saddr->has_to ? saddr->to + port_offset : port_min;
         for (p = port_min; p <= port_max; p++) {
-            inet_setport(e, p);
-            if (try_bind(slisten, saddr, e) >= 0) {
-                goto listen;
-            }
-            if (p == port_max) {
-                if (!e->ai_next) {
-                    error_setg_errno(errp, errno, "Failed to bind socket");
-                }
+            int eno = try_bind_listen(&slisten, saddr, e, p, &err);
+            if (!eno) {
+                goto listen_ok;
+            } else if (eno != EADDRINUSE) {
+                goto listen_failed;
             }
         }
+    }
+    error_setg_errno(errp, errno, "Failed to find available port");
+
+listen_failed:
+    saved_errno = errno;
+    if (slisten >= 0) {
         closesocket(slisten);
     }
     freeaddrinfo(res);
+    errno = saved_errno;
     return -1;
 
-listen:
-    if (listen(slisten,1) != 0) {
-        error_setg_errno(errp, errno, "Failed to listen on socket");
-        closesocket(slisten);
-        freeaddrinfo(res);
-        return -1;
-    }
+listen_ok:
     if (update_addr) {
         g_free(saddr->host);
         saddr->host = g_strdup(uaddr);