diff mbox

Do not bind to reserved ports registered in /etc/services

Message ID 20180110004920.11100-1-gjover@sipwise.com (mailing list archive)
State New, archived
Headers show

Commit Message

Guillem Jover Jan. 10, 2018, 12:49 a.m. UTC
When using the bindresvport() function a privileged port will be looked
for and bound to a socket. The problem is that any service using a static
privileged port registered in the /etc/services file, can get its port
taken over by libtirpc users, making the other service fail to start.

Starting the other service before libtircp users is not an option as
this does not cover restart situations, for example during package
upgrades or HA switchovers and similar.

In addition honoring the /etc/services registry is already done for
example by rpc.statd, so it seems obvious to make libtirpc follow this
same pattern too.

Signed-off-by: Guillem Jover <gjover@sipwise.com>
---
 src/bindresvport.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

Comments

Steve Dickson Jan. 11, 2018, 3:18 p.m. UTC | #1
On 01/09/2018 07:49 PM, Guillem Jover wrote:
> When using the bindresvport() function a privileged port will be looked
> for and bound to a socket. The problem is that any service using a static
> privileged port registered in the /etc/services file, can get its port
> taken over by libtirpc users, making the other service fail to start.
> 
> Starting the other service before libtircp users is not an option as
> this does not cover restart situations, for example during package
> upgrades or HA switchovers and similar.
> 
> In addition honoring the /etc/services registry is already done for
> example by rpc.statd, so it seems obvious to make libtirpc follow this
> same pattern too.
Overall I think this makes sense, but this eliminates 240 privilege
ports and worried we would run out of port (due to them in TIME_WAIT)
during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
is done... But on the other hand v3 is no longer the default and
there are 784 available ports.... Hopefully that is enough.

Does anybody else have concerns about limiting the ports space?

steved.

> 
> Signed-off-by: Guillem Jover <gjover@sipwise.com>
> ---
>  src/bindresvport.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/src/bindresvport.c b/src/bindresvport.c
> index 2d8f2bc..98e5f40 100644
> --- a/src/bindresvport.c
> +++ b/src/bindresvport.c
> @@ -40,6 +40,7 @@
>  #include <netinet/in.h>
>  
>  #include <errno.h>
> +#include <netdb.h>
>  #include <string.h>
>  #include <unistd.h>
>  
> @@ -73,12 +74,15 @@ bindresvport_sa(sd, sa)
>          int sd;
>          struct sockaddr *sa;
>  {
> -        int res, af;
> +        int res, af, so_proto;
> +        socklen_t so_proto_len;
>          struct sockaddr_storage myaddr;
>  	struct sockaddr_in *sin;
>  #ifdef INET6
>  	struct sockaddr_in6 *sin6;
>  #endif
> +	struct servent *se;
> +	const char *se_proto;
>  	u_int16_t *portp;
>  	static u_int16_t port;
>  	static short startport = STARTPORT;
> @@ -125,6 +129,25 @@ bindresvport_sa(sd, sa)
>          }
>          sa->sa_family = af;
>  
> +        so_proto_len = sizeof(so_proto);
> +        if (getsockopt(sd, SOL_SOCKET, SO_PROTOCOL, &so_proto, &so_proto_len) == -1) {
> +                mutex_unlock(&port_lock);
> +                return -1;      /* errno is correctly set */
> +        }
> +        switch (so_proto) {
> +        case IPPROTO_UDP:
> +        case IPPROTO_UDPLITE:
> +                se_proto = "udp";
> +                break;
> +        case IPPROTO_TCP:
> +                se_proto = "tcp";
> +                break;
> +        default:
> +                errno = ENOPROTOOPT;
> +                mutex_unlock(&port_lock);
> +                return -1;
> +        }
> +
>          if (port == 0) {
>                  port = (getpid() % NPORTS) + STARTPORT;
>          }
> @@ -135,6 +158,9 @@ bindresvport_sa(sd, sa)
>                  *portp = htons(port++);
>                   if (port > endport) 
>                          port = startport;
> +                se = getservbyport(*portp, se_proto);
> +                if (se != NULL)
> +                        continue;
>                  res = bind(sd, sa, salen);
>  		if (res >= 0 || errno != EADDRINUSE)
>  	                break;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Jan. 11, 2018, 3:50 p.m. UTC | #2
> On Jan 9, 2018, at 7:49 PM, Guillem Jover <gjover@sipwise.com> wrote:
> 
> When using the bindresvport() function a privileged port will be looked
> for and bound to a socket. The problem is that any service using a static
> privileged port registered in the /etc/services file, can get its port
> taken over by libtirpc users, making the other service fail to start.
> 
> Starting the other service before libtircp users is not an option as
> this does not cover restart situations, for example during package
> upgrades or HA switchovers and similar.
> 
> In addition honoring the /etc/services registry is already done for
> example by rpc.statd, so it seems obvious to make libtirpc follow this
> same pattern too.

Does the glibc version of bindresvport(3) skip ports? I ask because
this hasn't been a noted problem before. Which service exactly is
causing the difficulty?

Usually applications that grab a privileged port are short-lived.
statd is careful to avoid ports in /etc/services because it allocates
one socket with a privileged port for contacting the kernel, and keeps
it in place.


> Signed-off-by: Guillem Jover <gjover@sipwise.com>
> ---
> src/bindresvport.c | 28 +++++++++++++++++++++++++++-
> 1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/src/bindresvport.c b/src/bindresvport.c
> index 2d8f2bc..98e5f40 100644
> --- a/src/bindresvport.c
> +++ b/src/bindresvport.c
> @@ -40,6 +40,7 @@
> #include <netinet/in.h>
> 
> #include <errno.h>
> +#include <netdb.h>
> #include <string.h>
> #include <unistd.h>
> 
> @@ -73,12 +74,15 @@ bindresvport_sa(sd, sa)
>         int sd;
>         struct sockaddr *sa;
> {
> -        int res, af;
> +        int res, af, so_proto;
> +        socklen_t so_proto_len;
>         struct sockaddr_storage myaddr;
> 	struct sockaddr_in *sin;
> #ifdef INET6
> 	struct sockaddr_in6 *sin6;
> #endif
> +	struct servent *se;
> +	const char *se_proto;
> 	u_int16_t *portp;
> 	static u_int16_t port;
> 	static short startport = STARTPORT;
> @@ -125,6 +129,25 @@ bindresvport_sa(sd, sa)
>         }
>         sa->sa_family = af;
> 
> +        so_proto_len = sizeof(so_proto);
> +        if (getsockopt(sd, SOL_SOCKET, SO_PROTOCOL, &so_proto, &so_proto_len) == -1) {
> +                mutex_unlock(&port_lock);
> +                return -1;      /* errno is correctly set */
> +        }
> +        switch (so_proto) {
> +        case IPPROTO_UDP:
> +        case IPPROTO_UDPLITE:

What is UDPLITE? Would it require a separate netid
or other infrastructure? IMO it's not correct to add
this transport protocol in this patch. If you resend,
please remove this line.


> +                se_proto = "udp";
> +                break;
> +        case IPPROTO_TCP:
> +                se_proto = "tcp";
> +                break;
> +        default:
> +                errno = ENOPROTOOPT;
> +                mutex_unlock(&port_lock);
> +                return -1;
> +        }
> +
>         if (port == 0) {
>                 port = (getpid() % NPORTS) + STARTPORT;
>         }
> @@ -135,6 +158,9 @@ bindresvport_sa(sd, sa)
>                 *portp = htons(port++);
>                  if (port > endport) 
>                         port = startport;
> +                se = getservbyport(*portp, se_proto);
> +                if (se != NULL)
> +                        continue;
>                 res = bind(sd, sa, salen);
> 		if (res >= 0 || errno != EADDRINUSE)
> 	                break;
> -- 
> 2.15.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guillem Jover Jan. 12, 2018, 6:05 p.m. UTC | #3
Hi!

On Thu, 2018-01-11 at 10:50:14 -0500, Chuck Lever wrote:
> On Jan 9, 2018, at 7:49 PM, Guillem Jover <gjover@sipwise.com> wrote:
> > When using the bindresvport() function a privileged port will be looked
> > for and bound to a socket. The problem is that any service using a static
> > privileged port registered in the /etc/services file, can get its port
> > taken over by libtirpc users, making the other service fail to start.
> > 
> > Starting the other service before libtircp users is not an option as
> > this does not cover restart situations, for example during package
> > upgrades or HA switchovers and similar.
> > 
> > In addition honoring the /etc/services registry is already done for
> > example by rpc.statd, so it seems obvious to make libtirpc follow this
> > same pattern too.
> 
> Does the glibc version of bindresvport(3) skip ports? I ask because
> this hasn't been a noted problem before. Which service exactly is
> causing the difficulty?

So, this is all a bit of a mess, and apparently it has been known in
some quarters for quite some time. :(

This was reported to glibc a long time ago [B] but at the time upstream
bogusly rejected the idea of using /etc/services (or similar). Instead
someone had to create a workaround to overcome this [P], a daemon which
binds to a port, until the other service requests the port to be released
so that it can itself bind to it, which is still obviously prone to races
during restarts and complicates things.

[B] <https://sourceware.org/bugzilla/show_bug.cgi?id=1014>
    <https://bugzilla.redhat.com/show_bug.cgi?id=103401>
    <https://bugzilla.novell.com/show_bug.cgi?id=262341>
    <https://bugs.debian.org/638322>
[P] <http://cyberelk.net/tim/software/portreserve/>

Then some distributions instead patched their glibc to honor a new
/etc/bindresvport.blacklist file in the bindresvport() function; OpenSUSE
at least, and apparently Debian too [F] (although this is not documented
anywhere). In addition nfs-utils upstream (where an rpc.statd comes from)
already honors /etc/services. To complicate things, rpcbind and other
SunRPC clients using libtirpc do not honor neither /etc/services nor
/etc/bindresvport.blacklist, because libtirpc has its own bindrecvport()
implementation.

[F] <https://sources.debian.org/src/glibc/2.26-3/debian/patches/any/local-bindresvport_blacklist.diff/>

On the above Debian bug report, it was proposed to make libtirpc switch
to use the libc bindresvport() implementation so that at least on those
distributions where it is locally patched it would honor the
/etc/bindresvport.blacklist file. The problem with this, is of course
that it does not help any upstream code on any other non-patched system.

So I decided against using the undocumented /etc/bindresvport.blacklist,
because that requires filling it up with ports that are already present
in /etc/services, which seems pointless. It does not support a fragments
style directory such as /etc/bindresvport.blacklist.d/ anyway which
makes "registering" the ports from each package too difficult. And
because nfs-utils' rpc.statd will honor /etc/services on any system
regardless of the libc implementation, which is not the case for
/etc/bindresvport.blacklist.

But at this point, if you disagree and prefer some other solution, I'm
happy to oblige, because I'd rather have anything at all. :)

> Usually applications that grab a privileged port are short-lived.
> statd is careful to avoid ports in /etc/services because it allocates
> one socket with a privileged port for contacting the kernel, and keeps
> it in place.

The other long-lived libtirpc instance that is causing problems for
us is rpcbind. And fixing this at the root seemed better than patching
just rpcbind.

> > diff --git a/src/bindresvport.c b/src/bindresvport.c
> > index 2d8f2bc..98e5f40 100644
> > --- a/src/bindresvport.c
> > +++ b/src/bindresvport.c
> > @@ -125,6 +129,25 @@ bindresvport_sa(sd, sa)
> >         }
> >         sa->sa_family = af;
> > 
> > +        so_proto_len = sizeof(so_proto);
> > +        if (getsockopt(sd, SOL_SOCKET, SO_PROTOCOL, &so_proto, &so_proto_len) == -1) {
> > +                mutex_unlock(&port_lock);
> > +                return -1;      /* errno is correctly set */
> > +        }
> > +        switch (so_proto) {
> > +        case IPPROTO_UDP:
> > +        case IPPROTO_UDPLITE:
> 
> What is UDPLITE?

<http://man7.org/linux/man-pages/man7/udplite.7.html>. I was trying to
make this as generic as possible, but checking again now, I guess this
might deserve a new protocol name in /etc/services anyway.

> Would it require a separate netid
> or other infrastructure? IMO it's not correct to add
> this transport protocol in this patch. If you resend,
> please remove this line.

Yeah, will drop it. The one not being handled because I was not sure
how, that does appear for example on /etc/services on a Debian system
is "ddp", but given that the library is for SunRPC there's not much
point in trying to handle that case here anyway.

Thanks,
Guillem
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guillem Jover Jan. 12, 2018, 6:41 p.m. UTC | #4
On Thu, 2018-01-11 at 10:18:46 -0500, Steve Dickson wrote:
> Overall I think this makes sense, but this eliminates 240 privilege
> ports and worried we would run out of port (due to them in TIME_WAIT)
> during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
> is done... But on the other hand v3 is no longer the default and
> there are 784 available ports.... Hopefully that is enough.

Hmm, those numbers do not match my own. bindresvport() uses the port
range between 512 and 1023 inclusive. On my Debian stable (stretch)
and unstable systems these are the number of registered ports in
/etc/services:

  ,---
  # UDP
  $ awk '/^[^#]/ { print $2 }' /etc/services | \
    sed -n -e 's,/udp,,p' | \
    while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
    then echo $port; fi; done | sort -u | wc -l
  31
  # TCP
  $ awk '/^[^#]/ { print $2 }' /etc/services | \
    sed -n -e 's,/tcp,,p' | \
    while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
    then echo $port; fi; done | sort -u | wc -l
  48
  `---

So that's pretty low. Not as low as the 9 listed in
/etc/bindresvport.blacklist, but as I've mentioned on the other reply,
given that the list is incomplete and that does not support dynamically
registering the ports being actively used on the current system with a
".d/" style fragments directory, it would eventually require to ship
all the ports already listed in /etc/services anyway, which gains us
nothing.

In addition I notice now that even the comment in that file (at least on
Debian) is bogus, as it does not account for ports between 512 and 599:

  ,--- /etc/bindresvport.blacklist
  #
  # This file contains a list of port numbers between 600 and 1024,
  # which should not be used by bindresvport. bindresvport is mostly
  # called by RPC services. This mostly solves the problem, that a
  # RPC service uses a well known port of another service.
  #
  `---

:)

Thanks,
Guillem
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Jan. 12, 2018, 7:12 p.m. UTC | #5
> On Jan 12, 2018, at 1:05 PM, Guillem Jover <gjover@sipwise.com> wrote:
> 
> Hi!
> 
> On Thu, 2018-01-11 at 10:50:14 -0500, Chuck Lever wrote:
>> On Jan 9, 2018, at 7:49 PM, Guillem Jover <gjover@sipwise.com> wrote:
>>> When using the bindresvport() function a privileged port will be looked
>>> for and bound to a socket. The problem is that any service using a static
>>> privileged port registered in the /etc/services file, can get its port
>>> taken over by libtirpc users, making the other service fail to start.
>>> 
>>> Starting the other service before libtircp users is not an option as
>>> this does not cover restart situations, for example during package
>>> upgrades or HA switchovers and similar.
>>> 
>>> In addition honoring the /etc/services registry is already done for
>>> example by rpc.statd, so it seems obvious to make libtirpc follow this
>>> same pattern too.
>> 
>> Does the glibc version of bindresvport(3) skip ports? I ask because
>> this hasn't been a noted problem before. Which service exactly is
>> causing the difficulty?
> 
> So, this is all a bit of a mess, and apparently it has been known in
> some quarters for quite some time. :(
> 
> This was reported to glibc a long time ago [B] but at the time upstream
> bogusly rejected the idea of using /etc/services (or similar). Instead
> someone had to create a workaround to overcome this [P], a daemon which
> binds to a port, until the other service requests the port to be released
> so that it can itself bind to it, which is still obviously prone to races
> during restarts and complicates things.
> 
> [B] <https://sourceware.org/bugzilla/show_bug.cgi?id=1014>
>    <https://bugzilla.redhat.com/show_bug.cgi?id=103401>
>    <https://bugzilla.novell.com/show_bug.cgi?id=262341>
>    <https://bugs.debian.org/638322>
> [P] <http://cyberelk.net/tim/software/portreserve/>
> 
> Then some distributions instead patched their glibc to honor a new
> /etc/bindresvport.blacklist file in the bindresvport() function; OpenSUSE
> at least, and apparently Debian too [F] (although this is not documented
> anywhere). In addition nfs-utils upstream (where an rpc.statd comes from)
> already honors /etc/services. To complicate things, rpcbind and other
> SunRPC clients using libtirpc do not honor neither /etc/services nor
> /etc/bindresvport.blacklist, because libtirpc has its own bindrecvport()
> implementation.

Hi Guillem-

rpc.statd was a local fix to acquire a single port for one long
lived socket. It's not a solution intended to be a generic API
for all RPC applications on the system. Using /etc/services is
completely adequate for this purpose.

If bindresvport(3) is changed to avoid well-known ports,
rpc.statd could easily be converted to use it, for example.


> [F] <https://sources.debian.org/src/glibc/2.26-3/debian/patches/any/local-bindresvport_blacklist.diff/>
> 
> On the above Debian bug report, it was proposed to make libtirpc switch
> to use the libc bindresvport() implementation so that at least on those
> distributions where it is locally patched it would honor the
> /etc/bindresvport.blacklist file. The problem with this, is of course
> that it does not help any upstream code on any other non-patched system.

The community issue here is that there have evolved, over time,
multiple RPC libraries with divergent capabilities. The only way
to truly address this confusion is to eliminate all but one of
them, which is far outside the scope of your bug fix. For now we
have to live with it.


> So I decided against using the undocumented /etc/bindresvport.blacklist,
> because that requires filling it up with ports that are already present
> in /etc/services, which seems pointless.

It's not pointless if the two sets of ports are typically not
entirely conjunct.


> It does not support a fragments
> style directory such as /etc/bindresvport.blacklist.d/ anyway which
> makes "registering" the ports from each package too difficult.

That would be simple enough to add. Again, not really a reason
not to go with a black list. And /etc/services has the same
issue.


> And
> because nfs-utils' rpc.statd will honor /etc/services on any system
> regardless of the libc implementation, which is not the case for
> /etc/bindresvport.blacklist.

As above, rpc.statd can be converted to use a bindresvport(3)
that avoids well-known ports quite easily.


> But at this point, if you disagree and prefer some other solution, I'm
> happy to oblige, because I'd rather have anything at all. :)

I agree there's an issue that needs a solution.


>> Usually applications that grab a privileged port are short-lived.
>> statd is careful to avoid ports in /etc/services because it allocates
>> one socket with a privileged port for contacting the kernel, and keeps
>> it in place.
> 
> The other long-lived libtirpc instance that is causing problems for
> us is rpcbind. And fixing this at the root seemed better than patching
> just rpcbind.

Perhaps. Callers that need only a short-lived socket can work
without any restrictions. The issue is those one or two users
that create a long-lived socket with a privileged port and then
drop privileges. Perhaps a better solution for them is to retain
CAP_NET_BIND_SERVICE and use short-lived sockets instead.


>>> diff --git a/src/bindresvport.c b/src/bindresvport.c
>>> index 2d8f2bc..98e5f40 100644
>>> --- a/src/bindresvport.c
>>> +++ b/src/bindresvport.c
>>> @@ -125,6 +129,25 @@ bindresvport_sa(sd, sa)
>>>        }
>>>        sa->sa_family = af;
>>> 
>>> +        so_proto_len = sizeof(so_proto);
>>> +        if (getsockopt(sd, SOL_SOCKET, SO_PROTOCOL, &so_proto, &so_proto_len) == -1) {
>>> +                mutex_unlock(&port_lock);
>>> +                return -1;      /* errno is correctly set */
>>> +        }
>>> +        switch (so_proto) {
>>> +        case IPPROTO_UDP:
>>> +        case IPPROTO_UDPLITE:
>> 
>> What is UDPLITE?
> 
> <http://man7.org/linux/man-pages/man7/udplite.7.html>. I was trying to
> make this as generic as possible, but checking again now, I guess this
> might deserve a new protocol name in /etc/services anyway.
> 
>> Would it require a separate netid
>> or other infrastructure? IMO it's not correct to add
>> this transport protocol in this patch. If you resend,
>> please remove this line.
> 
> Yeah, will drop it. The one not being handled because I was not sure
> how, that does appear for example on /etc/services on a Debian system
> is "ddp", but given that the library is for SunRPC there's not much
> point in trying to handle that case here anyway.
> 
> Thanks,
> Guillem

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thorsten Kukuk Jan. 12, 2018, 7:12 p.m. UTC | #6
On Fri, Jan 12, Guillem Jover wrote:

> On Thu, 2018-01-11 at 10:18:46 -0500, Steve Dickson wrote:
> > Overall I think this makes sense, but this eliminates 240 privilege
> > ports and worried we would run out of port (due to them in TIME_WAIT)
> > during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
> > is done... But on the other hand v3 is no longer the default and
> > there are 784 available ports.... Hopefully that is enough.
> 
> Hmm, those numbers do not match my own. bindresvport() uses the port
> range between 512 and 1023 inclusive. On my Debian stable (stretch)
> and unstable systems these are the number of registered ports in
> /etc/services:
> 
>   ,---
>   # UDP
>   $ awk '/^[^#]/ { print $2 }' /etc/services | \
>     sed -n -e 's,/udp,,p' | \
>     while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
>     then echo $port; fi; done | sort -u | wc -l
>   31
>   # TCP
>   $ awk '/^[^#]/ { print $2 }' /etc/services | \
>     sed -n -e 's,/tcp,,p' | \
>     while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
>     then echo $port; fi; done | sort -u | wc -l
>   48
>   `---

This numbers are only low, since Debian is using a hand selected
/etc/services file with most entries missing. But your change 
would not be limited to libtirpc on Debian.
I have 276 for TCP and 276 for UDP, that's much, much more. So
already about 50% of the available range.

  Thorsten
Tom Talpey Jan. 12, 2018, 7:19 p.m. UTC | #7
On 1/12/2018 1:41 PM, Guillem Jover wrote:
> On Thu, 2018-01-11 at 10:18:46 -0500, Steve Dickson wrote:
>> Overall I think this makes sense, but this eliminates 240 privilege
>> ports and worried we would run out of port (due to them in TIME_WAIT)
>> during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
>> is done... But on the other hand v3 is no longer the default and
>> there are 784 available ports.... Hopefully that is enough.
> 
> Hmm, those numbers do not match my own. bindresvport() uses the port
> range between 512 and 1023 inclusive. On my Debian stable (stretch)

Properly speaking, no service should be binding to any port but the
one it is assigned to. This includes 0-1023 as well as 1024-49151.

https://tools.ietf.org/html/rfc6335

See last quoted sentence from:

https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml

>> Service names are assigned on a first-come, first-served process, as
>> documented in [RFC6335].
>> 
>> Port numbers are assigned in various ways, based on three ranges: System
>> Ports (0-1023), User Ports (1024-49151), and the Dynamic and/or Private
>> Ports (49152-65535); the difference uses of these ranges is described in
>> [RFC6335]. According to Section 8.1.2 of [RFC6335], System Ports are 
>> assigned by the "IETF Review" or "IESG Approval" procedures described in 
>> [RFC8126]. User Ports are assigned by IANA using the "IETF Review" process, 
>> the "IESG Approval" process, or the "Expert Review" process, as per 
>> [RFC6335]. Dynamic Ports are not assigned.
>> 
>> The registration procedures for service names and port numbers are
>> described in [RFC6335].
>> 
>> Assigned ports both System and User ports SHOULD NOT be used without
>> or prior to IANA registration.

Tom.




> and unstable systems these are the number of registered ports in
> /etc/services:
> 
>    ,---
>    # UDP
>    $ awk '/^[^#]/ { print $2 }' /etc/services | \
>      sed -n -e 's,/udp,,p' | \
>      while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
>      then echo $port; fi; done | sort -u | wc -l
>    31
>    # TCP
>    $ awk '/^[^#]/ { print $2 }' /etc/services | \
>      sed -n -e 's,/tcp,,p' | \
>      while read port; do if [ $port -ge 512 -a $port -lt 1024 ]; \
>      then echo $port; fi; done | sort -u | wc -l
>    48
>    `---
> 
> So that's pretty low. Not as low as the 9 listed in
> /etc/bindresvport.blacklist, but as I've mentioned on the other reply,
> given that the list is incomplete and that does not support dynamically
> registering the ports being actively used on the current system with a
> ".d/" style fragments directory, it would eventually require to ship
> all the ports already listed in /etc/services anyway, which gains us
> nothing.
> 
> In addition I notice now that even the comment in that file (at least on
> Debian) is bogus, as it does not account for ports between 512 and 599:
> 
>    ,--- /etc/bindresvport.blacklist
>    #
>    # This file contains a list of port numbers between 600 and 1024,
>    # which should not be used by bindresvport. bindresvport is mostly
>    # called by RPC services. This mostly solves the problem, that a
>    # RPC service uses a well known port of another service.
>    #
>    `---
> 
> :)
> 
> Thanks,
> Guillem
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thorsten Kukuk Jan. 12, 2018, 9:12 p.m. UTC | #8
On Fri, Jan 12, Chuck Lever wrote:

> > On Jan 12, 2018, at 1:05 PM, Guillem Jover <gjover@sipwise.com> wrote:

> > [F] <https://sources.debian.org/src/glibc/2.26-3/debian/patches/any/local-bindresvport_blacklist.diff/>
> > 
> > On the above Debian bug report, it was proposed to make libtirpc switch
> > to use the libc bindresvport() implementation so that at least on those
> > distributions where it is locally patched it would honor the
> > /etc/bindresvport.blacklist file. The problem with this, is of course
> > that it does not help any upstream code on any other non-patched system.
> 
> The community issue here is that there have evolved, over time,
> multiple RPC libraries with divergent capabilities. The only way
> to truly address this confusion is to eliminate all but one of
> them, which is far outside the scope of your bug fix. For now we
> have to live with it.

openSUSE is removing sunrpc from glibc, Fedora seems to be removing
sunrpc from glibc, so it's only a matter of time when libtirpc is the
only RPC implementation used on Linux.

  Thorsten
Chuck Lever Jan. 12, 2018, 9:14 p.m. UTC | #9
> On Jan 12, 2018, at 4:12 PM, Thorsten Kukuk <kukuk@suse.de> wrote:
> 
> On Fri, Jan 12, Chuck Lever wrote:
> 
>>> On Jan 12, 2018, at 1:05 PM, Guillem Jover <gjover@sipwise.com> wrote:
> 
>>> [F] <https://sources.debian.org/src/glibc/2.26-3/debian/patches/any/local-bindresvport_blacklist.diff/>
>>> 
>>> On the above Debian bug report, it was proposed to make libtirpc switch
>>> to use the libc bindresvport() implementation so that at least on those
>>> distributions where it is locally patched it would honor the
>>> /etc/bindresvport.blacklist file. The problem with this, is of course
>>> that it does not help any upstream code on any other non-patched system.
>> 
>> The community issue here is that there have evolved, over time,
>> multiple RPC libraries with divergent capabilities. The only way
>> to truly address this confusion is to eliminate all but one of
>> them, which is far outside the scope of your bug fix. For now we
>> have to live with it.
> 
> openSUSE is removing sunrpc from glibc, Fedora seems to be removing
> sunrpc from glibc, so it's only a matter of time when libtirpc is the
> only RPC implementation used on Linux.

There are other RPC implementations, for example:

 - The nonstandard TI-RPC library used internally by Ganesha

 - The RPC librar(ies) buried in Kerberos implementations

So there's more to do than simply removing the one in glibc,
unfortunately.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matt Benjamin Jan. 12, 2018, 9:30 p.m. UTC | #10
Hi Chuck,

On Fri, Jan 12, 2018 at 4:14 PM, Chuck Lever <chuck.lever@oracle.com> wrote:

>>
>> openSUSE is removing sunrpc from glibc, Fedora seems to be removing
>> sunrpc from glibc, so it's only a matter of time when libtirpc is the
>> only RPC implementation used on Linux.
>
> There are other RPC implementations, for example:
>
>  - The nonstandard TI-RPC library used internally by Ganesha
>

The current idea behind libntirpc is that it has no intent, and
provides no trivial ability, to serve as the ONC RPC implementation
for any Linux application services.  It is, in other words, a
3rd-party component from the Linux system point of view.

Matt

>  - The RPC librar(ies) buried in Kerberos implementations
>
> So there's more to do than simply removing the one in glibc,
> unfortunately.

>
>
> --
> Chuck Lever
Steve Dickson Jan. 12, 2018, 10:08 p.m. UTC | #11
On 01/12/2018 04:12 PM, Thorsten Kukuk wrote:
> On Fri, Jan 12, Chuck Lever wrote:
> 
>>> On Jan 12, 2018, at 1:05 PM, Guillem Jover <gjover@sipwise.com> wrote:
> 
>>> [F] <https://sources.debian.org/src/glibc/2.26-3/debian/patches/any/local-bindresvport_blacklist.diff/>
>>>
>>> On the above Debian bug report, it was proposed to make libtirpc switch
>>> to use the libc bindresvport() implementation so that at least on those
>>> distributions where it is locally patched it would honor the
>>> /etc/bindresvport.blacklist file. The problem with this, is of course
>>> that it does not help any upstream code on any other non-patched system.
>>
>> The community issue here is that there have evolved, over time,
>> multiple RPC libraries with divergent capabilities. The only way
>> to truly address this confusion is to eliminate all but one of
>> them, which is far outside the scope of your bug fix. For now we
>> have to live with it.
> 
> openSUSE is removing sunrpc from glibc, Fedora seems to be removing
> sunrpc from glibc, so it's only a matter of time when libtirpc is the
> only RPC implementation used on Linux.
We are... 
   https://bugzilla.redhat.com/show_bug.cgi?id=1531540

We also adding a new package that will contain rpcgen
    https://bugzilla.redhat.com/show_bug.cgi?id=1532364

based off Thorsten's git tree
    https://github.com/thkukuk/rpcsvc-proto

steved.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Feb. 8, 2018, 6:07 p.m. UTC | #12
On Fri, Jan 12, 2018 at 2:19 PM, Tom Talpey <tom@talpey.com> wrote:
> On 1/12/2018 1:41 PM, Guillem Jover wrote:
>>
>> On Thu, 2018-01-11 at 10:18:46 -0500, Steve Dickson wrote:
>>>
>>> Overall I think this makes sense, but this eliminates 240 privilege
>>> ports and worried we would run out of port (due to them in TIME_WAIT)
>>> during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
>>> is done... But on the other hand v3 is no longer the default and
>>> there are 784 available ports.... Hopefully that is enough.
>>
>>
>> Hmm, those numbers do not match my own. bindresvport() uses the port
>> range between 512 and 1023 inclusive. On my Debian stable (stretch)
>
>
> Properly speaking, no service should be binding to any port but the
> one it is assigned to. This includes 0-1023 as well as 1024-49151.
>
> https://tools.ietf.org/html/rfc6335
>
> See last quoted sentence from:
>
> https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml
>
>>> Service names are assigned on a first-come, first-served process, as
>>> documented in [RFC6335].
>>>
>>> Port numbers are assigned in various ways, based on three ranges: System
>>> Ports (0-1023), User Ports (1024-49151), and the Dynamic and/or Private
>>> Ports (49152-65535); the difference uses of these ranges is described in
>>> [RFC6335]. According to Section 8.1.2 of [RFC6335], System Ports are
>>> assigned by the "IETF Review" or "IESG Approval" procedures described in
>>> [RFC8126]. User Ports are assigned by IANA using the "IETF Review" process,
>>> the "IESG Approval" process, or the "Expert Review" process, as per
>>> [RFC6335]. Dynamic Ports are not assigned.
>>>
>>> The registration procedures for service names and port numbers are
>>> described in [RFC6335].
>>>
>>> Assigned ports both System and User ports SHOULD NOT be used without
>>> or prior to IANA registration.
>
>
> Tom.

At Steve's request, I've filed a bug against libtirpc:

https://bugzilla.linux-nfs.org/show_bug.cgi?id=320

to document the issue and the rationales for alternate solutions.

I'm fairly confident that bindresvport(3) is never necessary for
svc_tli_create(3). It is arguably also not appropriate for
clnt_tli_create(3). Therefore IMO it should be removed from those code
paths. That by itself would provide some relief and would eliminate
the need to alter the current bindresvport(3) implementation (say, by
introducing a blacklisting mechanism).

As Tom mentioned above, RFC 6335 describes a port range that will
never interfere with service ports allocated by IANA. This range is
known as the Dynamic or Private port range (49152 - 65535). On my
systems, bind(3) already frequently allocates from the Dynamic port
range purely by chance.

To completely prevent the accidental allocation of a well-known
service port, a special internal wrapper for the bind(3) system call
(similar to bindresvport(3)) can be constructed for libtirpc that
allocates only from the Dynamic port range whenever a caller does not
specify a port, making libtirpc adhere to the Best Common Practices
outlined in RFC 6335, and thereby closing this window for user space
RPC services completely.

A more thorough solution IMO would be to fix up the bind(3) system
call to allocate only from the Dynamic port range whenever a caller
does not specify a port. This would take care of not only user space
RPC services but of all dynamically allocated ports on the system,
including ports dynamically allocated in the kernel.
Chuck Lever Feb. 8, 2018, 6:36 p.m. UTC | #13
> On Feb 8, 2018, at 1:07 PM, Chuck Lever <chucklever@gmail.com> wrote:
> 
> On Fri, Jan 12, 2018 at 2:19 PM, Tom Talpey <tom@talpey.com> wrote:
>> On 1/12/2018 1:41 PM, Guillem Jover wrote:
>>> 
>>> On Thu, 2018-01-11 at 10:18:46 -0500, Steve Dickson wrote:
>>>> 
>>>> Overall I think this makes sense, but this eliminates 240 privilege
>>>> ports and worried we would run out of port (due to them in TIME_WAIT)
>>>> during a v3 mount storms. A port goes into TIME_WAIT after a v3 mount
>>>> is done... But on the other hand v3 is no longer the default and
>>>> there are 784 available ports.... Hopefully that is enough.
>>> 
>>> 
>>> Hmm, those numbers do not match my own. bindresvport() uses the port
>>> range between 512 and 1023 inclusive. On my Debian stable (stretch)
>> 
>> 
>> Properly speaking, no service should be binding to any port but the
>> one it is assigned to. This includes 0-1023 as well as 1024-49151.
>> 
>> https://tools.ietf.org/html/rfc6335
>> 
>> See last quoted sentence from:
>> 
>> https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml
>> 
>>>> Service names are assigned on a first-come, first-served process, as
>>>> documented in [RFC6335].
>>>> 
>>>> Port numbers are assigned in various ways, based on three ranges: System
>>>> Ports (0-1023), User Ports (1024-49151), and the Dynamic and/or Private
>>>> Ports (49152-65535); the difference uses of these ranges is described in
>>>> [RFC6335]. According to Section 8.1.2 of [RFC6335], System Ports are
>>>> assigned by the "IETF Review" or "IESG Approval" procedures described in
>>>> [RFC8126]. User Ports are assigned by IANA using the "IETF Review" process,
>>>> the "IESG Approval" process, or the "Expert Review" process, as per
>>>> [RFC6335]. Dynamic Ports are not assigned.
>>>> 
>>>> The registration procedures for service names and port numbers are
>>>> described in [RFC6335].
>>>> 
>>>> Assigned ports both System and User ports SHOULD NOT be used without
>>>> or prior to IANA registration.
>> 
>> 
>> Tom.
> 
> At Steve's request, I've filed a bug against libtirpc:
> 
> https://bugzilla.linux-nfs.org/show_bug.cgi?id=320
> 
> to document the issue and the rationales for alternate solutions.
> 
> I'm fairly confident that bindresvport(3) is never necessary for
> svc_tli_create(3). It is arguably also not appropriate for
> clnt_tli_create(3). Therefore IMO it should be removed from those code
> paths. That by itself would provide some relief and would eliminate
> the need to alter the current bindresvport(3) implementation (say, by
> introducing a blacklisting mechanism).
> 
> As Tom mentioned above, RFC 6335 describes a port range that will
> never interfere with service ports allocated by IANA. This range is
> known as the Dynamic or Private port range (49152 - 65535). On my
> systems, bind(3) already frequently allocates from the Dynamic port
> range purely by chance.
> 
> To completely prevent the accidental allocation of a well-known
> service port, a special internal wrapper for the bind(3) system call
> (similar to bindresvport(3)) can be constructed for libtirpc that
> allocates only from the Dynamic port range whenever a caller does not
> specify a port, making libtirpc adhere to the Best Common Practices
> outlined in RFC 6335, and thereby closing this window for user space
> RPC services completely.
> 
> A more thorough solution IMO would be to fix up the bind(3) system
> call to allocate only from the Dynamic port range whenever a caller
> does not specify a port. This would take care of not only user space
> RPC services but of all dynamically allocated ports on the system,
> including ports dynamically allocated in the kernel.

Replace "bind(3)" with "bind(2)" throughout. Oops.

--
Chuck Lever
chucklever@gmail.com



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever March 6, 2018, 6:09 p.m. UTC | #14
On Thu, Feb 8, 2018 at 1:36 PM, Chuck Lever <chucklever@gmail.com> wrote:
>
>
>> On Feb 8, 2018, at 1:07 PM, Chuck Lever <chucklever@gmail.com> wrote:
>>
>> At Steve's request, I've filed a bug against libtirpc:
>>
>> https://bugzilla.linux-nfs.org/show_bug.cgi?id=320
>>
>> to document the issue and the rationales for alternate solutions.
>>
>> I'm fairly confident that bindresvport(3) is never necessary for
>> svc_tli_create(3). It is arguably also not appropriate for
>> clnt_tli_create(3). Therefore IMO it should be removed from those code
>> paths. That by itself would provide some relief and would eliminate
>> the need to alter the current bindresvport(3) implementation (say, by
>> introducing a blacklisting mechanism).
>>
>> As Tom mentioned above, RFC 6335 describes a port range that will
>> never interfere with service ports allocated by IANA. This range is
>> known as the Dynamic or Private port range (49152 - 65535). On my
>> systems, bind(3) already frequently allocates from the Dynamic port
>> range purely by chance.
>>
>> To completely prevent the accidental allocation of a well-known
>> service port, a special internal wrapper for the bind(3) system call
>> (similar to bindresvport(3)) can be constructed for libtirpc that
>> allocates only from the Dynamic port range whenever a caller does not
>> specify a port, making libtirpc adhere to the Best Common Practices
>> outlined in RFC 6335, and thereby closing this window for user space
>> RPC services completely.
>>
>> A more thorough solution IMO would be to fix up the bind(3) system
>> call to allocate only from the Dynamic port range whenever a caller
>> does not specify a port. This would take care of not only user space
>> RPC services but of all dynamically allocated ports on the system,
>> including ports dynamically allocated in the kernel.
>
> Replace "bind(3)" with "bind(2)" throughout. Oops.
>
> --
> Chuck Lever
> chucklever@gmail.com

As a first step, Steve has committed patches that address bugzilla 320.
This makes libtirpc avoid allocating reserved ports for RPC service
listeners and long-running clients that request a dynamically assigned
port. svc_tli_create(3) and clnt_tli_create(3) were wrong to use
bindresvport(3) for this purpose.

IMO this addresses the two cases Guillem Jover reported explicitly:
rpc.statd's listener, and the rpcbind CALLIT client. mountd on RHEL
systems appears to use 20048 (though it can use any dynamically-
assigned port) and lockd's service port is assigned by the kernel, so
it is typically outside the reserved port range.

Even so, I can see that making bindresvport(3) more particular
about skipping IANA-assigned ports in the reserved port range might
provide additional benefit/relief.

I checked with a pal on the Solaris NFS team to see if their user
space bindresvport(3) implementation was careful to avoid IANA-
assigned ports. He said it does not currently, but they have
considered it and probably will consider it again because there are
occasional requests for such functionality.

The question now is whether to consult /etc/services, a separate
blacklist, or both, when trolling for an unused reserved port. It seems
easiest to start with /etc/services; then perhaps introduce a blacklist
file/directory later if that is desirable. Other opinions?

Note that neither of these solutions addresses the largest consumer
of dynamically-assigned reserved ports: the kernel NFS client. The
only way we have to address that today is the "noresvport" mount
option. (We could make that the default for Kerberos mounts).
J. Bruce Fields March 8, 2018, 8:24 p.m. UTC | #15
On Tue, Mar 06, 2018 at 01:09:01PM -0500, Chuck Lever wrote:
> Note that neither of these solutions addresses the largest consumer
> of dynamically-assigned reserved ports: the kernel NFS client. The
> only way we have to address that today is the "noresvport" mount
> option. (We could make that the default for Kerberos mounts).

Makes sense to me.

Looks like knfsd's not helpful here, though: the export option
("secure"/"insecure") defaults to "secure", which always requires a low
port.  It should be easy to modify "secure" to mean "require low ports
only for auth_sys/auth_null", and that's probably the right thing to do.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/bindresvport.c b/src/bindresvport.c
index 2d8f2bc..98e5f40 100644
--- a/src/bindresvport.c
+++ b/src/bindresvport.c
@@ -40,6 +40,7 @@ 
 #include <netinet/in.h>
 
 #include <errno.h>
+#include <netdb.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -73,12 +74,15 @@  bindresvport_sa(sd, sa)
         int sd;
         struct sockaddr *sa;
 {
-        int res, af;
+        int res, af, so_proto;
+        socklen_t so_proto_len;
         struct sockaddr_storage myaddr;
 	struct sockaddr_in *sin;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
+	struct servent *se;
+	const char *se_proto;
 	u_int16_t *portp;
 	static u_int16_t port;
 	static short startport = STARTPORT;
@@ -125,6 +129,25 @@  bindresvport_sa(sd, sa)
         }
         sa->sa_family = af;
 
+        so_proto_len = sizeof(so_proto);
+        if (getsockopt(sd, SOL_SOCKET, SO_PROTOCOL, &so_proto, &so_proto_len) == -1) {
+                mutex_unlock(&port_lock);
+                return -1;      /* errno is correctly set */
+        }
+        switch (so_proto) {
+        case IPPROTO_UDP:
+        case IPPROTO_UDPLITE:
+                se_proto = "udp";
+                break;
+        case IPPROTO_TCP:
+                se_proto = "tcp";
+                break;
+        default:
+                errno = ENOPROTOOPT;
+                mutex_unlock(&port_lock);
+                return -1;
+        }
+
         if (port == 0) {
                 port = (getpid() % NPORTS) + STARTPORT;
         }
@@ -135,6 +158,9 @@  bindresvport_sa(sd, sa)
                 *portp = htons(port++);
                  if (port > endport) 
                         port = startport;
+                se = getservbyport(*portp, se_proto);
+                if (se != NULL)
+                        continue;
                 res = bind(sd, sa, salen);
 		if (res >= 0 || errno != EADDRINUSE)
 	                break;