diff mbox series

[net-next] ipv4: Use READ/WRITE_ONCE() for IP local_port_range

Message ID a4f5b61c9cd44eada41d8f09d3848806@AcuMS.aculab.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] ipv4: Use READ/WRITE_ONCE() for IP local_port_range | expand

Commit Message

David Laight Nov. 29, 2023, 7:26 p.m. UTC
Commit 227b60f5102cd added a seqlock to ensure that the low and high
port numbers were always updated together.
This is overkill because the two 16bit port numbers can be held in
a u32 and read/written in a single instruction.

More recently 91d0b78c5177f added support for finer per-socket limits.
The user-supplied value is 'high << 16 | low' but they are held
separately and the socket options protected by the socket lock.

Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
always updated together.

Change (the now trival) inet_get_local_port_range() to a static inline
to optimise the calling code.
(In particular avoiding returning integers by reference.)

Signed-off-by: David Laight <david.laight@aculab.com>
---
 include/net/inet_sock.h         |  5 +----
 include/net/ip.h                |  7 ++++++-
 include/net/netns/ipv4.h        |  3 +--
 net/ipv4/af_inet.c              |  4 +---
 net/ipv4/inet_connection_sock.c | 29 ++++++++++------------------
 net/ipv4/ip_sockglue.c          | 34 ++++++++++++++++-----------------
 net/ipv4/sysctl_net_ipv4.c      | 12 ++++--------
 7 files changed, 40 insertions(+), 54 deletions(-)

Comments

Jakub Sitnicki Nov. 29, 2023, 8:17 p.m. UTC | #1
Hey David,

On Wed, Nov 29, 2023 at 07:26 PM GMT, David Laight wrote:
> Commit 227b60f5102cd added a seqlock to ensure that the low and high
> port numbers were always updated together.
> This is overkill because the two 16bit port numbers can be held in
> a u32 and read/written in a single instruction.
>
> More recently 91d0b78c5177f added support for finer per-socket limits.
> The user-supplied value is 'high << 16 | low' but they are held
> separately and the socket options protected by the socket lock.
>
> Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
> fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
> always updated together.
>
> Change (the now trival) inet_get_local_port_range() to a static inline
> to optimise the calling code.
> (In particular avoiding returning integers by reference.)
>
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---

Regarding the per-socket changes - we don't expect contention on sock
lock between inet_stream_connect / __inet_bind, where we grab it and
eventually call inet_sk_get_local_port_range, and sockopt handlers, do
we?

The motivation is not super clear for me for that of the changes.

>  include/net/inet_sock.h         |  5 +----
>  include/net/ip.h                |  7 ++++++-
>  include/net/netns/ipv4.h        |  3 +--
>  net/ipv4/af_inet.c              |  4 +---
>  net/ipv4/inet_connection_sock.c | 29 ++++++++++------------------
>  net/ipv4/ip_sockglue.c          | 34 ++++++++++++++++-----------------
>  net/ipv4/sysctl_net_ipv4.c      | 12 ++++--------
>  7 files changed, 40 insertions(+), 54 deletions(-)
>
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index 74db6d97cae1..ebf71410aa2b 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -234,10 +234,7 @@ struct inet_sock {
>  	int			uc_index;
>  	int			mc_index;
>  	__be32			mc_addr;
> -	struct {
> -		__u16 lo;
> -		__u16 hi;
> -	}			local_port_range;
> +	u32			local_port_range;

Nit: This field would benefit from a similar comment as you have added to
local_ports.range ("/* high << 16 | low */"), now that it is no longer
obvious how to interpret the contents.

>  
>  	struct ip_mc_socklist __rcu	*mc_list;
>  	struct inet_cork_full	cork;

[...]

> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 394a498c2823..1a45d41f8b39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -117,34 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)

[...]

>  void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
>  {
>  	const struct inet_sock *inet = inet_sk(sk);
>  	const struct net *net = sock_net(sk);
>  	int lo, hi, sk_lo, sk_hi;
> +	u32 sk_range;
>  
>  	inet_get_local_port_range(net, &lo, &hi);
>  
> -	sk_lo = inet->local_port_range.lo;
> -	sk_hi = inet->local_port_range.hi;
> +	sk_range = READ_ONCE(inet->local_port_range);
> +	if (unlikely(sk_range)) {
> +		sk_lo = sk_range & 0xffff;
> +		sk_hi = sk_range >> 16;
>  
> -	if (unlikely(lo <= sk_lo && sk_lo <= hi))
> -		lo = sk_lo;
> -	if (unlikely(lo <= sk_hi && sk_hi <= hi))
> -		hi = sk_hi;
> +		if (unlikely(lo <= sk_lo && sk_lo <= hi))
> +			lo = sk_lo;
> +		if (unlikely(lo <= sk_hi && sk_hi <= hi))
> +			hi = sk_hi;
> +	}

Actually when we know that sk_range is set, the above two branches
become likely. It will be usually so that the set per-socket port range
narrows down the per-netns port range.

These checks exist only in case the per-netns port range has been
reconfigured after per-socket port range has been set. The per-netns one
always takes precedence.

>  
>  	*low = lo;
>  	*high = hi;

[...]
Eric Dumazet Nov. 29, 2023, 9:06 p.m. UTC | #2
On Wed, Nov 29, 2023 at 8:26 PM David Laight <David.Laight@aculab.com> wrote:
>
> Commit 227b60f5102cd added a seqlock to ensure that the low and high
> port numbers were always updated together.
> This is overkill because the two 16bit port numbers can be held in
> a u32 and read/written in a single instruction.
>
> More recently 91d0b78c5177f added support for finer per-socket limits.
> The user-supplied value is 'high << 16 | low' but they are held
> separately and the socket options protected by the socket lock.
>
> Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
> fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
> always updated together.
>
> Change (the now trival) inet_get_local_port_range() to a static inline
> to optimise the calling code.
> (In particular avoiding returning integers by reference.)
>
> Signed-off-by: David Laight <david.laight@aculab.com>

Nice, I had this patch on my TODO list :)
David Laight Nov. 30, 2023, 9:04 a.m. UTC | #3
From: Jakub Sitnicki
> Sent: 29 November 2023 20:17
> 
> Hey David,
> 
> On Wed, Nov 29, 2023 at 07:26 PM GMT, David Laight wrote:
> > Commit 227b60f5102cd added a seqlock to ensure that the low and high
> > port numbers were always updated together.
> > This is overkill because the two 16bit port numbers can be held in
> > a u32 and read/written in a single instruction.
> >
> > More recently 91d0b78c5177f added support for finer per-socket limits.
> > The user-supplied value is 'high << 16 | low' but they are held
> > separately and the socket options protected by the socket lock.
> >
> > Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
> > fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
> > always updated together.
> >
> > Change (the now trival) inet_get_local_port_range() to a static inline
> > to optimise the calling code.
> > (In particular avoiding returning integers by reference.)
> >
> > Signed-off-by: David Laight <david.laight@aculab.com>
> > ---
> 
> Regarding the per-socket changes - we don't expect contention on sock
> lock between inet_stream_connect / __inet_bind, where we grab it and
> eventually call inet_sk_get_local_port_range, and sockopt handlers, do
> we?
> 
> The motivation is not super clear for me for that of the changes.

The locking in the getsockopt() code is actually quite horrid.
Look at the conditionals for the rntl lock.
It is conditionally acquired based on a function that sets a flag,
but released based on the exit path from the switch statement.

But there are only two options that need the rtnl lock and the socket
lock, and two trivial ones (including this one) that need the socket
lock.
So the code can be simplified by moving the locking into the case branches.
With only 2 such cases the overhead will be minimal (compared the to
setsockopt() case where a lot of options need locking.

This is all part of a big patchset I'm trying to write that converts
all the setsockopt code to take an 'unsigned int optlen' parameter
and return the length to pass back to the caller.
So the copy_to_user() of the updated length is done by the syscall
stub rather than inside every separate function (and sometimes in
multiple places in a function).

After all, if the copy fails EFAULT the application is broken.
It really doesn't matter if any side effects have happened.
If you get a fault reading from a pipe the data is lost.

> 
> >  include/net/inet_sock.h         |  5 +----
> >  include/net/ip.h                |  7 ++++++-
> >  include/net/netns/ipv4.h        |  3 +--
> >  net/ipv4/af_inet.c              |  4 +---
> >  net/ipv4/inet_connection_sock.c | 29 ++++++++++------------------
> >  net/ipv4/ip_sockglue.c          | 34 ++++++++++++++++-----------------
> >  net/ipv4/sysctl_net_ipv4.c      | 12 ++++--------
> >  7 files changed, 40 insertions(+), 54 deletions(-)
> >
> > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> > index 74db6d97cae1..ebf71410aa2b 100644
> > --- a/include/net/inet_sock.h
> > +++ b/include/net/inet_sock.h
> > @@ -234,10 +234,7 @@ struct inet_sock {
> >  	int			uc_index;
> >  	int			mc_index;
> >  	__be32			mc_addr;
> > -	struct {
> > -		__u16 lo;
> > -		__u16 hi;
> > -	}			local_port_range;
> > +	u32			local_port_range;
> 
> Nit: This field would benefit from a similar comment as you have added to
> local_ports.range ("/* high << 16 | low */"), now that it is no longer
> obvious how to interpret the contents.
> 
> >
> >  	struct ip_mc_socklist __rcu	*mc_list;
> >  	struct inet_cork_full	cork;
> 
> [...]
> 
> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> > index 394a498c2823..1a45d41f8b39 100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -117,34 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
> 
> [...]
> 
> >  void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
> >  {
> >  	const struct inet_sock *inet = inet_sk(sk);
> >  	const struct net *net = sock_net(sk);
> >  	int lo, hi, sk_lo, sk_hi;
> > +	u32 sk_range;
> >
> >  	inet_get_local_port_range(net, &lo, &hi);
> >
> > -	sk_lo = inet->local_port_range.lo;
> > -	sk_hi = inet->local_port_range.hi;
> > +	sk_range = READ_ONCE(inet->local_port_range);
> > +	if (unlikely(sk_range)) {
> > +		sk_lo = sk_range & 0xffff;
> > +		sk_hi = sk_range >> 16;
> >
> > -	if (unlikely(lo <= sk_lo && sk_lo <= hi))
> > -		lo = sk_lo;
> > -	if (unlikely(lo <= sk_hi && sk_hi <= hi))
> > -		hi = sk_hi;
> > +		if (unlikely(lo <= sk_lo && sk_lo <= hi))
> > +			lo = sk_lo;
> > +		if (unlikely(lo <= sk_hi && sk_hi <= hi))
> > +			hi = sk_hi;
> > +	}
> 
> Actually when we know that sk_range is set, the above two branches
> become likely. It will be usually so that the set per-socket port range
> narrows down the per-netns port range.

True, I'd left them alone, but would also flip the first conditional.

I can edit the patch :-)

	David

> 
> These checks exist only in case the per-netns port range has been
> reconfigured after per-socket port range has been set. The per-netns one
> always takes precedence.
> 
> >
> >  	*low = lo;
> >  	*high = hi;
> 
> [...]

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Eric Dumazet Nov. 30, 2023, 9:28 a.m. UTC | #4
On Wed, Nov 29, 2023 at 8:26 PM David Laight <David.Laight@aculab.com> wrote:
>
> Commit 227b60f5102cd added a seqlock to ensure that the low and high
> port numbers were always updated together.
> This is overkill because the two 16bit port numbers can be held in
> a u32 and read/written in a single instruction.
>
> More recently 91d0b78c5177f added support for finer per-socket limits.
> The user-supplied value is 'high << 16 | low' but they are held
> separately and the socket options protected by the socket lock.
>
> Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
> fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
> always updated together.
>
> Change (the now trival) inet_get_local_port_range() to a static inline
> to optimise the calling code.
> (In particular avoiding returning integers by reference.)
>
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---
>  include/net/inet_sock.h         |  5 +----
>  include/net/ip.h                |  7 ++++++-
>  include/net/netns/ipv4.h        |  3 +--
>  net/ipv4/af_inet.c              |  4 +---
>  net/ipv4/inet_connection_sock.c | 29 ++++++++++------------------
>  net/ipv4/ip_sockglue.c          | 34 ++++++++++++++++-----------------
>  net/ipv4/sysctl_net_ipv4.c      | 12 ++++--------
>  7 files changed, 40 insertions(+), 54 deletions(-)
>
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index 74db6d97cae1..ebf71410aa2b 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -234,10 +234,7 @@ struct inet_sock {
>         int                     uc_index;
>         int                     mc_index;
>         __be32                  mc_addr;
> -       struct {
> -               __u16 lo;
> -               __u16 hi;
> -       }                       local_port_range;
> +       u32                     local_port_range;
>
>         struct ip_mc_socklist __rcu     *mc_list;
>         struct inet_cork_full   cork;
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 1fc4c8d69e33..154f9dd75fe6 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -349,7 +349,12 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
>         } \
>  }
>
> -void inet_get_local_port_range(const struct net *net, int *low, int *high);
> +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
> +{
> +       u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);

Please insert an empty line here.

> +       *low = range & 0xffff;
> +       *high = range >> 16;
> +}
>  void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
>
>  #ifdef CONFIG_SYSCTL
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 73f43f699199..30ba5359da84 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -19,8 +19,7 @@ struct hlist_head;
>  struct fib_table;
>  struct sock;
>  struct local_ports {
> -       seqlock_t       lock;
> -       int             range[2];
> +       u32             range;  /* high << 16 | low */
>         bool            warned;
>  };
>
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index fb81de10d332..b8964b40c3c0 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1847,9 +1847,7 @@ static __net_init int inet_init_net(struct net *net)
>         /*
>          * Set defaults for local port range
>          */
> -       seqlock_init(&net->ipv4.ip_local_ports.lock);
> -       net->ipv4.ip_local_ports.range[0] =  32768;
> -       net->ipv4.ip_local_ports.range[1] =  60999;
> +       net->ipv4.ip_local_ports.range =  60999 << 16 | 32768;
>
>         seqlock_init(&net->ipv4.ping_group_range.lock);
>         /*
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 394a498c2823..1a45d41f8b39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -117,34 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
>         return !sk->sk_rcv_saddr;
>  }
>
> -void inet_get_local_port_range(const struct net *net, int *low, int *high)
> -{
> -       unsigned int seq;
> -
> -       do {
> -               seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
> -
> -               *low = net->ipv4.ip_local_ports.range[0];
> -               *high = net->ipv4.ip_local_ports.range[1];
> -       } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
> -}
> -EXPORT_SYMBOL(inet_get_local_port_range);
> -
>  void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
>  {
>         const struct inet_sock *inet = inet_sk(sk);
>         const struct net *net = sock_net(sk);
>         int lo, hi, sk_lo, sk_hi;
> +       u32 sk_range;
>
>         inet_get_local_port_range(net, &lo, &hi);
>
> -       sk_lo = inet->local_port_range.lo;
> -       sk_hi = inet->local_port_range.hi;
> +       sk_range = READ_ONCE(inet->local_port_range);
> +       if (unlikely(sk_range)) {
> +               sk_lo = sk_range & 0xffff;
> +               sk_hi = sk_range >> 16;
>
> -       if (unlikely(lo <= sk_lo && sk_lo <= hi))
> -               lo = sk_lo;
> -       if (unlikely(lo <= sk_hi && sk_hi <= hi))
> -               hi = sk_hi;
> +               if (unlikely(lo <= sk_lo && sk_lo <= hi))
> +                       lo = sk_lo;
> +               if (unlikely(lo <= sk_hi && sk_hi <= hi))
> +                       hi = sk_hi;
> +       }
>
>         *low = lo;
>         *high = hi;
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 2efc53526a38..bf940fe249a5 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -1055,6 +1055,20 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
>         case IP_TOS:    /* This sets both TOS and Precedence */
>                 ip_sock_set_tos(sk, val);
>                 return 0;
> +
> +       case IP_LOCAL_PORT_RANGE:
> +       {
> +               const __u16 lo = val;
> +               const __u16 hi = val >> 16;

I know that we use __u16 and __u32 already, but I think we should
reserve them for exported fields in uapi.

New code  should use u16 and u32, no need for __ prefixes.

> +
> +               if (optlen != sizeof(__u32))
> +                       return -EINVAL;
> +               if (lo != 0 && hi != 0 && lo > hi)
> +                       return -EINVAL;
> +
> +               WRITE_ONCE(inet->local_port_range, val);
> +               return 0;
> +       }
>
Jakub Sitnicki Nov. 30, 2023, 9:29 a.m. UTC | #5
On Thu, Nov 30, 2023 at 09:04 AM GMT, David Laight wrote:
> From: Jakub Sitnicki
>> Sent: 29 November 2023 20:17
>> 
>> Hey David,
>> 
>> On Wed, Nov 29, 2023 at 07:26 PM GMT, David Laight wrote:
>> > Commit 227b60f5102cd added a seqlock to ensure that the low and high
>> > port numbers were always updated together.
>> > This is overkill because the two 16bit port numbers can be held in
>> > a u32 and read/written in a single instruction.
>> >
>> > More recently 91d0b78c5177f added support for finer per-socket limits.
>> > The user-supplied value is 'high << 16 | low' but they are held
>> > separately and the socket options protected by the socket lock.
>> >
>> > Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
>> > fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
>> > always updated together.
>> >
>> > Change (the now trival) inet_get_local_port_range() to a static inline
>> > to optimise the calling code.
>> > (In particular avoiding returning integers by reference.)
>> >
>> > Signed-off-by: David Laight <david.laight@aculab.com>
>> > ---
>> 
>> Regarding the per-socket changes - we don't expect contention on sock
>> lock between inet_stream_connect / __inet_bind, where we grab it and
>> eventually call inet_sk_get_local_port_range, and sockopt handlers, do
>> we?
>> 
>> The motivation is not super clear for me for that of the changes.
>
> The locking in the getsockopt() code is actually quite horrid.
> Look at the conditionals for the rntl lock.
> It is conditionally acquired based on a function that sets a flag,
> but released based on the exit path from the switch statement.
>
> But there are only two options that need the rtnl lock and the socket
> lock, and two trivial ones (including this one) that need the socket
> lock.
> So the code can be simplified by moving the locking into the case branches.
> With only 2 such cases the overhead will be minimal (compared the to
> setsockopt() case where a lot of options need locking.
>
> This is all part of a big patchset I'm trying to write that converts
> all the setsockopt code to take an 'unsigned int optlen' parameter
> and return the length to pass back to the caller.
> So the copy_to_user() of the updated length is done by the syscall
> stub rather than inside every separate function (and sometimes in
> multiple places in a function).
>
> After all, if the copy fails EFAULT the application is broken.
> It really doesn't matter if any side effects have happened.
> If you get a fault reading from a pipe the data is lost.

OK, so you're trying to refactor the setsockopt handler. Now it makes
more sense what is driving this work. Thanks for the context.

[...]
Mat Martineau Dec. 1, 2023, 11:45 p.m. UTC | #6
On Wed, 29 Nov 2023, David Laight wrote:

> Commit 227b60f5102cd added a seqlock to ensure that the low and high
> port numbers were always updated together.
> This is overkill because the two 16bit port numbers can be held in
> a u32 and read/written in a single instruction.
>
> More recently 91d0b78c5177f added support for finer per-socket limits.
> The user-supplied value is 'high << 16 | low' but they are held
> separately and the socket options protected by the socket lock.
>
> Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk'
> fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are
> always updated together.
>
> Change (the now trival) inet_get_local_port_range() to a static inline
> to optimise the calling code.
> (In particular avoiding returning integers by reference.)
>
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---
> include/net/inet_sock.h         |  5 +----
> include/net/ip.h                |  7 ++++++-
> include/net/netns/ipv4.h        |  3 +--
> net/ipv4/af_inet.c              |  4 +---
> net/ipv4/inet_connection_sock.c | 29 ++++++++++------------------
> net/ipv4/ip_sockglue.c          | 34 ++++++++++++++++-----------------
> net/ipv4/sysctl_net_ipv4.c      | 12 ++++--------
> 7 files changed, 40 insertions(+), 54 deletions(-)
>
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index 74db6d97cae1..ebf71410aa2b 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -234,10 +234,7 @@ struct inet_sock {
> 	int			uc_index;
> 	int			mc_index;
> 	__be32			mc_addr;
> -	struct {
> -		__u16 lo;
> -		__u16 hi;
> -	}			local_port_range;
> +	u32			local_port_range;
>
> 	struct ip_mc_socklist __rcu	*mc_list;
> 	struct inet_cork_full	cork;
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 1fc4c8d69e33..154f9dd75fe6 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -349,7 +349,12 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
> 	} \
> }
>
> -void inet_get_local_port_range(const struct net *net, int *low, int *high);
> +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
> +{
> +	u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);
> +	*low = range & 0xffff;
> +	*high = range >> 16;
> +}
> void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
>
> #ifdef CONFIG_SYSCTL
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 73f43f699199..30ba5359da84 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -19,8 +19,7 @@ struct hlist_head;
> struct fib_table;
> struct sock;
> struct local_ports {
> -	seqlock_t	lock;
> -	int		range[2];
> +	u32		range;	/* high << 16 | low */
> 	bool		warned;
> };
>
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index fb81de10d332..b8964b40c3c0 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1847,9 +1847,7 @@ static __net_init int inet_init_net(struct net *net)
> 	/*
> 	 * Set defaults for local port range
> 	 */
> -	seqlock_init(&net->ipv4.ip_local_ports.lock);
> -	net->ipv4.ip_local_ports.range[0] =  32768;
> -	net->ipv4.ip_local_ports.range[1] =  60999;
> +	net->ipv4.ip_local_ports.range =  60999 << 16 | 32768;

Hi David -

Better to use unsigned integer constants here, since 60999 << 16 doesn't 
fit in a signed int on 32-bit platforms.

>
> 	seqlock_init(&net->ipv4.ping_group_range.lock);
> 	/*
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 394a498c2823..1a45d41f8b39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -117,34 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
> 	return !sk->sk_rcv_saddr;
> }
>
> -void inet_get_local_port_range(const struct net *net, int *low, int *high)
> -{
> -	unsigned int seq;
> -
> -	do {
> -		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
> -
> -		*low = net->ipv4.ip_local_ports.range[0];
> -		*high = net->ipv4.ip_local_ports.range[1];
> -	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
> -}
> -EXPORT_SYMBOL(inet_get_local_port_range);
> -
> void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
> {
> 	const struct inet_sock *inet = inet_sk(sk);
> 	const struct net *net = sock_net(sk);
> 	int lo, hi, sk_lo, sk_hi;
> +	u32 sk_range;
>
> 	inet_get_local_port_range(net, &lo, &hi);
>
> -	sk_lo = inet->local_port_range.lo;
> -	sk_hi = inet->local_port_range.hi;
> +	sk_range = READ_ONCE(inet->local_port_range);
> +	if (unlikely(sk_range)) {
> +		sk_lo = sk_range & 0xffff;
> +		sk_hi = sk_range >> 16;
>
> -	if (unlikely(lo <= sk_lo && sk_lo <= hi))
> -		lo = sk_lo;
> -	if (unlikely(lo <= sk_hi && sk_hi <= hi))
> -		hi = sk_hi;
> +		if (unlikely(lo <= sk_lo && sk_lo <= hi))
> +			lo = sk_lo;
> +		if (unlikely(lo <= sk_hi && sk_hi <= hi))
> +			hi = sk_hi;
> +	}
>
> 	*low = lo;
> 	*high = hi;
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 2efc53526a38..bf940fe249a5 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -1055,6 +1055,20 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> 	case IP_TOS:	/* This sets both TOS and Precedence */
> 		ip_sock_set_tos(sk, val);
> 		return 0;
> +
> +	case IP_LOCAL_PORT_RANGE:
> +	{
> +		const __u16 lo = val;
> +		const __u16 hi = val >> 16;

Suggest casting 'val' to an unsigned int before shifting right, even 
though assigning to a __u16 will mask off any surprising bits introduced 
by an arithmetic right shift of a 32-bit signed int.

> +
> +		if (optlen != sizeof(__u32))
> +			return -EINVAL;
> +		if (lo != 0 && hi != 0 && lo > hi)
> +			return -EINVAL;
> +
> +		WRITE_ONCE(inet->local_port_range, val);
> +		return 0;
> +	}
> 	}
>
> 	err = 0;
> @@ -1332,20 +1346,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> 		err = xfrm_user_policy(sk, optname, optval, optlen);
> 		break;
>
> -	case IP_LOCAL_PORT_RANGE:
> -	{
> -		const __u16 lo = val;
> -		const __u16 hi = val >> 16;
> -
> -		if (optlen != sizeof(__u32))
> -			goto e_inval;
> -		if (lo != 0 && hi != 0 && lo > hi)
> -			goto e_inval;
> -
> -		inet->local_port_range.lo = lo;
> -		inet->local_port_range.hi = hi;
> -		break;
> -	}
> 	default:
> 		err = -ENOPROTOOPT;
> 		break;
> @@ -1692,6 +1692,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> 			return -EFAULT;
> 		return 0;
> 	}
> +	case IP_LOCAL_PORT_RANGE:
> +		val = READ_ONCE(inet->local_port_range);
> +		goto copyval;
> 	}
>
> 	if (needs_rtnl)
> @@ -1721,9 +1724,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> 		else
> 			err = ip_get_mcast_msfilter(sk, optval, optlen, len);
> 		goto out;
> -	case IP_LOCAL_PORT_RANGE:
> -		val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
> -		break;
> 	case IP_PROTOCOL:
> 		val = inet_sk(sk)->inet_num;
> 		break;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index f63a545a7374..b008b6b5e85d 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -54,22 +54,18 @@ static void set_local_port_range(struct net *net, int range[2])
> {
> 	bool same_parity = !((range[0] ^ range[1]) & 1);
>
> -	write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
> 	if (same_parity && !net->ipv4.ip_local_ports.warned) {
> 		net->ipv4.ip_local_ports.warned = true;
> 		pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
> 	}
> -	net->ipv4.ip_local_ports.range[0] = range[0];
> -	net->ipv4.ip_local_ports.range[1] = range[1];
> -	write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
> +	WRITE_ONCE(net->ipv4.ip_local_ports.range, range[1] << 16 | range[0]);

Similar, make sure the value is cast to unsigned before shifting here.


- Mat
David Laight Dec. 3, 2023, 11:44 a.m. UTC | #7
...
> > +	net->ipv4.ip_local_ports.range =  60999 << 16 | 32768;
> 
> Hi David -
> 
> Better to use unsigned integer constants here, since 60999 << 16 doesn't
> fit in a signed int on 32-bit platforms.

Or 64bit Linux for that matter.
I'll drop in a couple of u.

...
> > +	case IP_LOCAL_PORT_RANGE:
> > +	{
> > +		const __u16 lo = val;
> > +		const __u16 hi = val >> 16;
> 
> Suggest casting 'val' to an unsigned int before shifting right, even
> though assigning to a __u16 will mask off any surprising bits introduced
> by an arithmetic right shift of a 32-bit signed int.
> 
> > +
> > +		if (optlen != sizeof(__u32))
> > +			return -EINVAL;
> > +		if (lo != 0 && hi != 0 && lo > hi)
> > +			return -EINVAL;

I'd rather leave that block alone since it is just moved from
further down the file.
Although I may remove the 'const __'.

...
> > @@ -54,22 +54,18 @@ static void set_local_port_range(struct net *net, int range[2])
> > {
> > 	bool same_parity = !((range[0] ^ range[1]) & 1);
> >
> > -	write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
> > 	if (same_parity && !net->ipv4.ip_local_ports.warned) {
> > 		net->ipv4.ip_local_ports.warned = true;
> > 		pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end
> values.\n");
> > 	}
> > -	net->ipv4.ip_local_ports.range[0] = range[0];
> > -	net->ipv4.ip_local_ports.range[1] = range[1];
> > -	write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
> > +	WRITE_ONCE(net->ipv4.ip_local_ports.range, range[1] << 16 | range[0]);
> 
> Similar, make sure the value is cast to unsigned before shifting here.

I think I'll pass the port numbers as two 'unsigned int' values
rather than 'int range[2]'.
Passing them at u16 doesn't work.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 74db6d97cae1..ebf71410aa2b 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -234,10 +234,7 @@  struct inet_sock {
 	int			uc_index;
 	int			mc_index;
 	__be32			mc_addr;
-	struct {
-		__u16 lo;
-		__u16 hi;
-	}			local_port_range;
+	u32			local_port_range;
 
 	struct ip_mc_socklist __rcu	*mc_list;
 	struct inet_cork_full	cork;
diff --git a/include/net/ip.h b/include/net/ip.h
index 1fc4c8d69e33..154f9dd75fe6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -349,7 +349,12 @@  static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
 	} \
 }
 
-void inet_get_local_port_range(const struct net *net, int *low, int *high);
+static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
+{
+	u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);
+	*low = range & 0xffff;
+	*high = range >> 16;
+}
 void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
 
 #ifdef CONFIG_SYSCTL
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 73f43f699199..30ba5359da84 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -19,8 +19,7 @@  struct hlist_head;
 struct fib_table;
 struct sock;
 struct local_ports {
-	seqlock_t	lock;
-	int		range[2];
+	u32		range;	/* high << 16 | low */
 	bool		warned;
 };
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fb81de10d332..b8964b40c3c0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1847,9 +1847,7 @@  static __net_init int inet_init_net(struct net *net)
 	/*
 	 * Set defaults for local port range
 	 */
-	seqlock_init(&net->ipv4.ip_local_ports.lock);
-	net->ipv4.ip_local_ports.range[0] =  32768;
-	net->ipv4.ip_local_ports.range[1] =  60999;
+	net->ipv4.ip_local_ports.range =  60999 << 16 | 32768;
 
 	seqlock_init(&net->ipv4.ping_group_range.lock);
 	/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 394a498c2823..1a45d41f8b39 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -117,34 +117,25 @@  bool inet_rcv_saddr_any(const struct sock *sk)
 	return !sk->sk_rcv_saddr;
 }
 
-void inet_get_local_port_range(const struct net *net, int *low, int *high)
-{
-	unsigned int seq;
-
-	do {
-		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
-
-		*low = net->ipv4.ip_local_ports.range[0];
-		*high = net->ipv4.ip_local_ports.range[1];
-	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
-}
-EXPORT_SYMBOL(inet_get_local_port_range);
-
 void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct net *net = sock_net(sk);
 	int lo, hi, sk_lo, sk_hi;
+	u32 sk_range;
 
 	inet_get_local_port_range(net, &lo, &hi);
 
-	sk_lo = inet->local_port_range.lo;
-	sk_hi = inet->local_port_range.hi;
+	sk_range = READ_ONCE(inet->local_port_range);
+	if (unlikely(sk_range)) {
+		sk_lo = sk_range & 0xffff;
+		sk_hi = sk_range >> 16;
 
-	if (unlikely(lo <= sk_lo && sk_lo <= hi))
-		lo = sk_lo;
-	if (unlikely(lo <= sk_hi && sk_hi <= hi))
-		hi = sk_hi;
+		if (unlikely(lo <= sk_lo && sk_lo <= hi))
+			lo = sk_lo;
+		if (unlikely(lo <= sk_hi && sk_hi <= hi))
+			hi = sk_hi;
+	}
 
 	*low = lo;
 	*high = hi;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2efc53526a38..bf940fe249a5 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1055,6 +1055,20 @@  int do_ip_setsockopt(struct sock *sk, int level, int optname,
 	case IP_TOS:	/* This sets both TOS and Precedence */
 		ip_sock_set_tos(sk, val);
 		return 0;
+
+	case IP_LOCAL_PORT_RANGE:
+	{
+		const __u16 lo = val;
+		const __u16 hi = val >> 16;
+
+		if (optlen != sizeof(__u32))
+			return -EINVAL;
+		if (lo != 0 && hi != 0 && lo > hi)
+			return -EINVAL;
+
+		WRITE_ONCE(inet->local_port_range, val);
+		return 0;
+	}
 	}
 
 	err = 0;
@@ -1332,20 +1346,6 @@  int do_ip_setsockopt(struct sock *sk, int level, int optname,
 		err = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
-	case IP_LOCAL_PORT_RANGE:
-	{
-		const __u16 lo = val;
-		const __u16 hi = val >> 16;
-
-		if (optlen != sizeof(__u32))
-			goto e_inval;
-		if (lo != 0 && hi != 0 && lo > hi)
-			goto e_inval;
-
-		inet->local_port_range.lo = lo;
-		inet->local_port_range.hi = hi;
-		break;
-	}
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1692,6 +1692,9 @@  int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		return 0;
 	}
+	case IP_LOCAL_PORT_RANGE:
+		val = READ_ONCE(inet->local_port_range);
+		goto copyval;
 	}
 
 	if (needs_rtnl)
@@ -1721,9 +1724,6 @@  int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		else
 			err = ip_get_mcast_msfilter(sk, optval, optlen, len);
 		goto out;
-	case IP_LOCAL_PORT_RANGE:
-		val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
-		break;
 	case IP_PROTOCOL:
 		val = inet_sk(sk)->inet_num;
 		break;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f63a545a7374..b008b6b5e85d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -54,22 +54,18 @@  static void set_local_port_range(struct net *net, int range[2])
 {
 	bool same_parity = !((range[0] ^ range[1]) & 1);
 
-	write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
 	if (same_parity && !net->ipv4.ip_local_ports.warned) {
 		net->ipv4.ip_local_ports.warned = true;
 		pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
 	}
-	net->ipv4.ip_local_ports.range[0] = range[0];
-	net->ipv4.ip_local_ports.range[1] = range[1];
-	write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
+	WRITE_ONCE(net->ipv4.ip_local_ports.range, range[1] << 16 | range[0]);
 }
 
 /* Validate changes from /proc interface. */
 static int ipv4_local_port_range(struct ctl_table *table, int write,
 				 void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net =
-		container_of(table->data, struct net, ipv4.ip_local_ports.range);
+	struct net *net = table->data;
 	int ret;
 	int range[2];
 	struct ctl_table tmp = {
@@ -733,8 +729,8 @@  static struct ctl_table ipv4_net_table[] = {
 	},
 	{
 		.procname	= "ip_local_port_range",
-		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),
-		.data		= &init_net.ipv4.ip_local_ports.range,
+		.maxlen		= 0,
+		.data		= &init_net,
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},