diff mbox series

[bpf-next,v2,3/6] net: remove duplicate reuseport_lookup functions

Message ID 20230613-so-reuseport-v2-3-b7c69a342613@isovalent.com (mailing list archive)
State New
Headers show
Series Add SO_REUSEPORT support for TC bpf_sk_assign | expand

Commit Message

Lorenz Bauer June 13, 2023, 10:14 a.m. UTC
There are currently four copies of reuseport_lookup: one each for
(TCP, UDP)x(IPv4, IPv6). This forces us to duplicate all callers of
those functions as well. This is already the case for sk_lookup
helpers (inet,inet6,udp4,udp6)_lookup_run_bpf.

The only difference between the reuseport_lookup helpers is calling
a different hash function. Cut down the number of reuseport_lookup
functions to one per IP version by using the INDIRECT_CALL
infrastructure.

Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
---
 include/net/inet6_hashtables.h | 11 ++++++++++-
 include/net/inet_hashtables.h  | 15 +++++++++-----
 net/ipv4/inet_hashtables.c     | 22 ++++++++++++++-------
 net/ipv4/udp.c                 | 37 +++++++++++-----------------------
 net/ipv6/inet6_hashtables.c    | 16 +++++++++++----
 net/ipv6/udp.c                 | 45 +++++++++++++++---------------------------
 6 files changed, 75 insertions(+), 71 deletions(-)

Comments

Simon Horman June 13, 2023, 3:32 p.m. UTC | #1
On Tue, Jun 13, 2023 at 11:14:58AM +0100, Lorenz Bauer wrote:

...

> @@ -332,6 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net,
>  	return score;
>  }
>  
> +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *,
> +					  const __be32, const __u16,
> +					  const __be32, const __be16));
> +

Hi Lorenz,

Would this be better placed in a header file?
GCC complains that in udp.c this function is neither static nor
has a prototype.

Liksewise for udp6_ehashfn()

...
kernel test robot June 13, 2023, 5:26 p.m. UTC | #2
Hi Lorenz,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 25085b4e9251c77758964a8e8651338972353642]

url:    https://github.com/intel-lab-lkp/linux/commits/Lorenz-Bauer/net-export-inet_lookup_reuseport-and-inet6_lookup_reuseport/20230613-181619
base:   25085b4e9251c77758964a8e8651338972353642
patch link:    https://lore.kernel.org/r/20230613-so-reuseport-v2-3-b7c69a342613%40isovalent.com
patch subject: [PATCH bpf-next v2 3/6] net: remove duplicate reuseport_lookup functions
config: i386-defconfig (https://download.01.org/0day-ci/archive/20230614/202306140138.DnwjedJ1-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build):
        git checkout 25085b4e9251c77758964a8e8651338972353642
        b4 shazam https://lore.kernel.org/r/20230613-so-reuseport-v2-3-b7c69a342613@isovalent.com
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 olddefconfig
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/ipv4/ net/ipv6/

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306140138.DnwjedJ1-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> net/ipv4/udp.c:409:5: warning: no previous prototype for 'udp_ehashfn' [-Wmissing-prototypes]
     409 | u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
         |     ^~~~~~~~~~~
--
>> net/ipv6/udp.c:74:5: warning: no previous prototype for 'udp6_ehashfn' [-Wmissing-prototypes]
      74 | u32 udp6_ehashfn(const struct net *net,
         |     ^~~~~~~~~~~~


vim +/udp_ehashfn +409 net/ipv4/udp.c

   407	
   408	INDIRECT_CALLABLE_SCOPE
 > 409	u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
   410			const __be32 faddr, const __be16 fport)
   411	{
   412		static u32 udp_ehash_secret __read_mostly;
   413	
   414		net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
   415	
   416		return __inet_ehashfn(laddr, lport, faddr, fport,
   417				      udp_ehash_secret + net_hash_mix(net));
   418	}
   419
Kuniyuki Iwashima June 13, 2023, 6:56 p.m. UTC | #3
From: Lorenz Bauer <lmb@isovalent.com>
Date: Tue, 13 Jun 2023 11:14:58 +0100
> There are currently four copies of reuseport_lookup: one each for
> (TCP, UDP)x(IPv4, IPv6). This forces us to duplicate all callers of
> those functions as well. This is already the case for sk_lookup
> helpers (inet,inet6,udp4,udp6)_lookup_run_bpf.
> 
> The only difference between the reuseport_lookup helpers is calling
> a different hash function. Cut down the number of reuseport_lookup
> functions to one per IP version by using the INDIRECT_CALL
> infrastructure.
> 
> Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
> ---
>  include/net/inet6_hashtables.h | 11 ++++++++++-
>  include/net/inet_hashtables.h  | 15 +++++++++-----
>  net/ipv4/inet_hashtables.c     | 22 ++++++++++++++-------
>  net/ipv4/udp.c                 | 37 +++++++++++-----------------------
>  net/ipv6/inet6_hashtables.c    | 16 +++++++++++----
>  net/ipv6/udp.c                 | 45 +++++++++++++++---------------------------
>  6 files changed, 75 insertions(+), 71 deletions(-)
> 
> diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
> index 032ddab48f8f..49d586454287 100644
> --- a/include/net/inet6_hashtables.h
> +++ b/include/net/inet6_hashtables.h
> @@ -48,12 +48,21 @@ struct sock *__inet6_lookup_established(struct net *net,
>  					const u16 hnum, const int dif,
>  					const int sdif);
>  
> +typedef u32 (*inet6_ehashfn_t)(const struct net *net,
> +			       const struct in6_addr *laddr, const u16 lport,
> +			       const struct in6_addr *faddr, const __be16 fport);
> +
> +u32 inet6_ehashfn(const struct net *net,
> +		  const struct in6_addr *laddr, const u16 lport,
> +		  const struct in6_addr *faddr, const __be16 fport);
> +
>  struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
>  				    struct sk_buff *skb, int doff,
>  				    const struct in6_addr *saddr,
>  				    __be16 sport,
>  				    const struct in6_addr *daddr,
> -				    unsigned short hnum);
> +				    unsigned short hnum,
> +				    inet6_ehashfn_t ehashfn);
>  
>  struct sock *inet6_lookup_listener(struct net *net,
>  				   struct inet_hashinfo *hashinfo,
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 8734f3488f5d..51ab6a1a3601 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -379,10 +379,19 @@ struct sock *__inet_lookup_established(struct net *net,
>  				       const __be32 daddr, const u16 hnum,
>  				       const int dif, const int sdif);
>  
> +typedef u32 (*inet_ehashfn_t)(const struct net *net,
> +			      const __be32 laddr, const __u16 lport,
> +			      const __be32 faddr, const __be16 fport);
> +
> +u32 inet_ehashfn(const struct net *net,
> +		 const __be32 laddr, const __u16 lport,
> +		 const __be32 faddr, const __be16 fport);
> +
>  struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
>  				   struct sk_buff *skb, int doff,
>  				   __be32 saddr, __be16 sport,
> -				   __be32 daddr, unsigned short hnum);
> +				   __be32 daddr, unsigned short hnum,
> +				   inet_ehashfn_t ehashfn);
>  
>  static inline struct sock *
>  	inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
> @@ -453,10 +462,6 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
>  			     refcounted);
>  }
>  
> -u32 inet6_ehashfn(const struct net *net,
> -		  const struct in6_addr *laddr, const u16 lport,
> -		  const struct in6_addr *faddr, const __be16 fport);
> -
>  static inline void sk_daddr_set(struct sock *sk, __be32 addr)
>  {
>  	sk->sk_daddr = addr; /* alias of inet_daddr */
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 91f9210d4e83..1ec895fd9905 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -28,9 +28,9 @@
>  #include <net/tcp.h>
>  #include <net/sock_reuseport.h>
>  
> -static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
> -			const __u16 lport, const __be32 faddr,
> -			const __be16 fport)
> +u32 inet_ehashfn(const struct net *net, const __be32 laddr,
> +		 const __u16 lport, const __be32 faddr,
> +		 const __be16 fport)
>  {
>  	static u32 inet_ehash_secret __read_mostly;
>  
> @@ -332,6 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net,
>  	return score;
>  }
>  
> +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *,
> +					  const __be32, const __u16,
> +					  const __be32, const __be16));
> +
>  /**
>   * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
>   * @net: network namespace.
> @@ -342,6 +346,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
>   * @sport: source port.
>   * @daddr: destination address.
>   * @hnum: destination port in host byte order.
> + * @ehashfn: hash function used to generate the fallback hash.
>   *
>   * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
>   *         the selected sock or an error.
> @@ -349,13 +354,15 @@ static inline int compute_score(struct sock *sk, struct net *net,
>  struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
>  				   struct sk_buff *skb, int doff,
>  				   __be32 saddr, __be16 sport,
> -				   __be32 daddr, unsigned short hnum)
> +				   __be32 daddr, unsigned short hnum,
> +				   inet_ehashfn_t ehashfn)
>  {
>  	struct sock *reuse_sk = NULL;
>  	u32 phash;
>  
>  	if (sk->sk_reuseport) {
> -		phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
> +		phash = INDIRECT_CALL_2(ehashfn, inet_ehashfn, udp_ehashfn,
> +					net, daddr, hnum, saddr, sport);
>  		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
>  	}
>  	return reuse_sk;
> @@ -385,7 +392,7 @@ static struct sock *inet_lhash2_lookup(struct net *net,
>  		score = compute_score(sk, net, hnum, daddr, dif, sdif);
>  		if (score > hiscore) {
>  			result = inet_lookup_reuseport(net, sk, skb, doff,
> -						       saddr, sport, daddr, hnum);
> +						       saddr, sport, daddr, hnum, inet_ehashfn);
>  			if (result)
>  				return result;
>  
> @@ -414,7 +421,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
>  	if (no_reuseport || IS_ERR_OR_NULL(sk))
>  		return sk;
>  
> -	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
> +	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
> +					 inet_ehashfn);
>  	if (reuse_sk)
>  		sk = reuse_sk;
>  	return sk;
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index fd3dae081f3a..10468fe144d0 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -405,9 +405,9 @@ static int compute_score(struct sock *sk, struct net *net,
>  	return score;
>  }
>  
> -static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
> -		       const __u16 lport, const __be32 faddr,
> -		       const __be16 fport)
> +INDIRECT_CALLABLE_SCOPE
> +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
> +		const __be32 faddr, const __be16 fport)
>  {
>  	static u32 udp_ehash_secret __read_mostly;
>  
> @@ -417,22 +417,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
>  			      udp_ehash_secret + net_hash_mix(net));
>  }
>  
> -static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
> -				     struct sk_buff *skb,
> -				     __be32 saddr, __be16 sport,
> -				     __be32 daddr, unsigned short hnum)
> -{
> -	struct sock *reuse_sk = NULL;
> -	u32 hash;
> -
> -	if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
> -		hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
> -		reuse_sk = reuseport_select_sock(sk, hash, skb,
> -						 sizeof(struct udphdr));
> -	}
> -	return reuse_sk;
> -}
> -
>  /* called with rcu_read_lock() */
>  static struct sock *udp4_lib_lookup2(struct net *net,
>  				     __be32 saddr, __be16 sport,
> @@ -450,11 +434,13 @@ static struct sock *udp4_lib_lookup2(struct net *net,
>  		score = compute_score(sk, net, saddr, sport,
>  				      daddr, hnum, dif, sdif);
>  		if (score > badness) {
> -			result = lookup_reuseport(net, sk, skb,
> -						  saddr, sport, daddr, hnum);
> -			/* Fall back to scoring if group has connections */
> -			if (result && !reuseport_has_conns(sk))
> -				return result;
> +			if (sk->sk_state != TCP_ESTABLISHED) {
> +				result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
> +							       saddr, sport, daddr, hnum, udp_ehashfn);
> +				/* Fall back to scoring if group has connections */
> +				if (result && !reuseport_has_conns(sk))
> +					return result;

				result = result ? : sk;
> +			}

			else {
				result = sk;
			}

The assignment to result below is buggy.  Let's say SO_REUSEPROT group
have TCP_CLOSE and TCP_ESTABLISHED sockets.

  1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup
  2. result is not NULL, but the group has TCP_ESTABLISHED sk
  3. result = result
  4. Find TCP_ESTABLISHED sk, which has a higher score
  5. result = result (TCP_CLOSE) <-- should be sk.

Same for v6 function.

>  
>  			result = result ? : sk;
>  			badness = score;
> @@ -480,7 +466,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
>  	if (no_reuseport || IS_ERR_OR_NULL(sk))
>  		return sk;
>  
> -	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
> +	reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
> +					 saddr, sport, daddr, hnum, udp_ehashfn);
>  	if (reuse_sk)
>  		sk = reuse_sk;
>  	return sk;
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 208998694ae3..a350ee40141c 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -111,6 +111,10 @@ static inline int compute_score(struct sock *sk, struct net *net,
>  	return score;
>  }
>  
> +INDIRECT_CALLABLE_DECLARE(u32 udp6_ehashfn(const struct net *,
> +					   const struct in6_addr *, const u16,
> +					   const struct in6_addr *, const __be16));
> +
>  /**
>   * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
>   * @net: network namespace.
> @@ -121,6 +125,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
>   * @sport: source port.
>   * @daddr: destination address.
>   * @hnum: destination port in host byte order.
> + * @ehashfn: hash function used to generate the fallback hash.
>   *
>   * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
>   *         the selected sock or an error.
> @@ -130,13 +135,15 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
>  				    const struct in6_addr *saddr,
>  				    __be16 sport,
>  				    const struct in6_addr *daddr,
> -				    unsigned short hnum)
> +				    unsigned short hnum,
> +				    inet6_ehashfn_t ehashfn)
>  {
>  	struct sock *reuse_sk = NULL;
>  	u32 phash;
>  
>  	if (sk->sk_reuseport) {
> -		phash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
> +		phash = INDIRECT_CALL_2(ehashfn, inet6_ehashfn, udp6_ehashfn,
> +					net, daddr, hnum, saddr, sport);
>  		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
>  	}
>  	return reuse_sk;
> @@ -159,7 +166,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
>  		score = compute_score(sk, net, hnum, daddr, dif, sdif);
>  		if (score > hiscore) {
>  			result = inet6_lookup_reuseport(net, sk, skb, doff,
> -							saddr, sport, daddr, hnum);
> +							saddr, sport, daddr, hnum, inet6_ehashfn);
>  			if (result)
>  				return result;
>  
> @@ -190,7 +197,8 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net,
>  	if (no_reuseport || IS_ERR_OR_NULL(sk))
>  		return sk;
>  
> -	reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
> +	reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
> +					  saddr, sport, daddr, hnum, inet6_ehashfn);
>  	if (reuse_sk)
>  		sk = reuse_sk;
>  	return sk;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index e5a337e6b970..2af3a595f38a 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -70,11 +70,12 @@ int udpv6_init_sock(struct sock *sk)
>  	return 0;
>  }
>  
> -static u32 udp6_ehashfn(const struct net *net,
> -			const struct in6_addr *laddr,
> -			const u16 lport,
> -			const struct in6_addr *faddr,
> -			const __be16 fport)
> +INDIRECT_CALLABLE_SCOPE
> +u32 udp6_ehashfn(const struct net *net,
> +		 const struct in6_addr *laddr,
> +		 const u16 lport,
> +		 const struct in6_addr *faddr,
> +		 const __be16 fport)
>  {
>  	static u32 udp6_ehash_secret __read_mostly;
>  	static u32 udp_ipv6_hash_secret __read_mostly;
> @@ -159,24 +160,6 @@ static int compute_score(struct sock *sk, struct net *net,
>  	return score;
>  }
>  
> -static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
> -				     struct sk_buff *skb,
> -				     const struct in6_addr *saddr,
> -				     __be16 sport,
> -				     const struct in6_addr *daddr,
> -				     unsigned int hnum)
> -{
> -	struct sock *reuse_sk = NULL;
> -	u32 hash;
> -
> -	if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
> -		hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
> -		reuse_sk = reuseport_select_sock(sk, hash, skb,
> -						 sizeof(struct udphdr));
> -	}
> -	return reuse_sk;
> -}
> -
>  /* called with rcu_read_lock() */
>  static struct sock *udp6_lib_lookup2(struct net *net,
>  		const struct in6_addr *saddr, __be16 sport,
> @@ -193,11 +176,14 @@ static struct sock *udp6_lib_lookup2(struct net *net,
>  		score = compute_score(sk, net, saddr, sport,
>  				      daddr, hnum, dif, sdif);
>  		if (score > badness) {
> -			result = lookup_reuseport(net, sk, skb,
> -						  saddr, sport, daddr, hnum);
> -			/* Fall back to scoring if group has connections */
> -			if (result && !reuseport_has_conns(sk))
> -				return result;
> +			if (sk->sk_state != TCP_ESTABLISHED) {
> +				result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
> +								saddr, sport, daddr, hnum,
> +								udp6_ehashfn);
> +				/* Fall back to scoring if group has connections */
> +				if (result && !reuseport_has_conns(sk))
> +					return result;
> +			}
>  
>  			result = result ? : sk;
>  			badness = score;
> @@ -225,7 +211,8 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
>  	if (no_reuseport || IS_ERR_OR_NULL(sk))
>  		return sk;
>  
> -	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
> +	reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
> +					  saddr, sport, daddr, hnum, udp6_ehashfn);
>  	if (reuse_sk)
>  		sk = reuse_sk;
>  	return sk;
> 
> -- 
> 2.40.1
Lorenz Bauer June 14, 2023, 3:25 p.m. UTC | #4
On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
>                         else {
>                                 result = sk;
>                         }
>
> The assignment to result below is buggy.  Let's say SO_REUSEPROT group
> have TCP_CLOSE and TCP_ESTABLISHED sockets.

I'm not very familiar with SO_REUSEPORT, I assumed (incorrectly
probably) that such a group would only ever have TCP_CLOSE in UDP case
and TCP_LISTENING in TCP case. Can you explain how I could end up in
this situation?
Lorenz Bauer June 14, 2023, 3:42 p.m. UTC | #5
On Tue, Jun 13, 2023 at 4:33 PM Simon Horman <simon.horman@corigine.com> wrote:
> >
> > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *,
> > +                                       const __be32, const __u16,
> > +                                       const __be32, const __be16));
> > +
>
> Hi Lorenz,
>
> Would this be better placed in a header file?
> GCC complains that in udp.c this function is neither static nor
> has a prototype.

Hi Simon,

The problem is that I don't want to pull in udp.h in
inet_hashtables.c, but that is the natural place to define that
function. I was hoping the macro magic would solve the problem, but oh
well. How do you make gcc complain, and what is the full error
message?

Thanks
Lorenz
Kuniyuki Iwashima June 14, 2023, 4:52 p.m. UTC | #6
From: Lorenz Bauer <lmb@isovalent.com>
Date: Wed, 14 Jun 2023 16:25:05 +0100
> On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> >                         else {
> >                                 result = sk;
> >                         }
> >
> > The assignment to result below is buggy.  Let's say SO_REUSEPROT group
> > have TCP_CLOSE and TCP_ESTABLISHED sockets.
> 
> I'm not very familiar with SO_REUSEPORT, I assumed (incorrectly
> probably) that such a group would only ever have TCP_CLOSE in UDP case
> and TCP_LISTENING in TCP case. Can you explain how I could end up in
> this situation?

When we call conenct() for UDP socket in SO_REUSEPORT group, the state
is changed from TCP_CLOSE to TCP_ESTABLISHED in __ip4_datagram_connect(),
and the socket remains in the group.

That's why we check TCP_ESTABLISHED in reuseport_select_sock_by_hash()
that is always false for TCP but true for UDP in the case above.
Simon Horman June 15, 2023, 7:21 a.m. UTC | #7
On Wed, Jun 14, 2023 at 04:42:45PM +0100, Lorenz Bauer wrote:
> On Tue, Jun 13, 2023 at 4:33 PM Simon Horman <simon.horman@corigine.com> wrote:
> > >
> > > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *,
> > > +                                       const __be32, const __u16,
> > > +                                       const __be32, const __be16));
> > > +
> >
> > Hi Lorenz,
> >
> > Would this be better placed in a header file?
> > GCC complains that in udp.c this function is neither static nor
> > has a prototype.
> 
> Hi Simon,
> 
> The problem is that I don't want to pull in udp.h in
> inet_hashtables.c, but that is the natural place to define that
> function. I was hoping the macro magic would solve the problem, but oh
> well. How do you make gcc complain, and what is the full error
> message?

Hi Lorenz,

sorry for the bother.

With gcc 12.3.0 [1] on x86_64 I see:

$ make allmodconfig
$ make W=1 net/ipv4/udp.o
net/ipv4/udp.c:410:5: error: no previous prototype for 'udp_ehashfn' [-Werror=missing-prototypes]
  410 | u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
      |     ^~~~~~~~~~~

[1] https://mirrors.edge.kernel.org/pub/tools/crosstool/
Lorenz Bauer June 20, 2023, 2:26 p.m. UTC | #8
On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> The assignment to result below is buggy.  Let's say SO_REUSEPROT group
> have TCP_CLOSE and TCP_ESTABLISHED sockets.
>
>   1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup
>   2. result is not NULL, but the group has TCP_ESTABLISHED sk
>   3. result = result
>   4. Find TCP_ESTABLISHED sk, which has a higher score
>   5. result = result (TCP_CLOSE) <-- should be sk.
>
> Same for v6 function.

Thanks for your explanation, I think I get it now. I misunderstood
that you were worried about returning TCP_ESTABLISHED instead of
TCP_CLOSE, but it's exactly the other way around.

I have a follow up question regarding the existing code:

    result = lookup_reuseport(net, sk, skb,
                    saddr, sport, daddr, hnum);
    /* Fall back to scoring if group has connections */
    if (result && !reuseport_has_conns(sk))
        return result;

    result = result ? : sk;
    badness = score;

Assuming that result != NULL but reuseport_has_conns() == true, we use
the reuseport socket as the result, but assign the score of sk to
badness. Shouldn't we use the score of the reuseport socket?

Thanks
Lorenz
Kuniyuki Iwashima June 20, 2023, 6:31 p.m. UTC | #9
From: Lorenz Bauer <lmb@isovalent.com>
Date: Tue, 20 Jun 2023 15:26:05 +0100
> On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> > The assignment to result below is buggy.  Let's say SO_REUSEPROT group
> > have TCP_CLOSE and TCP_ESTABLISHED sockets.
> >
> >   1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup
> >   2. result is not NULL, but the group has TCP_ESTABLISHED sk
> >   3. result = result
> >   4. Find TCP_ESTABLISHED sk, which has a higher score
> >   5. result = result (TCP_CLOSE) <-- should be sk.
> >
> > Same for v6 function.
> 
> Thanks for your explanation, I think I get it now. I misunderstood
> that you were worried about returning TCP_ESTABLISHED instead of
> TCP_CLOSE, but it's exactly the other way around.
> 
> I have a follow up question regarding the existing code:
> 
>     result = lookup_reuseport(net, sk, skb,
>                     saddr, sport, daddr, hnum);
>     /* Fall back to scoring if group has connections */
>     if (result && !reuseport_has_conns(sk))
>         return result;
> 
>     result = result ? : sk;
>     badness = score;
> 
> Assuming that result != NULL but reuseport_has_conns() == true, we use
> the reuseport socket as the result, but assign the score of sk to
> badness. Shouldn't we use the score of the reuseport socket?

Good point.  This is based on an assumption that all SO_REUSEPORT
sockets have the same score, which is wrong for two corner cases
if reuseport_has_conns() == true :

  1) SO_INCOMING_CPU is set
     -> selected sk might have +1 score

  2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk
     -> selected sk will have more than 8

Using the old score could trigger more lookups depending on the
order that sockets are created.

  sk -> sk (SO_INCOMING_CPU) -> sk (ESTABLISHED)
  |     |
  `-> select the next SO_INCOMING_CPU sk
        |
	`-> select itself (We should save this lookup)

So, yes, we should update badness like

  if (unlikely(result)) {
      badness = compute_score(result, ...);
  } else {
      result = sk;
      badness = score;
  }
Lorenz Bauer June 21, 2023, 8:01 a.m. UTC | #10
On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> Good point.  This is based on an assumption that all SO_REUSEPORT
> sockets have the same score, which is wrong for two corner cases
> if reuseport_has_conns() == true :
>
>   1) SO_INCOMING_CPU is set
>      -> selected sk might have +1 score
>
>   2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk
>      -> selected sk will have more than 8
>
> Using the old score could trigger more lookups depending on the
> order that sockets are created.

So the result will still be correct, but it's less performant? Happy
to fix a perf regression, but if the result is incorrect this might
need a separate fix?

Best
Lorenz
Lorenz Bauer June 21, 2023, 1:49 p.m. UTC | #11
On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> Good point.  This is based on an assumption that all SO_REUSEPORT
> sockets have the same score, which is wrong for two corner cases [...]

I did some more digging. I think this was introduced by commit
efc6b6f6c311 ("udp: Improve load balancing for SO_REUSEPORT.") which
unfortunately ran into a merge conflict. That resulted in Dave Miller
moving the bug around in commit a57066b1a019 ("Merge
git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net"). Can you
take a look and let me know if you think that is correct?

Best
Lorenz
Kuniyuki Iwashima June 21, 2023, 3 p.m. UTC | #12
From: Lorenz Bauer <lmb@isovalent.com>
Date: Wed, 21 Jun 2023 09:01:15 +0100
> On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> > Good point.  This is based on an assumption that all SO_REUSEPORT
> > sockets have the same score, which is wrong for two corner cases
> > if reuseport_has_conns() == true :
> >
> >   1) SO_INCOMING_CPU is set
> >      -> selected sk might have +1 score
> >
> >   2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk
> >      -> selected sk will have more than 8
> >
> > Using the old score could trigger more lookups depending on the
> > order that sockets are created.
> 
> So the result will still be correct, but it's less performant? Happy
> to fix a perf regression, but if the result is incorrect this might
> need a separate fix?

Right, the result is always correct.

If BPF prog selects a different socket per lookup, there is no
consistency, but it _is_ corret.


> I did some more digging. I think this was introduced by commit
> efc6b6f6c311 ("udp: Improve load balancing for SO_REUSEPORT.") which
> unfortunately ran into a merge conflict. That resulted in Dave Miller
> moving the bug around in commit a57066b1a019 ("Merge
> git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net"). Can you
> take a look and let me know if you think that is correct?

Yes, I should have updated the score too in efc6b6f6c311 to save
unneeded lookups.  The conflict itself was resolved properly.
diff mbox series

Patch

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 032ddab48f8f..49d586454287 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -48,12 +48,21 @@  struct sock *__inet6_lookup_established(struct net *net,
 					const u16 hnum, const int dif,
 					const int sdif);
 
+typedef u32 (*inet6_ehashfn_t)(const struct net *net,
+			       const struct in6_addr *laddr, const u16 lport,
+			       const struct in6_addr *faddr, const __be16 fport);
+
+u32 inet6_ehashfn(const struct net *net,
+		  const struct in6_addr *laddr, const u16 lport,
+		  const struct in6_addr *faddr, const __be16 fport);
+
 struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
 				    struct sk_buff *skb, int doff,
 				    const struct in6_addr *saddr,
 				    __be16 sport,
 				    const struct in6_addr *daddr,
-				    unsigned short hnum);
+				    unsigned short hnum,
+				    inet6_ehashfn_t ehashfn);
 
 struct sock *inet6_lookup_listener(struct net *net,
 				   struct inet_hashinfo *hashinfo,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 8734f3488f5d..51ab6a1a3601 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,19 @@  struct sock *__inet_lookup_established(struct net *net,
 				       const __be32 daddr, const u16 hnum,
 				       const int dif, const int sdif);
 
+typedef u32 (*inet_ehashfn_t)(const struct net *net,
+			      const __be32 laddr, const __u16 lport,
+			      const __be32 faddr, const __be16 fport);
+
+u32 inet_ehashfn(const struct net *net,
+		 const __be32 laddr, const __u16 lport,
+		 const __be32 faddr, const __be16 fport);
+
 struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
 				   struct sk_buff *skb, int doff,
 				   __be32 saddr, __be16 sport,
-				   __be32 daddr, unsigned short hnum);
+				   __be32 daddr, unsigned short hnum,
+				   inet_ehashfn_t ehashfn);
 
 static inline struct sock *
 	inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
@@ -453,10 +462,6 @@  static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 			     refcounted);
 }
 
-u32 inet6_ehashfn(const struct net *net,
-		  const struct in6_addr *laddr, const u16 lport,
-		  const struct in6_addr *faddr, const __be16 fport);
-
 static inline void sk_daddr_set(struct sock *sk, __be32 addr)
 {
 	sk->sk_daddr = addr; /* alias of inet_daddr */
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 91f9210d4e83..1ec895fd9905 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -28,9 +28,9 @@ 
 #include <net/tcp.h>
 #include <net/sock_reuseport.h>
 
-static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
-			const __u16 lport, const __be32 faddr,
-			const __be16 fport)
+u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+		 const __u16 lport, const __be32 faddr,
+		 const __be16 fport)
 {
 	static u32 inet_ehash_secret __read_mostly;
 
@@ -332,6 +332,10 @@  static inline int compute_score(struct sock *sk, struct net *net,
 	return score;
 }
 
+INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *,
+					  const __be32, const __u16,
+					  const __be32, const __be16));
+
 /**
  * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
  * @net: network namespace.
@@ -342,6 +346,7 @@  static inline int compute_score(struct sock *sk, struct net *net,
  * @sport: source port.
  * @daddr: destination address.
  * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
  *
  * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
  *         the selected sock or an error.
@@ -349,13 +354,15 @@  static inline int compute_score(struct sock *sk, struct net *net,
 struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
 				   struct sk_buff *skb, int doff,
 				   __be32 saddr, __be16 sport,
-				   __be32 daddr, unsigned short hnum)
+				   __be32 daddr, unsigned short hnum,
+				   inet_ehashfn_t ehashfn)
 {
 	struct sock *reuse_sk = NULL;
 	u32 phash;
 
 	if (sk->sk_reuseport) {
-		phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+		phash = INDIRECT_CALL_2(ehashfn, inet_ehashfn, udp_ehashfn,
+					net, daddr, hnum, saddr, sport);
 		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
 	}
 	return reuse_sk;
@@ -385,7 +392,7 @@  static struct sock *inet_lhash2_lookup(struct net *net,
 		score = compute_score(sk, net, hnum, daddr, dif, sdif);
 		if (score > hiscore) {
 			result = inet_lookup_reuseport(net, sk, skb, doff,
-						       saddr, sport, daddr, hnum);
+						       saddr, sport, daddr, hnum, inet_ehashfn);
 			if (result)
 				return result;
 
@@ -414,7 +421,8 @@  static inline struct sock *inet_lookup_run_bpf(struct net *net,
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
-	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
+					 inet_ehashfn);
 	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fd3dae081f3a..10468fe144d0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -405,9 +405,9 @@  static int compute_score(struct sock *sk, struct net *net,
 	return score;
 }
 
-static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
-		       const __u16 lport, const __be32 faddr,
-		       const __be16 fport)
+INDIRECT_CALLABLE_SCOPE
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+		const __be32 faddr, const __be16 fport)
 {
 	static u32 udp_ehash_secret __read_mostly;
 
@@ -417,22 +417,6 @@  static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
 			      udp_ehash_secret + net_hash_mix(net));
 }
 
-static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
-				     struct sk_buff *skb,
-				     __be32 saddr, __be16 sport,
-				     __be32 daddr, unsigned short hnum)
-{
-	struct sock *reuse_sk = NULL;
-	u32 hash;
-
-	if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
-		hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
-		reuse_sk = reuseport_select_sock(sk, hash, skb,
-						 sizeof(struct udphdr));
-	}
-	return reuse_sk;
-}
-
 /* called with rcu_read_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
 				     __be32 saddr, __be16 sport,
@@ -450,11 +434,13 @@  static struct sock *udp4_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif);
 		if (score > badness) {
-			result = lookup_reuseport(net, sk, skb,
-						  saddr, sport, daddr, hnum);
-			/* Fall back to scoring if group has connections */
-			if (result && !reuseport_has_conns(sk))
-				return result;
+			if (sk->sk_state != TCP_ESTABLISHED) {
+				result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+							       saddr, sport, daddr, hnum, udp_ehashfn);
+				/* Fall back to scoring if group has connections */
+				if (result && !reuseport_has_conns(sk))
+					return result;
+			}
 
 			result = result ? : sk;
 			badness = score;
@@ -480,7 +466,8 @@  static struct sock *udp4_lookup_run_bpf(struct net *net,
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
-	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
+	reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+					 saddr, sport, daddr, hnum, udp_ehashfn);
 	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 208998694ae3..a350ee40141c 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -111,6 +111,10 @@  static inline int compute_score(struct sock *sk, struct net *net,
 	return score;
 }
 
+INDIRECT_CALLABLE_DECLARE(u32 udp6_ehashfn(const struct net *,
+					   const struct in6_addr *, const u16,
+					   const struct in6_addr *, const __be16));
+
 /**
  * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
  * @net: network namespace.
@@ -121,6 +125,7 @@  static inline int compute_score(struct sock *sk, struct net *net,
  * @sport: source port.
  * @daddr: destination address.
  * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
  *
  * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
  *         the selected sock or an error.
@@ -130,13 +135,15 @@  struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
 				    const struct in6_addr *saddr,
 				    __be16 sport,
 				    const struct in6_addr *daddr,
-				    unsigned short hnum)
+				    unsigned short hnum,
+				    inet6_ehashfn_t ehashfn)
 {
 	struct sock *reuse_sk = NULL;
 	u32 phash;
 
 	if (sk->sk_reuseport) {
-		phash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+		phash = INDIRECT_CALL_2(ehashfn, inet6_ehashfn, udp6_ehashfn,
+					net, daddr, hnum, saddr, sport);
 		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
 	}
 	return reuse_sk;
@@ -159,7 +166,7 @@  static struct sock *inet6_lhash2_lookup(struct net *net,
 		score = compute_score(sk, net, hnum, daddr, dif, sdif);
 		if (score > hiscore) {
 			result = inet6_lookup_reuseport(net, sk, skb, doff,
-							saddr, sport, daddr, hnum);
+							saddr, sport, daddr, hnum, inet6_ehashfn);
 			if (result)
 				return result;
 
@@ -190,7 +197,8 @@  static inline struct sock *inet6_lookup_run_bpf(struct net *net,
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
-	reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+	reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
+					  saddr, sport, daddr, hnum, inet6_ehashfn);
 	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5a337e6b970..2af3a595f38a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -70,11 +70,12 @@  int udpv6_init_sock(struct sock *sk)
 	return 0;
 }
 
-static u32 udp6_ehashfn(const struct net *net,
-			const struct in6_addr *laddr,
-			const u16 lport,
-			const struct in6_addr *faddr,
-			const __be16 fport)
+INDIRECT_CALLABLE_SCOPE
+u32 udp6_ehashfn(const struct net *net,
+		 const struct in6_addr *laddr,
+		 const u16 lport,
+		 const struct in6_addr *faddr,
+		 const __be16 fport)
 {
 	static u32 udp6_ehash_secret __read_mostly;
 	static u32 udp_ipv6_hash_secret __read_mostly;
@@ -159,24 +160,6 @@  static int compute_score(struct sock *sk, struct net *net,
 	return score;
 }
 
-static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
-				     struct sk_buff *skb,
-				     const struct in6_addr *saddr,
-				     __be16 sport,
-				     const struct in6_addr *daddr,
-				     unsigned int hnum)
-{
-	struct sock *reuse_sk = NULL;
-	u32 hash;
-
-	if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
-		hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
-		reuse_sk = reuseport_select_sock(sk, hash, skb,
-						 sizeof(struct udphdr));
-	}
-	return reuse_sk;
-}
-
 /* called with rcu_read_lock() */
 static struct sock *udp6_lib_lookup2(struct net *net,
 		const struct in6_addr *saddr, __be16 sport,
@@ -193,11 +176,14 @@  static struct sock *udp6_lib_lookup2(struct net *net,
 		score = compute_score(sk, net, saddr, sport,
 				      daddr, hnum, dif, sdif);
 		if (score > badness) {
-			result = lookup_reuseport(net, sk, skb,
-						  saddr, sport, daddr, hnum);
-			/* Fall back to scoring if group has connections */
-			if (result && !reuseport_has_conns(sk))
-				return result;
+			if (sk->sk_state != TCP_ESTABLISHED) {
+				result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+								saddr, sport, daddr, hnum,
+								udp6_ehashfn);
+				/* Fall back to scoring if group has connections */
+				if (result && !reuseport_has_conns(sk))
+					return result;
+			}
 
 			result = result ? : sk;
 			badness = score;
@@ -225,7 +211,8 @@  static inline struct sock *udp6_lookup_run_bpf(struct net *net,
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
-	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
+	reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+					  saddr, sport, daddr, hnum, udp6_ehashfn);
 	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;