Message ID | 20230613-so-reuseport-v2-3-b7c69a342613@isovalent.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add SO_REUSEPORT support for TC bpf_sk_assign | expand |
On Tue, Jun 13, 2023 at 11:14:58AM +0100, Lorenz Bauer wrote: ... > @@ -332,6 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net, > return score; > } > > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *, > + const __be32, const __u16, > + const __be32, const __be16)); > + Hi Lorenz, Would this be better placed in a header file? GCC complains that in udp.c this function is neither static nor has a prototype. Liksewise for udp6_ehashfn() ...
Hi Lorenz, kernel test robot noticed the following build warnings: [auto build test WARNING on 25085b4e9251c77758964a8e8651338972353642] url: https://github.com/intel-lab-lkp/linux/commits/Lorenz-Bauer/net-export-inet_lookup_reuseport-and-inet6_lookup_reuseport/20230613-181619 base: 25085b4e9251c77758964a8e8651338972353642 patch link: https://lore.kernel.org/r/20230613-so-reuseport-v2-3-b7c69a342613%40isovalent.com patch subject: [PATCH bpf-next v2 3/6] net: remove duplicate reuseport_lookup functions config: i386-defconfig (https://download.01.org/0day-ci/archive/20230614/202306140138.DnwjedJ1-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce (this is a W=1 build): git checkout 25085b4e9251c77758964a8e8651338972353642 b4 shazam https://lore.kernel.org/r/20230613-so-reuseport-v2-3-b7c69a342613@isovalent.com # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=i386 olddefconfig make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/ipv4/ net/ipv6/ If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202306140138.DnwjedJ1-lkp@intel.com/ All warnings (new ones prefixed by >>): >> net/ipv4/udp.c:409:5: warning: no previous prototype for 'udp_ehashfn' [-Wmissing-prototypes] 409 | u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, | ^~~~~~~~~~~ -- >> net/ipv6/udp.c:74:5: warning: no previous prototype for 'udp6_ehashfn' [-Wmissing-prototypes] 74 | u32 udp6_ehashfn(const struct net *net, | ^~~~~~~~~~~~ vim +/udp_ehashfn +409 net/ipv4/udp.c 407 408 INDIRECT_CALLABLE_SCOPE > 409 u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, 410 const __be32 faddr, const __be16 fport) 411 { 412 static u32 udp_ehash_secret __read_mostly; 413 414 net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); 415 416 return __inet_ehashfn(laddr, lport, faddr, fport, 417 udp_ehash_secret + net_hash_mix(net)); 418 } 419
From: Lorenz Bauer <lmb@isovalent.com> Date: Tue, 13 Jun 2023 11:14:58 +0100 > There are currently four copies of reuseport_lookup: one each for > (TCP, UDP)x(IPv4, IPv6). This forces us to duplicate all callers of > those functions as well. This is already the case for sk_lookup > helpers (inet,inet6,udp4,udp6)_lookup_run_bpf. > > The only difference between the reuseport_lookup helpers is calling > a different hash function. Cut down the number of reuseport_lookup > functions to one per IP version by using the INDIRECT_CALL > infrastructure. > > Signed-off-by: Lorenz Bauer <lmb@isovalent.com> > --- > include/net/inet6_hashtables.h | 11 ++++++++++- > include/net/inet_hashtables.h | 15 +++++++++----- > net/ipv4/inet_hashtables.c | 22 ++++++++++++++------- > net/ipv4/udp.c | 37 +++++++++++----------------------- > net/ipv6/inet6_hashtables.c | 16 +++++++++++---- > net/ipv6/udp.c | 45 +++++++++++++++--------------------------- > 6 files changed, 75 insertions(+), 71 deletions(-) > > diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h > index 032ddab48f8f..49d586454287 100644 > --- a/include/net/inet6_hashtables.h > +++ b/include/net/inet6_hashtables.h > @@ -48,12 +48,21 @@ struct sock *__inet6_lookup_established(struct net *net, > const u16 hnum, const int dif, > const int sdif); > > +typedef u32 (*inet6_ehashfn_t)(const struct net *net, > + const struct in6_addr *laddr, const u16 lport, > + const struct in6_addr *faddr, const __be16 fport); > + > +u32 inet6_ehashfn(const struct net *net, > + const struct in6_addr *laddr, const u16 lport, > + const struct in6_addr *faddr, const __be16 fport); > + > struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, > struct sk_buff *skb, int doff, > const struct in6_addr *saddr, > __be16 sport, > const struct in6_addr *daddr, > - unsigned short hnum); > + unsigned short hnum, > + inet6_ehashfn_t ehashfn); > > struct sock *inet6_lookup_listener(struct net *net, > struct inet_hashinfo *hashinfo, > diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h > index 8734f3488f5d..51ab6a1a3601 100644 > --- a/include/net/inet_hashtables.h > +++ b/include/net/inet_hashtables.h > @@ -379,10 +379,19 @@ struct sock *__inet_lookup_established(struct net *net, > const __be32 daddr, const u16 hnum, > const int dif, const int sdif); > > +typedef u32 (*inet_ehashfn_t)(const struct net *net, > + const __be32 laddr, const __u16 lport, > + const __be32 faddr, const __be16 fport); > + > +u32 inet_ehashfn(const struct net *net, > + const __be32 laddr, const __u16 lport, > + const __be32 faddr, const __be16 fport); > + > struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, > struct sk_buff *skb, int doff, > __be32 saddr, __be16 sport, > - __be32 daddr, unsigned short hnum); > + __be32 daddr, unsigned short hnum, > + inet_ehashfn_t ehashfn); > > static inline struct sock * > inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, > @@ -453,10 +462,6 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, > refcounted); > } > > -u32 inet6_ehashfn(const struct net *net, > - const struct in6_addr *laddr, const u16 lport, > - const struct in6_addr *faddr, const __be16 fport); > - > static inline void sk_daddr_set(struct sock *sk, __be32 addr) > { > sk->sk_daddr = addr; /* alias of inet_daddr */ > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index 91f9210d4e83..1ec895fd9905 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -28,9 +28,9 @@ > #include <net/tcp.h> > #include <net/sock_reuseport.h> > > -static u32 inet_ehashfn(const struct net *net, const __be32 laddr, > - const __u16 lport, const __be32 faddr, > - const __be16 fport) > +u32 inet_ehashfn(const struct net *net, const __be32 laddr, > + const __u16 lport, const __be32 faddr, > + const __be16 fport) > { > static u32 inet_ehash_secret __read_mostly; > > @@ -332,6 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net, > return score; > } > > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *, > + const __be32, const __u16, > + const __be32, const __be16)); > + > /** > * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. > * @net: network namespace. > @@ -342,6 +346,7 @@ static inline int compute_score(struct sock *sk, struct net *net, > * @sport: source port. > * @daddr: destination address. > * @hnum: destination port in host byte order. > + * @ehashfn: hash function used to generate the fallback hash. > * > * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to > * the selected sock or an error. > @@ -349,13 +354,15 @@ static inline int compute_score(struct sock *sk, struct net *net, > struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, > struct sk_buff *skb, int doff, > __be32 saddr, __be16 sport, > - __be32 daddr, unsigned short hnum) > + __be32 daddr, unsigned short hnum, > + inet_ehashfn_t ehashfn) > { > struct sock *reuse_sk = NULL; > u32 phash; > > if (sk->sk_reuseport) { > - phash = inet_ehashfn(net, daddr, hnum, saddr, sport); > + phash = INDIRECT_CALL_2(ehashfn, inet_ehashfn, udp_ehashfn, > + net, daddr, hnum, saddr, sport); > reuse_sk = reuseport_select_sock(sk, phash, skb, doff); > } > return reuse_sk; > @@ -385,7 +392,7 @@ static struct sock *inet_lhash2_lookup(struct net *net, > score = compute_score(sk, net, hnum, daddr, dif, sdif); > if (score > hiscore) { > result = inet_lookup_reuseport(net, sk, skb, doff, > - saddr, sport, daddr, hnum); > + saddr, sport, daddr, hnum, inet_ehashfn); > if (result) > return result; > > @@ -414,7 +421,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net, > if (no_reuseport || IS_ERR_OR_NULL(sk)) > return sk; > > - reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); > + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, > + inet_ehashfn); > if (reuse_sk) > sk = reuse_sk; > return sk; > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index fd3dae081f3a..10468fe144d0 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -405,9 +405,9 @@ static int compute_score(struct sock *sk, struct net *net, > return score; > } > > -static u32 udp_ehashfn(const struct net *net, const __be32 laddr, > - const __u16 lport, const __be32 faddr, > - const __be16 fport) > +INDIRECT_CALLABLE_SCOPE > +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, > + const __be32 faddr, const __be16 fport) > { > static u32 udp_ehash_secret __read_mostly; > > @@ -417,22 +417,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, > udp_ehash_secret + net_hash_mix(net)); > } > > -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, > - struct sk_buff *skb, > - __be32 saddr, __be16 sport, > - __be32 daddr, unsigned short hnum) > -{ > - struct sock *reuse_sk = NULL; > - u32 hash; > - > - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { > - hash = udp_ehashfn(net, daddr, hnum, saddr, sport); > - reuse_sk = reuseport_select_sock(sk, hash, skb, > - sizeof(struct udphdr)); > - } > - return reuse_sk; > -} > - > /* called with rcu_read_lock() */ > static struct sock *udp4_lib_lookup2(struct net *net, > __be32 saddr, __be16 sport, > @@ -450,11 +434,13 @@ static struct sock *udp4_lib_lookup2(struct net *net, > score = compute_score(sk, net, saddr, sport, > daddr, hnum, dif, sdif); > if (score > badness) { > - result = lookup_reuseport(net, sk, skb, > - saddr, sport, daddr, hnum); > - /* Fall back to scoring if group has connections */ > - if (result && !reuseport_has_conns(sk)) > - return result; > + if (sk->sk_state != TCP_ESTABLISHED) { > + result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), > + saddr, sport, daddr, hnum, udp_ehashfn); > + /* Fall back to scoring if group has connections */ > + if (result && !reuseport_has_conns(sk)) > + return result; result = result ? : sk; > + } else { result = sk; } The assignment to result below is buggy. Let's say SO_REUSEPROT group have TCP_CLOSE and TCP_ESTABLISHED sockets. 1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup 2. result is not NULL, but the group has TCP_ESTABLISHED sk 3. result = result 4. Find TCP_ESTABLISHED sk, which has a higher score 5. result = result (TCP_CLOSE) <-- should be sk. Same for v6 function. > > result = result ? : sk; > badness = score; > @@ -480,7 +466,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net, > if (no_reuseport || IS_ERR_OR_NULL(sk)) > return sk; > > - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); > + reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), > + saddr, sport, daddr, hnum, udp_ehashfn); > if (reuse_sk) > sk = reuse_sk; > return sk; > diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c > index 208998694ae3..a350ee40141c 100644 > --- a/net/ipv6/inet6_hashtables.c > +++ b/net/ipv6/inet6_hashtables.c > @@ -111,6 +111,10 @@ static inline int compute_score(struct sock *sk, struct net *net, > return score; > } > > +INDIRECT_CALLABLE_DECLARE(u32 udp6_ehashfn(const struct net *, > + const struct in6_addr *, const u16, > + const struct in6_addr *, const __be16)); > + > /** > * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. > * @net: network namespace. > @@ -121,6 +125,7 @@ static inline int compute_score(struct sock *sk, struct net *net, > * @sport: source port. > * @daddr: destination address. > * @hnum: destination port in host byte order. > + * @ehashfn: hash function used to generate the fallback hash. > * > * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to > * the selected sock or an error. > @@ -130,13 +135,15 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, > const struct in6_addr *saddr, > __be16 sport, > const struct in6_addr *daddr, > - unsigned short hnum) > + unsigned short hnum, > + inet6_ehashfn_t ehashfn) > { > struct sock *reuse_sk = NULL; > u32 phash; > > if (sk->sk_reuseport) { > - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); > + phash = INDIRECT_CALL_2(ehashfn, inet6_ehashfn, udp6_ehashfn, > + net, daddr, hnum, saddr, sport); > reuse_sk = reuseport_select_sock(sk, phash, skb, doff); > } > return reuse_sk; > @@ -159,7 +166,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net, > score = compute_score(sk, net, hnum, daddr, dif, sdif); > if (score > hiscore) { > result = inet6_lookup_reuseport(net, sk, skb, doff, > - saddr, sport, daddr, hnum); > + saddr, sport, daddr, hnum, inet6_ehashfn); > if (result) > return result; > > @@ -190,7 +197,8 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net, > if (no_reuseport || IS_ERR_OR_NULL(sk)) > return sk; > > - reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); > + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, > + saddr, sport, daddr, hnum, inet6_ehashfn); > if (reuse_sk) > sk = reuse_sk; > return sk; > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index e5a337e6b970..2af3a595f38a 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -70,11 +70,12 @@ int udpv6_init_sock(struct sock *sk) > return 0; > } > > -static u32 udp6_ehashfn(const struct net *net, > - const struct in6_addr *laddr, > - const u16 lport, > - const struct in6_addr *faddr, > - const __be16 fport) > +INDIRECT_CALLABLE_SCOPE > +u32 udp6_ehashfn(const struct net *net, > + const struct in6_addr *laddr, > + const u16 lport, > + const struct in6_addr *faddr, > + const __be16 fport) > { > static u32 udp6_ehash_secret __read_mostly; > static u32 udp_ipv6_hash_secret __read_mostly; > @@ -159,24 +160,6 @@ static int compute_score(struct sock *sk, struct net *net, > return score; > } > > -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, > - struct sk_buff *skb, > - const struct in6_addr *saddr, > - __be16 sport, > - const struct in6_addr *daddr, > - unsigned int hnum) > -{ > - struct sock *reuse_sk = NULL; > - u32 hash; > - > - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { > - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); > - reuse_sk = reuseport_select_sock(sk, hash, skb, > - sizeof(struct udphdr)); > - } > - return reuse_sk; > -} > - > /* called with rcu_read_lock() */ > static struct sock *udp6_lib_lookup2(struct net *net, > const struct in6_addr *saddr, __be16 sport, > @@ -193,11 +176,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, > score = compute_score(sk, net, saddr, sport, > daddr, hnum, dif, sdif); > if (score > badness) { > - result = lookup_reuseport(net, sk, skb, > - saddr, sport, daddr, hnum); > - /* Fall back to scoring if group has connections */ > - if (result && !reuseport_has_conns(sk)) > - return result; > + if (sk->sk_state != TCP_ESTABLISHED) { > + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), > + saddr, sport, daddr, hnum, > + udp6_ehashfn); > + /* Fall back to scoring if group has connections */ > + if (result && !reuseport_has_conns(sk)) > + return result; > + } > > result = result ? : sk; > badness = score; > @@ -225,7 +211,8 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net, > if (no_reuseport || IS_ERR_OR_NULL(sk)) > return sk; > > - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); > + reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), > + saddr, sport, daddr, hnum, udp6_ehashfn); > if (reuse_sk) > sk = reuse_sk; > return sk; > > -- > 2.40.1
On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > else { > result = sk; > } > > The assignment to result below is buggy. Let's say SO_REUSEPROT group > have TCP_CLOSE and TCP_ESTABLISHED sockets. I'm not very familiar with SO_REUSEPORT, I assumed (incorrectly probably) that such a group would only ever have TCP_CLOSE in UDP case and TCP_LISTENING in TCP case. Can you explain how I could end up in this situation?
On Tue, Jun 13, 2023 at 4:33 PM Simon Horman <simon.horman@corigine.com> wrote: > > > > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *, > > + const __be32, const __u16, > > + const __be32, const __be16)); > > + > > Hi Lorenz, > > Would this be better placed in a header file? > GCC complains that in udp.c this function is neither static nor > has a prototype. Hi Simon, The problem is that I don't want to pull in udp.h in inet_hashtables.c, but that is the natural place to define that function. I was hoping the macro magic would solve the problem, but oh well. How do you make gcc complain, and what is the full error message? Thanks Lorenz
From: Lorenz Bauer <lmb@isovalent.com> Date: Wed, 14 Jun 2023 16:25:05 +0100 > On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > > > else { > > result = sk; > > } > > > > The assignment to result below is buggy. Let's say SO_REUSEPROT group > > have TCP_CLOSE and TCP_ESTABLISHED sockets. > > I'm not very familiar with SO_REUSEPORT, I assumed (incorrectly > probably) that such a group would only ever have TCP_CLOSE in UDP case > and TCP_LISTENING in TCP case. Can you explain how I could end up in > this situation? When we call conenct() for UDP socket in SO_REUSEPORT group, the state is changed from TCP_CLOSE to TCP_ESTABLISHED in __ip4_datagram_connect(), and the socket remains in the group. That's why we check TCP_ESTABLISHED in reuseport_select_sock_by_hash() that is always false for TCP but true for UDP in the case above.
On Wed, Jun 14, 2023 at 04:42:45PM +0100, Lorenz Bauer wrote: > On Tue, Jun 13, 2023 at 4:33 PM Simon Horman <simon.horman@corigine.com> wrote: > > > > > > +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *, > > > + const __be32, const __u16, > > > + const __be32, const __be16)); > > > + > > > > Hi Lorenz, > > > > Would this be better placed in a header file? > > GCC complains that in udp.c this function is neither static nor > > has a prototype. > > Hi Simon, > > The problem is that I don't want to pull in udp.h in > inet_hashtables.c, but that is the natural place to define that > function. I was hoping the macro magic would solve the problem, but oh > well. How do you make gcc complain, and what is the full error > message? Hi Lorenz, sorry for the bother. With gcc 12.3.0 [1] on x86_64 I see: $ make allmodconfig $ make W=1 net/ipv4/udp.o net/ipv4/udp.c:410:5: error: no previous prototype for 'udp_ehashfn' [-Werror=missing-prototypes] 410 | u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, | ^~~~~~~~~~~ [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/
On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > The assignment to result below is buggy. Let's say SO_REUSEPROT group > have TCP_CLOSE and TCP_ESTABLISHED sockets. > > 1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup > 2. result is not NULL, but the group has TCP_ESTABLISHED sk > 3. result = result > 4. Find TCP_ESTABLISHED sk, which has a higher score > 5. result = result (TCP_CLOSE) <-- should be sk. > > Same for v6 function. Thanks for your explanation, I think I get it now. I misunderstood that you were worried about returning TCP_ESTABLISHED instead of TCP_CLOSE, but it's exactly the other way around. I have a follow up question regarding the existing code: result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; badness = score; Assuming that result != NULL but reuseport_has_conns() == true, we use the reuseport socket as the result, but assign the score of sk to badness. Shouldn't we use the score of the reuseport socket? Thanks Lorenz
From: Lorenz Bauer <lmb@isovalent.com> Date: Tue, 20 Jun 2023 15:26:05 +0100 > On Tue, Jun 13, 2023 at 7:57 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > > > The assignment to result below is buggy. Let's say SO_REUSEPROT group > > have TCP_CLOSE and TCP_ESTABLISHED sockets. > > > > 1. Find TCP_CLOSE sk and do SO_REUSEPORT lookup > > 2. result is not NULL, but the group has TCP_ESTABLISHED sk > > 3. result = result > > 4. Find TCP_ESTABLISHED sk, which has a higher score > > 5. result = result (TCP_CLOSE) <-- should be sk. > > > > Same for v6 function. > > Thanks for your explanation, I think I get it now. I misunderstood > that you were worried about returning TCP_ESTABLISHED instead of > TCP_CLOSE, but it's exactly the other way around. > > I have a follow up question regarding the existing code: > > result = lookup_reuseport(net, sk, skb, > saddr, sport, daddr, hnum); > /* Fall back to scoring if group has connections */ > if (result && !reuseport_has_conns(sk)) > return result; > > result = result ? : sk; > badness = score; > > Assuming that result != NULL but reuseport_has_conns() == true, we use > the reuseport socket as the result, but assign the score of sk to > badness. Shouldn't we use the score of the reuseport socket? Good point. This is based on an assumption that all SO_REUSEPORT sockets have the same score, which is wrong for two corner cases if reuseport_has_conns() == true : 1) SO_INCOMING_CPU is set -> selected sk might have +1 score 2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk -> selected sk will have more than 8 Using the old score could trigger more lookups depending on the order that sockets are created. sk -> sk (SO_INCOMING_CPU) -> sk (ESTABLISHED) | | `-> select the next SO_INCOMING_CPU sk | `-> select itself (We should save this lookup) So, yes, we should update badness like if (unlikely(result)) { badness = compute_score(result, ...); } else { result = sk; badness = score; }
On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > Good point. This is based on an assumption that all SO_REUSEPORT > sockets have the same score, which is wrong for two corner cases > if reuseport_has_conns() == true : > > 1) SO_INCOMING_CPU is set > -> selected sk might have +1 score > > 2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk > -> selected sk will have more than 8 > > Using the old score could trigger more lookups depending on the > order that sockets are created. So the result will still be correct, but it's less performant? Happy to fix a perf regression, but if the result is incorrect this might need a separate fix? Best Lorenz
On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > Good point. This is based on an assumption that all SO_REUSEPORT > sockets have the same score, which is wrong for two corner cases [...] I did some more digging. I think this was introduced by commit efc6b6f6c311 ("udp: Improve load balancing for SO_REUSEPORT.") which unfortunately ran into a merge conflict. That resulted in Dave Miller moving the bug around in commit a57066b1a019 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net"). Can you take a look and let me know if you think that is correct? Best Lorenz
From: Lorenz Bauer <lmb@isovalent.com> Date: Wed, 21 Jun 2023 09:01:15 +0100 > On Tue, Jun 20, 2023 at 7:31 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > > > Good point. This is based on an assumption that all SO_REUSEPORT > > sockets have the same score, which is wrong for two corner cases > > if reuseport_has_conns() == true : > > > > 1) SO_INCOMING_CPU is set > > -> selected sk might have +1 score > > > > 2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk > > -> selected sk will have more than 8 > > > > Using the old score could trigger more lookups depending on the > > order that sockets are created. > > So the result will still be correct, but it's less performant? Happy > to fix a perf regression, but if the result is incorrect this might > need a separate fix? Right, the result is always correct. If BPF prog selects a different socket per lookup, there is no consistency, but it _is_ corret. > I did some more digging. I think this was introduced by commit > efc6b6f6c311 ("udp: Improve load balancing for SO_REUSEPORT.") which > unfortunately ran into a merge conflict. That resulted in Dave Miller > moving the bug around in commit a57066b1a019 ("Merge > git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net"). Can you > take a look and let me know if you think that is correct? Yes, I should have updated the score too in efc6b6f6c311 to save unneeded lookups. The conflict itself was resolved properly.
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 032ddab48f8f..49d586454287 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,12 +48,21 @@ struct sock *__inet6_lookup_established(struct net *net, const u16 hnum, const int dif, const int sdif); +typedef u32 (*inet6_ehashfn_t)(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + +u32 inet6_ehashfn(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, - unsigned short hnum); + unsigned short hnum, + inet6_ehashfn_t ehashfn); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 8734f3488f5d..51ab6a1a3601 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,19 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif, const int sdif); +typedef u32 (*inet_ehashfn_t)(const struct net *net, + const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); + +u32 inet_ehashfn(const struct net *net, + const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); + struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum); + __be32 daddr, unsigned short hnum, + inet_ehashfn_t ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, @@ -453,10 +462,6 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, refcounted); } -u32 inet6_ehashfn(const struct net *net, - const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const __be16 fport); - static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 91f9210d4e83..1ec895fd9905 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -28,9 +28,9 @@ #include <net/tcp.h> #include <net/sock_reuseport.h> -static u32 inet_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -332,6 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +INDIRECT_CALLABLE_DECLARE(u32 udp_ehashfn(const struct net *, + const __be32, const __u16, + const __be32, const __be16)); + /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. @@ -342,6 +346,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. @@ -349,13 +354,15 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) + __be32 daddr, unsigned short hnum, + inet_ehashfn_t ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_2(ehashfn, inet_ehashfn, udp_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; @@ -385,7 +392,7 @@ static struct sock *inet_lhash2_lookup(struct net *net, score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; @@ -414,7 +421,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, + inet_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fd3dae081f3a..10468fe144d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -405,9 +405,9 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static u32 udp_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -417,22 +417,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -450,11 +434,13 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); - /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) - return result; + if (sk->sk_state != TCP_ESTABLISHED) { + result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk)) + return result; + } result = result ? : sk; badness = score; @@ -480,7 +466,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 208998694ae3..a350ee40141c 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -111,6 +111,10 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +INDIRECT_CALLABLE_DECLARE(u32 udp6_ehashfn(const struct net *, + const struct in6_addr *, const u16, + const struct in6_addr *, const __be16)); + /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. @@ -121,6 +125,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. @@ -130,13 +135,15 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, - unsigned short hnum) + unsigned short hnum, + inet6_ehashfn_t ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_2(ehashfn, inet6_ehashfn, udp6_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; @@ -159,7 +166,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net, score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; @@ -190,7 +197,8 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet6_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e5a337e6b970..2af3a595f38a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -70,11 +70,12 @@ int udpv6_init_sock(struct sock *sk) return 0; } -static u32 udp6_ehashfn(const struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -159,24 +160,6 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned int hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -193,11 +176,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); - /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) - return result; + if (sk->sk_state != TCP_ESTABLISHED) { + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, + udp6_ehashfn); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk)) + return result; + } result = result ? : sk; badness = score; @@ -225,7 +211,8 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); if (reuse_sk) sk = reuse_sk; return sk;
There are currently four copies of reuseport_lookup: one each for (TCP, UDP)x(IPv4, IPv6). This forces us to duplicate all callers of those functions as well. This is already the case for sk_lookup helpers (inet,inet6,udp4,udp6)_lookup_run_bpf. The only difference between the reuseport_lookup helpers is calling a different hash function. Cut down the number of reuseport_lookup functions to one per IP version by using the INDIRECT_CALL infrastructure. Signed-off-by: Lorenz Bauer <lmb@isovalent.com> --- include/net/inet6_hashtables.h | 11 ++++++++++- include/net/inet_hashtables.h | 15 +++++++++----- net/ipv4/inet_hashtables.c | 22 ++++++++++++++------- net/ipv4/udp.c | 37 +++++++++++----------------------- net/ipv6/inet6_hashtables.c | 16 +++++++++++---- net/ipv6/udp.c | 45 +++++++++++++++--------------------------- 6 files changed, 75 insertions(+), 71 deletions(-)