Message ID | 20240711071017.64104-1-348067333@qq.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] inet: reduce the execution time of getsockname() | expand |
From: heze0908 <heze0908@gmail.com> Date: Thu, 11 Jul 2024 15:10:17 +0800 > From: Ze He <zanehe@tencent.com> > > Recently, we received feedback regarding an increase > in the time consumption of getsockname() in production. > Therefore, we conducted tests based on the > "getsockname" test item in libmicro. The test results > indicate that compared to the kernel 5.4, the latest > kernel indeed has an increased time consumption > in getsockname(). > The test results are as follows: > > case_name kernel 5.4 latest kernel diff > ---------- ----------- ------------- -------- > getsockname 0.12278 0.18246 +48.61% > > It was discovered that the introduction of lock_sock() in > commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") > to solve the data race problem between __inet_hash_connect() > and inet_getname() has led to the increased time consumption. > This patch attempts to propose a lockless solution to replace > the spinlock solution. > > We have to solve the race issue without heavy spin lock: > one reader is reading some members in struct inet_sock > while the other writer is trying to modify them. Those > members are "inet_sport" "inet_saddr" "inet_dport" > "inet_rcv_saddr". Therefore, in the path of getname, we > use READ_ONCE to read these data, and correspondingly, > in the path of tcp connect, we use WRITE_ONCE to write > these data. > > Using this patch, we conducted the getsockname test again, > and the results are as follows: > > case_name latest kernel latest kernel(patched) > ---------- ----------- --------------------- > getsockname 0.18246 0.14423 > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > Signed-off-by: Ze He <zanehe@tencent.com> > --- > include/net/ip.h | 3 ++- > net/ipv4/af_inet.c | 27 +++++++++++++-------------- > net/ipv4/inet_hashtables.c | 8 ++++---- > net/ipv4/tcp_ipv4.c | 4 ++-- > net/ipv6/af_inet6.c | 22 ++++++++++------------ > net/ipv6/inet6_hashtables.c | 2 +- > net/ipv6/tcp_ipv6.c | 6 +++--- > 7 files changed, 35 insertions(+), 37 deletions(-) > > diff --git a/include/net/ip.h b/include/net/ip.h > index c5606cadb1a5..cec1919cfdd0 100644 > --- a/include/net/ip.h > +++ b/include/net/ip.h > @@ -663,7 +663,8 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, > > static __inline__ void inet_reset_saddr(struct sock *sk) > { > - inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0; > + WRITE_ONCE(inet_sk(sk)->inet_rcv_saddr, 0); > + WRITE_ONCE(inet_sk(sk)->inet_saddr, 0); > #if IS_ENABLED(CONFIG_IPV6) > if (sk->sk_family == PF_INET6) { > struct ipv6_pinfo *np = inet6_sk(sk); > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index b24d74616637..e8c035f23078 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -803,28 +803,27 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, > int sin_addr_len = sizeof(*sin); > > sin->sin_family = AF_INET; > - lock_sock(sk); > if (peer) { > - if (!inet->inet_dport || > + __be16 dport = READ_ONCE(inet->inet_dport); > + > + if (!dport || > (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && sk->sk_state will need annotation. > - peer == 1)) { > - release_sock(sk); > + peer == 1)) > return -ENOTCONN; > - } > - sin->sin_port = inet->inet_dport; > + sin->sin_port = dport; > sin->sin_addr.s_addr = inet->inet_daddr; READ_ONCE() is needed here, and WRITE_ONCE() in sk_daddr_set(). > - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, > - CGROUP_INET4_GETPEERNAME); > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, > + CGROUP_INET4_GETPEERNAME, NULL); > } else { > - __be32 addr = inet->inet_rcv_saddr; > + __be32 addr = READ_ONCE(inet->inet_rcv_saddr); > + > if (!addr) > - addr = inet->inet_saddr; > - sin->sin_port = inet->inet_sport; > + addr = READ_ONCE(inet->inet_saddr); > + sin->sin_port = READ_ONCE(inet->inet_sport); > sin->sin_addr.s_addr = addr; > - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, > - CGROUP_INET4_GETSOCKNAME); > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, > + CGROUP_INET4_GETSOCKNAME, NULL); > } > - release_sock(sk); > memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); > return sin_addr_len; > } > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index 48d0d494185b..9398dbf625b4 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -577,7 +577,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, > * in hash table socket with a funny identity. > */ > inet->inet_num = lport; > - inet->inet_sport = htons(lport); > + WRITE_ONCE(inet->inet_sport, htons(lport)); > sk->sk_hash = hash; > WARN_ON(!sk_unhashed(sk)); > __sk_nulls_add_node_rcu(sk, &head->chain); > @@ -877,7 +877,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in > static void inet_update_saddr(struct sock *sk, void *saddr, int family) > { > if (family == AF_INET) { > - inet_sk(sk)->inet_saddr = *(__be32 *)saddr; > + WRITE_ONCE(inet_sk(sk)->inet_saddr, *(__be32 *)saddr); > sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); > } > #if IS_ENABLED(CONFIG_IPV6) > @@ -1115,7 +1115,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, > inet_bind_hash(sk, tb, tb2, port); > > if (sk_unhashed(sk)) { > - inet_sk(sk)->inet_sport = htons(port); > + WRITE_ONCE(inet_sk(sk)->inet_sport, htons(port)); > inet_ehash_nolisten(sk, (struct sock *)tw, NULL); > } > if (tw) > @@ -1140,7 +1140,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, > spin_unlock(lock); > > sk->sk_hash = 0; > - inet_sk(sk)->inet_sport = 0; > + WRITE_ONCE(inet_sk(sk)->inet_sport, 0); > inet_sk(sk)->inet_num = 0; > > if (tw) > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index fd17f25ff288..041a29d8a0fb 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -279,7 +279,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) > WRITE_ONCE(tp->write_seq, 0); > } > > - inet->inet_dport = usin->sin_port; > + WRITE_ONCE(inet->inet_dport, usin->sin_port); > sk_daddr_set(sk, daddr); > > inet_csk(sk)->icsk_ext_hdr_len = 0; > @@ -348,7 +348,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) > inet_bhash2_reset_saddr(sk); > ip_rt_put(rt); > sk->sk_route_caps = 0; > - inet->inet_dport = 0; > + WRITE_ONCE(inet->inet_dport, 0); > return err; > } > EXPORT_SYMBOL(tcp_v4_connect); > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c > index e03fb9a1dbeb..241bc6d2d0a2 100644 > --- a/net/ipv6/af_inet6.c > +++ b/net/ipv6/af_inet6.c > @@ -532,32 +532,30 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, > sin->sin6_family = AF_INET6; > sin->sin6_flowinfo = 0; > sin->sin6_scope_id = 0; > - lock_sock(sk); > if (peer) { > - if (!inet->inet_dport || > + __be16 dport = READ_ONCE(inet->inet_dport); > + > + if (!dport || > (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && > - peer == 1)) { > - release_sock(sk); > + peer == 1)) > return -ENOTCONN; > - } > - sin->sin6_port = inet->inet_dport; > + sin->sin6_port = dport; > sin->sin6_addr = sk->sk_v6_daddr; This access is also racy, > if (inet6_test_bit(SNDFLOW, sk)) > sin->sin6_flowinfo = np->flow_label; > - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, > - CGROUP_INET6_GETPEERNAME); > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, > + CGROUP_INET6_GETPEERNAME, NULL); > } else { > if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) > sin->sin6_addr = np->saddr; > else > sin->sin6_addr = sk->sk_v6_rcv_saddr; and same here. > - sin->sin6_port = inet->inet_sport; > - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, > - CGROUP_INET6_GETSOCKNAME); > + sin->sin6_port = READ_ONCE(inet->inet_sport); > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, > + CGROUP_INET6_GETSOCKNAME, NULL); > } > sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, > sk->sk_bound_dev_if); > - release_sock(sk); > return sin_addr_len; > } > EXPORT_SYMBOL(inet6_getname); > diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c > index 6db71bb1cd30..d5b191db9dfe 100644 > --- a/net/ipv6/inet6_hashtables.c > +++ b/net/ipv6/inet6_hashtables.c > @@ -302,7 +302,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, > * in hash table socket with a funny identity. > */ > inet->inet_num = lport; > - inet->inet_sport = htons(lport); > + WRITE_ONCE(inet->inet_sport, htons(lport)); > sk->sk_hash = hash; > WARN_ON(!sk_unhashed(sk)); > __sk_nulls_add_node_rcu(sk, &head->chain); > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 200fea92f12f..f78ab704378a 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -293,7 +293,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, > > /* set the source address */ > np->saddr = *saddr; > - inet->inet_rcv_saddr = LOOPBACK4_IPV6; > + WRITE_ONCE(inet->inet_rcv_saddr, LOOPBACK4_IPV6); > > sk->sk_gso_type = SKB_GSO_TCPV6; > ip6_dst_store(sk, dst, NULL, NULL); > @@ -305,7 +305,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, > > tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); > > - inet->inet_dport = usin->sin6_port; > + WRITE_ONCE(inet->inet_dport, usin->sin6_port); > > tcp_set_state(sk, TCP_SYN_SENT); > err = inet6_hash_connect(tcp_death_row, sk); > @@ -340,7 +340,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, > tcp_set_state(sk, TCP_CLOSE); > inet_bhash2_reset_saddr(sk); > failure: > - inet->inet_dport = 0; > + WRITE_ONCE(inet->inet_dport, 0); > sk->sk_route_caps = 0; > return err; > } > -- > 2.43.5
On Thu, Jul 11, 2024 at 12:10 AM heze0908 <heze0908@gmail.com> wrote: > > From: Ze He <zanehe@tencent.com> > > Recently, we received feedback regarding an increase > in the time consumption of getsockname() in production. > Therefore, we conducted tests based on the > "getsockname" test item in libmicro. The test results > indicate that compared to the kernel 5.4, the latest > kernel indeed has an increased time consumption > in getsockname(). > The test results are as follows: > > case_name kernel 5.4 latest kernel diff > ---------- ----------- ------------- -------- > getsockname 0.12278 0.18246 +48.61% > > It was discovered that the introduction of lock_sock() in > commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") > to solve the data race problem between __inet_hash_connect() > and inet_getname() has led to the increased time consumption. > This patch attempts to propose a lockless solution to replace > the spinlock solution. > > We have to solve the race issue without heavy spin lock: > one reader is reading some members in struct inet_sock > while the other writer is trying to modify them. Those > members are "inet_sport" "inet_saddr" "inet_dport" > "inet_rcv_saddr". Therefore, in the path of getname, we > use READ_ONCE to read these data, and correspondingly, > in the path of tcp connect, we use WRITE_ONCE to write > these data. > > Using this patch, we conducted the getsockname test again, > and the results are as follows: > > case_name latest kernel latest kernel(patched) > ---------- ----------- --------------------- > getsockname 0.18246 0.14423 > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > Signed-off-by: Ze He <zanehe@tencent.com> There is no way you can implement a correct getsockname() without extra synchronization. When multiple fields are read, READ_ONCE() will not ensure consistency, especially for IPv6 addresses which are too big to fit in a single word.
Hello Eric, On Fri, Jul 12, 2024 at 1:26 AM Eric Dumazet <edumazet@google.com> wrote: > > On Thu, Jul 11, 2024 at 12:10 AM heze0908 <heze0908@gmail.com> wrote: > > > > From: Ze He <zanehe@tencent.com> > > > > Recently, we received feedback regarding an increase > > in the time consumption of getsockname() in production. > > Therefore, we conducted tests based on the > > "getsockname" test item in libmicro. The test results > > indicate that compared to the kernel 5.4, the latest > > kernel indeed has an increased time consumption > > in getsockname(). > > The test results are as follows: > > > > case_name kernel 5.4 latest kernel diff > > ---------- ----------- ------------- -------- > > getsockname 0.12278 0.18246 +48.61% > > > > It was discovered that the introduction of lock_sock() in > > commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") > > to solve the data race problem between __inet_hash_connect() > > and inet_getname() has led to the increased time consumption. > > This patch attempts to propose a lockless solution to replace > > the spinlock solution. > > > > We have to solve the race issue without heavy spin lock: > > one reader is reading some members in struct inet_sock > > while the other writer is trying to modify them. Those > > members are "inet_sport" "inet_saddr" "inet_dport" > > "inet_rcv_saddr". Therefore, in the path of getname, we > > use READ_ONCE to read these data, and correspondingly, > > in the path of tcp connect, we use WRITE_ONCE to write > > these data. > > > > Using this patch, we conducted the getsockname test again, > > and the results are as follows: > > > > case_name latest kernel latest kernel(patched) > > ---------- ----------- --------------------- > > getsockname 0.18246 0.14423 > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > Signed-off-by: Ze He <zanehe@tencent.com> > > There is no way you can implement a correct getsockname() without > extra synchronization. > > When multiple fields are read, READ_ONCE() will not ensure > consistency, especially for IPv6 addresses > which are too big to fit in a single word. > Thanks for your reply. I was thinking two ways at the beginning, one is using lockless way as this patch does which apparently is a little bit complicated, the other one is reverting commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") because in the real world I don't think the software programmer could call this two syscalls (connect and getsockname) concurrently. What is the use/meaning of calling those two concurrently? Even if there is data-race in this case, programmers cannot trust the results of getsockname one way or another. The fact is the degradation of performance, which the users complain about after upgrading the kernel from 5.4 to the latest. What do you suggest, Eric? Thanks, Jason
On Thu, Jul 11, 2024 at 4:58 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > Hello Eric, > > On Fri, Jul 12, 2024 at 1:26 AM Eric Dumazet <edumazet@google.com> wrote: > > > > On Thu, Jul 11, 2024 at 12:10 AM heze0908 <heze0908@gmail.com> wrote: > > > > > > From: Ze He <zanehe@tencent.com> > > > > > > Recently, we received feedback regarding an increase > > > in the time consumption of getsockname() in production. > > > Therefore, we conducted tests based on the > > > "getsockname" test item in libmicro. The test results > > > indicate that compared to the kernel 5.4, the latest > > > kernel indeed has an increased time consumption > > > in getsockname(). > > > The test results are as follows: > > > > > > case_name kernel 5.4 latest kernel diff > > > ---------- ----------- ------------- -------- > > > getsockname 0.12278 0.18246 +48.61% > > > > > > It was discovered that the introduction of lock_sock() in > > > commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") > > > to solve the data race problem between __inet_hash_connect() > > > and inet_getname() has led to the increased time consumption. > > > This patch attempts to propose a lockless solution to replace > > > the spinlock solution. > > > > > > We have to solve the race issue without heavy spin lock: > > > one reader is reading some members in struct inet_sock > > > while the other writer is trying to modify them. Those > > > members are "inet_sport" "inet_saddr" "inet_dport" > > > "inet_rcv_saddr". Therefore, in the path of getname, we > > > use READ_ONCE to read these data, and correspondingly, > > > in the path of tcp connect, we use WRITE_ONCE to write > > > these data. > > > > > > Using this patch, we conducted the getsockname test again, > > > and the results are as follows: > > > > > > case_name latest kernel latest kernel(patched) > > > ---------- ----------- --------------------- > > > getsockname 0.18246 0.14423 > > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > > Signed-off-by: Ze He <zanehe@tencent.com> > > > > There is no way you can implement a correct getsockname() without > > extra synchronization. > > > > When multiple fields are read, READ_ONCE() will not ensure > > consistency, especially for IPv6 addresses > > which are too big to fit in a single word. > > > > Thanks for your reply. > > I was thinking two ways at the beginning, one is using lockless way as > this patch does which apparently is a little bit complicated, the > other one is reverting commit 9dfc685e0262 ("inet: remove races in > inet{6}_getname()") because in the real world I don't think the > software programmer could call this two syscalls (connect and > getsockname) concurrently. What is the use/meaning of calling those > two concurrently? Even if there is data-race in this case, programmers > cannot trust the results of getsockname one way or another. The fact > is the degradation of performance, which the users complain about > after upgrading the kernel from 5.4 to the latest. What do you > suggest, Eric? In the 'real world' we need results that we can trust. Fact that it was missing in the past is not an excuse, this was a bug in the first place. Feel free to implement an alternate protection, if you really need to.
On Fri, Jul 12, 2024 at 8:32 AM Eric Dumazet <edumazet@google.com> wrote: > > On Thu, Jul 11, 2024 at 4:58 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > Hello Eric, > > > > On Fri, Jul 12, 2024 at 1:26 AM Eric Dumazet <edumazet@google.com> wrote: > > > > > > On Thu, Jul 11, 2024 at 12:10 AM heze0908 <heze0908@gmail.com> wrote: > > > > > > > > From: Ze He <zanehe@tencent.com> > > > > > > > > Recently, we received feedback regarding an increase > > > > in the time consumption of getsockname() in production. > > > > Therefore, we conducted tests based on the > > > > "getsockname" test item in libmicro. The test results > > > > indicate that compared to the kernel 5.4, the latest > > > > kernel indeed has an increased time consumption > > > > in getsockname(). > > > > The test results are as follows: > > > > > > > > case_name kernel 5.4 latest kernel diff > > > > ---------- ----------- ------------- -------- > > > > getsockname 0.12278 0.18246 +48.61% > > > > > > > > It was discovered that the introduction of lock_sock() in > > > > commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") > > > > to solve the data race problem between __inet_hash_connect() > > > > and inet_getname() has led to the increased time consumption. > > > > This patch attempts to propose a lockless solution to replace > > > > the spinlock solution. > > > > > > > > We have to solve the race issue without heavy spin lock: > > > > one reader is reading some members in struct inet_sock > > > > while the other writer is trying to modify them. Those > > > > members are "inet_sport" "inet_saddr" "inet_dport" > > > > "inet_rcv_saddr". Therefore, in the path of getname, we > > > > use READ_ONCE to read these data, and correspondingly, > > > > in the path of tcp connect, we use WRITE_ONCE to write > > > > these data. > > > > > > > > Using this patch, we conducted the getsockname test again, > > > > and the results are as follows: > > > > > > > > case_name latest kernel latest kernel(patched) > > > > ---------- ----------- --------------------- > > > > getsockname 0.18246 0.14423 > > > > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > > > Signed-off-by: Ze He <zanehe@tencent.com> > > > > > > There is no way you can implement a correct getsockname() without > > > extra synchronization. > > > > > > When multiple fields are read, READ_ONCE() will not ensure > > > consistency, especially for IPv6 addresses > > > which are too big to fit in a single word. > > > > > > > Thanks for your reply. > > > > I was thinking two ways at the beginning, one is using lockless way as > > this patch does which apparently is a little bit complicated, the > > other one is reverting commit 9dfc685e0262 ("inet: remove races in > > inet{6}_getname()") because in the real world I don't think the > > software programmer could call this two syscalls (connect and > > getsockname) concurrently. What is the use/meaning of calling those > > two concurrently? Even if there is data-race in this case, programmers > > cannot trust the results of getsockname one way or another. The fact > > is the degradation of performance, which the users complain about > > after upgrading the kernel from 5.4 to the latest. What do you > > suggest, Eric? > > In the 'real world' we need results that we can trust. > > Fact that it was missing in the past is not an excuse, this was a bug > in the first place. > > Feel free to implement an alternate protection, if you really need to. Umm. Let me think more about how to solve the data consistency issue... Really hard, I think. Thanks, Jason
diff --git a/include/net/ip.h b/include/net/ip.h index c5606cadb1a5..cec1919cfdd0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -663,7 +663,8 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, static __inline__ void inet_reset_saddr(struct sock *sk) { - inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0; + WRITE_ONCE(inet_sk(sk)->inet_rcv_saddr, 0); + WRITE_ONCE(inet_sk(sk)->inet_saddr, 0); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b24d74616637..e8c035f23078 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -803,28 +803,27 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, int sin_addr_len = sizeof(*sin); sin->sin_family = AF_INET; - lock_sock(sk); if (peer) { - if (!inet->inet_dport || + __be16 dport = READ_ONCE(inet->inet_dport); + + if (!dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1)) { - release_sock(sk); + peer == 1)) return -ENOTCONN; - } - sin->sin_port = inet->inet_dport; + sin->sin_port = dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, - CGROUP_INET4_GETPEERNAME); + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET4_GETPEERNAME, NULL); } else { - __be32 addr = inet->inet_rcv_saddr; + __be32 addr = READ_ONCE(inet->inet_rcv_saddr); + if (!addr) - addr = inet->inet_saddr; - sin->sin_port = inet->inet_sport; + addr = READ_ONCE(inet->inet_saddr); + sin->sin_port = READ_ONCE(inet->inet_sport); sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, - CGROUP_INET4_GETSOCKNAME); + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET4_GETSOCKNAME, NULL); } - release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sin_addr_len; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 48d0d494185b..9398dbf625b4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -577,7 +577,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, * in hash table socket with a funny identity. */ inet->inet_num = lport; - inet->inet_sport = htons(lport); + WRITE_ONCE(inet->inet_sport, htons(lport)); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); @@ -877,7 +877,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in static void inet_update_saddr(struct sock *sk, void *saddr, int family) { if (family == AF_INET) { - inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + WRITE_ONCE(inet_sk(sk)->inet_saddr, *(__be32 *)saddr); sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); } #if IS_ENABLED(CONFIG_IPV6) @@ -1115,7 +1115,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_bind_hash(sk, tb, tb2, port); if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); + WRITE_ONCE(inet_sk(sk)->inet_sport, htons(port)); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) @@ -1140,7 +1140,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, spin_unlock(lock); sk->sk_hash = 0; - inet_sk(sk)->inet_sport = 0; + WRITE_ONCE(inet_sk(sk)->inet_sport, 0); inet_sk(sk)->inet_num = 0; if (tw) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd17f25ff288..041a29d8a0fb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -279,7 +279,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) WRITE_ONCE(tp->write_seq, 0); } - inet->inet_dport = usin->sin_port; + WRITE_ONCE(inet->inet_dport, usin->sin_port); sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; @@ -348,7 +348,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; - inet->inet_dport = 0; + WRITE_ONCE(inet->inet_dport, 0); return err; } EXPORT_SYMBOL(tcp_v4_connect); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e03fb9a1dbeb..241bc6d2d0a2 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -532,32 +532,30 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; - lock_sock(sk); if (peer) { - if (!inet->inet_dport || + __be16 dport = READ_ONCE(inet->inet_dport); + + if (!dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1)) { - release_sock(sk); + peer == 1)) return -ENOTCONN; - } - sin->sin6_port = inet->inet_dport; + sin->sin6_port = dport; sin->sin6_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = np->flow_label; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, - CGROUP_INET6_GETPEERNAME); + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, - CGROUP_INET6_GETSOCKNAME); + sin->sin6_port = READ_ONCE(inet->inet_sport); + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); - release_sock(sk); return sin_addr_len; } EXPORT_SYMBOL(inet6_getname); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 6db71bb1cd30..d5b191db9dfe 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -302,7 +302,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, * in hash table socket with a funny identity. */ inet->inet_num = lport; - inet->inet_sport = htons(lport); + WRITE_ONCE(inet->inet_sport, htons(lport)); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 200fea92f12f..f78ab704378a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -293,7 +293,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, /* set the source address */ np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; + WRITE_ONCE(inet->inet_rcv_saddr, LOOPBACK4_IPV6); sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); @@ -305,7 +305,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - inet->inet_dport = usin->sin6_port; + WRITE_ONCE(inet->inet_dport, usin->sin6_port); tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); @@ -340,7 +340,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: - inet->inet_dport = 0; + WRITE_ONCE(inet->inet_dport, 0); sk->sk_route_caps = 0; return err; }