Message ID | 72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu | expand |
From: Kirill Tkhai <tkhai@ya.ru> Date: Mon, 23 Jan 2023 01:21:20 +0300 > Some functions use unlocked check for sock::sk_state. This does not guarantee > a new value assigned to sk_state on some CPU is already visible on this CPU. > > Example: > > [CPU0:Task0] [CPU1:Task1] > unix_listen() > unix_state_lock(sk); > sk->sk_state = TCP_LISTEN; > unix_state_unlock(sk); > unix_accept() > if (sk->sk_state != TCP_LISTEN) /* not visible */ > goto out; /* return error */ > > Task1 may miss new sk->sk_state value, and then unix_accept() returns error. > Since in this situation unix_accept() is called chronologically later, such > behavior is not obvious and it is wrong. Have you seen this on a real workload ? It sounds like a userspace bug that accept() is called on a different CPU before listen() returns. At least, accept() is fetching sk at the same time, then I think there should be no guarantee that sk_state is TCP_LISTEN. Same for other TCP_ESTABLISHED tests, it seems a program is calling sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent connect() will finish earlier. > > This patch aims to fix the problem. A new function unix_sock_state() is > introduced, and it makes sure a user never misses a new state assigned just > before the function is called. We will use it in the places, where unlocked > sk_state dereferencing was used before. > > Note, that there remain some more places with sk_state unfixed. Also, the same > problem is with unix_peer(). This will be a subject for future patches. > > Signed-off-by: Kirill Tkhai <tkhai@ya.ru> > --- > net/unix/af_unix.c | 43 +++++++++++++++++++++++++++++++------------ > 1 file changed, 31 insertions(+), 12 deletions(-) > > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c > index 009616fa0256..f53e09a0753b 100644 > --- a/net/unix/af_unix.c > +++ b/net/unix/af_unix.c > @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s) > } > EXPORT_SYMBOL_GPL(unix_peer_get); > > +/* This function returns current sk->sk_state guaranteeing > + * its relevance in case of assignment was made on other CPU. > + */ > +static unsigned char unix_sock_state(struct sock *sk) > +{ > + unsigned char s_state = READ_ONCE(sk->sk_state); > + > + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their > + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. > + * We may avoid taking the lock in case of those states are > + * already visible. > + */ > + if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN) > + && sk->sk_type != SOCK_DGRAM) > + return s_state; > + > + unix_state_lock(sk); > + s_state = sk->sk_state; > + unix_state_unlock(sk); > + return s_state; > +} > + > static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, > int addr_len) > { > @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) > int nr_fds = 0; > > if (sk) { > - s_state = READ_ONCE(sk->sk_state); > + s_state = unix_sock_state(sk); > u = unix_sk(sk); > > - /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their > - * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. > - * SOCK_DGRAM is ordinary. So, no lock is needed. > - */ > if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) > nr_fds = atomic_read(&u->scm_stat.nr_fds); > else if (s_state == TCP_LISTEN) > @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, > goto out; > > err = -EINVAL; > - if (sk->sk_state != TCP_LISTEN) > + if (unix_sock_state(sk) != TCP_LISTEN) > goto out; > > /* If socket state is TCP_LISTEN it cannot change (for now...), > @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, > } > > if (msg->msg_namelen) { > - err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; > + unsigned char s_state = unix_sock_state(sk); > + err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; No need to define s_state here, or a blank line is needed after the definition. https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/ > goto out_err; > } else { > err = -ENOTCONN; > @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, > return -EOPNOTSUPP; > > other = unix_peer(sk); > - if (!other || sk->sk_state != TCP_ESTABLISHED) > + if (!other || unix_sock_state(sk) != TCP_ESTABLISHED) > return -ENOTCONN; > > if (false) { > @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, > if (err) > return err; > > - if (sk->sk_state != TCP_ESTABLISHED) > + if (unix_sock_state(sk) != TCP_ESTABLISHED) > return -ENOTCONN; > > if (msg->msg_namelen) > @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, > { > struct sock *sk = sock->sk; > > - if (sk->sk_state != TCP_ESTABLISHED) > + if (unix_sock_state(sk) != TCP_ESTABLISHED) > return -ENOTCONN; > > return unix_dgram_recvmsg(sock, msg, size, flags); > @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, > > static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) > { > - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) > + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) > return -ENOTCONN; > > return unix_read_skb(sk, recv_actor); > @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, > size_t size = state->size; > unsigned int last_len; > > - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { > + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) { > err = -EINVAL; > goto out; > }
Hi Kuniyuki, On 24.01.2023 20:57, Kuniyuki Iwashima wrote: > From: Kirill Tkhai <tkhai@ya.ru> > Date: Mon, 23 Jan 2023 01:21:20 +0300 >> Some functions use unlocked check for sock::sk_state. This does not guarantee >> a new value assigned to sk_state on some CPU is already visible on this CPU. >> >> Example: >> >> [CPU0:Task0] [CPU1:Task1] >> unix_listen() >> unix_state_lock(sk); >> sk->sk_state = TCP_LISTEN; >> unix_state_unlock(sk); >> unix_accept() >> if (sk->sk_state != TCP_LISTEN) /* not visible */ >> goto out; /* return error */ >> >> Task1 may miss new sk->sk_state value, and then unix_accept() returns error. >> Since in this situation unix_accept() is called chronologically later, such >> behavior is not obvious and it is wrong. > > Have you seen this on a real workload ? No, I haven't seen. This is my observation on current code, which I made during work on recent scm_fds patch. > It sounds like a userspace bug that accept() is called on a different > CPU before listen() returns. At least, accept() is fetching sk at the > same time, then I think there should be no guarantee that sk_state is > TCP_LISTEN. > > Same for other TCP_ESTABLISHED tests, it seems a program is calling > sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent > connect() will finish earlier. Not exactly. This is not about the case of "accept() is called before listen() returns". This is about "accept is called after listen returns". unix_listen() unix_state_lock(sk); sk->sk_state = TCP_LISTEN; unix_state_unlock(sk); <<returned from syscall>> unix_accept() if (sk->sk_state != TCP_LISTEN) /* not visible */ goto out; /* return error */ A syscall enter and exit do not imply any memory barriers. New sk_state in unix_accept() may be not visible. Speaking about other cases. Even changes made by three sequential syscalls connect(), accept() and sendmsg() may be not visible on other CPU: CPU0 CPU1 CPU2 connect(sk) sk2->sk_state = TCP_ESTABLISHED sendmsg(sk) sk2 = accept() read("fdinfo of sk2") sk2->sk_state is not visible There are a lot of possibilities of smp reordering in recvmsg(). CPU0 CPU1 CPU2 connect(sk) sk2->sk_state = TCP_ESTABLISHED sendmsg(sk) sk2 = accept() sk2 = pidfd_getfd("fd of sk2) ioctl(sk2, SIOCATMARK, &answ) -> answ=1 if (answ) recvmsg(sk2) --> TCP_ESTABLISHED is not visible ^^^ in this example there is also smp reordering between answ and sk_state There are a lot of cases. We should not wait for real appearance, because this may appear on !x86 cpus, and this may result in long and difficult debug for users. We should provide stable interfaces for them. The big advantage is this should not affect performance, since generic case still goes thru unlocked fastpath case. Kirill >> >> This patch aims to fix the problem. A new function unix_sock_state() is >> introduced, and it makes sure a user never misses a new state assigned just >> before the function is called. We will use it in the places, where unlocked >> sk_state dereferencing was used before. >> >> Note, that there remain some more places with sk_state unfixed. Also, the same >> problem is with unix_peer(). This will be a subject for future patches. >> >> Signed-off-by: Kirill Tkhai <tkhai@ya.ru> >> --- >> net/unix/af_unix.c | 43 +++++++++++++++++++++++++++++++------------ >> 1 file changed, 31 insertions(+), 12 deletions(-) >> >> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c >> index 009616fa0256..f53e09a0753b 100644 >> --- a/net/unix/af_unix.c >> +++ b/net/unix/af_unix.c >> @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s) >> } >> EXPORT_SYMBOL_GPL(unix_peer_get); >> >> +/* This function returns current sk->sk_state guaranteeing >> + * its relevance in case of assignment was made on other CPU. >> + */ >> +static unsigned char unix_sock_state(struct sock *sk) >> +{ >> + unsigned char s_state = READ_ONCE(sk->sk_state); >> + >> + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their >> + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. >> + * We may avoid taking the lock in case of those states are >> + * already visible. >> + */ >> + if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN) >> + && sk->sk_type != SOCK_DGRAM) >> + return s_state; >> + >> + unix_state_lock(sk); >> + s_state = sk->sk_state; >> + unix_state_unlock(sk); >> + return s_state; >> +} >> + >> static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, >> int addr_len) >> { >> @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) >> int nr_fds = 0; >> >> if (sk) { >> - s_state = READ_ONCE(sk->sk_state); >> + s_state = unix_sock_state(sk); >> u = unix_sk(sk); >> >> - /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their >> - * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. >> - * SOCK_DGRAM is ordinary. So, no lock is needed. >> - */ >> if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) >> nr_fds = atomic_read(&u->scm_stat.nr_fds); >> else if (s_state == TCP_LISTEN) >> @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, >> goto out; >> >> err = -EINVAL; >> - if (sk->sk_state != TCP_LISTEN) >> + if (unix_sock_state(sk) != TCP_LISTEN) >> goto out; >> >> /* If socket state is TCP_LISTEN it cannot change (for now...), >> @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, >> } >> >> if (msg->msg_namelen) { >> - err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; >> + unsigned char s_state = unix_sock_state(sk); >> + err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; > > No need to define s_state here, or a blank line is needed after > the definition. > https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/ > >> goto out_err; >> } else { >> err = -ENOTCONN; >> @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, >> return -EOPNOTSUPP; >> >> other = unix_peer(sk); >> - if (!other || sk->sk_state != TCP_ESTABLISHED) >> + if (!other || unix_sock_state(sk) != TCP_ESTABLISHED) >> return -ENOTCONN; >> >> if (false) { >> @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, >> if (err) >> return err; >> >> - if (sk->sk_state != TCP_ESTABLISHED) >> + if (unix_sock_state(sk) != TCP_ESTABLISHED) >> return -ENOTCONN; >> >> if (msg->msg_namelen) >> @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, >> { >> struct sock *sk = sock->sk; >> >> - if (sk->sk_state != TCP_ESTABLISHED) >> + if (unix_sock_state(sk) != TCP_ESTABLISHED) >> return -ENOTCONN; >> >> return unix_dgram_recvmsg(sock, msg, size, flags); >> @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, >> >> static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) >> { >> - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) >> + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) >> return -ENOTCONN; >> >> return unix_read_skb(sk, recv_actor); >> @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, >> size_t size = state->size; >> unsigned int last_len; >> >> - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { >> + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) { >> err = -EINVAL; >> goto out; >> }
On Tue, Jan 24, 2023 at 09:57:12AM -0800, Kuniyuki Iwashima wrote: > From: Kirill Tkhai <tkhai@ya.ru> > Date: Mon, 23 Jan 2023 01:21:20 +0300 > > Some functions use unlocked check for sock::sk_state. This does not guarantee > > a new value assigned to sk_state on some CPU is already visible on this CPU. > > > > Example: > > > > [CPU0:Task0] [CPU1:Task1] > > unix_listen() > > unix_state_lock(sk); > > sk->sk_state = TCP_LISTEN; > > unix_state_unlock(sk); > > unix_accept() > > if (sk->sk_state != TCP_LISTEN) /* not visible */ > > goto out; /* return error */ > > > > Task1 may miss new sk->sk_state value, and then unix_accept() returns error. > > Since in this situation unix_accept() is called chronologically later, such > > behavior is not obvious and it is wrong. > > Have you seen this on a real workload ? > > It sounds like a userspace bug that accept() is called on a different > CPU before listen() returns. At least, accept() is fetching sk at the I must confess I don't get why accept() can't be called on different cpu while listen() is in progress. As far as I see there is a small race window which of course not critical at all since in worst case we simply report an error back to userspace, still a nit worth to fix. > same time, then I think there should be no guarantee that sk_state is > TCP_LISTEN. > > Same for other TCP_ESTABLISHED tests, it seems a program is calling > sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent > connect() will finish earlier.
On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote: > Since in this situation unix_accept() is called chronologically later, such > behavior is not obvious and it is wrong. Noob question, perhaps - how do we establish the ordering ? CPU1 knows that listen() has succeed without a barrier ?
On 25.01.2023 04:35, Jakub Kicinski wrote: > On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote: >> Since in this situation unix_accept() is called chronologically later, such >> behavior is not obvious and it is wrong. > > Noob question, perhaps - how do we establish the ordering ? > CPU1 knows that listen() has succeed without a barrier ? 1)There are a many combinations with third task involved: [CPU0:Task0] [CPU1:Task1] [CPU2:Task2] listen(sk) kernel: sk_diag_fill(sk) rep->udiag_state = TCP_LISTEN return_from_syscall userspace: mutex_lock() shared_mem_var = rep->udiag_state mutex_unlock() userspace: mutex_lock() if (shared_mem_var == TCP_LISTEN) accept(sk); /* -> fail, since sk_state is not visible */ mutex_unlock() In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(), there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2. 2)My understanding is chronologically later accept() mustn't miss sk_state. Otherwise, kernel says that ordering between internal syscalls data is userspace duty, which is wrong. Userspace knows nothing about internal kernel data. It's not prohibited to call accept() for a socket obtained via pidfd_getfd() by a side application. Why doesn't the code guarantee, that accept() see actual sk_state? [CPU0:Task0] [CPU1:Task1] listen(sk) sk = pidfd_getfd() accept(sk) /* -> fail */ 3)Such possible situations in log file also look strange: [CPU0:Task0] [CPU1:Task1] listen() get_time(&time1) write_log("listening at ...", time1) get_time(&time2) sk = accept() if (sk < 0) write_log("accept failed at ...", time2) In case of there is no kernel ordering, we may see in their logs something like the below: Task1.log "listening at time1" Task2.log "accept failed at time2" and time2 > time1. Kirill
On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote: > 1)There are a many combinations with third task involved: > > [CPU0:Task0] [CPU1:Task1] [CPU2:Task2] > listen(sk) > kernel: > sk_diag_fill(sk) > rep->udiag_state = TCP_LISTEN > return_from_syscall > userspace: > mutex_lock() > shared_mem_var = rep->udiag_state > mutex_unlock() > > userspace: > mutex_lock() > if (shared_mem_var == TCP_LISTEN) > accept(sk); /* -> fail, since sk_state is not visible */ > mutex_unlock() > > In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility > to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(), > there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2. Me trying to prove that memory ordering is transitive would be 100% speculation. Let's ask Paul instead - is the above valid? Or the fact that CPU1 observes state from CPU0 and is strongly ordered with CPU2 implies that CPU2 will also observe CPU0's state? > 2)My understanding is chronologically later accept() mustn't miss sk_state. > Otherwise, kernel says that ordering between internal syscalls data > is userspace duty, which is wrong. Userspace knows nothing about internal > kernel data. > 3)Such possible situations in log file also look strange: Dunno those points are a bit subjective. Squeezing perf out of distributed systems requires sacrifices
On Wed, Jan 25, 2023 at 10:10:53PM -0800, Jakub Kicinski wrote: > On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote: > > 1)There are a many combinations with third task involved: > > > > [CPU0:Task0] [CPU1:Task1] [CPU2:Task2] > > listen(sk) > > kernel: > > sk_diag_fill(sk) > > rep->udiag_state = TCP_LISTEN > > return_from_syscall > > userspace: > > mutex_lock() > > shared_mem_var = rep->udiag_state > > mutex_unlock() > > > > userspace: > > mutex_lock() > > if (shared_mem_var == TCP_LISTEN) > > accept(sk); /* -> fail, since sk_state is not visible */ > > mutex_unlock() > > > > In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility > > to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(), > > there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2. > > Me trying to prove that memory ordering is transitive would be 100% > speculation. Let's ask Paul instead - is the above valid? Or the fact > that CPU1 observes state from CPU0 and is strongly ordered with CPU2 > implies that CPU2 will also observe CPU0's state? Hmmm... What is listen() doing? There seem to be a lot of them in the kernel. But proceeding on first principles... Sometimes. Memory ordering is transitive only when the ordering is sufficiently strong. In this case, I do not see any ordering between CPU 0 and anything else. If the listen() function were to acquire the same mutex as CPU1 and CPU2 did, and if it acquired it first, then CPU2 would be guaranteed to see anything CPU0 did while holding that mutex. Alternatively, if CPU0 wrote to some memory, and CPU1 read that value before releasing the mutex (including possibly before acquiring that mutex), then CPU2 would be guaranteed to see that value (or the value written by some later write to that same memory) after acquiring that mutex. So here are some things you can count on transitively: 1. After acquiring a given lock (or mutex or whatever), you will see any values written or read prior to any earlier conflicting release of that same lock. 2. After an access with acquire semantics (for example, smp_load_acquire()) you will see any values written or read prior to any earlier access with release semantics (for example, smp_store_release()). Or in all cases, you might see later values, in case someone else also did a write to the location in question. Does that help, or am I missing a turn in there somewhere? Thanx, Paul
On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote: > > Me trying to prove that memory ordering is transitive would be 100% > > speculation. Let's ask Paul instead - is the above valid? Or the fact > > that CPU1 observes state from CPU0 and is strongly ordered with CPU2 > > implies that CPU2 will also observe CPU0's state? > > Hmmm... What is listen() doing? There seem to be a lot of them > in the kernel. > > But proceeding on first principles... > > Sometimes. Memory ordering is transitive only when the ordering is > sufficiently strong. > > In this case, I do not see any ordering between CPU 0 and anything else. > If the listen() function were to acquire the same mutex as CPU1 and CPU2 > did, and if it acquired it first, then CPU2 would be guaranteed to see > anything CPU0 did while holding that mutex. The fuller picture would be: [CPU0] [CPU1] [CPU2] WRITE_ONCE(sk->sk_state, TCP_LISTEN); val = READ_ONCE(sk->sk_state) mutex_lock() shared_mem_var = val mutex_unlock() mutex_lock() if (shared_mem_var == TCP_LISTEN) BUG_ON(READ_ONCE(sk->sk_state) != TCP_LISTEN) mutex_unlock() > Alternatively, if CPU0 wrote to some memory, and CPU1 read that value > before releasing the mutex (including possibly before acquiring that > mutex), then CPU2 would be guaranteed to see that value (or the value > written by some later write to that same memory) after acquiring that > mutex. Which I believe is exactly what happens in the example. > So here are some things you can count on transitively: > > 1. After acquiring a given lock (or mutex or whatever), you will > see any values written or read prior to any earlier conflicting > release of that same lock. > > 2. After an access with acquire semantics (for example, > smp_load_acquire()) you will see any values written or read > prior to any earlier access with release semantics (for example, > smp_store_release()). > > Or in all cases, you might see later values, in case someone else also > did a write to the location in question. > > Does that help, or am I missing a turn in there somewhere? Very much so, thank you!
On Thu, Jan 26, 2023 at 01:33:22PM -0800, Jakub Kicinski wrote: > On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote: > > > Me trying to prove that memory ordering is transitive would be 100% > > > speculation. Let's ask Paul instead - is the above valid? Or the fact > > > that CPU1 observes state from CPU0 and is strongly ordered with CPU2 > > > implies that CPU2 will also observe CPU0's state? > > > > Hmmm... What is listen() doing? There seem to be a lot of them > > in the kernel. > > > > But proceeding on first principles... > > > > Sometimes. Memory ordering is transitive only when the ordering is > > sufficiently strong. > > > > In this case, I do not see any ordering between CPU 0 and anything else. > > If the listen() function were to acquire the same mutex as CPU1 and CPU2 > > did, and if it acquired it first, then CPU2 would be guaranteed to see > > anything CPU0 did while holding that mutex. > > The fuller picture would be: > > [CPU0] [CPU1] [CPU2] > WRITE_ONCE(sk->sk_state, > TCP_LISTEN); > val = READ_ONCE(sk->sk_state) > mutex_lock() > shared_mem_var = val > mutex_unlock() > mutex_lock() > if (shared_mem_var == TCP_LISTEN) > BUG_ON(READ_ONCE(sk->sk_state) > != TCP_LISTEN) > mutex_unlock() And just to make sure that I represented your example correctly, please see the litmus test below. LKMM says that the bad outcome cannot happen, that is, if CPU1 sees CPU0's write to sk->sk_state ("ss" in the litmus test) and if CPU2 sees CPU1's write to shared_mem_var ("smv" in the litmus test), then it cannot be the case that CPU2 sees the old value of sk->sk_state that was overwritten by CPU0. And as you note below, it is no surprise that LKMM has this opinion. ;-) But I need to make sure that I didn't misrepresent your diagram above. > > Alternatively, if CPU0 wrote to some memory, and CPU1 read that value > > before releasing the mutex (including possibly before acquiring that > > mutex), then CPU2 would be guaranteed to see that value (or the value > > written by some later write to that same memory) after acquiring that > > mutex. > > Which I believe is exactly what happens in the example. > > > So here are some things you can count on transitively: > > > > 1. After acquiring a given lock (or mutex or whatever), you will > > see any values written or read prior to any earlier conflicting > > release of that same lock. > > > > 2. After an access with acquire semantics (for example, > > smp_load_acquire()) you will see any values written or read > > prior to any earlier access with release semantics (for example, > > smp_store_release()). > > > > Or in all cases, you might see later values, in case someone else also > > did a write to the location in question. > > > > Does that help, or am I missing a turn in there somewhere? > > Very much so, thank you! ------------------------------------------------------------------------ C C-Jakub-listen (* * Result: Never * * Message-ID: <20230126133322.3bfab5e0@kernel.org> *) { } P0(int *ss, int *smv, spinlock_t *gbl) { WRITE_ONCE(*ss, 1); } P1(int *ss, int *smv, spinlock_t *gbl) { int r1; r1 = READ_ONCE(*ss); spin_lock(gbl); WRITE_ONCE(*smv, 1); spin_unlock(gbl); } P2(int *ss, int *smv, spinlock_t *gbl) { int r1; int r2; spin_lock(gbl); r1 = READ_ONCE(*smv); r2 = READ_ONCE(*ss); spin_unlock(gbl); } exists (1:r1=1 /\ 2:r1=1 /\ 2:r2=0)
On 26.01.2023 09:10, Jakub Kicinski wrote: > On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote: >> 1)There are a many combinations with third task involved: >> >> [CPU0:Task0] [CPU1:Task1] [CPU2:Task2] >> listen(sk) >> kernel: >> sk_diag_fill(sk) >> rep->udiag_state = TCP_LISTEN >> return_from_syscall >> userspace: >> mutex_lock() >> shared_mem_var = rep->udiag_state >> mutex_unlock() >> >> userspace: >> mutex_lock() >> if (shared_mem_var == TCP_LISTEN) >> accept(sk); /* -> fail, since sk_state is not visible */ >> mutex_unlock() >> >> In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility >> to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(), >> there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2. > > Me trying to prove that memory ordering is transitive would be 100% > speculation. Let's ask Paul instead - is the above valid? Or the fact > that CPU1 observes state from CPU0 and is strongly ordered with CPU2 > implies that CPU2 will also observe CPU0's state? Thanks Paul for the answer about shared mutex. Despite that situation is safe, the issue with pidfd_getfd() still exists. >> 2)My understanding is chronologically later accept() mustn't miss sk_state. >> Otherwise, kernel says that ordering between internal syscalls data >> is userspace duty, which is wrong. Userspace knows nothing about internal >> kernel data. > >> 3)Such possible situations in log file also look strange: > > Dunno those points are a bit subjective. Squeezing perf out of > distributed systems requires sacrifices
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 009616fa0256..f53e09a0753b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s) } EXPORT_SYMBOL_GPL(unix_peer_get); +/* This function returns current sk->sk_state guaranteeing + * its relevance in case of assignment was made on other CPU. + */ +static unsigned char unix_sock_state(struct sock *sk) +{ + unsigned char s_state = READ_ONCE(sk->sk_state); + + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. + * We may avoid taking the lock in case of those states are + * already visible. + */ + if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN) + && sk->sk_type != SOCK_DGRAM) + return s_state; + + unix_state_lock(sk); + s_state = sk->sk_state; + unix_state_unlock(sk); + return s_state; +} + static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, int addr_len) { @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) int nr_fds = 0; if (sk) { - s_state = READ_ONCE(sk->sk_state); + s_state = unix_sock_state(sk); u = unix_sk(sk); - /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their - * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. - * SOCK_DGRAM is ordinary. So, no lock is needed. - */ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); else if (s_state == TCP_LISTEN) @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, goto out; err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) + if (unix_sock_state(sk) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } if (msg->msg_namelen) { - err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; + unsigned char s_state = unix_sock_state(sk); + err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, return -EOPNOTSUPP; other = unix_peer(sk); - if (!other || sk->sk_state != TCP_ESTABLISHED) + if (!other || unix_sock_state(sk) != TCP_ESTABLISHED) return -ENOTCONN; if (false) { @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, if (err) return err; - if (sk->sk_state != TCP_ESTABLISHED) + if (unix_sock_state(sk) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; - if (sk->sk_state != TCP_ESTABLISHED) + if (unix_sock_state(sk) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) return -ENOTCONN; return unix_read_skb(sk, recv_actor); @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, size_t size = state->size; unsigned int last_len; - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; }
Some functions use unlocked check for sock::sk_state. This does not guarantee a new value assigned to sk_state on some CPU is already visible on this CPU. Example: [CPU0:Task0] [CPU1:Task1] unix_listen() unix_state_lock(sk); sk->sk_state = TCP_LISTEN; unix_state_unlock(sk); unix_accept() if (sk->sk_state != TCP_LISTEN) /* not visible */ goto out; /* return error */ Task1 may miss new sk->sk_state value, and then unix_accept() returns error. Since in this situation unix_accept() is called chronologically later, such behavior is not obvious and it is wrong. This patch aims to fix the problem. A new function unix_sock_state() is introduced, and it makes sure a user never misses a new state assigned just before the function is called. We will use it in the places, where unlocked sk_state dereferencing was used before. Note, that there remain some more places with sk_state unfixed. Also, the same problem is with unix_peer(). This will be a subject for future patches. Signed-off-by: Kirill Tkhai <tkhai@ya.ru> --- net/unix/af_unix.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-)