diff mbox series

[net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu

Message ID 72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 9 this patch: 9
netdev/checkpatch warning CHECK: Logical continuations should be on the previous line WARNING: Missing a blank line after declarations
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Kirill Tkhai Jan. 22, 2023, 10:21 p.m. UTC
Some functions use unlocked check for sock::sk_state. This does not guarantee
a new value assigned to sk_state on some CPU is already visible on this CPU.

Example:

[CPU0:Task0]                    [CPU1:Task1]
unix_listen()
  unix_state_lock(sk);
  sk->sk_state = TCP_LISTEN;
  unix_state_unlock(sk);
                                unix_accept()
                                  if (sk->sk_state != TCP_LISTEN) /* not visible */
                                     goto out;                    /* return error */

Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
Since in this situation unix_accept() is called chronologically later, such
behavior is not obvious and it is wrong.

This patch aims to fix the problem. A new function unix_sock_state() is
introduced, and it makes sure a user never misses a new state assigned just
before the function is called. We will use it in the places, where unlocked
sk_state dereferencing was used before.

Note, that there remain some more places with sk_state unfixed. Also, the same
problem is with unix_peer(). This will be a subject for future patches.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
 net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

Comments

Kuniyuki Iwashima Jan. 24, 2023, 5:57 p.m. UTC | #1
From:   Kirill Tkhai <tkhai@ya.ru>
Date:   Mon, 23 Jan 2023 01:21:20 +0300
> Some functions use unlocked check for sock::sk_state. This does not guarantee
> a new value assigned to sk_state on some CPU is already visible on this CPU.
> 
> Example:
> 
> [CPU0:Task0]                    [CPU1:Task1]
> unix_listen()
>   unix_state_lock(sk);
>   sk->sk_state = TCP_LISTEN;
>   unix_state_unlock(sk);
>                                 unix_accept()
>                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
>                                      goto out;                    /* return error */
> 
> Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
> Since in this situation unix_accept() is called chronologically later, such
> behavior is not obvious and it is wrong.

Have you seen this on a real workload ?

It sounds like a userspace bug that accept() is called on a different
CPU before listen() returns.  At least, accept() is fetching sk at the
same time, then I think there should be no guarantee that sk_state is
TCP_LISTEN.

Same for other TCP_ESTABLISHED tests, it seems a program is calling
sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
connect() will finish earlier.


> 
> This patch aims to fix the problem. A new function unix_sock_state() is
> introduced, and it makes sure a user never misses a new state assigned just
> before the function is called. We will use it in the places, where unlocked
> sk_state dereferencing was used before.
> 
> Note, that there remain some more places with sk_state unfixed. Also, the same
> problem is with unix_peer(). This will be a subject for future patches.
> 
> Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
> ---
>  net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
>  1 file changed, 31 insertions(+), 12 deletions(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 009616fa0256..f53e09a0753b 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s)
>  }
>  EXPORT_SYMBOL_GPL(unix_peer_get);
>  
> +/* This function returns current sk->sk_state guaranteeing
> + * its relevance in case of assignment was made on other CPU.
> + */
> +static unsigned char unix_sock_state(struct sock *sk)
> +{
> +	unsigned char s_state = READ_ONCE(sk->sk_state);
> +
> +	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
> +	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
> +	 * We may avoid taking the lock in case of those states are
> +	 * already visible.
> +	 */
> +	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
> +	    && sk->sk_type != SOCK_DGRAM)
> +		return s_state;
> +
> +	unix_state_lock(sk);
> +	s_state = sk->sk_state;
> +	unix_state_unlock(sk);
> +	return s_state;
> +}
> +
>  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
>  					     int addr_len)
>  {
> @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
>  	int nr_fds = 0;
>  
>  	if (sk) {
> -		s_state = READ_ONCE(sk->sk_state);
> +		s_state = unix_sock_state(sk);
>  		u = unix_sk(sk);
>  
> -		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
> -		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
> -		 * SOCK_DGRAM is ordinary. So, no lock is needed.
> -		 */
>  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
>  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
>  		else if (s_state == TCP_LISTEN)
> @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
>  		goto out;
>  
>  	err = -EINVAL;
> -	if (sk->sk_state != TCP_LISTEN)
> +	if (unix_sock_state(sk) != TCP_LISTEN)
>  		goto out;
>  
>  	/* If socket state is TCP_LISTEN it cannot change (for now...),
> @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
>  	}
>  
>  	if (msg->msg_namelen) {
> -		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
> +		unsigned char s_state = unix_sock_state(sk);
> +		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;

No need to define s_state here, or a blank line is needed after
the definition.
https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/

>  		goto out_err;
>  	} else {
>  		err = -ENOTCONN;
> @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
>  		return -EOPNOTSUPP;
>  
>  	other = unix_peer(sk);
> -	if (!other || sk->sk_state != TCP_ESTABLISHED)
> +	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	if (false) {
> @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
>  	if (err)
>  		return err;
>  
> -	if (sk->sk_state != TCP_ESTABLISHED)
> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	if (msg->msg_namelen)
> @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
>  {
>  	struct sock *sk = sock->sk;
>  
> -	if (sk->sk_state != TCP_ESTABLISHED)
> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	return unix_dgram_recvmsg(sock, msg, size, flags);
> @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
>  
>  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>  {
> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
>  		return -ENOTCONN;
>  
>  	return unix_read_skb(sk, recv_actor);
> @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>  	size_t size = state->size;
>  	unsigned int last_len;
>  
> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
>  		err = -EINVAL;
>  		goto out;
>  	}
Kirill Tkhai Jan. 24, 2023, 9:05 p.m. UTC | #2
Hi Kuniyuki,

On 24.01.2023 20:57, Kuniyuki Iwashima wrote:
> From:   Kirill Tkhai <tkhai@ya.ru>
> Date:   Mon, 23 Jan 2023 01:21:20 +0300
>> Some functions use unlocked check for sock::sk_state. This does not guarantee
>> a new value assigned to sk_state on some CPU is already visible on this CPU.
>>
>> Example:
>>
>> [CPU0:Task0]                    [CPU1:Task1]
>> unix_listen()
>>   unix_state_lock(sk);
>>   sk->sk_state = TCP_LISTEN;
>>   unix_state_unlock(sk);
>>                                 unix_accept()
>>                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
>>                                      goto out;                    /* return error */
>>
>> Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
>> Since in this situation unix_accept() is called chronologically later, such
>> behavior is not obvious and it is wrong.
> 
> Have you seen this on a real workload ?

No, I haven't seen. This is my observation on current code, which I made during work
on recent scm_fds patch.

> It sounds like a userspace bug that accept() is called on a different
> CPU before listen() returns.  At least, accept() is fetching sk at the
> same time, then I think there should be no guarantee that sk_state is
> TCP_LISTEN.
>
> Same for other TCP_ESTABLISHED tests, it seems a program is calling
> sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
> connect() will finish earlier.

Not exactly. This is not about the case of "accept() is called before listen() returns".
This is about "accept is called after listen returns".

 unix_listen()
   unix_state_lock(sk);
   sk->sk_state = TCP_LISTEN;
   unix_state_unlock(sk);

 <<returned from syscall>>
                                 unix_accept()
                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
                                      goto out;                    /* return error */

A syscall enter and exit do not imply any memory barriers. New sk_state in unix_accept()
may be not visible.

Speaking about other cases. Even changes made by three sequential syscalls connect(),
accept() and sendmsg() may be not visible on other CPU:

CPU0                                   CPU1                 CPU2
connect(sk)
  sk2->sk_state = TCP_ESTABLISHED                     
sendmsg(sk)
                                       sk2 = accept()
                                                            read("fdinfo of sk2")
                                                              sk2->sk_state is not visible

There are a lot of possibilities of smp reordering in recvmsg().


CPU0                                  CPU1                 CPU2
connect(sk)
  sk2->sk_state = TCP_ESTABLISHED                     
sendmsg(sk)
                                      sk2 = accept()
                                   
                                                           sk2 = pidfd_getfd("fd of sk2)
                                                           ioctl(sk2, SIOCATMARK, &answ) -> answ=1
                                                           if (answ)
                                                             recvmsg(sk2) --> TCP_ESTABLISHED is not visible

^^^ in this example there is also smp reordering between answ and sk_state

There are a lot of cases. We should not wait for real appearance, because this may appear on !x86 cpus,
and this may result in long and difficult debug for users. We should provide stable interfaces for them.

The big advantage is this should not affect performance, since generic case still goes thru unlocked
fastpath case.

Kirill
 
>>
>> This patch aims to fix the problem. A new function unix_sock_state() is
>> introduced, and it makes sure a user never misses a new state assigned just
>> before the function is called. We will use it in the places, where unlocked
>> sk_state dereferencing was used before.
>>
>> Note, that there remain some more places with sk_state unfixed. Also, the same
>> problem is with unix_peer(). This will be a subject for future patches.
>>
>> Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
>> ---
>>  net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
>>  1 file changed, 31 insertions(+), 12 deletions(-)
>>
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index 009616fa0256..f53e09a0753b 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
>> @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s)
>>  }
>>  EXPORT_SYMBOL_GPL(unix_peer_get);
>>  
>> +/* This function returns current sk->sk_state guaranteeing
>> + * its relevance in case of assignment was made on other CPU.
>> + */
>> +static unsigned char unix_sock_state(struct sock *sk)
>> +{
>> +	unsigned char s_state = READ_ONCE(sk->sk_state);
>> +
>> +	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
>> +	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
>> +	 * We may avoid taking the lock in case of those states are
>> +	 * already visible.
>> +	 */
>> +	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
>> +	    && sk->sk_type != SOCK_DGRAM)
>> +		return s_state;
>> +
>> +	unix_state_lock(sk);
>> +	s_state = sk->sk_state;
>> +	unix_state_unlock(sk);
>> +	return s_state;
>> +}
>> +
>>  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
>>  					     int addr_len)
>>  {
>> @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
>>  	int nr_fds = 0;
>>  
>>  	if (sk) {
>> -		s_state = READ_ONCE(sk->sk_state);
>> +		s_state = unix_sock_state(sk);
>>  		u = unix_sk(sk);
>>  
>> -		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
>> -		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
>> -		 * SOCK_DGRAM is ordinary. So, no lock is needed.
>> -		 */
>>  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
>>  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
>>  		else if (s_state == TCP_LISTEN)
>> @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
>>  		goto out;
>>  
>>  	err = -EINVAL;
>> -	if (sk->sk_state != TCP_LISTEN)
>> +	if (unix_sock_state(sk) != TCP_LISTEN)
>>  		goto out;
>>  
>>  	/* If socket state is TCP_LISTEN it cannot change (for now...),
>> @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
>>  	}
>>  
>>  	if (msg->msg_namelen) {
>> -		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
>> +		unsigned char s_state = unix_sock_state(sk);
>> +		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
> 
> No need to define s_state here, or a blank line is needed after
> the definition.
> https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/
> 
>>  		goto out_err;
>>  	} else {
>>  		err = -ENOTCONN;
>> @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
>>  		return -EOPNOTSUPP;
>>  
>>  	other = unix_peer(sk);
>> -	if (!other || sk->sk_state != TCP_ESTABLISHED)
>> +	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	if (false) {
>> @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
>>  	if (err)
>>  		return err;
>>  
>> -	if (sk->sk_state != TCP_ESTABLISHED)
>> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	if (msg->msg_namelen)
>> @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
>>  {
>>  	struct sock *sk = sock->sk;
>>  
>> -	if (sk->sk_state != TCP_ESTABLISHED)
>> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	return unix_dgram_recvmsg(sock, msg, size, flags);
>> @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
>>  
>>  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>>  {
>> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
>> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
>>  		return -ENOTCONN;
>>  
>>  	return unix_read_skb(sk, recv_actor);
>> @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>>  	size_t size = state->size;
>>  	unsigned int last_len;
>>  
>> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
>> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
>>  		err = -EINVAL;
>>  		goto out;
>>  	}
Cyrill Gorcunov Jan. 24, 2023, 10:32 p.m. UTC | #3
On Tue, Jan 24, 2023 at 09:57:12AM -0800, Kuniyuki Iwashima wrote:
> From:   Kirill Tkhai <tkhai@ya.ru>
> Date:   Mon, 23 Jan 2023 01:21:20 +0300
> > Some functions use unlocked check for sock::sk_state. This does not guarantee
> > a new value assigned to sk_state on some CPU is already visible on this CPU.
> > 
> > Example:
> > 
> > [CPU0:Task0]                    [CPU1:Task1]
> > unix_listen()
> >   unix_state_lock(sk);
> >   sk->sk_state = TCP_LISTEN;
> >   unix_state_unlock(sk);
> >                                 unix_accept()
> >                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
> >                                      goto out;                    /* return error */
> > 
> > Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
> > Since in this situation unix_accept() is called chronologically later, such
> > behavior is not obvious and it is wrong.
> 
> Have you seen this on a real workload ?
> 
> It sounds like a userspace bug that accept() is called on a different
> CPU before listen() returns.  At least, accept() is fetching sk at the

I must confess I don't get why accept() can't be called on different cpu
while listen() is in progress. As far as I see there is a small race window
which of course not critical at all since in worst case we simply report an
error back to userspace, still a nit worth to fix.

> same time, then I think there should be no guarantee that sk_state is
> TCP_LISTEN.
> 
> Same for other TCP_ESTABLISHED tests, it seems a program is calling
> sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
> connect() will finish earlier.
Jakub Kicinski Jan. 25, 2023, 1:35 a.m. UTC | #4
On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote:
> Since in this situation unix_accept() is called chronologically later, such
> behavior is not obvious and it is wrong.

Noob question, perhaps - how do we establish the ordering ?
CPU1 knows that listen() has succeed without a barrier ?
Kirill Tkhai Jan. 25, 2023, 9:09 p.m. UTC | #5
On 25.01.2023 04:35, Jakub Kicinski wrote:
> On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote:
>> Since in this situation unix_accept() is called chronologically later, such
>> behavior is not obvious and it is wrong.
> 
> Noob question, perhaps - how do we establish the ordering ?
> CPU1 knows that listen() has succeed without a barrier ?

1)There are a many combinations with third task involved:

[CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
listen(sk)
              kernel:
                sk_diag_fill(sk)
                  rep->udiag_state = TCP_LISTEN
                return_from_syscall
              userspace:
                mutex_lock()
                shared_mem_var = rep->udiag_state 
                mutex_unlock()

                                                     userspace: 
                                                       mutex_lock()
                                                       if (shared_mem_var == TCP_LISTEN)
                                                         accept(sk); /* -> fail, since sk_state is not visible */
                                                       mutex_unlock()

In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.

2)My understanding is chronologically later accept() mustn't miss sk_state.
Otherwise, kernel says that ordering between internal syscalls data
is userspace duty, which is wrong. Userspace knows nothing about internal
kernel data.

It's not prohibited to call accept() for a socket obtained via pidfd_getfd()
by a side application. Why doesn't the code guarantee, that accept() see
actual sk_state?

[CPU0:Task0]          [CPU1:Task1]
listen(sk)
                      sk = pidfd_getfd()
                      accept(sk) /* -> fail */

3)Such possible situations in log file also look strange:

[CPU0:Task0]                                           [CPU1:Task1]
listen()
get_time(&time1)
write_log("listening at ...", time1)

                                                       get_time(&time2)
                                                       sk = accept()
                                                       if (sk < 0)
                                                          write_log("accept failed at ...", time2)

In case of there is no kernel ordering, we may see in their logs something
like the below:

Task1.log
"listening at time1"

Task2.log
"accept failed at time2"

and time2 > time1.

Kirill
Jakub Kicinski Jan. 26, 2023, 6:10 a.m. UTC | #6
On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
> 1)There are a many combinations with third task involved:
> 
> [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
> listen(sk)
>               kernel:
>                 sk_diag_fill(sk)
>                   rep->udiag_state = TCP_LISTEN
>                 return_from_syscall
>               userspace:
>                 mutex_lock()
>                 shared_mem_var = rep->udiag_state 
>                 mutex_unlock()
> 
>                                                      userspace: 
>                                                        mutex_lock()
>                                                        if (shared_mem_var == TCP_LISTEN)
>                                                          accept(sk); /* -> fail, since sk_state is not visible */
>                                                        mutex_unlock()
> 
> In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
> to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
> there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.

Me trying to prove that memory ordering is transitive would be 100%
speculation. Let's ask Paul instead - is the above valid? Or the fact
that CPU1 observes state from CPU0 and is strongly ordered with CPU2
implies that CPU2 will also observe CPU0's state?

> 2)My understanding is chronologically later accept() mustn't miss sk_state.
> Otherwise, kernel says that ordering between internal syscalls data
> is userspace duty, which is wrong. Userspace knows nothing about internal
> kernel data.

> 3)Such possible situations in log file also look strange:

Dunno those points are a bit subjective. Squeezing perf out of
distributed systems requires sacrifices 
Paul E. McKenney Jan. 26, 2023, 8:25 p.m. UTC | #7
On Wed, Jan 25, 2023 at 10:10:53PM -0800, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
> > 1)There are a many combinations with third task involved:
> > 
> > [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
> > listen(sk)
> >               kernel:
> >                 sk_diag_fill(sk)
> >                   rep->udiag_state = TCP_LISTEN
> >                 return_from_syscall
> >               userspace:
> >                 mutex_lock()
> >                 shared_mem_var = rep->udiag_state 
> >                 mutex_unlock()
> > 
> >                                                      userspace: 
> >                                                        mutex_lock()
> >                                                        if (shared_mem_var == TCP_LISTEN)
> >                                                          accept(sk); /* -> fail, since sk_state is not visible */
> >                                                        mutex_unlock()
> > 
> > In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
> > to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
> > there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.
> 
> Me trying to prove that memory ordering is transitive would be 100%
> speculation. Let's ask Paul instead - is the above valid? Or the fact
> that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> implies that CPU2 will also observe CPU0's state?

Hmmm...  What is listen() doing?  There seem to be a lot of them
in the kernel.

But proceeding on first principles...

Sometimes.  Memory ordering is transitive only when the ordering is
sufficiently strong.

In this case, I do not see any ordering between CPU 0 and anything else.
If the listen() function were to acquire the same mutex as CPU1 and CPU2
did, and if it acquired it first, then CPU2 would be guaranteed to see
anything CPU0 did while holding that mutex.

Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
before releasing the mutex (including possibly before acquiring that
mutex), then CPU2 would be guaranteed to see that value (or the value
written by some later write to that same memory) after acquiring that
mutex.

So here are some things you can count on transitively:

1.	After acquiring a given lock (or mutex or whatever), you will
	see any values written or read prior to any earlier conflicting
	release of that same lock.

2.	After an access with acquire semantics (for example,
	smp_load_acquire()) you will see any values written or read
	prior to any earlier access with release semantics (for example,
	smp_store_release()).

Or in all cases, you might see later values, in case someone else also
did a write to the location in question.

Does that help, or am I missing a turn in there somewhere?

							Thanx, Paul
Jakub Kicinski Jan. 26, 2023, 9:33 p.m. UTC | #8
On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote:
> > Me trying to prove that memory ordering is transitive would be 100%
> > speculation. Let's ask Paul instead - is the above valid? Or the fact
> > that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> > implies that CPU2 will also observe CPU0's state?  
> 
> Hmmm...  What is listen() doing?  There seem to be a lot of them
> in the kernel.
> 
> But proceeding on first principles...
> 
> Sometimes.  Memory ordering is transitive only when the ordering is
> sufficiently strong.
> 
> In this case, I do not see any ordering between CPU 0 and anything else.
> If the listen() function were to acquire the same mutex as CPU1 and CPU2
> did, and if it acquired it first, then CPU2 would be guaranteed to see
> anything CPU0 did while holding that mutex.

The fuller picture would be:

[CPU0]                     [CPU1]                [CPU2]
WRITE_ONCE(sk->sk_state,
           TCP_LISTEN);
                           val = READ_ONCE(sk->sk_state) 
                           mutex_lock()
                           shared_mem_var = val
                           mutex_unlock()
                                                  mutex_lock()
                                                  if (shared_mem_var == TCP_LISTEN)
                                                     BUG_ON(READ_ONCE(sk->sk_state)
                                                            != TCP_LISTEN)
                                                  mutex_unlock()

> Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
> before releasing the mutex (including possibly before acquiring that
> mutex), then CPU2 would be guaranteed to see that value (or the value
> written by some later write to that same memory) after acquiring that
> mutex.

Which I believe is exactly what happens in the example.

> So here are some things you can count on transitively:
> 
> 1.	After acquiring a given lock (or mutex or whatever), you will
> 	see any values written or read prior to any earlier conflicting
> 	release of that same lock.
> 
> 2.	After an access with acquire semantics (for example,
> 	smp_load_acquire()) you will see any values written or read
> 	prior to any earlier access with release semantics (for example,
> 	smp_store_release()).
> 
> Or in all cases, you might see later values, in case someone else also
> did a write to the location in question.
> 
> Does that help, or am I missing a turn in there somewhere?

Very much so, thank you!
Paul E. McKenney Jan. 26, 2023, 9:47 p.m. UTC | #9
On Thu, Jan 26, 2023 at 01:33:22PM -0800, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote:
> > > Me trying to prove that memory ordering is transitive would be 100%
> > > speculation. Let's ask Paul instead - is the above valid? Or the fact
> > > that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> > > implies that CPU2 will also observe CPU0's state?  
> > 
> > Hmmm...  What is listen() doing?  There seem to be a lot of them
> > in the kernel.
> > 
> > But proceeding on first principles...
> > 
> > Sometimes.  Memory ordering is transitive only when the ordering is
> > sufficiently strong.
> > 
> > In this case, I do not see any ordering between CPU 0 and anything else.
> > If the listen() function were to acquire the same mutex as CPU1 and CPU2
> > did, and if it acquired it first, then CPU2 would be guaranteed to see
> > anything CPU0 did while holding that mutex.
> 
> The fuller picture would be:
> 
> [CPU0]                     [CPU1]                [CPU2]
> WRITE_ONCE(sk->sk_state,
>            TCP_LISTEN);
>                            val = READ_ONCE(sk->sk_state) 
>                            mutex_lock()
>                            shared_mem_var = val
>                            mutex_unlock()
>                                                   mutex_lock()
>                                                   if (shared_mem_var == TCP_LISTEN)
>                                                      BUG_ON(READ_ONCE(sk->sk_state)
>                                                             != TCP_LISTEN)
>                                                   mutex_unlock()

And just to make sure that I represented your example correctly, please
see the litmus test below.

LKMM says that the bad outcome cannot happen, that is, if CPU1 sees
CPU0's write to sk->sk_state ("ss" in the litmus test) and if CPU2
sees CPU1's write to shared_mem_var ("smv" in the litmus test), then
it cannot be the case that CPU2 sees the old value of sk->sk_state that
was overwritten by CPU0.

And as you note below, it is no surprise that LKMM has this opinion.  ;-)

But I need to make sure that I didn't misrepresent your diagram above.

> > Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
> > before releasing the mutex (including possibly before acquiring that
> > mutex), then CPU2 would be guaranteed to see that value (or the value
> > written by some later write to that same memory) after acquiring that
> > mutex.
> 
> Which I believe is exactly what happens in the example.
> 
> > So here are some things you can count on transitively:
> > 
> > 1.	After acquiring a given lock (or mutex or whatever), you will
> > 	see any values written or read prior to any earlier conflicting
> > 	release of that same lock.
> > 
> > 2.	After an access with acquire semantics (for example,
> > 	smp_load_acquire()) you will see any values written or read
> > 	prior to any earlier access with release semantics (for example,
> > 	smp_store_release()).
> > 
> > Or in all cases, you might see later values, in case someone else also
> > did a write to the location in question.
> > 
> > Does that help, or am I missing a turn in there somewhere?
> 
> Very much so, thank you!

------------------------------------------------------------------------

C C-Jakub-listen

(*
 * Result: Never
 *
 * Message-ID: <20230126133322.3bfab5e0@kernel.org>
 *)

{
}

P0(int *ss, int *smv, spinlock_t *gbl)
{
	WRITE_ONCE(*ss, 1);
}

P1(int *ss, int *smv, spinlock_t *gbl)
{
        int r1;

	r1 = READ_ONCE(*ss);
	spin_lock(gbl);
	WRITE_ONCE(*smv, 1);
	spin_unlock(gbl);
}

P2(int *ss, int *smv, spinlock_t *gbl)
{
	int r1;
        int r2;

	spin_lock(gbl);
	r1 = READ_ONCE(*smv);
	r2 = READ_ONCE(*ss);
	spin_unlock(gbl);
}

exists
(1:r1=1 /\ 2:r1=1 /\ 2:r2=0)
Kirill Tkhai Feb. 2, 2023, 7:42 p.m. UTC | #10
On 26.01.2023 09:10, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
>> 1)There are a many combinations with third task involved:
>>
>> [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
>> listen(sk)
>>               kernel:
>>                 sk_diag_fill(sk)
>>                   rep->udiag_state = TCP_LISTEN
>>                 return_from_syscall
>>               userspace:
>>                 mutex_lock()
>>                 shared_mem_var = rep->udiag_state 
>>                 mutex_unlock()
>>
>>                                                      userspace: 
>>                                                        mutex_lock()
>>                                                        if (shared_mem_var == TCP_LISTEN)
>>                                                          accept(sk); /* -> fail, since sk_state is not visible */
>>                                                        mutex_unlock()
>>
>> In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
>> to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
>> there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.
> 
> Me trying to prove that memory ordering is transitive would be 100%
> speculation. Let's ask Paul instead - is the above valid? Or the fact
> that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> implies that CPU2 will also observe CPU0's state?

Thanks Paul for the answer about shared mutex. Despite that situation is safe, the issue with pidfd_getfd() still exists.
 
>> 2)My understanding is chronologically later accept() mustn't miss sk_state.
>> Otherwise, kernel says that ordering between internal syscalls data
>> is userspace duty, which is wrong. Userspace knows nothing about internal
>> kernel data.
> 
>> 3)Such possible situations in log file also look strange:
> 
> Dunno those points are a bit subjective. Squeezing perf out of
> distributed systems requires sacrifices 
diff mbox series

Patch

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 009616fa0256..f53e09a0753b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -247,6 +247,28 @@  struct sock *unix_peer_get(struct sock *s)
 }
 EXPORT_SYMBOL_GPL(unix_peer_get);
 
+/* This function returns current sk->sk_state guaranteeing
+ * its relevance in case of assignment was made on other CPU.
+ */
+static unsigned char unix_sock_state(struct sock *sk)
+{
+	unsigned char s_state = READ_ONCE(sk->sk_state);
+
+	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
+	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
+	 * We may avoid taking the lock in case of those states are
+	 * already visible.
+	 */
+	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
+	    && sk->sk_type != SOCK_DGRAM)
+		return s_state;
+
+	unix_state_lock(sk);
+	s_state = sk->sk_state;
+	unix_state_unlock(sk);
+	return s_state;
+}
+
 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 					     int addr_len)
 {
@@ -812,13 +834,9 @@  static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 	int nr_fds = 0;
 
 	if (sk) {
-		s_state = READ_ONCE(sk->sk_state);
+		s_state = unix_sock_state(sk);
 		u = unix_sk(sk);
 
-		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
-		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
-		 * SOCK_DGRAM is ordinary. So, no lock is needed.
-		 */
 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
 		else if (s_state == TCP_LISTEN)
@@ -1686,7 +1704,7 @@  static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 		goto out;
 
 	err = -EINVAL;
-	if (sk->sk_state != TCP_LISTEN)
+	if (unix_sock_state(sk) != TCP_LISTEN)
 		goto out;
 
 	/* If socket state is TCP_LISTEN it cannot change (for now...),
@@ -2178,7 +2196,8 @@  static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 	if (msg->msg_namelen) {
-		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
+		unsigned char s_state = unix_sock_state(sk);
+		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
 		goto out_err;
 	} else {
 		err = -ENOTCONN;
@@ -2279,7 +2298,7 @@  static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 		return -EOPNOTSUPP;
 
 	other = unix_peer(sk);
-	if (!other || sk->sk_state != TCP_ESTABLISHED)
+	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	if (false) {
@@ -2391,7 +2410,7 @@  static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
 	if (err)
 		return err;
 
-	if (sk->sk_state != TCP_ESTABLISHED)
+	if (unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	if (msg->msg_namelen)
@@ -2405,7 +2424,7 @@  static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
 {
 	struct sock *sk = sock->sk;
 
-	if (sk->sk_state != TCP_ESTABLISHED)
+	if (unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	return unix_dgram_recvmsg(sock, msg, size, flags);
@@ -2689,7 +2708,7 @@  static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 
 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
-	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
 		return -ENOTCONN;
 
 	return unix_read_skb(sk, recv_actor);
@@ -2713,7 +2732,7 @@  static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	size_t size = state->size;
 	unsigned int last_len;
 
-	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
 		err = -EINVAL;
 		goto out;
 	}