diff mbox series

[bpf-next,v5,04/11] skmsg: avoid lock_sock() in sk_psock_backlog()

Message ID 20210317022219.24934-5-xiyou.wangcong@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series sockmap: introduce BPF_SK_SKB_VERDICT and support UDP | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: yhs@fb.com kpsingh@kernel.org andrii@kernel.org kafai@fb.com ast@kernel.org songliubraving@fb.com davem@davemloft.net kuba@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 114 this patch: 114
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 136 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 114 this patch: 114
netdev/header_inline success Link

Commit Message

Cong Wang March 17, 2021, 2:22 a.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

We do not have to lock the sock to avoid losing sk_socket,
instead we can purge all the ingress queues when we close
the socket. Sending or receiving packets after orphaning
socket makes no sense.

We do purge these queues when psock refcnt reaches zero but
here we want to purge them explicitly in sock_map_close().
There are also some nasty race conditions on testing bit
SK_PSOCK_TX_ENABLED and queuing/canceling the psock work,
we can expand psock->ingress_lock a bit to protect them too.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/skmsg.h |  1 +
 net/core/skmsg.c      | 50 +++++++++++++++++++++++++++++++------------
 net/core/sock_map.c   |  1 +
 3 files changed, 38 insertions(+), 14 deletions(-)

Comments

John Fastabend March 20, 2021, 2:45 a.m. UTC | #1
Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>
> 
> We do not have to lock the sock to avoid losing sk_socket,
> instead we can purge all the ingress queues when we close
> the socket. Sending or receiving packets after orphaning
> socket makes no sense.
> 
> We do purge these queues when psock refcnt reaches zero but
> here we want to purge them explicitly in sock_map_close().
> There are also some nasty race conditions on testing bit
> SK_PSOCK_TX_ENABLED and queuing/canceling the psock work,
> we can expand psock->ingress_lock a bit to protect them too.
> 
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Cc: Lorenz Bauer <lmb@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>  include/linux/skmsg.h |  1 +
>  net/core/skmsg.c      | 50 +++++++++++++++++++++++++++++++------------
>  net/core/sock_map.c   |  1 +
>  3 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index f2d45a73b2b2..0f5e663f6c7f 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -347,6 +347,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
>  }

Overall looks good, comment/question below.

>  
>  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
>  int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 305dddc51857..d0a227b0f672 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
>  	if (!ingress) {
>  		if (!sock_writeable(psock->sk))
>  			return -EAGAIN;
> -		return skb_send_sock_locked(psock->sk, skb, off, len);
> +		return skb_send_sock(psock->sk, skb, off, len);
>  	}
>  	return sk_psock_skb_ingress(psock, skb);
>  }
> @@ -511,8 +511,6 @@ static void sk_psock_backlog(struct work_struct *work)
>  	u32 len, off;
>  	int ret;
>  
> -	/* Lock sock to avoid losing sk_socket during loop. */
> -	lock_sock(psock->sk);
>  	if (state->skb) {
>  		skb = state->skb;
>  		len = state->len;
> @@ -529,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
>  		skb_bpf_redirect_clear(skb);
>  		do {
>  			ret = -EIO;
> -			if (likely(psock->sk->sk_socket))
> +			if (!sock_flag(psock->sk, SOCK_DEAD))
>  				ret = sk_psock_handle_skb(psock, skb, off,
>  							  len, ingress);
>  			if (ret <= 0) {
> @@ -537,13 +535,13 @@ static void sk_psock_backlog(struct work_struct *work)
>  					state->skb = skb;
>  					state->len = len;
>  					state->off = off;
> -					goto end;
> +					return;

Unrelated to your series I'll add it to my queue of fixes, but I think we
leak state->skb on teardown.

>  				}
>  				/* Hard errors break pipe and stop xmit. */
>  				sk_psock_report_error(psock, ret ? -ret : EPIPE);
>  				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
>  				kfree_skb(skb);
> -				goto end;
> +				return;
>  			}
>  			off += ret;
>  			len -= ret;
> @@ -552,8 +550,6 @@ static void sk_psock_backlog(struct work_struct *work)
>  		if (!ingress)
>  			kfree_skb(skb);
>  	}
> -end:
> -	release_sock(psock->sk);
>  }
>  
>  struct sk_psock *sk_psock_init(struct sock *sk, int node)
> @@ -631,7 +627,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>  	}
>  }
>  
> -static void sk_psock_zap_ingress(struct sk_psock *psock)
> +static void __sk_psock_zap_ingress(struct sk_psock *psock)
>  {
>  	struct sk_buff *skb;
>  
> @@ -639,8 +635,13 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
>  		skb_bpf_redirect_clear(skb);
>  		kfree_skb(skb);
>  	}
> -	spin_lock_bh(&psock->ingress_lock);
>  	__sk_psock_purge_ingress_msg(psock);
> +}
> +
> +static void sk_psock_zap_ingress(struct sk_psock *psock)
> +{
> +	spin_lock_bh(&psock->ingress_lock);
> +	__sk_psock_zap_ingress(psock);
>  	spin_unlock_bh(&psock->ingress_lock);

I'm wondering about callers of sk_psock_zap_ingress() and why the lock is
needed here. We have two callers

sk_psock_destroy_deferred(), is deferred after an RCU grace period and after
cancel_work_sync() so there should be no users to into the skb queue. If there
are we  have other problems I think.

sk_psock_drop() is the other. It is called when the refcnt is zero and does
a sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED). Should it just wrap
up the clear_state and sk_psock_zap_ingress similar to other cases so it
doesn't have to deal with the case where enqueue happens after
sk_psock_zap_ingress.

Something like this would be clearer?
                                                                                           
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)                                
{                                                                                          
	sk_psock_stop()
        write_lock_bh(&sk->sk_callback_lock);                                              
        sk_psock_restore_proto(sk, psock);                                                 
        rcu_assign_sk_user_data(sk, NULL);                                                 
        if (psock->progs.stream_parser)                                                    
                sk_psock_stop_strp(sk, psock);                                             
        else if (psock->progs.stream_verdict)                                              
                sk_psock_stop_verdict(sk, psock);                                          
        write_unlock_bh(&sk->sk_callback_lock);                                            
        call_rcu(&psock->rcu, sk_psock_destroy);                                           
}                                                                                          
EXPORT_SYMBOL_GPL(sk_psock_drop)

Then sk_psock_zap_ingress, as coded above, is not really needed anywhere and
we just use the lockless variant, __sk_psock_zap_ingress(). WDYT, to I miss
something.


>  }
>  
> @@ -654,6 +655,17 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
>  	}
>  }
>  
> +void sk_psock_stop(struct sk_psock *psock)
> +{
> +	spin_lock_bh(&psock->ingress_lock);
> +	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
> +	sk_psock_cork_free(psock);
> +	__sk_psock_zap_ingress(psock);
> +	spin_unlock_bh(&psock->ingress_lock);
> +
> +	cancel_work_sync(&psock->work);
> +}
> +
>  static void sk_psock_done_strp(struct sk_psock *psock);
Cong Wang March 22, 2021, 3:23 a.m. UTC | #2
On Fri, Mar 19, 2021 at 7:45 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Cong Wang wrote:
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > We do not have to lock the sock to avoid losing sk_socket,
> > instead we can purge all the ingress queues when we close
> > the socket. Sending or receiving packets after orphaning
> > socket makes no sense.
> >
> > We do purge these queues when psock refcnt reaches zero but
> > here we want to purge them explicitly in sock_map_close().
> > There are also some nasty race conditions on testing bit
> > SK_PSOCK_TX_ENABLED and queuing/canceling the psock work,
> > we can expand psock->ingress_lock a bit to protect them too.
> >
> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Jakub Sitnicki <jakub@cloudflare.com>
> > Cc: Lorenz Bauer <lmb@cloudflare.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >  include/linux/skmsg.h |  1 +
> >  net/core/skmsg.c      | 50 +++++++++++++++++++++++++++++++------------
> >  net/core/sock_map.c   |  1 +
> >  3 files changed, 38 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index f2d45a73b2b2..0f5e663f6c7f 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -347,6 +347,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
> >  }
>
> Overall looks good, comment/question below.
>
> >
> >  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
> >  int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> > index 305dddc51857..d0a227b0f672 100644
> > --- a/net/core/skmsg.c
> > +++ b/net/core/skmsg.c
> > @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
> >       if (!ingress) {
> >               if (!sock_writeable(psock->sk))
> >                       return -EAGAIN;
> > -             return skb_send_sock_locked(psock->sk, skb, off, len);
> > +             return skb_send_sock(psock->sk, skb, off, len);
> >       }
> >       return sk_psock_skb_ingress(psock, skb);
> >  }
> > @@ -511,8 +511,6 @@ static void sk_psock_backlog(struct work_struct *work)
> >       u32 len, off;
> >       int ret;
> >
> > -     /* Lock sock to avoid losing sk_socket during loop. */
> > -     lock_sock(psock->sk);
> >       if (state->skb) {
> >               skb = state->skb;
> >               len = state->len;
> > @@ -529,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work)
> >               skb_bpf_redirect_clear(skb);
> >               do {
> >                       ret = -EIO;
> > -                     if (likely(psock->sk->sk_socket))
> > +                     if (!sock_flag(psock->sk, SOCK_DEAD))
> >                               ret = sk_psock_handle_skb(psock, skb, off,
> >                                                         len, ingress);
> >                       if (ret <= 0) {
> > @@ -537,13 +535,13 @@ static void sk_psock_backlog(struct work_struct *work)
> >                                       state->skb = skb;
> >                                       state->len = len;
> >                                       state->off = off;
> > -                                     goto end;
> > +                                     return;
>
> Unrelated to your series I'll add it to my queue of fixes, but I think we
> leak state->skb on teardown.

Ok. Please target all bug fixes to -net.

>
> >                               }
> >                               /* Hard errors break pipe and stop xmit. */
> >                               sk_psock_report_error(psock, ret ? -ret : EPIPE);
> >                               sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
> >                               kfree_skb(skb);
> > -                             goto end;
> > +                             return;
> >                       }
> >                       off += ret;
> >                       len -= ret;
> > @@ -552,8 +550,6 @@ static void sk_psock_backlog(struct work_struct *work)
> >               if (!ingress)
> >                       kfree_skb(skb);
> >       }
> > -end:
> > -     release_sock(psock->sk);
> >  }
> >
> >  struct sk_psock *sk_psock_init(struct sock *sk, int node)
> > @@ -631,7 +627,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
> >       }
> >  }
> >
> > -static void sk_psock_zap_ingress(struct sk_psock *psock)
> > +static void __sk_psock_zap_ingress(struct sk_psock *psock)
> >  {
> >       struct sk_buff *skb;
> >
> > @@ -639,8 +635,13 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
> >               skb_bpf_redirect_clear(skb);
> >               kfree_skb(skb);
> >       }
> > -     spin_lock_bh(&psock->ingress_lock);
> >       __sk_psock_purge_ingress_msg(psock);
> > +}
> > +
> > +static void sk_psock_zap_ingress(struct sk_psock *psock)
> > +{
> > +     spin_lock_bh(&psock->ingress_lock);
> > +     __sk_psock_zap_ingress(psock);
> >       spin_unlock_bh(&psock->ingress_lock);
>
> I'm wondering about callers of sk_psock_zap_ingress() and why the lock is
> needed here. We have two callers
>
> sk_psock_destroy_deferred(), is deferred after an RCU grace period and after
> cancel_work_sync() so there should be no users to into the skb queue. If there
> are we  have other problems I think.

Right, I think sk_psock_zap_ingress() can be completely removed here
as it is already called in sk_psock_drop() (as below).

>
> sk_psock_drop() is the other. It is called when the refcnt is zero and does
> a sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED). Should it just wrap
> up the clear_state and sk_psock_zap_ingress similar to other cases so it
> doesn't have to deal with the case where enqueue happens after
> sk_psock_zap_ingress.
>
> Something like this would be clearer?

Yes.

>
> void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
> {
>         sk_psock_stop()
>         write_lock_bh(&sk->sk_callback_lock);
>         sk_psock_restore_proto(sk, psock);
>         rcu_assign_sk_user_data(sk, NULL);
>         if (psock->progs.stream_parser)
>                 sk_psock_stop_strp(sk, psock);
>         else if (psock->progs.stream_verdict)
>                 sk_psock_stop_verdict(sk, psock);
>         write_unlock_bh(&sk->sk_callback_lock);
>         call_rcu(&psock->rcu, sk_psock_destroy);
> }
> EXPORT_SYMBOL_GPL(sk_psock_drop)
>
> Then sk_psock_zap_ingress, as coded above, is not really needed anywhere and
> we just use the lockless variant, __sk_psock_zap_ingress(). WDYT, to I miss
> something.

This makes sense to me too.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index f2d45a73b2b2..0f5e663f6c7f 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -347,6 +347,7 @@  static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_stop(struct sk_psock *psock);
 
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 305dddc51857..d0a227b0f672 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -497,7 +497,7 @@  static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 	if (!ingress) {
 		if (!sock_writeable(psock->sk))
 			return -EAGAIN;
-		return skb_send_sock_locked(psock->sk, skb, off, len);
+		return skb_send_sock(psock->sk, skb, off, len);
 	}
 	return sk_psock_skb_ingress(psock, skb);
 }
@@ -511,8 +511,6 @@  static void sk_psock_backlog(struct work_struct *work)
 	u32 len, off;
 	int ret;
 
-	/* Lock sock to avoid losing sk_socket during loop. */
-	lock_sock(psock->sk);
 	if (state->skb) {
 		skb = state->skb;
 		len = state->len;
@@ -529,7 +527,7 @@  static void sk_psock_backlog(struct work_struct *work)
 		skb_bpf_redirect_clear(skb);
 		do {
 			ret = -EIO;
-			if (likely(psock->sk->sk_socket))
+			if (!sock_flag(psock->sk, SOCK_DEAD))
 				ret = sk_psock_handle_skb(psock, skb, off,
 							  len, ingress);
 			if (ret <= 0) {
@@ -537,13 +535,13 @@  static void sk_psock_backlog(struct work_struct *work)
 					state->skb = skb;
 					state->len = len;
 					state->off = off;
-					goto end;
+					return;
 				}
 				/* Hard errors break pipe and stop xmit. */
 				sk_psock_report_error(psock, ret ? -ret : EPIPE);
 				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 				kfree_skb(skb);
-				goto end;
+				return;
 			}
 			off += ret;
 			len -= ret;
@@ -552,8 +550,6 @@  static void sk_psock_backlog(struct work_struct *work)
 		if (!ingress)
 			kfree_skb(skb);
 	}
-end:
-	release_sock(psock->sk);
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -631,7 +627,7 @@  static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 	}
 }
 
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
 {
 	struct sk_buff *skb;
 
@@ -639,8 +635,13 @@  static void sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}
-	spin_lock_bh(&psock->ingress_lock);
 	__sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+	spin_lock_bh(&psock->ingress_lock);
+	__sk_psock_zap_ingress(psock);
 	spin_unlock_bh(&psock->ingress_lock);
 }
 
@@ -654,6 +655,17 @@  static void sk_psock_link_destroy(struct sk_psock *psock)
 	}
 }
 
+void sk_psock_stop(struct sk_psock *psock)
+{
+	spin_lock_bh(&psock->ingress_lock);
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+	sk_psock_cork_free(psock);
+	__sk_psock_zap_ingress(psock);
+	spin_unlock_bh(&psock->ingress_lock);
+
+	cancel_work_sync(&psock->work);
+}
+
 static void sk_psock_done_strp(struct sk_psock *psock);
 
 static void sk_psock_destroy_deferred(struct work_struct *gc)
@@ -770,14 +782,20 @@  static void sk_psock_skb_redirect(struct sk_buff *skb)
 	 * error that caused the pipe to break. We can't send a packet on
 	 * a socket that is in this state so we drop the skb.
 	 */
-	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
-	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+	if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+		kfree_skb(skb);
+		return;
+	}
+	spin_lock_bh(&psock_other->ingress_lock);
+	if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+		spin_unlock_bh(&psock_other->ingress_lock);
 		kfree_skb(skb);
 		return;
 	}
 
 	skb_queue_tail(&psock_other->ingress_skb, skb);
 	schedule_work(&psock_other->work);
+	spin_unlock_bh(&psock_other->ingress_lock);
 }
 
 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -845,8 +863,12 @@  static void sk_psock_verdict_apply(struct sk_psock *psock,
 			err = sk_psock_skb_ingress_self(psock, skb);
 		}
 		if (err < 0) {
-			skb_queue_tail(&psock->ingress_skb, skb);
-			schedule_work(&psock->work);
+			spin_lock_bh(&psock->ingress_lock);
+			if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+				skb_queue_tail(&psock->ingress_skb, skb);
+				schedule_work(&psock->work);
+			}
+			spin_unlock_bh(&psock->ingress_lock);
 		}
 		break;
 	case __SK_REDIRECT:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dd53a7771d7e..7c3589fc13bb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1540,6 +1540,7 @@  void sock_map_close(struct sock *sk, long timeout)
 	saved_close = psock->saved_close;
 	sock_map_remove_links(sk, psock);
 	rcu_read_unlock();
+	sk_psock_stop(psock);
 	release_sock(sk);
 	saved_close(sk, timeout);
 }