diff mbox series

net: rds: use maybe_get_net() when acquiring refcount on TCP sockets

Message ID 63dab11e-2aeb-5608-6dcb-6ebc3e98056e@I-love.SAKURA.ne.jp (mailing list archive)
State Superseded
Headers show
Series net: rds: use maybe_get_net() when acquiring refcount on TCP sockets | expand

Commit Message

Tetsuo Handa May 5, 2022, 12:45 a.m. UTC
Eric Dumazet is reporting addition on 0 problem at rds_tcp_tune(), for
delayed works queued in rds_wq might be invoked after a net namespace's
refcount already reached 0.

Since rds_tcp_exit_net() from cleanup_net() calls flush_workqueue(rds_wq),
it is guaranteed that we can instead use maybe_get_net() from delayed work
functions until rds_tcp_exit_net() returns.

Note that I'm not convinced that all works which might access a net
namespace are already queued in rds_wq by the moment rds_tcp_exit_net()
calls flush_workqueue(rds_wq). If some race is there, rds_tcp_exit_net()
will fail to wait for work functions, and kmem_cache_free() could be
called from net_free() before maybe_get_net() is called from
rds_tcp_tune().

Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 3a58f13a881ed351 ("net: rds: acquire refcount on TCP sockets")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 net/rds/tcp.c         | 11 ++++++++---
 net/rds/tcp.h         |  2 +-
 net/rds/tcp_connect.c |  5 ++++-
 net/rds/tcp_listen.c  |  5 ++++-
 4 files changed, 17 insertions(+), 6 deletions(-)

Comments

Eric Dumazet May 5, 2022, 12:53 a.m. UTC | #1
On Wed, May 4, 2022 at 5:45 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> Eric Dumazet is reporting addition on 0 problem at rds_tcp_tune(), for
> delayed works queued in rds_wq might be invoked after a net namespace's
> refcount already reached 0.
>
> Since rds_tcp_exit_net() from cleanup_net() calls flush_workqueue(rds_wq),
> it is guaranteed that we can instead use maybe_get_net() from delayed work
> functions until rds_tcp_exit_net() returns.
>
> Note that I'm not convinced that all works which might access a net
> namespace are already queued in rds_wq by the moment rds_tcp_exit_net()
> calls flush_workqueue(rds_wq). If some race is there, rds_tcp_exit_net()
> will fail to wait for work functions, and kmem_cache_free() could be
> called from net_free() before maybe_get_net() is called from
> rds_tcp_tune().
>
> Reported-by: Eric Dumazet <edumazet@google.com>
> Fixes: 3a58f13a881ed351 ("net: rds: acquire refcount on TCP sockets")
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  net/rds/tcp.c         | 11 ++++++++---
>  net/rds/tcp.h         |  2 +-
>  net/rds/tcp_connect.c |  5 ++++-
>  net/rds/tcp_listen.c  |  5 ++++-
>  4 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/net/rds/tcp.c b/net/rds/tcp.c
> index 2f638f8b7b1e..8e26bcf02044 100644
> --- a/net/rds/tcp.c
> +++ b/net/rds/tcp.c
> @@ -487,11 +487,11 @@ struct rds_tcp_net {
>  /* All module specific customizations to the RDS-TCP socket should be done in
>   * rds_tcp_tune() and applied after socket creation.
>   */
> -void rds_tcp_tune(struct socket *sock)
> +bool rds_tcp_tune(struct socket *sock)
>  {
>         struct sock *sk = sock->sk;
>         struct net *net = sock_net(sk);
> -       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +       struct rds_tcp_net *rtn;
>
>         tcp_sock_set_nodelay(sock->sk);
>         lock_sock(sk);
> @@ -499,10 +499,14 @@ void rds_tcp_tune(struct socket *sock)
>          * a process which created this net namespace terminated.
>          */
>         if (!sk->sk_net_refcnt) {
> +               if (!maybe_get_net(net)) {


> +                       release_sock(sk);
> +                       return false;
> +               }
>                 sk->sk_net_refcnt = 1;
> -               get_net_track(net, &sk->ns_tracker, GFP_KERNEL);

This could use:
                  netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);

>                 sock_inuse_add(net, 1);
>         }
> +       rtn = net_generic(net, rds_tcp_netid);
>         if (rtn->sndbuf_size > 0) {
>                 sk->sk_sndbuf = rtn->sndbuf_size;
>                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
> @@ -512,6 +516,7 @@ void rds_tcp_tune(struct socket *sock)
>                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
>         }
>         release_sock(sk);
> +       return true;
>  }
>

Otherwise, patch looks good to me, thanks.
Jakub Kicinski May 5, 2022, 1:04 a.m. UTC | #2
On Thu, 5 May 2022 09:45:49 +0900 Tetsuo Handa wrote:
> Subject: [PATCH] net: rds: use maybe_get_net() when acquiring refcount on TCP  sockets

Please tag the next version as [PATCH net v2], and make sure it applies
cleanly on top of net/master, 'cause reportedly this one didn't?
https://patchwork.kernel.org/project/netdevbpf/patch/63dab11e-2aeb-5608-6dcb-6ebc3e98056e@I-love.SAKURA.ne.jp/
diff mbox series

Patch

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 2f638f8b7b1e..8e26bcf02044 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -487,11 +487,11 @@  struct rds_tcp_net {
 /* All module specific customizations to the RDS-TCP socket should be done in
  * rds_tcp_tune() and applied after socket creation.
  */
-void rds_tcp_tune(struct socket *sock)
+bool rds_tcp_tune(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct net *net = sock_net(sk);
-	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct rds_tcp_net *rtn;
 
 	tcp_sock_set_nodelay(sock->sk);
 	lock_sock(sk);
@@ -499,10 +499,14 @@  void rds_tcp_tune(struct socket *sock)
 	 * a process which created this net namespace terminated.
 	 */
 	if (!sk->sk_net_refcnt) {
+		if (!maybe_get_net(net)) {
+			release_sock(sk);
+			return false;
+		}
 		sk->sk_net_refcnt = 1;
-		get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
 		sock_inuse_add(net, 1);
 	}
+	rtn = net_generic(net, rds_tcp_netid);
 	if (rtn->sndbuf_size > 0) {
 		sk->sk_sndbuf = rtn->sndbuf_size;
 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
@@ -512,6 +516,7 @@  void rds_tcp_tune(struct socket *sock)
 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 	}
 	release_sock(sk);
+	return true;
 }
 
 static void rds_tcp_accept_worker(struct work_struct *work)
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index dc8d745d6857..f8b5930d7b34 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -49,7 +49,7 @@  struct rds_tcp_statistics {
 };
 
 /* tcp.c */
-void rds_tcp_tune(struct socket *sock);
+bool rds_tcp_tune(struct socket *sock);
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_restore_callbacks(struct socket *sock,
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 5461d77fff4f..f0c477c5d1db 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -124,7 +124,10 @@  int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 	if (ret < 0)
 		goto out;
 
-	rds_tcp_tune(sock);
+	if (!rds_tcp_tune(sock)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (isv6) {
 		sin6.sin6_family = AF_INET6;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 09cadd556d1e..7edf2e69d3fe 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -133,7 +133,10 @@  int rds_tcp_accept_one(struct socket *sock)
 	__module_get(new_sock->ops->owner);
 
 	rds_tcp_keepalive(new_sock);
-	rds_tcp_tune(new_sock);
+	if (!rds_tcp_tune(new_sock)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	inet = inet_sk(new_sock->sk);