[1/4] nfs: use-after-free in svc_process_common()
diff mbox series

Message ID 134cf19c-e698-abed-02de-1659f9a5d4fb@virtuozzo.com
State New
Headers show
Series
  • use-after-free in svc_process_common()
Related show

Commit Message

Vasily Averin Dec. 17, 2018, 4:23 p.m. UTC
if node have NFSv41+ mounts inside several net namespaces
it can lead to use-after-free in svc_process_common()

svc_process_common() 
        /* Setup reply header */
        rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE

svc_process_common() can use already freed rqstp->rq_xprt,
it was assigned in bc_svc_process() where it was taken from serv->sv_bc_xprt.

serv is global structure but sv_bc_xprt is assigned per-netnamespace,
so if nfsv41+ shares are mounted in several containers together
bc_svc_process() can use wrong backchannel or even access freed memory.

To find correct svc_xprt of client-related backchannel
bc_svc_process() now calls new .bc_get_xprt callback
that executes svc_find_xprt() with proper xprt name.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
---
 include/linux/sunrpc/xprt.h       |  1 +
 net/sunrpc/svc.c                  | 22 ++++++++++++++++------
 net/sunrpc/xprtrdma/backchannel.c |  5 +++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  7 +++++++
 6 files changed, 31 insertions(+), 6 deletions(-)

Comments

Jeff Layton Dec. 17, 2018, 5:49 p.m. UTC | #1
On Mon, 2018-12-17 at 19:23 +0300, Vasily Averin wrote:
> if node have NFSv41+ mounts inside several net namespaces
> it can lead to use-after-free in svc_process_common()
> 
> svc_process_common() 
>         /* Setup reply header */
>         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE
> 
> svc_process_common() can use already freed rqstp->rq_xprt,
> it was assigned in bc_svc_process() where it was taken from serv->sv_bc_xprt.
> 
> serv is global structure but sv_bc_xprt is assigned per-netnamespace,
> so if nfsv41+ shares are mounted in several containers together
> bc_svc_process() can use wrong backchannel or even access freed memory.
> 
> To find correct svc_xprt of client-related backchannel
> bc_svc_process() now calls new .bc_get_xprt callback
> that executes svc_find_xprt() with proper xprt name.
> 
> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
> ---
>  include/linux/sunrpc/xprt.h       |  1 +
>  net/sunrpc/svc.c                  | 22 ++++++++++++++++------
>  net/sunrpc/xprtrdma/backchannel.c |  5 +++++
>  net/sunrpc/xprtrdma/transport.c   |  1 +
>  net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
>  net/sunrpc/xprtsock.c             |  7 +++++++
>  6 files changed, 31 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index a4ab4f8d9140..031d2843a002 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -158,6 +158,7 @@ struct rpc_xprt_ops {
>  	int		(*bc_setup)(struct rpc_xprt *xprt,
>  				    unsigned int min_reqs);
>  	int		(*bc_up)(struct svc_serv *serv, struct net *net);
> +	struct svc_xprt*(*bc_get_xprt)(struct svc_serv *serv, struct net *net);
>  	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
>  	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
>  	void		(*bc_destroy)(struct rpc_xprt *xprt,
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index d13e05f1a990..a7264fd1b3db 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -1450,16 +1450,22 @@ int
>  bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  	       struct svc_rqst *rqstp)
>  {
> +	struct net	*net = req->rq_xprt->xprt_net;
>  	struct kvec	*argv = &rqstp->rq_arg.head[0];
>  	struct kvec	*resv = &rqstp->rq_res.head[0];
>  	struct rpc_task *task;
> +	struct svc_xprt *s_xprt;
>  	int proc_error;
>  	int error;
>  
>  	dprintk("svc: %s(%p)\n", __func__, req);
>  
> +	s_xprt = req->rq_xprt->ops->bc_get_xprt(serv, net);
> +	if (!s_xprt)
> +		goto proc_error;
> +
>  	/* Build the svc_rqst used by the common processing routine */
> -	rqstp->rq_xprt = serv->sv_bc_xprt;
> +	rqstp->rq_xprt = s_xprt;
>  	rqstp->rq_xid = req->rq_xid;
>  	rqstp->rq_prot = req->rq_xprt->prot;
>  	rqstp->rq_server = serv;
> @@ -1494,13 +1500,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  
>  	/* Parse and execute the bc call */
>  	proc_error = svc_process_common(rqstp, argv, resv);
> +	svc_xprt_put(rqstp->rq_xprt);
>  
>  	atomic_inc(&req->rq_xprt->bc_free_slots);
> -	if (!proc_error) {
> -		/* Processing error: drop the request */
> -		xprt_free_bc_request(req);
> -		return 0;
> -	}
> +	if (!proc_error)
> +		goto proc_error;
>  
>  	/* Finally, send the reply synchronously */
>  	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
> @@ -1517,6 +1521,12 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  out:
>  	dprintk("svc: %s(), error=%d\n", __func__, error);
>  	return error;
> +
> +proc_error:
> +	/* Processing error: drop the request */
> +	xprt_free_bc_request(req);
> +	error = -EINVAL;
> +	goto out;
>  }
>  EXPORT_SYMBOL_GPL(bc_svc_process);
>  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
> index e5b367a3e517..3e06aeacda43 100644
> --- a/net/sunrpc/xprtrdma/backchannel.c
> +++ b/net/sunrpc/xprtrdma/backchannel.c
> @@ -133,6 +133,11 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
>  	return 0;
>  }
>  
> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net)
> +{
> +	return svc_find_xprt(serv, "rdma-bc", net, AF_UNSPEC, 0);
> +}
> +
>  /**
>   * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
>   * @xprt: transport
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index ae2a83828953..41d67de93531 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -828,6 +828,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>  	.bc_setup		= xprt_rdma_bc_setup,
>  	.bc_up			= xprt_rdma_bc_up,
> +	.bc_get_xprt		= xprt_rdma_bc_get_xprt,
>  	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
>  	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
>  	.bc_destroy		= xprt_rdma_bc_destroy,
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index a13ccb643ce0..2726d71052a8 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -662,6 +662,7 @@ void xprt_rdma_cleanup(void);
>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>  int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
>  int xprt_rdma_bc_up(struct svc_serv *, struct net *);
> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net);
>  size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
>  int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
>  void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 8a5e823e0b33..16f9c7720465 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -1411,6 +1411,12 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
>  	return 0;
>  }
>  
> +static struct svc_xprt *xs_tcp_bc_get_xprt(struct svc_serv *serv,
> +					   struct net *net)
> +{
> +	return svc_find_xprt(serv, "tcp-bc", net, AF_UNSPEC, 0);
> +}
> +
>  static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
>  {
>  	return PAGE_SIZE;
> @@ -2668,6 +2674,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
>  #ifdef CONFIG_SUNRPC_BACKCHANNEL
>  	.bc_setup		= xprt_setup_bc,
>  	.bc_up			= xs_tcp_bc_up,
> +	.bc_get_xprt		= xs_tcp_bc_get_xprt,
>  	.bc_maxpayload		= xs_tcp_bc_maxpayload,
>  	.bc_free_rqst		= xprt_free_bc_rqst,
>  	.bc_destroy		= xprt_destroy_bc,

Reviewed-by: Jeff Layton <jlayton@kernel.org>
'J. Bruce Fields' Dec. 17, 2018, 9:50 p.m. UTC | #2
On Mon, Dec 17, 2018 at 07:23:54PM +0300, Vasily Averin wrote:
> if node have NFSv41+ mounts inside several net namespaces
> it can lead to use-after-free in svc_process_common()
> 
> svc_process_common() 
>         /* Setup reply header */
>         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE
> 
> svc_process_common() can use already freed rqstp->rq_xprt,
> it was assigned in bc_svc_process() where it was taken from serv->sv_bc_xprt.
> 
> serv is global structure but sv_bc_xprt is assigned per-netnamespace,
> so if nfsv41+ shares are mounted in several containers together
> bc_svc_process() can use wrong backchannel or even access freed memory.
> 
> To find correct svc_xprt of client-related backchannel
> bc_svc_process() now calls new .bc_get_xprt callback
> that executes svc_find_xprt() with proper xprt name.

This stuff is confusing and I need to stare at it some more before I
understand, but it's weird that we'd need to search for the right xprt.

We know which connection the backchannel request came over, and there
should only be one backchannel using that connection, why can't we find
it by just chasing pointers the right way?

OK, I do need to look at it more.

--b.

> 
> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
> ---
>  include/linux/sunrpc/xprt.h       |  1 +
>  net/sunrpc/svc.c                  | 22 ++++++++++++++++------
>  net/sunrpc/xprtrdma/backchannel.c |  5 +++++
>  net/sunrpc/xprtrdma/transport.c   |  1 +
>  net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
>  net/sunrpc/xprtsock.c             |  7 +++++++
>  6 files changed, 31 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index a4ab4f8d9140..031d2843a002 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -158,6 +158,7 @@ struct rpc_xprt_ops {
>  	int		(*bc_setup)(struct rpc_xprt *xprt,
>  				    unsigned int min_reqs);
>  	int		(*bc_up)(struct svc_serv *serv, struct net *net);
> +	struct svc_xprt*(*bc_get_xprt)(struct svc_serv *serv, struct net *net);
>  	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
>  	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
>  	void		(*bc_destroy)(struct rpc_xprt *xprt,
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index d13e05f1a990..a7264fd1b3db 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -1450,16 +1450,22 @@ int
>  bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  	       struct svc_rqst *rqstp)
>  {
> +	struct net	*net = req->rq_xprt->xprt_net;
>  	struct kvec	*argv = &rqstp->rq_arg.head[0];
>  	struct kvec	*resv = &rqstp->rq_res.head[0];
>  	struct rpc_task *task;
> +	struct svc_xprt *s_xprt;
>  	int proc_error;
>  	int error;
>  
>  	dprintk("svc: %s(%p)\n", __func__, req);
>  
> +	s_xprt = req->rq_xprt->ops->bc_get_xprt(serv, net);
> +	if (!s_xprt)
> +		goto proc_error;
> +
>  	/* Build the svc_rqst used by the common processing routine */
> -	rqstp->rq_xprt = serv->sv_bc_xprt;
> +	rqstp->rq_xprt = s_xprt;
>  	rqstp->rq_xid = req->rq_xid;
>  	rqstp->rq_prot = req->rq_xprt->prot;
>  	rqstp->rq_server = serv;
> @@ -1494,13 +1500,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  
>  	/* Parse and execute the bc call */
>  	proc_error = svc_process_common(rqstp, argv, resv);
> +	svc_xprt_put(rqstp->rq_xprt);
>  
>  	atomic_inc(&req->rq_xprt->bc_free_slots);
> -	if (!proc_error) {
> -		/* Processing error: drop the request */
> -		xprt_free_bc_request(req);
> -		return 0;
> -	}
> +	if (!proc_error)
> +		goto proc_error;
>  
>  	/* Finally, send the reply synchronously */
>  	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
> @@ -1517,6 +1521,12 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>  out:
>  	dprintk("svc: %s(), error=%d\n", __func__, error);
>  	return error;
> +
> +proc_error:
> +	/* Processing error: drop the request */
> +	xprt_free_bc_request(req);
> +	error = -EINVAL;
> +	goto out;
>  }
>  EXPORT_SYMBOL_GPL(bc_svc_process);
>  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
> index e5b367a3e517..3e06aeacda43 100644
> --- a/net/sunrpc/xprtrdma/backchannel.c
> +++ b/net/sunrpc/xprtrdma/backchannel.c
> @@ -133,6 +133,11 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
>  	return 0;
>  }
>  
> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net)
> +{
> +	return svc_find_xprt(serv, "rdma-bc", net, AF_UNSPEC, 0);
> +}
> +
>  /**
>   * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
>   * @xprt: transport
> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
> index ae2a83828953..41d67de93531 100644
> --- a/net/sunrpc/xprtrdma/transport.c
> +++ b/net/sunrpc/xprtrdma/transport.c
> @@ -828,6 +828,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>  	.bc_setup		= xprt_rdma_bc_setup,
>  	.bc_up			= xprt_rdma_bc_up,
> +	.bc_get_xprt		= xprt_rdma_bc_get_xprt,
>  	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
>  	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
>  	.bc_destroy		= xprt_rdma_bc_destroy,
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index a13ccb643ce0..2726d71052a8 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -662,6 +662,7 @@ void xprt_rdma_cleanup(void);
>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>  int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
>  int xprt_rdma_bc_up(struct svc_serv *, struct net *);
> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net);
>  size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
>  int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
>  void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 8a5e823e0b33..16f9c7720465 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -1411,6 +1411,12 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
>  	return 0;
>  }
>  
> +static struct svc_xprt *xs_tcp_bc_get_xprt(struct svc_serv *serv,
> +					   struct net *net)
> +{
> +	return svc_find_xprt(serv, "tcp-bc", net, AF_UNSPEC, 0);
> +}
> +
>  static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
>  {
>  	return PAGE_SIZE;
> @@ -2668,6 +2674,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
>  #ifdef CONFIG_SUNRPC_BACKCHANNEL
>  	.bc_setup		= xprt_setup_bc,
>  	.bc_up			= xs_tcp_bc_up,
> +	.bc_get_xprt		= xs_tcp_bc_get_xprt,
>  	.bc_maxpayload		= xs_tcp_bc_maxpayload,
>  	.bc_free_rqst		= xprt_free_bc_rqst,
>  	.bc_destroy		= xprt_destroy_bc,
> -- 
> 2.17.1
Vasily Averin Dec. 18, 2018, 6:45 a.m. UTC | #3
On 12/18/18 12:50 AM, J. Bruce Fields wrote:
> On Mon, Dec 17, 2018 at 07:23:54PM +0300, Vasily Averin wrote:
>> if node have NFSv41+ mounts inside several net namespaces
>> it can lead to use-after-free in svc_process_common()
>>
>> svc_process_common() 
>>         /* Setup reply header */
>>         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<< HERE
>>
>> svc_process_common() can use already freed rqstp->rq_xprt,
>> it was assigned in bc_svc_process() where it was taken from serv->sv_bc_xprt.
>>
>> serv is global structure but sv_bc_xprt is assigned per-netnamespace,
>> so if nfsv41+ shares are mounted in several containers together
>> bc_svc_process() can use wrong backchannel or even access freed memory.
>>
>> To find correct svc_xprt of client-related backchannel
>> bc_svc_process() now calls new .bc_get_xprt callback
>> that executes svc_find_xprt() with proper xprt name.
> 
> This stuff is confusing and I need to stare at it some more before I
> understand, but it's weird that we'd need to search for the right xprt.

All NFS clients in all net namespaces used the same minorversion 
shares common nfs_callback_data taken from global nfs_callback_info array.

Moreover these clients can use either rdma or nfs transport,
however only one of them can be used in one net namespace.

Each net namespace must have own backchannel, 
it cannot depend on other net namespaces, 
because at least they can use different transports.

So one svc_serv should be able to handle several (per-netns) backchannels.

Frankly speaking If you prefer I can easily convert global nfs_callback_info to per net-namespace.
I've checked, it works too. However current solution looks better for me.

> We know which connection the backchannel request came over, and there
> should only be one backchannel using that connection, why can't we find
> it by just chasing pointers the right way?

it is allocated by using follwing calltrace:
nfs_callback_up
 nfs_callback_up_net
  xprt->ops->bc_up(serv, net) -> xs_tcp_bc_up
   svc_create_xprt(serv, "tcp-bc")
    __svc_xpo_create
     svc_bc_tcp_create
      svc_bc_create_socket

Here backchannel's svc_sock/svc/xprt is created.
It is per-netns and therefore it cannot be saved as pointer on global svc_serv.

It could be saved on some xprt related to forechannel,
I've expected it was done already -- but it was not done.
I've tried to find any way to do it -- but without success,
according structures seems are not accessible in svc_bc_tcp_create.

Finally I've found that backchannel's xprt is added into serv->sv_permsocks
and svc_find_xprt can find it by name.

It would be great if you can advise some more simple way.  

> 
> OK, I do need to look at it more.

It is quite important for containers so I think this patch (or any alternative solution)
should be pushed in stable@.

 
> --b.
> 
>>
>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>> ---
>>  include/linux/sunrpc/xprt.h       |  1 +
>>  net/sunrpc/svc.c                  | 22 ++++++++++++++++------
>>  net/sunrpc/xprtrdma/backchannel.c |  5 +++++
>>  net/sunrpc/xprtrdma/transport.c   |  1 +
>>  net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
>>  net/sunrpc/xprtsock.c             |  7 +++++++
>>  6 files changed, 31 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
>> index a4ab4f8d9140..031d2843a002 100644
>> --- a/include/linux/sunrpc/xprt.h
>> +++ b/include/linux/sunrpc/xprt.h
>> @@ -158,6 +158,7 @@ struct rpc_xprt_ops {
>>  	int		(*bc_setup)(struct rpc_xprt *xprt,
>>  				    unsigned int min_reqs);
>>  	int		(*bc_up)(struct svc_serv *serv, struct net *net);
>> +	struct svc_xprt*(*bc_get_xprt)(struct svc_serv *serv, struct net *net);
>>  	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
>>  	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
>>  	void		(*bc_destroy)(struct rpc_xprt *xprt,
>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> index d13e05f1a990..a7264fd1b3db 100644
>> --- a/net/sunrpc/svc.c
>> +++ b/net/sunrpc/svc.c
>> @@ -1450,16 +1450,22 @@ int
>>  bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>>  	       struct svc_rqst *rqstp)
>>  {
>> +	struct net	*net = req->rq_xprt->xprt_net;
>>  	struct kvec	*argv = &rqstp->rq_arg.head[0];
>>  	struct kvec	*resv = &rqstp->rq_res.head[0];
>>  	struct rpc_task *task;
>> +	struct svc_xprt *s_xprt;
>>  	int proc_error;
>>  	int error;
>>  
>>  	dprintk("svc: %s(%p)\n", __func__, req);
>>  
>> +	s_xprt = req->rq_xprt->ops->bc_get_xprt(serv, net);
>> +	if (!s_xprt)
>> +		goto proc_error;
>> +
>>  	/* Build the svc_rqst used by the common processing routine */
>> -	rqstp->rq_xprt = serv->sv_bc_xprt;
>> +	rqstp->rq_xprt = s_xprt;
>>  	rqstp->rq_xid = req->rq_xid;
>>  	rqstp->rq_prot = req->rq_xprt->prot;
>>  	rqstp->rq_server = serv;
>> @@ -1494,13 +1500,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>>  
>>  	/* Parse and execute the bc call */
>>  	proc_error = svc_process_common(rqstp, argv, resv);
>> +	svc_xprt_put(rqstp->rq_xprt);
>>  
>>  	atomic_inc(&req->rq_xprt->bc_free_slots);
>> -	if (!proc_error) {
>> -		/* Processing error: drop the request */
>> -		xprt_free_bc_request(req);
>> -		return 0;
>> -	}
>> +	if (!proc_error)
>> +		goto proc_error;
>>  
>>  	/* Finally, send the reply synchronously */
>>  	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
>> @@ -1517,6 +1521,12 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
>>  out:
>>  	dprintk("svc: %s(), error=%d\n", __func__, error);
>>  	return error;
>> +
>> +proc_error:
>> +	/* Processing error: drop the request */
>> +	xprt_free_bc_request(req);
>> +	error = -EINVAL;
>> +	goto out;
>>  }
>>  EXPORT_SYMBOL_GPL(bc_svc_process);
>>  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
>> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
>> index e5b367a3e517..3e06aeacda43 100644
>> --- a/net/sunrpc/xprtrdma/backchannel.c
>> +++ b/net/sunrpc/xprtrdma/backchannel.c
>> @@ -133,6 +133,11 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
>>  	return 0;
>>  }
>>  
>> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net)
>> +{
>> +	return svc_find_xprt(serv, "rdma-bc", net, AF_UNSPEC, 0);
>> +}
>> +
>>  /**
>>   * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
>>   * @xprt: transport
>> diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
>> index ae2a83828953..41d67de93531 100644
>> --- a/net/sunrpc/xprtrdma/transport.c
>> +++ b/net/sunrpc/xprtrdma/transport.c
>> @@ -828,6 +828,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
>>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>>  	.bc_setup		= xprt_rdma_bc_setup,
>>  	.bc_up			= xprt_rdma_bc_up,
>> +	.bc_get_xprt		= xprt_rdma_bc_get_xprt,
>>  	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
>>  	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
>>  	.bc_destroy		= xprt_rdma_bc_destroy,
>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
>> index a13ccb643ce0..2726d71052a8 100644
>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
>> @@ -662,6 +662,7 @@ void xprt_rdma_cleanup(void);
>>  #if defined(CONFIG_SUNRPC_BACKCHANNEL)
>>  int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
>>  int xprt_rdma_bc_up(struct svc_serv *, struct net *);
>> +struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net);
>>  size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
>>  int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
>>  void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
>> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
>> index 8a5e823e0b33..16f9c7720465 100644
>> --- a/net/sunrpc/xprtsock.c
>> +++ b/net/sunrpc/xprtsock.c
>> @@ -1411,6 +1411,12 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
>>  	return 0;
>>  }
>>  
>> +static struct svc_xprt *xs_tcp_bc_get_xprt(struct svc_serv *serv,
>> +					   struct net *net)
>> +{
>> +	return svc_find_xprt(serv, "tcp-bc", net, AF_UNSPEC, 0);
>> +}
>> +
>>  static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
>>  {
>>  	return PAGE_SIZE;
>> @@ -2668,6 +2674,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
>>  #ifdef CONFIG_SUNRPC_BACKCHANNEL
>>  	.bc_setup		= xprt_setup_bc,
>>  	.bc_up			= xs_tcp_bc_up,
>> +	.bc_get_xprt		= xs_tcp_bc_get_xprt,
>>  	.bc_maxpayload		= xs_tcp_bc_maxpayload,
>>  	.bc_free_rqst		= xprt_free_bc_rqst,
>>  	.bc_destroy		= xprt_destroy_bc,
>> -- 
>> 2.17.1
>
Trond Myklebust Dec. 18, 2018, 12:49 p.m. UTC | #4
On Tue, 2018-12-18 at 09:45 +0300, Vasily Averin wrote:
> On 12/18/18 12:50 AM, J. Bruce Fields wrote:
> > On Mon, Dec 17, 2018 at 07:23:54PM +0300, Vasily Averin wrote:
> > > if node have NFSv41+ mounts inside several net namespaces
> > > it can lead to use-after-free in svc_process_common()
> > > 
> > > svc_process_common() 
> > >         /* Setup reply header */
> > >         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<<
> > > HERE
> > > 
> > > svc_process_common() can use already freed rqstp->rq_xprt,
> > > it was assigned in bc_svc_process() where it was taken from serv-
> > > >sv_bc_xprt.
> > > 
> > > serv is global structure but sv_bc_xprt is assigned per-
> > > netnamespace,
> > > so if nfsv41+ shares are mounted in several containers together
> > > bc_svc_process() can use wrong backchannel or even access freed
> > > memory.
> > > 
> > > To find correct svc_xprt of client-related backchannel
> > > bc_svc_process() now calls new .bc_get_xprt callback
> > > that executes svc_find_xprt() with proper xprt name.
> > 
> > This stuff is confusing and I need to stare at it some more before
> > I
> > understand, but it's weird that we'd need to search for the right
> > xprt.
> 
> All NFS clients in all net namespaces used the same minorversion 
> shares common nfs_callback_data taken from global nfs_callback_info
> array.
> 
> Moreover these clients can use either rdma or nfs transport,
> however only one of them can be used in one net namespace.
> 
> Each net namespace must have own backchannel, 
> it cannot depend on other net namespaces, 
> because at least they can use different transports.
> 
> So one svc_serv should be able to handle several (per-netns)
> backchannels.
> 
> Frankly speaking If you prefer I can easily convert global
> nfs_callback_info to per net-namespace.
> I've checked, it works too. However current solution looks better for
> me.
> 
> > We know which connection the backchannel request came over, and
> > there
> > should only be one backchannel using that connection, why can't we
> > find
> > it by just chasing pointers the right way?
> 
> it is allocated by using follwing calltrace:
> nfs_callback_up
>  nfs_callback_up_net
>   xprt->ops->bc_up(serv, net) -> xs_tcp_bc_up
>    svc_create_xprt(serv, "tcp-bc")
>     __svc_xpo_create
>      svc_bc_tcp_create
>       svc_bc_create_socket
> 
> Here backchannel's svc_sock/svc/xprt is created.
> It is per-netns and therefore it cannot be saved as pointer on global
> svc_serv.
> 
> It could be saved on some xprt related to forechannel,
> I've expected it was done already -- but it was not done.
> I've tried to find any way to do it -- but without success,
> according structures seems are not accessible in svc_bc_tcp_create.
> 
> Finally I've found that backchannel's xprt is added into serv-
> >sv_permsocks
> and svc_find_xprt can find it by name.
> 
> It would be great if you can advise some more simple way.  
> 
> > OK, I do need to look at it more.
> 
> It is quite important for containers so I think this patch (or any
> alternative solution)
> should be pushed in stable@.
> 

The whole "let's set up rqstp->rq_xprt for the back channel" is nothing
but a giant hack in order to work around the fact that
svc_process_common() uses it to find the xpt_ops, and perform a couple
of (meaningless for the back channel) tests of xpt_flags.

What say we just pass in the xpt_ops as a parameter to
svc_process_common(), and make those xpt_flags tests check for whether
or not rqstp->rq_xprt is actually non-NULL?

It probably also requires us to store a pointer to struct net in the
struct svc_rqst so that nfs4_callback_compound() and
svcauth_gss_accept() can find it, but that should be OK since the
transport already has that referenced.

Cheers,
  Trond
Vasily Averin Dec. 18, 2018, 2:35 p.m. UTC | #5
On 12/18/18 3:49 PM, Trond Myklebust wrote:
> On Tue, 2018-12-18 at 09:45 +0300, Vasily Averin wrote:
>> On 12/18/18 12:50 AM, J. Bruce Fields wrote:
>>> On Mon, Dec 17, 2018 at 07:23:54PM +0300, Vasily Averin wrote:
>>>> if node have NFSv41+ mounts inside several net namespaces
>>>> it can lead to use-after-free in svc_process_common()
>>>>
>>>> svc_process_common() 
>>>>         /* Setup reply header */
>>>>         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); <<<
>>>> HERE
>>>>
>>>> svc_process_common() can use already freed rqstp->rq_xprt,
>>>> it was assigned in bc_svc_process() where it was taken from serv-
>>>>> sv_bc_xprt.
>>>>
>>>> serv is global structure but sv_bc_xprt is assigned per-
>>>> netnamespace,
>>>> so if nfsv41+ shares are mounted in several containers together
>>>> bc_svc_process() can use wrong backchannel or even access freed
>>>> memory.
>>>>
>>>> To find correct svc_xprt of client-related backchannel
>>>> bc_svc_process() now calls new .bc_get_xprt callback
>>>> that executes svc_find_xprt() with proper xprt name.
>>>
>>> This stuff is confusing and I need to stare at it some more before
>>> I
>>> understand, but it's weird that we'd need to search for the right
>>> xprt.
>>
>> All NFS clients in all net namespaces used the same minorversion 
>> shares common nfs_callback_data taken from global nfs_callback_info
>> array.
>>
>> Moreover these clients can use either rdma or nfs transport,
>> however only one of them can be used in one net namespace.
>>
>> Each net namespace must have own backchannel, 
>> it cannot depend on other net namespaces, 
>> because at least they can use different transports.
>>
>> So one svc_serv should be able to handle several (per-netns)
>> backchannels.
>>
>> Frankly speaking If you prefer I can easily convert global
>> nfs_callback_info to per net-namespace.
>> I've checked, it works too. However current solution looks better for
>> me.
>>
>>> We know which connection the backchannel request came over, and
>>> there
>>> should only be one backchannel using that connection, why can't we
>>> find
>>> it by just chasing pointers the right way?
>>
>> it is allocated by using follwing calltrace:
>> nfs_callback_up
>>  nfs_callback_up_net
>>   xprt->ops->bc_up(serv, net) -> xs_tcp_bc_up
>>    svc_create_xprt(serv, "tcp-bc")
>>     __svc_xpo_create
>>      svc_bc_tcp_create
>>       svc_bc_create_socket
>>
>> Here backchannel's svc_sock/svc/xprt is created.
>> It is per-netns and therefore it cannot be saved as pointer on global
>> svc_serv.
>>
>> It could be saved on some xprt related to forechannel,
>> I've expected it was done already -- but it was not done.
>> I've tried to find any way to do it -- but without success,
>> according structures seems are not accessible in svc_bc_tcp_create.
>>
>> Finally I've found that backchannel's xprt is added into serv-
>>> sv_permsocks
>> and svc_find_xprt can find it by name.
>>
>> It would be great if you can advise some more simple way.  
>>
>>> OK, I do need to look at it more.
>>
>> It is quite important for containers so I think this patch (or any
>> alternative solution)
>> should be pushed in stable@.
>>
> 
> The whole "let's set up rqstp->rq_xprt for the back channel" is nothing
> but a giant hack in order to work around the fact that
> svc_process_common() uses it to find the xpt_ops, and perform a couple
> of (meaningless for the back channel) tests of xpt_flags.
> 
> What say we just pass in the xpt_ops as a parameter to
> svc_process_common(), and make those xpt_flags tests check for whether
> or not rqstp->rq_xprt is actually non-NULL?

To access proper xpt_flags inside svc_process_common() 
we need to pass svc_xprt instead of xpt_ops.

Do you mean something like following?

--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1148,7 +1148,7 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
  * Common routine for processing the RPC request.
  */
 static int
-svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv, struct svc_xprt *s_xprt)
 {
        struct svc_program      *progp;
        const struct svc_version *versp = NULL; /* compiler food */
@@ -1172,7 +1172,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
        clear_bit(RQ_DROPME, &rqstp->rq_flags);
 
        /* Setup reply header */
-       rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+       s_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
 
        svc_putu32(resv, rqstp->rq_xid);
 
@@ -1245,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
         * fit.
         */
        if (versp->vs_need_cong_ctrl &&
-           !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+           !test_bit(XPT_CONG_CTRL, &s_xprt->xpt_flags))


@@ -1336,8 +1336,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
        return 0;
 
  close:
-       if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
-               svc_close_xprt(rqstp->rq_xprt);
+       if (test_bit(XPT_TEMP, &s_xprt->xpt_flags))
+               svc_close_xprt(s_xprt);
        dprintk("svc: svc_process close\n");
        return 0;


> It probably also requires us to store a pointer to struct net in the
> struct svc_rqst so that nfs4_callback_compound() and
> svcauth_gss_accept() can find it, but that should be OK since the
> transport already has that referenced.
> 
> Cheers,
>   Trond
>
Trond Myklebust Dec. 18, 2018, 2:55 p.m. UTC | #6
On Tue, 2018-12-18 at 17:35 +0300, Vasily Averin wrote:
> On 12/18/18 3:49 PM, Trond Myklebust wrote:
> > On Tue, 2018-12-18 at 09:45 +0300, Vasily Averin wrote:
> > > On 12/18/18 12:50 AM, J. Bruce Fields wrote:
> > > > On Mon, Dec 17, 2018 at 07:23:54PM +0300, Vasily Averin wrote:
> > > > > if node have NFSv41+ mounts inside several net namespaces
> > > > > it can lead to use-after-free in svc_process_common()
> > > > > 
> > > > > svc_process_common() 
> > > > >         /* Setup reply header */
> > > > >         rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
> > > > > <<<
> > > > > HERE
> > > > > 
> > > > > svc_process_common() can use already freed rqstp->rq_xprt,
> > > > > it was assigned in bc_svc_process() where it was taken from
> > > > > serv-
> > > > > > sv_bc_xprt.
> > > > > 
> > > > > serv is global structure but sv_bc_xprt is assigned per-
> > > > > netnamespace,
> > > > > so if nfsv41+ shares are mounted in several containers
> > > > > together
> > > > > bc_svc_process() can use wrong backchannel or even access
> > > > > freed
> > > > > memory.
> > > > > 
> > > > > To find correct svc_xprt of client-related backchannel
> > > > > bc_svc_process() now calls new .bc_get_xprt callback
> > > > > that executes svc_find_xprt() with proper xprt name.
> > > > 
> > > > This stuff is confusing and I need to stare at it some more
> > > > before
> > > > I
> > > > understand, but it's weird that we'd need to search for the
> > > > right
> > > > xprt.
> > > 
> > > All NFS clients in all net namespaces used the same minorversion 
> > > shares common nfs_callback_data taken from global
> > > nfs_callback_info
> > > array.
> > > 
> > > Moreover these clients can use either rdma or nfs transport,
> > > however only one of them can be used in one net namespace.
> > > 
> > > Each net namespace must have own backchannel, 
> > > it cannot depend on other net namespaces, 
> > > because at least they can use different transports.
> > > 
> > > So one svc_serv should be able to handle several (per-netns)
> > > backchannels.
> > > 
> > > Frankly speaking If you prefer I can easily convert global
> > > nfs_callback_info to per net-namespace.
> > > I've checked, it works too. However current solution looks better
> > > for
> > > me.
> > > 
> > > > We know which connection the backchannel request came over, and
> > > > there
> > > > should only be one backchannel using that connection, why can't
> > > > we
> > > > find
> > > > it by just chasing pointers the right way?
> > > 
> > > it is allocated by using follwing calltrace:
> > > nfs_callback_up
> > >  nfs_callback_up_net
> > >   xprt->ops->bc_up(serv, net) -> xs_tcp_bc_up
> > >    svc_create_xprt(serv, "tcp-bc")
> > >     __svc_xpo_create
> > >      svc_bc_tcp_create
> > >       svc_bc_create_socket
> > > 
> > > Here backchannel's svc_sock/svc/xprt is created.
> > > It is per-netns and therefore it cannot be saved as pointer on
> > > global
> > > svc_serv.
> > > 
> > > It could be saved on some xprt related to forechannel,
> > > I've expected it was done already -- but it was not done.
> > > I've tried to find any way to do it -- but without success,
> > > according structures seems are not accessible in
> > > svc_bc_tcp_create.
> > > 
> > > Finally I've found that backchannel's xprt is added into serv-
> > > > sv_permsocks
> > > and svc_find_xprt can find it by name.
> > > 
> > > It would be great if you can advise some more simple way.  
> > > 
> > > > OK, I do need to look at it more.
> > > 
> > > It is quite important for containers so I think this patch (or
> > > any
> > > alternative solution)
> > > should be pushed in stable@.
> > > 
> > 
> > The whole "let's set up rqstp->rq_xprt for the back channel" is
> > nothing
> > but a giant hack in order to work around the fact that
> > svc_process_common() uses it to find the xpt_ops, and perform a
> > couple
> > of (meaningless for the back channel) tests of xpt_flags.
> > 
> > What say we just pass in the xpt_ops as a parameter to
> > svc_process_common(), and make those xpt_flags tests check for
> > whether
> > or not rqstp->rq_xprt is actually non-NULL?
> 
> To access proper xpt_flags inside svc_process_common() 
> we need to pass svc_xprt instead of xpt_ops.

No. We don't care about xpt_flags for the back channel because there is
no "server transport". The actual transport is stored in the 'struct
rpc_rqst', and is the struct rpc_xprt corresponding to the client
socket or RDMA channel.

IOW: All we really need in svc_process_common() is to be able to run
rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
either as a pointer to the struct svc_xprt_ops itself.

The flags are irrelevant, because they refer to a transport object that
isn't real.

> 
> Do you mean something like following?
> 
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -1148,7 +1148,7 @@ static __printf(2,3) void svc_printk(struct
> svc_rqst *rqstp, const char *fmt, ..
>   * Common routine for processing the RPC request.
>   */
>  static int
> -svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct
> kvec *resv)
> +svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct
> kvec *resv, struct svc_xprt *s_xprt)
>  {
>         struct svc_program      *progp;
>         const struct svc_version *versp = NULL; /* compiler food */
> @@ -1172,7 +1172,7 @@ svc_process_common(struct svc_rqst *rqstp,
> struct kvec *argv, struct kvec *resv)
>         clear_bit(RQ_DROPME, &rqstp->rq_flags);
>  
>         /* Setup reply header */
> -       rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
> +       s_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
>  
>         svc_putu32(resv, rqstp->rq_xid);
>  
> @@ -1245,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp,
> struct kvec *argv, struct kvec *resv)
>          * fit.
>          */
>         if (versp->vs_need_cong_ctrl &&
> -           !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
> +           !test_bit(XPT_CONG_CTRL, &s_xprt->xpt_flags))


if (versp->vs_need_cong_ctrl && rqstp->rq_xprt && !test_bit(...)))

> 
> 
> @@ -1336,8 +1336,8 @@ svc_process_common(struct svc_rqst *rqstp,
> struct kvec *argv, struct kvec *resv)
>         return 0;
>  
>   close:
> -       if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
> -               svc_close_xprt(rqstp->rq_xprt);
> +       if (test_bit(XPT_TEMP, &s_xprt->xpt_flags))
> +               svc_close_xprt(s_xprt);
>         dprintk("svc: svc_process close\n");
>         return 0;
> 
> 
> > It probably also requires us to store a pointer to struct net in
> > the
> > struct svc_rqst so that nfs4_callback_compound() and
> > svcauth_gss_accept() can find it, but that should be OK since the
> > transport already has that referenced.
> > 
> > Cheers,
> >   Trond
> >
Vasily Averin Dec. 18, 2018, 8:02 p.m. UTC | #7
On 12/18/18 5:55 PM, Trond Myklebust wrote:
>>> It probably also requires us to store a pointer to struct net in
>>> the
>>> struct svc_rqst so that nfs4_callback_compound() and
>>> svcauth_gss_accept() can find it, but that should be OK since the
>>> transport already has that referenced.

Ok, I can fix these functions and their sub-calls.
However  rqst->rq_xprt is used in other functions that seems can be called inside svc_process_common() 
- in trace_svc_process(rqstp, progp->pg_name);
- in svc_reserve_auth(rqstp, ...) -> svc_reserve()
- svc_authorise() -> svcauth_gss_release()

It seems I should fix these places too, it isn't?
could you please advise how to fix svc_reserve() ?
Trond Myklebust Dec. 18, 2018, 8:43 p.m. UTC | #8
On Tue, 2018-12-18 at 23:02 +0300, Vasily Averin wrote:
> On 12/18/18 5:55 PM, Trond Myklebust wrote:
> > > > It probably also requires us to store a pointer to struct net
> > > > in
> > > > the
> > > > struct svc_rqst so that nfs4_callback_compound() and
> > > > svcauth_gss_accept() can find it, but that should be OK since
> > > > the
> > > > transport already has that referenced.
> 
> Ok, I can fix these functions and their sub-calls.
> However  rqst->rq_xprt is used in other functions that seems can be
> called inside svc_process_common() 
> - in trace_svc_process(rqstp, progp->pg_name);
> - in svc_reserve_auth(rqstp, ...) -> svc_reserve()
> - svc_authorise() -> svcauth_gss_release()
> 
> It seems I should fix these places too, it isn't?
> could you please advise how to fix svc_reserve() ?

We don't want svc_reserve() to run at all for the back channel, so I
guess that a test for rqstp->rq_xprt != NULL is appropriate there too.

svcauth_gss_release() is just using rqstp->rq_xprt to find the net
namespace, so if you add a pointer rqstp->rq_net to fix
nfs4_callback_compound, then that will fix the gss case as well.

For trace_svc_process(), maybe pull rqst->rq_xprt->xpt_remotebuf out of
the tracepoint definition in include/trace/events/sunrpc.h and make it
a tracepoint argument that is allowed to be NULL?
Vladis Dronov Dec. 18, 2018, 9:31 p.m. UTC | #9
Hello,

The CVE-2018-16884 id was assigned to this flaw and proposed to MITRE.
We would like to suggest to use this id in public communications
regarding this flaw.

Best regards,
Vladis Dronov | Red Hat, Inc. | Product Security Engineer
Vasily Averin Dec. 19, 2018, 11:25 a.m. UTC | #10
On 12/18/18 11:43 PM, Trond Myklebust wrote:
> On Tue, 2018-12-18 at 23:02 +0300, Vasily Averin wrote:
>> On 12/18/18 5:55 PM, Trond Myklebust wrote:
>>>>> It probably also requires us to store a pointer to struct net
>>>>> in
>>>>> the
>>>>> struct svc_rqst so that nfs4_callback_compound() and
>>>>> svcauth_gss_accept() can find it, but that should be OK since
>>>>> the
>>>>> transport already has that referenced.
>>
>> Ok, I can fix these functions and their sub-calls.
>> However  rqst->rq_xprt is used in other functions that seems can be
>> called inside svc_process_common() 
>> - in trace_svc_process(rqstp, progp->pg_name);
>> - in svc_reserve_auth(rqstp, ...) -> svc_reserve()
>> - svc_authorise() -> svcauth_gss_release()
>>
>> It seems I should fix these places too, it isn't?
>> could you please advise how to fix svc_reserve() ?
> 
> We don't want svc_reserve() to run at all for the back channel, so I
> guess that a test for rqstp->rq_xprt != NULL is appropriate there too.
> 
> svcauth_gss_release() is just using rqstp->rq_xprt to find the net
> namespace, so if you add a pointer rqstp->rq_net to fix
> nfs4_callback_compound, then that will fix the gss case as well.
> 
> For trace_svc_process(), maybe pull rqst->rq_xprt->xpt_remotebuf out of
> the tracepoint definition in include/trace/events/sunrpc.h and make it
> a tracepoint argument that is allowed to be NULL?

This one seems works, could you please check it before formal submit ?
  NFSv4 callback-1644  [002] ....  4731.064372: svc_process: addr=(null) xid=0x0b0924e3 service=NFSv4 callback vers=1 proc=1

Frankly speaking I'm afraid that I missed something,
rqstp->rq_xprt is widely used and nobody expect that it can be NULL.

And even I missed nothing --  it's quite tricky anyway.
Future cahnges can add new calls or execute old non-empty-xprt-aware
functions and trigger crash in some exotic configuration.

Thank you,
	Vasily Averin
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 73e130a840ce..f87d2ba88869 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -295,9 +295,12 @@ struct svc_rqst {
 	struct svc_cacherep *	rq_cacherep;	/* cache info */
 	struct task_struct	*rq_task;	/* service thread */
 	spinlock_t		rq_lock;	/* per-request lock */
+	struct net *		rq_bc_net;	/* pointer to backchannel's
+						 * net namespace
+						 */
 };
 
-#define SVC_NET(svc_rqst)	(svc_rqst->rq_xprt->xpt_net)
+#define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
  * Rigorous type checking on sockaddr type conversions
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 28e384186c35..df4305be73d6 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -569,7 +569,7 @@ TRACE_EVENT(svc_process,
 		__field(u32, vers)
 		__field(u32, proc)
 		__string(service, name)
-		__string(addr, rqst->rq_xprt->xpt_remotebuf)
+		__string(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)")
 	),
 
 	TP_fast_assign(
@@ -577,7 +577,7 @@ TRACE_EVENT(svc_process,
 		__entry->vers = rqst->rq_vers;
 		__entry->proc = rqst->rq_proc;
 		__assign_str(service, name);
-		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+		__assign_str(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)");
 	),
 
 	TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u",
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1ece4bc3eb8d..152790ed309c 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1142,7 +1142,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
 	struct kvec *resv = &rqstp->rq_res.head[0];
 	struct rsi *rsip, rsikey;
 	int ret;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	memset(&rsikey, 0, sizeof(rsikey));
 	ret = gss_read_verf(gc, argv, authp,
@@ -1253,7 +1253,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
 	uint64_t handle;
 	int status;
 	int ret;
-	struct net *net = rqstp->rq_xprt->xpt_net;
+	struct net *net = SVC_NET(rqstp);
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 
 	memset(&ud, 0, sizeof(ud));
@@ -1444,7 +1444,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 	__be32		*rpcstart;
 	__be32		*reject_stat = resv->iov_base + resv->iov_len;
 	int		ret;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	dprintk("RPC:       svcauth_gss: argv->iov_len = %zd\n",
 			argv->iov_len);
@@ -1734,7 +1734,7 @@ svcauth_gss_release(struct svc_rqst *rqstp)
 	struct rpc_gss_wire_cred *gc = &gsd->clcred;
 	struct xdr_buf *resbuf = &rqstp->rq_res;
 	int stat = -EINVAL;
-	struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+	struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
 	if (gc->gc_proc != RPC_GSS_PROC_DATA)
 		goto out;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a7264fd1b3db..6ebb0324748f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1148,7 +1148,8 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ..
  * Common routine for processing the RPC request.
  */
 static int
-svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, 
+		   struct kvec *resv, const struct svc_xprt_ops *xops)
 {
 	struct svc_program	*progp;
 	const struct svc_version *versp = NULL;	/* compiler food */
@@ -1172,7 +1173,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	clear_bit(RQ_DROPME, &rqstp->rq_flags);
 
 	/* Setup reply header */
-	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+	xops->xpo_prep_reply_hdr(rqstp);
 
 	svc_putu32(resv, rqstp->rq_xid);
 
@@ -1244,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	 * for lower versions. RPC_PROG_MISMATCH seems to be the closest
 	 * fit.
 	 */
-	if (versp->vs_need_cong_ctrl &&
+	if (versp->vs_need_cong_ctrl && rqstp->rq_xprt &&
 	    !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
 		goto err_bad_vers;
 
@@ -1336,7 +1337,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	return 0;
 
  close:
-	if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+	if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
 		svc_close_xprt(rqstp->rq_xprt);
 	dprintk("svc: svc_process close\n");
 	return 0;
@@ -1432,7 +1433,8 @@ svc_process(struct svc_rqst *rqstp)
 	}
 
 	/* Returns 1 for send, 0 for drop */
-	if (likely(svc_process_common(rqstp, argv, resv)))
+	if (likely(svc_process_common(rqstp, argv, resv,
+					rqstp->rq_xprt->xpt_ops)))
 		return svc_send(rqstp);
 
 out_drop:
@@ -1465,10 +1467,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 		goto proc_error;
 
 	/* Build the svc_rqst used by the common processing routine */
-	rqstp->rq_xprt = s_xprt;
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
 	rqstp->rq_server = serv;
+	rqstp->rq_bc_net = net;
 
 	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
 	memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
@@ -1499,8 +1501,8 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	svc_getnl(argv);	/* CALLDIR */
 
 	/* Parse and execute the bc call */
-	proc_error = svc_process_common(rqstp, argv, resv);
-	svc_xprt_put(rqstp->rq_xprt);
+	proc_error = svc_process_common(rqstp, argv, resv, s_xprt->xpt_ops);
+	svc_xprt_put(s_xprt);
 
 	atomic_inc(&req->rq_xprt->bc_free_slots);
 	if (!proc_error)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 51d36230b6e3..51da7c244bee 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -468,10 +468,11 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
  */
 void svc_reserve(struct svc_rqst *rqstp, int space)
 {
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
 	space += rqstp->rq_res.head[0].iov_len;
 
-	if (space < rqstp->rq_reserved) {
-		struct svc_xprt *xprt = rqstp->rq_xprt;
+	if (xprt && (space < rqstp->rq_reserved)) {
 		atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
 		rqstp->rq_reserved = space;
Vasily Averin Dec. 20, 2018, 1:39 a.m. UTC | #11
Dear Trond,
Red Hat security believes the problem is quite important security issue:
https://access.redhat.com/security/cve/cve-2018-16884

Fix should be backported to affected distributions.

Could you please approve my first patch and push it to stable@ ?
From my PoV it is correctly fixes the problem, it breaks nothing and easy for backports,
lightly modified it can be even live-patched.

Other patches including switch to using empty rqst->rq_xprt can wait.

Thank you,
	Vasily Averin

On 12/19/18 2:25 PM, Vasily Averin wrote:
> On 12/18/18 11:43 PM, Trond Myklebust wrote:
>> On Tue, 2018-12-18 at 23:02 +0300, Vasily Averin wrote:
>>> On 12/18/18 5:55 PM, Trond Myklebust wrote:
>>>>>> It probably also requires us to store a pointer to struct net
>>>>>> in
>>>>>> the
>>>>>> struct svc_rqst so that nfs4_callback_compound() and
>>>>>> svcauth_gss_accept() can find it, but that should be OK since
>>>>>> the
>>>>>> transport already has that referenced.
>>>
>>> Ok, I can fix these functions and their sub-calls.
>>> However  rqst->rq_xprt is used in other functions that seems can be
>>> called inside svc_process_common() 
>>> - in trace_svc_process(rqstp, progp->pg_name);
>>> - in svc_reserve_auth(rqstp, ...) -> svc_reserve()
>>> - svc_authorise() -> svcauth_gss_release()
>>>
>>> It seems I should fix these places too, it isn't?
>>> could you please advise how to fix svc_reserve() ?
>>
>> We don't want svc_reserve() to run at all for the back channel, so I
>> guess that a test for rqstp->rq_xprt != NULL is appropriate there too.
>>
>> svcauth_gss_release() is just using rqstp->rq_xprt to find the net
>> namespace, so if you add a pointer rqstp->rq_net to fix
>> nfs4_callback_compound, then that will fix the gss case as well.
>>
>> For trace_svc_process(), maybe pull rqst->rq_xprt->xpt_remotebuf out of
>> the tracepoint definition in include/trace/events/sunrpc.h and make it
>> a tracepoint argument that is allowed to be NULL?
> 
> This one seems works, could you please check it before formal submit ?
>   NFSv4 callback-1644  [002] ....  4731.064372: svc_process: addr=(null) xid=0x0b0924e3 service=NFSv4 callback vers=1 proc=1
> 
> Frankly speaking I'm afraid that I missed something,
> rqstp->rq_xprt is widely used and nobody expect that it can be NULL.
> 
> And even I missed nothing --  it's quite tricky anyway.
> Future cahnges can add new calls or execute old non-empty-xprt-aware
> functions and trigger crash in some exotic configuration.
> 
> Thank you,
> 	Vasily Averin
>
Trond Myklebust Dec. 20, 2018, 1:58 a.m. UTC | #12
On Thu, 2018-12-20 at 04:39 +0300, Vasily Averin wrote:
> Dear Trond,
> Red Hat security believes the problem is quite important security
> issue:
> https://access.redhat.com/security/cve/cve-2018-16884
> 
> Fix should be backported to affected distributions.
> 
> Could you please approve my first patch and push it to stable@ ?
> From my PoV it is correctly fixes the problem, it breaks nothing and
> easy for backports,
> lightly modified it can be even live-patched.
> 
> Other patches including switch to using empty rqst->rq_xprt can wait.
> 

That patch is not acceptable for upstream.



> Thank you,
> 	Vasily Averin
> 
> On 12/19/18 2:25 PM, Vasily Averin wrote:
> > On 12/18/18 11:43 PM, Trond Myklebust wrote:
> > > On Tue, 2018-12-18 at 23:02 +0300, Vasily Averin wrote:
> > > > On 12/18/18 5:55 PM, Trond Myklebust wrote:
> > > > > > > It probably also requires us to store a pointer to struct
> > > > > > > net
> > > > > > > in
> > > > > > > the
> > > > > > > struct svc_rqst so that nfs4_callback_compound() and
> > > > > > > svcauth_gss_accept() can find it, but that should be OK
> > > > > > > since
> > > > > > > the
> > > > > > > transport already has that referenced.
> > > > 
> > > > Ok, I can fix these functions and their sub-calls.
> > > > However  rqst->rq_xprt is used in other functions that seems
> > > > can be
> > > > called inside svc_process_common() 
> > > > - in trace_svc_process(rqstp, progp->pg_name);
> > > > - in svc_reserve_auth(rqstp, ...) -> svc_reserve()
> > > > - svc_authorise() -> svcauth_gss_release()
> > > > 
> > > > It seems I should fix these places too, it isn't?
> > > > could you please advise how to fix svc_reserve() ?
> > > 
> > > We don't want svc_reserve() to run at all for the back channel,
> > > so I
> > > guess that a test for rqstp->rq_xprt != NULL is appropriate there
> > > too.
> > > 
> > > svcauth_gss_release() is just using rqstp->rq_xprt to find the
> > > net
> > > namespace, so if you add a pointer rqstp->rq_net to fix
> > > nfs4_callback_compound, then that will fix the gss case as well.
> > > 
> > > For trace_svc_process(), maybe pull rqst->rq_xprt->xpt_remotebuf
> > > out of
> > > the tracepoint definition in include/trace/events/sunrpc.h and
> > > make it
> > > a tracepoint argument that is allowed to be NULL?
> > 
> > This one seems works, could you please check it before formal
> > submit ?
> >   NFSv4 callback-1644  [002] ....  4731.064372: svc_process:
> > addr=(null) xid=0x0b0924e3 service=NFSv4 callback vers=1 proc=1
> > 
> > Frankly speaking I'm afraid that I missed something,
> > rqstp->rq_xprt is widely used and nobody expect that it can be
> > NULL.
> > 
> > And even I missed nothing --  it's quite tricky anyway.
> > Future cahnges can add new calls or execute old non-empty-xprt-
> > aware
> > functions and trigger crash in some exotic configuration.
> > 
> > Thank you,
> > 	Vasily Averin
> >
Vasily Averin Dec. 20, 2018, 9:30 a.m. UTC | #13
On 12/20/18 4:58 AM, Trond Myklebust wrote:
> On Thu, 2018-12-20 at 04:39 +0300, Vasily Averin wrote:
>> Dear Trond,
>> Red Hat security believes the problem is quite important security
>> issue:
>> https://access.redhat.com/security/cve/cve-2018-16884
>>
>> Fix should be backported to affected distributions.
>>
>> Could you please approve my first patch and push it to stable@ ?
>> From my PoV it is correctly fixes the problem, it breaks nothing and
>> easy for backports,
>> lightly modified it can be even live-patched.
>>
>> Other patches including switch to using empty rqst->rq_xprt can wait.
>>
> 
> That patch is not acceptable for upstream.

In this case how about my initial plan B -- make svc_serv per net-namespace?
It executes additional per-netns nfsv4 callback threads 
but does not require any changes in existing sunrpc code?
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 509dc5adeb8f..df6939da9d73 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -30,12 +30,6 @@
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
-struct nfs_callback_data {
-	unsigned int users;
-	struct svc_serv *serv;
-};
-
-static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
 static DEFINE_MUTEX(nfs_callback_mutex);
 static struct svc_program nfs4_callback_program;
 
@@ -252,22 +246,23 @@ static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
 };
 #endif
 
-static struct svc_serv *nfs_callback_create_svc(int minorversion)
+static struct svc_serv *nfs_callback_create_svc(int minorversion,
+						struct net *net)
 {
-	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	const struct svc_serv_ops *sv_ops;
-	struct svc_serv *serv;
+	struct svc_serv *serv = nn->serv[minorversion];
 
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (cb_info->serv) {
+	if (serv) {
 		/*
 		 * Note: increase service usage, because later in case of error
 		 * svc_destroy() will be called.
 		 */
-		svc_get(cb_info->serv);
-		return cb_info->serv;
+		svc_get(serv);
+		return serv;
 	}
 
 	switch (minorversion) {
@@ -281,20 +276,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 	if (sv_ops == NULL)
 		return ERR_PTR(-ENOTSUPP);
 
-	/*
-	 * Sanity check: if there's no task,
-	 * we should be the first user ...
-	 */
-	if (cb_info->users)
-		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
-			cb_info->users);
-
 	serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
 	if (!serv) {
 		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	cb_info->serv = serv;
+	nn->serv[minorversion] = serv;
 	/* As there is only one thread we need to over-ride the
 	 * default maximum of 80 connections
 	 */
@@ -308,14 +295,14 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
  */
 int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 {
-	struct svc_serv *serv;
-	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
-	int ret;
 	struct net *net = xprt->xprt_net;
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct svc_serv *serv = nn->serv[minorversion];
+	int ret;
 
 	mutex_lock(&nfs_callback_mutex);
 
-	serv = nfs_callback_create_svc(minorversion);
+	serv = nfs_callback_create_svc(minorversion, net);
 	if (IS_ERR(serv)) {
 		ret = PTR_ERR(serv);
 		goto err_create;
@@ -329,7 +316,6 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 	if (ret < 0)
 		goto err_start;
 
-	cb_info->users++;
 	/*
 	 * svc_create creates the svc_serv with sv_nrthreads == 1, and then
 	 * svc_prepare_thread increments that. So we need to call svc_destroy
@@ -337,8 +323,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 	 * thread exits.
 	 */
 err_net:
-	if (!cb_info->users)
-		cb_info->serv = NULL;
+	if (!nn->cb_users[minorversion])
+		nn->serv[minorversion] = NULL;
 	svc_destroy(serv);
 err_create:
 	mutex_unlock(&nfs_callback_mutex);
@@ -355,19 +341,18 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
  */
 void nfs_callback_down(int minorversion, struct net *net)
 {
-	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct svc_serv *serv;
 
 	mutex_lock(&nfs_callback_mutex);
-	serv = cb_info->serv;
+	serv = nn->serv[minorversion];
 	nfs_callback_down_net(minorversion, serv, net);
-	cb_info->users--;
-	if (cb_info->users == 0) {
+	if (nn->cb_users[minorversion] == 0) {
 		svc_get(serv);
 		serv->sv_ops->svo_setup(serv, NULL, 0);
 		svc_destroy(serv);
 		dprintk("nfs_callback_down: service destroyed\n");
-		cb_info->serv = NULL;
+		nn->serv[minorversion] = NULL;
 	}
 	mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fc9978c58265..a49978d2fb0d 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,6 +29,7 @@ struct nfs_net {
 	unsigned short nfs_callback_tcpport6;
 	int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
+	struct svc_serv *serv[NFS4_MAX_MINOR_VERSION + 1];
 	spinlock_t nfs_client_lock;
 	ktime_t boot_time;
 #ifdef CONFIG_PROC_FS
Trond Myklebust Dec. 20, 2018, 11:58 a.m. UTC | #14
On Thu, 2018-12-20 at 12:30 +0300, Vasily Averin wrote:
> On 12/20/18 4:58 AM, Trond Myklebust wrote:
> > On Thu, 2018-12-20 at 04:39 +0300, Vasily Averin wrote:
> > > Dear Trond,
> > > Red Hat security believes the problem is quite important security
> > > issue:
> > > https://access.redhat.com/security/cve/cve-2018-16884
> > > 
> > > Fix should be backported to affected distributions.
> > > 
> > > Could you please approve my first patch and push it to stable@ ?
> > > From my PoV it is correctly fixes the problem, it breaks nothing
> > > and
> > > easy for backports,
> > > lightly modified it can be even live-patched.
> > > 
> > > Other patches including switch to using empty rqst->rq_xprt can
> > > wait.
> > > 
> > 
> > That patch is not acceptable for upstream.
> 
> In this case how about my initial plan B -- make svc_serv per net-
> namespace?
> It executes additional per-netns nfsv4 callback threads 
> but does not require any changes in existing sunrpc code?

Can we please fix this issue properly without adding more hacks? The
hacks are what has caused the problem in the first place.

The server transport code is completely irrelevant to the client
backchannel and so anything in the backchannel code path that relies on
tests or checks of the "server transport state" is going to be broken.
'J. Bruce Fields' Dec. 21, 2018, 1 a.m. UTC | #15
On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
> No. We don't care about xpt_flags for the back channel because there is
> no "server transport". The actual transport is stored in the 'struct
> rpc_rqst', and is the struct rpc_xprt corresponding to the client
> socket or RDMA channel.
> 
> IOW: All we really need in svc_process_common() is to be able to run
> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
> either as a pointer to the struct svc_xprt_ops itself.

For what it's worth, I'd rather get rid of that op--it's an awfully
roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.

--b.
Vasily Averin Dec. 21, 2018, 11:30 a.m. UTC | #16
On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>> No. We don't care about xpt_flags for the back channel because there is
>> no "server transport". The actual transport is stored in the 'struct
>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>> socket or RDMA channel.
>>
>> IOW: All we really need in svc_process_common() is to be able to run
>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
>> either as a pointer to the struct svc_xprt_ops itself.
> 
> For what it's worth, I'd rather get rid of that op--it's an awfully
> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.

I'll try to save pointer to xpt_ops on per-netns sunrpc_net, 
and use it in svc_process_common() if rqstp->rq_xprt == NULL.
Vasily Averin Dec. 21, 2018, 5:39 p.m. UTC | #17
On 12/21/18 2:30 PM, Vasily Averin wrote:
> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
>> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>>> No. We don't care about xpt_flags for the back channel because there is
>>> no "server transport". The actual transport is stored in the 'struct
>>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>>> socket or RDMA channel.
>>>
>>> IOW: All we really need in svc_process_common() is to be able to run
>>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
>>> either as a pointer to the struct svc_xprt_ops itself.
>>
>> For what it's worth, I'd rather get rid of that op--it's an awfully
>> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
> 
> I'll try to save pointer to xpt_ops on per-netns sunrpc_net, 
> and use it in svc_process_common() if rqstp->rq_xprt == NULL.

Bruce, Trond,
I've send v3 patch version, and waiting for your feedback.

Thank you,
	Vasily Averin
Vasily Averin Dec. 22, 2018, 5:46 p.m. UTC | #18
On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>> No. We don't care about xpt_flags for the back channel because there is
>> no "server transport". The actual transport is stored in the 'struct
>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>> socket or RDMA channel.
>>
>> IOW: All we really need in svc_process_common() is to be able to run
>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
>> either as a pointer to the struct svc_xprt_ops itself.
> 
> For what it's worth, I'd rather get rid of that op--it's an awfully
> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.

Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY to call 
svc_tcp_prep_reply_hdr() in svc_process_common() ?
And according call for rdma-bc does nothing useful at all? 

I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and just 
provide pointer to svc_tcp_prep_reply_hdr() in  svc_process_common() 
via per-netns sunrpc_net -- and seems it was enough, my testcase worked correctly.

Am I missed something probably?
Should we really remove svc_create_xprt( "tcp/rdma-bc"...) related stuff? ?
'J. Bruce Fields' Dec. 23, 2018, 8:52 p.m. UTC | #19
On Sat, Dec 22, 2018 at 08:46:55PM +0300, Vasily Averin wrote:
> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> > On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
> >> No. We don't care about xpt_flags for the back channel because there is
> >> no "server transport". The actual transport is stored in the 'struct
> >> rpc_rqst', and is the struct rpc_xprt corresponding to the client
> >> socket or RDMA channel.
> >>
> >> IOW: All we really need in svc_process_common() is to be able to run
> >> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
> >> either as a pointer to the struct svc_xprt_ops itself.
> > 
> > For what it's worth, I'd rather get rid of that op--it's an awfully
> > roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
> 
> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY to call 
> svc_tcp_prep_reply_hdr() in svc_process_common() ?
> And according call for rdma-bc does nothing useful at all? 

Right, in the rdma case it's:

	void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
	{
	}

> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and
> just provide pointer to svc_tcp_prep_reply_hdr() in
> svc_process_common() via per-netns sunrpc_net -- and seems it was
> enough, my testcase worked correctly.
> 
> Am I missed something probably?  Should we really remove
> svc_create_xprt( "tcp/rdma-bc"...) related stuff? ?

Haven't looked carefully, but off the top of my head I can't see why
that wouldn't work.

I also tried some patches that replace that op by a flag bit (doesn't
address the original problem here, just seemed like a simplification):

	git://linux-nfs.org/~bfields/linux-topics.git

but I don't if that's compatible with what you've done.

--b.
Vasily Averin Dec. 23, 2018, 9:03 p.m. UTC | #20
On 12/23/18 11:52 PM, bfields@fieldses.org wrote:
> On Sat, Dec 22, 2018 at 08:46:55PM +0300, Vasily Averin wrote:
>> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
>>> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>>>> No. We don't care about xpt_flags for the back channel because there is
>>>> no "server transport". The actual transport is stored in the 'struct
>>>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>>>> socket or RDMA channel.
>>>>
>>>> IOW: All we really need in svc_process_common() is to be able to run
>>>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be passed
>>>> either as a pointer to the struct svc_xprt_ops itself.
>>>
>>> For what it's worth, I'd rather get rid of that op--it's an awfully
>>> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
>>
>> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY to call 
>> svc_tcp_prep_reply_hdr() in svc_process_common() ?
>> And according call for rdma-bc does nothing useful at all? 
> 
> Right, in the rdma case it's:
> 
> 	void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
> 	{
> 	}
> 
>> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and
>> just provide pointer to svc_tcp_prep_reply_hdr() in
>> svc_process_common() via per-netns sunrpc_net -- and seems it was
>> enough, my testcase worked correctly.
>>
>> Am I missed something probably?  Should we really remove
>> svc_create_xprt( "tcp/rdma-bc"...) related stuff? ?
> 
> Haven't looked carefully, but off the top of my head I can't see why
> that wouldn't work.

I've prepared new patch version removed svc_create_xprt( "tcp/rdma-bc"...)
as far as I see it works correctly.
I'm going to submit it tomorrow morning.

> I also tried some patches that replace that op by a flag bit (doesn't
> address the original problem here, just seemed like a simplification):
> 
> 	git://linux-nfs.org/~bfields/linux-topics.git
> 
> but I don't if that's compatible with what you've done.
> 
> --b.
>
Trond Myklebust Dec. 23, 2018, 11:56 p.m. UTC | #21
On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> > On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
> > > No. We don't care about xpt_flags for the back channel because
> > > there is
> > > no "server transport". The actual transport is stored in the
> > > 'struct
> > > rpc_rqst', and is the struct rpc_xprt corresponding to the client
> > > socket or RDMA channel.
> > > 
> > > IOW: All we really need in svc_process_common() is to be able to
> > > run
> > > rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be
> > > passed
> > > either as a pointer to the struct svc_xprt_ops itself.
> > 
> > For what it's worth, I'd rather get rid of that op--it's an awfully
> > roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
> 
> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY
> to call 
> svc_tcp_prep_reply_hdr() in svc_process_common() ?
> And according call for rdma-bc does nothing useful at all? 
> 
> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and
> just 
> provide pointer to svc_tcp_prep_reply_hdr() in  svc_process_common() 
> via per-netns sunrpc_net -- and seems it was enough, my testcase
> worked correctly.

I don't see how that function is related to net namespaces. As far as I
can tell, it only signals whether or not the type of transport uses the
TCP record marking scheme.

IOW: it depends on whether the client is using a stream based protocol
like TCP, or a datagram-like protocol like UDP, or RDMA. Whether that
use is occurring in a private net namespace or in the init process
namespace would be irrelevant.

> Am I missed something probably?
> Should we really remove svc_create_xprt( "tcp/rdma-bc"...) related
> stuff? ?

Agreed. The 'bc_up' callback in struct rpc_xprt_ops serves no
discernible purpose, and can be removed.
Vasily Averin Dec. 24, 2018, 5:51 a.m. UTC | #22
On 12/24/18 2:56 AM, Trond Myklebust wrote:
> On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
>> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
>>> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>>>> No. We don't care about xpt_flags for the back channel because
>>>> there is
>>>> no "server transport". The actual transport is stored in the
>>>> 'struct
>>>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>>>> socket or RDMA channel.
>>>>
>>>> IOW: All we really need in svc_process_common() is to be able to
>>>> run
>>>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be
>>>> passed
>>>> either as a pointer to the struct svc_xprt_ops itself.
>>>
>>> For what it's worth, I'd rather get rid of that op--it's an awfully
>>> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
>>
>> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY
>> to call 
>> svc_tcp_prep_reply_hdr() in svc_process_common() ?
>> And according call for rdma-bc does nothing useful at all? 
>>
>> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and
>> just 
>> provide pointer to svc_tcp_prep_reply_hdr() in  svc_process_common() 
>> via per-netns sunrpc_net -- and seems it was enough, my testcase
>> worked correctly.
> 
> I don't see how that function is related to net namespaces. As far as I
> can tell, it only signals whether or not the type of transport uses the
> TCP record marking scheme.

We need to know which kind of transport is used in specified net namespace,
for example init_ns can use RDMA transport and netns "second" can use 
TCP transport at the same time.
If you do not like an idea to use function pointer as a mark -- ok
I can save only some boolean flag on sunrpc_net, check it in svc_process_common() 
and if it is set -- call svc_tcp_prep_reply_hdr() directly.

Is it acceptable for you?

> IOW: it depends on whether the client is using a stream based protocol
> like TCP, or a datagram-like protocol like UDP, or RDMA. Whether that
> use is occurring in a private net namespace or in the init process
> namespace would be irrelevant.
> 
>> Am I missed something probably?
>> Should we really remove svc_create_xprt( "tcp/rdma-bc"...) related
>> stuff? ?
> 
> Agreed. The 'bc_up' callback in struct rpc_xprt_ops serves no
> discernible purpose, and can be removed.
>
Vasily Averin Dec. 24, 2018, 6:05 a.m. UTC | #23
On 12/24/18 8:51 AM, Vasily Averin wrote:
> On 12/24/18 2:56 AM, Trond Myklebust wrote:
>> On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
>>> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
>>>> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust wrote:
>>>>> No. We don't care about xpt_flags for the back channel because
>>>>> there is
>>>>> no "server transport". The actual transport is stored in the
>>>>> 'struct
>>>>> rpc_rqst', and is the struct rpc_xprt corresponding to the client
>>>>> socket or RDMA channel.
>>>>>
>>>>> IOW: All we really need in svc_process_common() is to be able to
>>>>> run
>>>>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can be
>>>>> passed
>>>>> either as a pointer to the struct svc_xprt_ops itself.
>>>>
>>>> For what it's worth, I'd rather get rid of that op--it's an awfully
>>>> roundabout way just to do "svc_putnl(resv, 0);" in the tcp case.
>>>
>>> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used ONLY
>>> to call 
>>> svc_tcp_prep_reply_hdr() in svc_process_common() ?
>>> And according call for rdma-bc does nothing useful at all? 
>>>
>>> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up() and
>>> just 
>>> provide pointer to svc_tcp_prep_reply_hdr() in  svc_process_common() 
>>> via per-netns sunrpc_net -- and seems it was enough, my testcase
>>> worked correctly.
>>
>> I don't see how that function is related to net namespaces. As far as I
>> can tell, it only signals whether or not the type of transport uses the
>> TCP record marking scheme.
> 
> We need to know which kind of transport is used in specified net namespace,
> for example init_ns can use RDMA transport and netns "second" can use 
> TCP transport at the same time.
> If you do not like an idea to use function pointer as a mark -- ok
> I can save only some boolean flag on sunrpc_net, check it in svc_process_common() 
> and if it is set -- call svc_tcp_prep_reply_hdr() directly.

moreover, I can do not change sunrpc_net at all,
I can check in bc_svc_common() which transport uses incoming svc_req
and provide such flag as new parameter to svc_process_common().

>> IOW: it depends on whether the client is using a stream based protocol
>> like TCP, or a datagram-like protocol like UDP, or RDMA. Whether that
>> use is occurring in a private net namespace or in the init process
>> namespace would be irrelevant.
>>
>>> Am I missed something probably?
>>> Should we really remove svc_create_xprt( "tcp/rdma-bc"...) related
>>> stuff? ?
>>
>> Agreed. The 'bc_up' callback in struct rpc_xprt_ops serves no
>> discernible purpose, and can be removed.
>>
Trond Myklebust Dec. 24, 2018, 8:21 a.m. UTC | #24
On Mon, 2018-12-24 at 09:05 +0300, Vasily Averin wrote:
> On 12/24/18 8:51 AM, Vasily Averin wrote:
> > On 12/24/18 2:56 AM, Trond Myklebust wrote:
> > > On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
> > > > On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> > > > > On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust
> > > > > wrote:
> > > > > > No. We don't care about xpt_flags for the back channel
> > > > > > because
> > > > > > there is
> > > > > > no "server transport". The actual transport is stored in
> > > > > > the
> > > > > > 'struct
> > > > > > rpc_rqst', and is the struct rpc_xprt corresponding to the
> > > > > > client
> > > > > > socket or RDMA channel.
> > > > > > 
> > > > > > IOW: All we really need in svc_process_common() is to be
> > > > > > able to
> > > > > > run
> > > > > > rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can
> > > > > > be
> > > > > > passed
> > > > > > either as a pointer to the struct svc_xprt_ops itself.
> > > > > 
> > > > > For what it's worth, I'd rather get rid of that op--it's an
> > > > > awfully
> > > > > roundabout way just to do "svc_putnl(resv, 0);" in the tcp
> > > > > case.
> > > > 
> > > > Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used
> > > > ONLY
> > > > to call 
> > > > svc_tcp_prep_reply_hdr() in svc_process_common() ?
> > > > And according call for rdma-bc does nothing useful at all? 
> > > > 
> > > > I've just tried to remove svc_create_xprt() from xs_tcp_bc_up()
> > > > and
> > > > just 
> > > > provide pointer to svc_tcp_prep_reply_hdr()
> > > > in  svc_process_common() 
> > > > via per-netns sunrpc_net -- and seems it was enough, my
> > > > testcase
> > > > worked correctly.
> > > 
> > > I don't see how that function is related to net namespaces. As
> > > far as I
> > > can tell, it only signals whether or not the type of transport
> > > uses the
> > > TCP record marking scheme.
> > 
> > We need to know which kind of transport is used in specified net
> > namespace,
> > for example init_ns can use RDMA transport and netns "second" can
> > use 
> > TCP transport at the same time.
> > If you do not like an idea to use function pointer as a mark -- ok
> > I can save only some boolean flag on sunrpc_net, check it in
> > svc_process_common() 
> > and if it is set -- call svc_tcp_prep_reply_hdr() directly.

I'm not against the idea of using a function pointer, but I'm saying
that the transport is not unique per-netns. Instead, the transport is
usually per NFS mount, but you can always retrieve a pointer to it
directly in bc_svc_process() from req->rq_xprt. 


> moreover, I can do not change sunrpc_net at all,
> I can check in bc_svc_common() which transport uses incoming svc_req
> and provide such flag as new parameter to svc_process_common().

The function or flag used by bc_svc_common() could be added to req-
>rq_xprt->ops as another 'bc_' field and then passed to
svc_process_common() as the parameter.
Vasily Averin Dec. 24, 2018, 8:59 a.m. UTC | #25
On 12/24/18 11:21 AM, Trond Myklebust wrote:
> On Mon, 2018-12-24 at 09:05 +0300, Vasily Averin wrote:
>> On 12/24/18 8:51 AM, Vasily Averin wrote:
>>> On 12/24/18 2:56 AM, Trond Myklebust wrote:
>>>> On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
>>>>> On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
>>>>>> On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust
>>>>>> wrote:
>>>>>>> No. We don't care about xpt_flags for the back channel
>>>>>>> because
>>>>>>> there is
>>>>>>> no "server transport". The actual transport is stored in
>>>>>>> the
>>>>>>> 'struct
>>>>>>> rpc_rqst', and is the struct rpc_xprt corresponding to the
>>>>>>> client
>>>>>>> socket or RDMA channel.
>>>>>>>
>>>>>>> IOW: All we really need in svc_process_common() is to be
>>>>>>> able to
>>>>>>> run
>>>>>>> rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that can
>>>>>>> be
>>>>>>> passed
>>>>>>> either as a pointer to the struct svc_xprt_ops itself.
>>>>>>
>>>>>> For what it's worth, I'd rather get rid of that op--it's an
>>>>>> awfully
>>>>>> roundabout way just to do "svc_putnl(resv, 0);" in the tcp
>>>>>> case.
>>>>>
>>>>> Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was used
>>>>> ONLY
>>>>> to call 
>>>>> svc_tcp_prep_reply_hdr() in svc_process_common() ?
>>>>> And according call for rdma-bc does nothing useful at all? 
>>>>>
>>>>> I've just tried to remove svc_create_xprt() from xs_tcp_bc_up()
>>>>> and
>>>>> just 
>>>>> provide pointer to svc_tcp_prep_reply_hdr()
>>>>> in  svc_process_common() 
>>>>> via per-netns sunrpc_net -- and seems it was enough, my
>>>>> testcase
>>>>> worked correctly.
>>>>
>>>> I don't see how that function is related to net namespaces. As
>>>> far as I
>>>> can tell, it only signals whether or not the type of transport
>>>> uses the
>>>> TCP record marking scheme.
>>>
>>> We need to know which kind of transport is used in specified net
>>> namespace,
>>> for example init_ns can use RDMA transport and netns "second" can
>>> use 
>>> TCP transport at the same time.
>>> If you do not like an idea to use function pointer as a mark -- ok
>>> I can save only some boolean flag on sunrpc_net, check it in
>>> svc_process_common() 
>>> and if it is set -- call svc_tcp_prep_reply_hdr() directly.
> 
> I'm not against the idea of using a function pointer, but I'm saying
> that the transport is not unique per-netns. Instead, the transport is
> usually per NFS mount, but you can always retrieve a pointer to it
> directly in bc_svc_process() from req->rq_xprt. 

You're right, I was wrong because I was focused on creation of fake transport svc_xprt.
Yes, we cannot use per-netns pointer here.

>> moreover, I can do not change sunrpc_net at all,
>> I can check in bc_svc_common() which transport uses incoming svc_req
>> and provide such flag as new parameter to svc_process_common().
> 
> The function or flag used by bc_svc_common() could be added to req-
>> rq_xprt->ops as another 'bc_' field and then passed to
> svc_process_common() as the parameter.

Can I just check rqstp->rq_prot ? It is inherited from incoming svc_req,
and it seems it enough to check its propo, it isn't? 

svc_process_common()
...
        /* Setup reply header */
        if (rqstp->rq_prot == IPPROTO_TCP)
                svc_tcp_prep_reply_hdr(rqstp);
Trond Myklebust Dec. 24, 2018, 9:53 a.m. UTC | #26
On Mon, 2018-12-24 at 11:59 +0300, Vasily Averin wrote:
> On 12/24/18 11:21 AM, Trond Myklebust wrote:
> > On Mon, 2018-12-24 at 09:05 +0300, Vasily Averin wrote:
> > > On 12/24/18 8:51 AM, Vasily Averin wrote:
> > > > On 12/24/18 2:56 AM, Trond Myklebust wrote:
> > > > > On Sat, 2018-12-22 at 20:46 +0300, Vasily Averin wrote:
> > > > > > On 12/21/18 4:00 AM, bfields@fieldses.org wrote:
> > > > > > > On Tue, Dec 18, 2018 at 02:55:15PM +0000, Trond Myklebust
> > > > > > > wrote:
> > > > > > > > No. We don't care about xpt_flags for the back channel
> > > > > > > > because
> > > > > > > > there is
> > > > > > > > no "server transport". The actual transport is stored
> > > > > > > > in
> > > > > > > > the
> > > > > > > > 'struct
> > > > > > > > rpc_rqst', and is the struct rpc_xprt corresponding to
> > > > > > > > the
> > > > > > > > client
> > > > > > > > socket or RDMA channel.
> > > > > > > > 
> > > > > > > > IOW: All we really need in svc_process_common() is to
> > > > > > > > be
> > > > > > > > able to
> > > > > > > > run
> > > > > > > > rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(), and that
> > > > > > > > can
> > > > > > > > be
> > > > > > > > passed
> > > > > > > > either as a pointer to the struct svc_xprt_ops itself.
> > > > > > > 
> > > > > > > For what it's worth, I'd rather get rid of that op--it's
> > > > > > > an
> > > > > > > awfully
> > > > > > > roundabout way just to do "svc_putnl(resv, 0);" in the
> > > > > > > tcp
> > > > > > > case.
> > > > > > 
> > > > > > Do you mean that svc_create_xprt(serv, "tcp-bc", ...) was
> > > > > > used
> > > > > > ONLY
> > > > > > to call 
> > > > > > svc_tcp_prep_reply_hdr() in svc_process_common() ?
> > > > > > And according call for rdma-bc does nothing useful at all? 
> > > > > > 
> > > > > > I've just tried to remove svc_create_xprt() from
> > > > > > xs_tcp_bc_up()
> > > > > > and
> > > > > > just 
> > > > > > provide pointer to svc_tcp_prep_reply_hdr()
> > > > > > in  svc_process_common() 
> > > > > > via per-netns sunrpc_net -- and seems it was enough, my
> > > > > > testcase
> > > > > > worked correctly.
> > > > > 
> > > > > I don't see how that function is related to net namespaces.
> > > > > As
> > > > > far as I
> > > > > can tell, it only signals whether or not the type of
> > > > > transport
> > > > > uses the
> > > > > TCP record marking scheme.
> > > > 
> > > > We need to know which kind of transport is used in specified
> > > > net
> > > > namespace,
> > > > for example init_ns can use RDMA transport and netns "second"
> > > > can
> > > > use 
> > > > TCP transport at the same time.
> > > > If you do not like an idea to use function pointer as a mark --
> > > > ok
> > > > I can save only some boolean flag on sunrpc_net, check it in
> > > > svc_process_common() 
> > > > and if it is set -- call svc_tcp_prep_reply_hdr() directly.
> > 
> > I'm not against the idea of using a function pointer, but I'm
> > saying
> > that the transport is not unique per-netns. Instead, the transport
> > is
> > usually per NFS mount, but you can always retrieve a pointer to it
> > directly in bc_svc_process() from req->rq_xprt. 
> 
> You're right, I was wrong because I was focused on creation of fake
> transport svc_xprt.
> Yes, we cannot use per-netns pointer here.
> 
> > > moreover, I can do not change sunrpc_net at all,
> > > I can check in bc_svc_common() which transport uses incoming
> > > svc_req
> > > and provide such flag as new parameter to svc_process_common().
> > 
> > The function or flag used by bc_svc_common() could be added to req-
> > > rq_xprt->ops as another 'bc_' field and then passed to
> > svc_process_common() as the parameter.
> 
> Can I just check rqstp->rq_prot ? It is inherited from incoming
> svc_req,
> and it seems it enough to check its propo, it isn't? 
> 
> svc_process_common()
> ...
>         /* Setup reply header */
>         if (rqstp->rq_prot == IPPROTO_TCP)
>                 svc_tcp_prep_reply_hdr(rqstp);

Yes. In these days with retpoline slowing down all indirect function
calls, then the above is probably the better solution.
Vasily Averin Dec. 24, 2018, 11:48 a.m. UTC | #27
On 12/24/18 12:53 PM, Trond Myklebust wrote:
> On Mon, 2018-12-24 at 11:59 +0300, Vasily Averin wrote:
>> Can I just check rqstp->rq_prot ? It is inherited from incoming
>> svc_req,
>> and it seems it enough to check its propo, it isn't? 
>>
>> svc_process_common()
>> ...
>>         /* Setup reply header */
>>         if (rqstp->rq_prot == IPPROTO_TCP)
>>                 svc_tcp_prep_reply_hdr(rqstp);
> 
> Yes. In these days with retpoline slowing down all indirect function
> calls, then the above is probably the better solution.

I've submitted v4 patch version with these changes.

Patch
diff mbox series

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a4ab4f8d9140..031d2843a002 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -158,6 +158,7 @@  struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	int		(*bc_up)(struct svc_serv *serv, struct net *net);
+	struct svc_xprt*(*bc_get_xprt)(struct svc_serv *serv, struct net *net);
 	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d13e05f1a990..a7264fd1b3db 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1450,16 +1450,22 @@  int
 bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	       struct svc_rqst *rqstp)
 {
+	struct net	*net = req->rq_xprt->xprt_net;
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
 	struct rpc_task *task;
+	struct svc_xprt *s_xprt;
 	int proc_error;
 	int error;
 
 	dprintk("svc: %s(%p)\n", __func__, req);
 
+	s_xprt = req->rq_xprt->ops->bc_get_xprt(serv, net);
+	if (!s_xprt)
+		goto proc_error;
+
 	/* Build the svc_rqst used by the common processing routine */
-	rqstp->rq_xprt = serv->sv_bc_xprt;
+	rqstp->rq_xprt = s_xprt;
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
 	rqstp->rq_server = serv;
@@ -1494,13 +1500,11 @@  bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp, argv, resv);
+	svc_xprt_put(rqstp->rq_xprt);
 
 	atomic_inc(&req->rq_xprt->bc_free_slots);
-	if (!proc_error) {
-		/* Processing error: drop the request */
-		xprt_free_bc_request(req);
-		return 0;
-	}
+	if (!proc_error)
+		goto proc_error;
 
 	/* Finally, send the reply synchronously */
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
@@ -1517,6 +1521,12 @@  bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 out:
 	dprintk("svc: %s(), error=%d\n", __func__, error);
 	return error;
+
+proc_error:
+	/* Processing error: drop the request */
+	xprt_free_bc_request(req);
+	error = -EINVAL;
+	goto out;
 }
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index e5b367a3e517..3e06aeacda43 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -133,6 +133,11 @@  int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
 	return 0;
 }
 
+struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net)
+{
+	return svc_find_xprt(serv, "rdma-bc", net, AF_UNSPEC, 0);
+}
+
 /**
  * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
  * @xprt: transport
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ae2a83828953..41d67de93531 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -828,6 +828,7 @@  static const struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_up			= xprt_rdma_bc_up,
+	.bc_get_xprt		= xprt_rdma_bc_get_xprt,
 	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a13ccb643ce0..2726d71052a8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -662,6 +662,7 @@  void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+struct svc_xprt *xprt_rdma_bc_get_xprt(struct svc_serv *serv, struct net *net);
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8a5e823e0b33..16f9c7720465 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1411,6 +1411,12 @@  static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
 	return 0;
 }
 
+static struct svc_xprt *xs_tcp_bc_get_xprt(struct svc_serv *serv,
+					   struct net *net)
+{
+	return svc_find_xprt(serv, "tcp-bc", net, AF_UNSPEC, 0);
+}
+
 static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
 {
 	return PAGE_SIZE;
@@ -2668,6 +2674,7 @@  static const struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_up			= xs_tcp_bc_up,
+	.bc_get_xprt		= xs_tcp_bc_get_xprt,
 	.bc_maxpayload		= xs_tcp_bc_maxpayload,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,