diff mbox

nfsd: Ensure that NFSv4 always drops the connection when dropping a request

Message ID 1414161055.21776.2.camel@leira.trondhjem.org (mailing list archive)
State New, archived
Headers show

Commit Message

Trond Myklebust Oct. 24, 2014, 2:30 p.m. UTC
On Fri, 2014-10-24 at 09:34 -0400, Jeff Layton wrote:
> On Fri, 24 Oct 2014 07:26:44 -0400
> Jeff Layton <jlayton@primarydata.com> wrote:
>  
> > I guess the theory here is that the client sent the request, and got a
> > TCP ACK and then the server never responded because it dropped the
> > request? That sounds plausible and we almost certainly want something
> > like the above for v4.0.
> > 
> > If the above patch fixes Christoph's problem with v4.1 though, then I
> > think we'll need more investigation into why. I don't see a way to get
> > a dropped request with v4.1. Those mainly occur due to interaction with
> > the DRC (disabled for v4.1) or with svc_defer (disabled for all v4
> > requests).
> > 
> > Is it possible that one of the nfsd threads is just hung while
> > processing an RPC and that's why you're not getting a response?
> > 
> 
> Ok, I'm able to reproduce this too and got a network trace. I think
> Trond is correct as to the cause:
> 
> I see a SETATTR call go out and then see a TCP ACK for it a little
> later, but didn't see a reply sent for it.
> 
> I sprinkled in some printks around the "dropit" codepath above and in
> some other areas however and none of them fired. So while Trond's patch
> looks like it might be correct, I don't think it'll fix the problem I'm
> seeing.
> 
> Sniffing a little later, it seems like the server replies are being
> delayed somehow:

You can get deferrals in the authentication code which trigger drops.
Here is a variant which tries to deal with that:

8<-----------------------------------------------------
From a9be1292152af2cb651415c44fd3d91b3d017240 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 24 Oct 2014 12:54:47 +0300
Subject: [PATCH v2] nfsd: Ensure that NFSv4 always drops the connection when
 dropping a request

Both RFC3530 and RFC5661 impose a requirement on the server that it MUST NOT
drop a request unless the connection is broken. This is in order to allow
clients to assume that they can always expect an answer unless the
connection is broken.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org
---
v2: Also deal with NFSv4 requests dropped due to authentication issues

 fs/nfsd/nfs4proc.c         |  6 ++++++
 include/linux/sunrpc/svc.h |  1 +
 net/sunrpc/svc.c           | 13 +++++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

Comments

Jeff Layton Oct. 24, 2014, 2:57 p.m. UTC | #1
On Fri, 24 Oct 2014 17:30:55 +0300
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> On Fri, 2014-10-24 at 09:34 -0400, Jeff Layton wrote:
> > On Fri, 24 Oct 2014 07:26:44 -0400
> > Jeff Layton <jlayton@primarydata.com> wrote:
> >  
> > > I guess the theory here is that the client sent the request, and got a
> > > TCP ACK and then the server never responded because it dropped the
> > > request? That sounds plausible and we almost certainly want something
> > > like the above for v4.0.
> > > 
> > > If the above patch fixes Christoph's problem with v4.1 though, then I
> > > think we'll need more investigation into why. I don't see a way to get
> > > a dropped request with v4.1. Those mainly occur due to interaction with
> > > the DRC (disabled for v4.1) or with svc_defer (disabled for all v4
> > > requests).
> > > 
> > > Is it possible that one of the nfsd threads is just hung while
> > > processing an RPC and that's why you're not getting a response?
> > > 
> > 
> > Ok, I'm able to reproduce this too and got a network trace. I think
> > Trond is correct as to the cause:
> > 
> > I see a SETATTR call go out and then see a TCP ACK for it a little
> > later, but didn't see a reply sent for it.
> > 
> > I sprinkled in some printks around the "dropit" codepath above and in
> > some other areas however and none of them fired. So while Trond's patch
> > looks like it might be correct, I don't think it'll fix the problem I'm
> > seeing.
> > 
> > Sniffing a little later, it seems like the server replies are being
> > delayed somehow:
> 
> You can get deferrals in the authentication code which trigger drops.
> Here is a variant which tries to deal with that:
> 
> 8<-----------------------------------------------------
> From a9be1292152af2cb651415c44fd3d91b3d017240 Mon Sep 17 00:00:00 2001
> From: Trond Myklebust <trond.myklebust@primarydata.com>
> Date: Fri, 24 Oct 2014 12:54:47 +0300
> Subject: [PATCH v2] nfsd: Ensure that NFSv4 always drops the connection when
>  dropping a request
> 
> Both RFC3530 and RFC5661 impose a requirement on the server that it MUST NOT
> drop a request unless the connection is broken. This is in order to allow
> clients to assume that they can always expect an answer unless the
> connection is broken.
> 
> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
> Cc: stable@vger.kernel.org
> ---
> v2: Also deal with NFSv4 requests dropped due to authentication issues
> 
>  fs/nfsd/nfs4proc.c         |  6 ++++++
>  include/linux/sunrpc/svc.h |  1 +
>  net/sunrpc/svc.c           | 13 +++++++++----
>  3 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index cdeb3cfd6f32..500ac76662a8 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1960,6 +1960,11 @@ static const char *nfsd4_op_name(unsigned opnum)
>  	return "unknown_operation";
>  }
>  
> +static void nfsd4_dropme(struct svc_rqst *rqstp)
> +{
> +	svc_close_xprt(rqstp->rq_xprt);
> +}
> +
>  #define nfsd4_voidres			nfsd4_voidargs
>  struct nfsd4_voidargs { int dummy; };
>  
> @@ -1989,6 +1994,7 @@ struct svc_version	nfsd_version4 = {
>  		.vs_nproc	= 2,
>  		.vs_proc	= nfsd_procedures4,
>  		.vs_dispatch	= nfsd_dispatch,
> +		.vs_dropme	= nfsd4_dropme,
>  		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
>  		.vs_rpcb_optnl	= 1,
>  };
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 21678464883a..824656da1f6d 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -400,6 +400,7 @@ struct svc_version {
>  	 * vs_dispatch == NULL means use default dispatcher.
>  	 */
>  	int			(*vs_dispatch)(struct svc_rqst *, __be32 *);
> +	void			(*vs_dropme)(struct svc_rqst *);
>  };
>  
>  /*
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index ca8a7958f4e6..a178496ce2c1 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -1111,9 +1111,13 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
>  	rqstp->rq_vers = vers = svc_getnl(argv);	/* version number */
>  	rqstp->rq_proc = proc = svc_getnl(argv);	/* procedure number */
>  
> -	for (progp = serv->sv_program; progp; progp = progp->pg_next)
> -		if (prog == progp->pg_prog)
> +	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
> +		if (prog == progp->pg_prog) {
> +			if (vers < progp->pg_nvers)
> +				versp = progp->pg_vers[vers];
>  			break;
> +		}
> +	}
>  
>  	/*
>  	 * Decode auth data, and add verifier to reply buffer.
> @@ -1148,8 +1152,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
>  	if (progp == NULL)
>  		goto err_bad_prog;
>  
> -	if (vers >= progp->pg_nvers ||
> -	  !(versp = progp->pg_vers[vers]))
> +	if (versp == NULL)
>  		goto err_bad_vers;
>  
>  	procp = versp->vs_proc + proc;
> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
>   dropit:
>  	svc_authorise(rqstp);	/* doesn't hurt to call this twice */
>  	dprintk("svc: svc_process dropit\n");

I don't think this will fix it either. I turned the above dprintk into
a normal printk and it never fired during the test. As best I can tell,
svc_process_common is not returning 0 when this occurs.

> +	if (versp && versp->vs_dropme)
> +		versp->vs_dropme(rqstp);
>  	return 0;
>  
>  err_short_len:
Trond Myklebust Oct. 24, 2014, 3:59 p.m. UTC | #2
On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton
<jeff.layton@primarydata.com> wrote:
>> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
>>   dropit:
>>       svc_authorise(rqstp);   /* doesn't hurt to call this twice */
>>       dprintk("svc: svc_process dropit\n");
>
> I don't think this will fix it either. I turned the above dprintk into
> a normal printk and it never fired during the test. As best I can tell,
> svc_process_common is not returning 0 when this occurs.

OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm
having trouble understanding the call chain for that stuff, but it too
looks as if it can trigger some strange behaviour.
Jeff Layton Oct. 24, 2014, 5:08 p.m. UTC | #3
On Fri, 24 Oct 2014 18:59:47 +0300
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton
> <jeff.layton@primarydata.com> wrote:
> >> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
> >>   dropit:
> >>       svc_authorise(rqstp);   /* doesn't hurt to call this twice */
> >>       dprintk("svc: svc_process dropit\n");
> >
> > I don't think this will fix it either. I turned the above dprintk into
> > a normal printk and it never fired during the test. As best I can tell,
> > svc_process_common is not returning 0 when this occurs.
> 
> OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm
> having trouble understanding the call chain for that stuff, but it too
> looks as if it can trigger some strange behaviour.
> 

I don't think that's it either. I turned the dprintks in svc_revisit
into a printks just to be sure, and they didn't fire either.

Basically, I don't think we ever do anything in svc_defer for v4.1
requests, due to this at the top of it:

        if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral)
                return NULL; /* if more than a page, give up FIXME */

...basically rq_usedeferral should be set in most cases for v4.1
requests. It gets set when processing the compound and then unset
afterward.

That said, I suppose you could end up deferring the request if it
occurs before the pc_func gets called, but I haven't seen any evidence
of that happening so far with this test.

I do concur with Christoph that I've only been able to reproduce this
while running on the loopback interface. If I have server and client in
different VMs, then this test runs just fine. Could this be related to
the changes that Neil sent in recently to make loopback mounts work
better?

One idea might be reasonable to backport 2aca5b869ace67 to something
v3.17-ish and see whether it's still reproducible?
Jeff Layton Oct. 27, 2014, 7 p.m. UTC | #4
On Fri, 24 Oct 2014 13:08:28 -0400
Jeff Layton <jlayton@primarydata.com> wrote:

> On Fri, 24 Oct 2014 18:59:47 +0300
> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> 
> > On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton
> > <jeff.layton@primarydata.com> wrote:
> > >> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
> > >>   dropit:
> > >>       svc_authorise(rqstp);   /* doesn't hurt to call this twice */
> > >>       dprintk("svc: svc_process dropit\n");
> > >
> > > I don't think this will fix it either. I turned the above dprintk into
> > > a normal printk and it never fired during the test. As best I can tell,
> > > svc_process_common is not returning 0 when this occurs.
> > 
> > OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm
> > having trouble understanding the call chain for that stuff, but it too
> > looks as if it can trigger some strange behaviour.
> > 
> 
> I don't think that's it either. I turned the dprintks in svc_revisit
> into a printks just to be sure, and they didn't fire either.
> 
> Basically, I don't think we ever do anything in svc_defer for v4.1
> requests, due to this at the top of it:
> 
>         if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral)
>                 return NULL; /* if more than a page, give up FIXME */
> 
> ...basically rq_usedeferral should be set in most cases for v4.1
> requests. It gets set when processing the compound and then unset
> afterward.
> 
> That said, I suppose you could end up deferring the request if it
> occurs before the pc_func gets called, but I haven't seen any evidence
> of that happening so far with this test.
> 
> I do concur with Christoph that I've only been able to reproduce this
> while running on the loopback interface. If I have server and client in
> different VMs, then this test runs just fine. Could this be related to
> the changes that Neil sent in recently to make loopback mounts work
> better?
> 
> One idea might be reasonable to backport 2aca5b869ace67 to something
> v3.17-ish and see whether it's still reproducible?
> 

Ok, I've made a bit more progress on this, mostly by adding a fair
number of tracepoints into the client and server request dispatching
code. I'll be posting patches to add those for eventually, but they
need a little cleanup first.

In any case, here's a typical request as it's supposed to work:

       mount.nfs-906   [002] ...1 22711.996969: xprt_transmit: xprt=0xffff8800ce961000 xid=0xa8a34513 status=0
            nfsd-678   [000] ...1 22711.997082: svc_recv: rq_xid=0xa8a34513 status=164
            nfsd-678   [000] ..s8 22711.997185: xprt_lookup_rqst: xprt=0xffff8800ce961000 xid=0xa8a34513 status=0
            nfsd-678   [000] ..s8 22711.997186: xprt_complete_rqst: xprt=0xffff8800ce961000 xid=0xa8a34513 status=140
            nfsd-678   [000] ...1 22711.997236: svc_send: rq_xid=0xa8a34513 dropme=0 status=144
            nfsd-678   [000] ...1 22711.997236: svc_process: rq_xid=0xa8a34513 dropme=0 status=144

...and here's what we see when things start hanging:

     kworker/2:2-107   [002] ...1 22741.696070: xprt_transmit: xprt=0xffff8800ce961000 xid=0xc3a84513 status=0
            nfsd-678   [002] .N.1 22741.696917: svc_recv: rq_xid=0xc3a84513 status=208
            nfsd-678   [002] ...1 22741.699890: svc_send: rq_xid=0xc3a84513 dropme=0 status=262252
            nfsd-678   [002] ...1 22741.699891: svc_process: rq_xid=0xc3a84513 dropme=0 status=262252

What's happening is that xprt_transmit is called to send the request
to the server. The server then gets it, and immediately processes it
(in less than a second) and sends the reply.

The problem is that at some point, we stop getting sk_data_ready
events on the socket, even when the server has clearly sent something
to the client. I'm unclear on why that is, but I suspect that something
is broken in the lower level socket code, possibly just in the loopback
code?

At this point, I think I'm pretty close to ready to call in the netdev
cavalry. I'll write up a synopsis of what we know so far, and send it
there (and cc it here) in the hopes that someone will have some insight.
diff mbox

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cdeb3cfd6f32..500ac76662a8 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1960,6 +1960,11 @@  static const char *nfsd4_op_name(unsigned opnum)
 	return "unknown_operation";
 }
 
+static void nfsd4_dropme(struct svc_rqst *rqstp)
+{
+	svc_close_xprt(rqstp->rq_xprt);
+}
+
 #define nfsd4_voidres			nfsd4_voidargs
 struct nfsd4_voidargs { int dummy; };
 
@@ -1989,6 +1994,7 @@  struct svc_version	nfsd_version4 = {
 		.vs_nproc	= 2,
 		.vs_proc	= nfsd_procedures4,
 		.vs_dispatch	= nfsd_dispatch,
+		.vs_dropme	= nfsd4_dropme,
 		.vs_xdrsize	= NFS4_SVC_XDRSIZE,
 		.vs_rpcb_optnl	= 1,
 };
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 21678464883a..824656da1f6d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -400,6 +400,7 @@  struct svc_version {
 	 * vs_dispatch == NULL means use default dispatcher.
 	 */
 	int			(*vs_dispatch)(struct svc_rqst *, __be32 *);
+	void			(*vs_dropme)(struct svc_rqst *);
 };
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index ca8a7958f4e6..a178496ce2c1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1111,9 +1111,13 @@  svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	rqstp->rq_vers = vers = svc_getnl(argv);	/* version number */
 	rqstp->rq_proc = proc = svc_getnl(argv);	/* procedure number */
 
-	for (progp = serv->sv_program; progp; progp = progp->pg_next)
-		if (prog == progp->pg_prog)
+	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+		if (prog == progp->pg_prog) {
+			if (vers < progp->pg_nvers)
+				versp = progp->pg_vers[vers];
 			break;
+		}
+	}
 
 	/*
 	 * Decode auth data, and add verifier to reply buffer.
@@ -1148,8 +1152,7 @@  svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	if (progp == NULL)
 		goto err_bad_prog;
 
-	if (vers >= progp->pg_nvers ||
-	  !(versp = progp->pg_vers[vers]))
+	if (versp == NULL)
 		goto err_bad_vers;
 
 	procp = versp->vs_proc + proc;
@@ -1228,6 +1231,8 @@  svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
  dropit:
 	svc_authorise(rqstp);	/* doesn't hurt to call this twice */
 	dprintk("svc: svc_process dropit\n");
+	if (versp && versp->vs_dropme)
+		versp->vs_dropme(rqstp);
 	return 0;
 
 err_short_len: