Message ID | 1414161055.21776.2.camel@leira.trondhjem.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, 24 Oct 2014 17:30:55 +0300 Trond Myklebust <trond.myklebust@primarydata.com> wrote: > On Fri, 2014-10-24 at 09:34 -0400, Jeff Layton wrote: > > On Fri, 24 Oct 2014 07:26:44 -0400 > > Jeff Layton <jlayton@primarydata.com> wrote: > > > > > I guess the theory here is that the client sent the request, and got a > > > TCP ACK and then the server never responded because it dropped the > > > request? That sounds plausible and we almost certainly want something > > > like the above for v4.0. > > > > > > If the above patch fixes Christoph's problem with v4.1 though, then I > > > think we'll need more investigation into why. I don't see a way to get > > > a dropped request with v4.1. Those mainly occur due to interaction with > > > the DRC (disabled for v4.1) or with svc_defer (disabled for all v4 > > > requests). > > > > > > Is it possible that one of the nfsd threads is just hung while > > > processing an RPC and that's why you're not getting a response? > > > > > > > Ok, I'm able to reproduce this too and got a network trace. I think > > Trond is correct as to the cause: > > > > I see a SETATTR call go out and then see a TCP ACK for it a little > > later, but didn't see a reply sent for it. > > > > I sprinkled in some printks around the "dropit" codepath above and in > > some other areas however and none of them fired. So while Trond's patch > > looks like it might be correct, I don't think it'll fix the problem I'm > > seeing. > > > > Sniffing a little later, it seems like the server replies are being > > delayed somehow: > > You can get deferrals in the authentication code which trigger drops. > Here is a variant which tries to deal with that: > > 8<----------------------------------------------------- > From a9be1292152af2cb651415c44fd3d91b3d017240 Mon Sep 17 00:00:00 2001 > From: Trond Myklebust <trond.myklebust@primarydata.com> > Date: Fri, 24 Oct 2014 12:54:47 +0300 > Subject: [PATCH v2] nfsd: Ensure that NFSv4 always drops the connection when > dropping a request > > Both RFC3530 and RFC5661 impose a requirement on the server that it MUST NOT > drop a request unless the connection is broken. This is in order to allow > clients to assume that they can always expect an answer unless the > connection is broken. > > Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> > Cc: stable@vger.kernel.org > --- > v2: Also deal with NFSv4 requests dropped due to authentication issues > > fs/nfsd/nfs4proc.c | 6 ++++++ > include/linux/sunrpc/svc.h | 1 + > net/sunrpc/svc.c | 13 +++++++++---- > 3 files changed, 16 insertions(+), 4 deletions(-) > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index cdeb3cfd6f32..500ac76662a8 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -1960,6 +1960,11 @@ static const char *nfsd4_op_name(unsigned opnum) > return "unknown_operation"; > } > > +static void nfsd4_dropme(struct svc_rqst *rqstp) > +{ > + svc_close_xprt(rqstp->rq_xprt); > +} > + > #define nfsd4_voidres nfsd4_voidargs > struct nfsd4_voidargs { int dummy; }; > > @@ -1989,6 +1994,7 @@ struct svc_version nfsd_version4 = { > .vs_nproc = 2, > .vs_proc = nfsd_procedures4, > .vs_dispatch = nfsd_dispatch, > + .vs_dropme = nfsd4_dropme, > .vs_xdrsize = NFS4_SVC_XDRSIZE, > .vs_rpcb_optnl = 1, > }; > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > index 21678464883a..824656da1f6d 100644 > --- a/include/linux/sunrpc/svc.h > +++ b/include/linux/sunrpc/svc.h > @@ -400,6 +400,7 @@ struct svc_version { > * vs_dispatch == NULL means use default dispatcher. > */ > int (*vs_dispatch)(struct svc_rqst *, __be32 *); > + void (*vs_dropme)(struct svc_rqst *); > }; > > /* > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c > index ca8a7958f4e6..a178496ce2c1 100644 > --- a/net/sunrpc/svc.c > +++ b/net/sunrpc/svc.c > @@ -1111,9 +1111,13 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) > rqstp->rq_vers = vers = svc_getnl(argv); /* version number */ > rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */ > > - for (progp = serv->sv_program; progp; progp = progp->pg_next) > - if (prog == progp->pg_prog) > + for (progp = serv->sv_program; progp; progp = progp->pg_next) { > + if (prog == progp->pg_prog) { > + if (vers < progp->pg_nvers) > + versp = progp->pg_vers[vers]; > break; > + } > + } > > /* > * Decode auth data, and add verifier to reply buffer. > @@ -1148,8 +1152,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) > if (progp == NULL) > goto err_bad_prog; > > - if (vers >= progp->pg_nvers || > - !(versp = progp->pg_vers[vers])) > + if (versp == NULL) > goto err_bad_vers; > > procp = versp->vs_proc + proc; > @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) > dropit: > svc_authorise(rqstp); /* doesn't hurt to call this twice */ > dprintk("svc: svc_process dropit\n"); I don't think this will fix it either. I turned the above dprintk into a normal printk and it never fired during the test. As best I can tell, svc_process_common is not returning 0 when this occurs. > + if (versp && versp->vs_dropme) > + versp->vs_dropme(rqstp); > return 0; > > err_short_len:
On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton <jeff.layton@primarydata.com> wrote: >> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) >> dropit: >> svc_authorise(rqstp); /* doesn't hurt to call this twice */ >> dprintk("svc: svc_process dropit\n"); > > I don't think this will fix it either. I turned the above dprintk into > a normal printk and it never fired during the test. As best I can tell, > svc_process_common is not returning 0 when this occurs. OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm having trouble understanding the call chain for that stuff, but it too looks as if it can trigger some strange behaviour.
On Fri, 24 Oct 2014 18:59:47 +0300 Trond Myklebust <trond.myklebust@primarydata.com> wrote: > On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton > <jeff.layton@primarydata.com> wrote: > >> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) > >> dropit: > >> svc_authorise(rqstp); /* doesn't hurt to call this twice */ > >> dprintk("svc: svc_process dropit\n"); > > > > I don't think this will fix it either. I turned the above dprintk into > > a normal printk and it never fired during the test. As best I can tell, > > svc_process_common is not returning 0 when this occurs. > > OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm > having trouble understanding the call chain for that stuff, but it too > looks as if it can trigger some strange behaviour. > I don't think that's it either. I turned the dprintks in svc_revisit into a printks just to be sure, and they didn't fire either. Basically, I don't think we ever do anything in svc_defer for v4.1 requests, due to this at the top of it: if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral) return NULL; /* if more than a page, give up FIXME */ ...basically rq_usedeferral should be set in most cases for v4.1 requests. It gets set when processing the compound and then unset afterward. That said, I suppose you could end up deferring the request if it occurs before the pc_func gets called, but I haven't seen any evidence of that happening so far with this test. I do concur with Christoph that I've only been able to reproduce this while running on the loopback interface. If I have server and client in different VMs, then this test runs just fine. Could this be related to the changes that Neil sent in recently to make loopback mounts work better? One idea might be reasonable to backport 2aca5b869ace67 to something v3.17-ish and see whether it's still reproducible?
On Fri, 24 Oct 2014 13:08:28 -0400 Jeff Layton <jlayton@primarydata.com> wrote: > On Fri, 24 Oct 2014 18:59:47 +0300 > Trond Myklebust <trond.myklebust@primarydata.com> wrote: > > > On Fri, Oct 24, 2014 at 5:57 PM, Jeff Layton > > <jeff.layton@primarydata.com> wrote: > > >> @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) > > >> dropit: > > >> svc_authorise(rqstp); /* doesn't hurt to call this twice */ > > >> dprintk("svc: svc_process dropit\n"); > > > > > > I don't think this will fix it either. I turned the above dprintk into > > > a normal printk and it never fired during the test. As best I can tell, > > > svc_process_common is not returning 0 when this occurs. > > > > OK. Is perhaps the "revisit canceled" triggering in svc_revisit()? I'm > > having trouble understanding the call chain for that stuff, but it too > > looks as if it can trigger some strange behaviour. > > > > I don't think that's it either. I turned the dprintks in svc_revisit > into a printks just to be sure, and they didn't fire either. > > Basically, I don't think we ever do anything in svc_defer for v4.1 > requests, due to this at the top of it: > > if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral) > return NULL; /* if more than a page, give up FIXME */ > > ...basically rq_usedeferral should be set in most cases for v4.1 > requests. It gets set when processing the compound and then unset > afterward. > > That said, I suppose you could end up deferring the request if it > occurs before the pc_func gets called, but I haven't seen any evidence > of that happening so far with this test. > > I do concur with Christoph that I've only been able to reproduce this > while running on the loopback interface. If I have server and client in > different VMs, then this test runs just fine. Could this be related to > the changes that Neil sent in recently to make loopback mounts work > better? > > One idea might be reasonable to backport 2aca5b869ace67 to something > v3.17-ish and see whether it's still reproducible? > Ok, I've made a bit more progress on this, mostly by adding a fair number of tracepoints into the client and server request dispatching code. I'll be posting patches to add those for eventually, but they need a little cleanup first. In any case, here's a typical request as it's supposed to work: mount.nfs-906 [002] ...1 22711.996969: xprt_transmit: xprt=0xffff8800ce961000 xid=0xa8a34513 status=0 nfsd-678 [000] ...1 22711.997082: svc_recv: rq_xid=0xa8a34513 status=164 nfsd-678 [000] ..s8 22711.997185: xprt_lookup_rqst: xprt=0xffff8800ce961000 xid=0xa8a34513 status=0 nfsd-678 [000] ..s8 22711.997186: xprt_complete_rqst: xprt=0xffff8800ce961000 xid=0xa8a34513 status=140 nfsd-678 [000] ...1 22711.997236: svc_send: rq_xid=0xa8a34513 dropme=0 status=144 nfsd-678 [000] ...1 22711.997236: svc_process: rq_xid=0xa8a34513 dropme=0 status=144 ...and here's what we see when things start hanging: kworker/2:2-107 [002] ...1 22741.696070: xprt_transmit: xprt=0xffff8800ce961000 xid=0xc3a84513 status=0 nfsd-678 [002] .N.1 22741.696917: svc_recv: rq_xid=0xc3a84513 status=208 nfsd-678 [002] ...1 22741.699890: svc_send: rq_xid=0xc3a84513 dropme=0 status=262252 nfsd-678 [002] ...1 22741.699891: svc_process: rq_xid=0xc3a84513 dropme=0 status=262252 What's happening is that xprt_transmit is called to send the request to the server. The server then gets it, and immediately processes it (in less than a second) and sends the reply. The problem is that at some point, we stop getting sk_data_ready events on the socket, even when the server has clearly sent something to the client. I'm unclear on why that is, but I suspect that something is broken in the lower level socket code, possibly just in the loopback code? At this point, I think I'm pretty close to ready to call in the netdev cavalry. I'll write up a synopsis of what we know so far, and send it there (and cc it here) in the hopes that someone will have some insight.
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cdeb3cfd6f32..500ac76662a8 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1960,6 +1960,11 @@ static const char *nfsd4_op_name(unsigned opnum) return "unknown_operation"; } +static void nfsd4_dropme(struct svc_rqst *rqstp) +{ + svc_close_xprt(rqstp->rq_xprt); +} + #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; @@ -1989,6 +1994,7 @@ struct svc_version nfsd_version4 = { .vs_nproc = 2, .vs_proc = nfsd_procedures4, .vs_dispatch = nfsd_dispatch, + .vs_dropme = nfsd4_dropme, .vs_xdrsize = NFS4_SVC_XDRSIZE, .vs_rpcb_optnl = 1, }; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 21678464883a..824656da1f6d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -400,6 +400,7 @@ struct svc_version { * vs_dispatch == NULL means use default dispatcher. */ int (*vs_dispatch)(struct svc_rqst *, __be32 *); + void (*vs_dropme)(struct svc_rqst *); }; /* diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index ca8a7958f4e6..a178496ce2c1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1111,9 +1111,13 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) rqstp->rq_vers = vers = svc_getnl(argv); /* version number */ rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */ - for (progp = serv->sv_program; progp; progp = progp->pg_next) - if (prog == progp->pg_prog) + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + if (prog == progp->pg_prog) { + if (vers < progp->pg_nvers) + versp = progp->pg_vers[vers]; break; + } + } /* * Decode auth data, and add verifier to reply buffer. @@ -1148,8 +1152,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) if (progp == NULL) goto err_bad_prog; - if (vers >= progp->pg_nvers || - !(versp = progp->pg_vers[vers])) + if (versp == NULL) goto err_bad_vers; procp = versp->vs_proc + proc; @@ -1228,6 +1231,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) dropit: svc_authorise(rqstp); /* doesn't hurt to call this twice */ dprintk("svc: svc_process dropit\n"); + if (versp && versp->vs_dropme) + versp->vs_dropme(rqstp); return 0; err_short_len: