Message ID | 20240823181423.20458-16-snitzer@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | nfs/nfsd: add support for localio | expand |
On Sat, 24 Aug 2024, Mike Snitzer wrote: > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > If the DS is local to this client use localio to write the data. > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > --- > fs/nfs/flexfilelayout/flexfilelayout.c | 136 +++++++++++++++++++++- > fs/nfs/flexfilelayout/flexfilelayout.h | 2 + > fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 + > 3 files changed, 140 insertions(+), 4 deletions(-) > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c > index 01ee52551a63..d91b640f6c05 100644 > --- a/fs/nfs/flexfilelayout/flexfilelayout.c > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c > @@ -11,6 +11,7 @@ > #include <linux/nfs_mount.h> > #include <linux/nfs_page.h> > #include <linux/module.h> > +#include <linux/file.h> > #include <linux/sched/mm.h> > > #include <linux/sunrpc/metrics.h> > @@ -162,6 +163,72 @@ decode_name(struct xdr_stream *xdr, u32 *id) > return 0; > } > > +/* > + * A dummy definition to make RCU (and non-LOCALIO compilation) happy. > + * struct nfsd_file should never be dereferenced in this file. > + */ > +struct nfsd_file { > + int undefined__; > +}; I removed this and tried building both with and without LOCALIO enabled and the compiler didn't complain. Could you tell me what to do to see the unhappiness you mention? > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h > index f84b3fb0dddd..562e7e27a8b5 100644 > --- a/fs/nfs/flexfilelayout/flexfilelayout.h > +++ b/fs/nfs/flexfilelayout/flexfilelayout.h > @@ -82,7 +82,9 @@ struct nfs4_ff_layout_mirror { > struct nfs_fh *fh_versions; > nfs4_stateid stateid; > const struct cred __rcu *ro_cred; > + struct nfsd_file __rcu *ro_file; > const struct cred __rcu *rw_cred; > + struct nfsd_file __rcu *rw_file; What is the lifetime of a layout_mirror? Does it live for longer than a single IO request? If so we have a problem as this will pin the nfsd_file until the layout is returned. Thanks, NeilBrown
On Mon, Aug 26, 2024 at 11:39:31AM +1000, NeilBrown wrote: > On Sat, 24 Aug 2024, Mike Snitzer wrote: > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > If the DS is local to this client use localio to write the data. > > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > --- > > fs/nfs/flexfilelayout/flexfilelayout.c | 136 +++++++++++++++++++++- > > fs/nfs/flexfilelayout/flexfilelayout.h | 2 + > > fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 + > > 3 files changed, 140 insertions(+), 4 deletions(-) > > > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c > > index 01ee52551a63..d91b640f6c05 100644 > > --- a/fs/nfs/flexfilelayout/flexfilelayout.c > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c > > @@ -11,6 +11,7 @@ > > #include <linux/nfs_mount.h> > > #include <linux/nfs_page.h> > > #include <linux/module.h> > > +#include <linux/file.h> > > #include <linux/sched/mm.h> > > > > #include <linux/sunrpc/metrics.h> > > @@ -162,6 +163,72 @@ decode_name(struct xdr_stream *xdr, u32 *id) > > return 0; > > } > > > > +/* > > + * A dummy definition to make RCU (and non-LOCALIO compilation) happy. > > + * struct nfsd_file should never be dereferenced in this file. > > + */ > > +struct nfsd_file { > > + int undefined__; > > +}; > > I removed this and tried building both with and without LOCALIO enabled > and the compiler didn't complain. > Could you tell me what to do to see the unhappiness you mention? Sorry, I can remove the dummy definition for upstream. That was leftover from the backport I did to 5.15.y stable@ kernel. Older kernels' RCU code dereferences what should just be an opaque pointer and (ab)use typeof. So without the dummy definition compiling against 5.15.y fails with: CC [M] fs/nfs/flexfilelayout/flexfilelayout.o In file included from ./include/linux/rbtree.h:24, from ./include/linux/mm_types.h:10, from ./include/linux/mmzone.h:21, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from ./include/linux/nfs_fs.h:23, from fs/nfs/flexfilelayout/flexfilelayout.c:10: fs/nfs/flexfilelayout/flexfilelayout.c: In function `ff_local_open_fh´: ./include/linux/rcupdate.h:441:9: error: dereferencing pointer to incomplete type `struct nfsd_file´ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ ^ ./include/linux/rcupdate.h:580:2: note: in expansion of macro `__rcu_dereference_check´ __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) ^~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/rcupdate.h:648:28: note: in expansion of macro `rcu_dereference_check´ #define rcu_dereference(p) rcu_dereference_check(p, 0) ^~~~~~~~~~~~~~~~~~~~~ fs/nfs/flexfilelayout/flexfilelayout.c:193:7: note: in expansion of macro `rcu_dereference´ nf = rcu_dereference(*pnf); ^~~~~~~~~~~~~~~ > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h > > index f84b3fb0dddd..562e7e27a8b5 100644 > > --- a/fs/nfs/flexfilelayout/flexfilelayout.h > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.h > > @@ -82,7 +82,9 @@ struct nfs4_ff_layout_mirror { > > struct nfs_fh *fh_versions; > > nfs4_stateid stateid; > > const struct cred __rcu *ro_cred; > > + struct nfsd_file __rcu *ro_file; > > const struct cred __rcu *rw_cred; > > + struct nfsd_file __rcu *rw_file; > > What is the lifetime of a layout_mirror? Does it live for longer than a > single IO request? If so we have a problem as this will pin the > nfsd_file until the layout is returned. Ah, yeah lifetime is longer than an IO... so we have the issue of pnfs (flexfileslayout) holding nfsd_files open in the client; which will prevent backing filesystem from being unmounted. I haven't done that same unmount test (which you reported I fixed for normal NFS) against pNFS with flexfiles. Will sort it out.
On Tue, 27 Aug 2024, Mike Snitzer wrote: > On Mon, Aug 26, 2024 at 11:39:31AM +1000, NeilBrown wrote: > > On Sat, 24 Aug 2024, Mike Snitzer wrote: > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > If the DS is local to this client use localio to write the data. > > > > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > > --- > > > fs/nfs/flexfilelayout/flexfilelayout.c | 136 +++++++++++++++++++++- > > > fs/nfs/flexfilelayout/flexfilelayout.h | 2 + > > > fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 + > > > 3 files changed, 140 insertions(+), 4 deletions(-) > > > > > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c > > > index 01ee52551a63..d91b640f6c05 100644 > > > --- a/fs/nfs/flexfilelayout/flexfilelayout.c > > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c > > > @@ -11,6 +11,7 @@ > > > #include <linux/nfs_mount.h> > > > #include <linux/nfs_page.h> > > > #include <linux/module.h> > > > +#include <linux/file.h> > > > #include <linux/sched/mm.h> > > > > > > #include <linux/sunrpc/metrics.h> > > > @@ -162,6 +163,72 @@ decode_name(struct xdr_stream *xdr, u32 *id) > > > return 0; > > > } > > > > > > +/* > > > + * A dummy definition to make RCU (and non-LOCALIO compilation) happy. > > > + * struct nfsd_file should never be dereferenced in this file. > > > + */ > > > +struct nfsd_file { > > > + int undefined__; > > > +}; > > > > I removed this and tried building both with and without LOCALIO enabled > > and the compiler didn't complain. > > Could you tell me what to do to see the unhappiness you mention? > > Sorry, I can remove the dummy definition for upstream. That was > leftover from the backport I did to 5.15.y stable@ kernel. Older > kernels' RCU code dereferences what should just be an opaque pointer > and (ab)use typeof. So without the dummy definition compiling against > 5.15.y fails with: > > CC [M] fs/nfs/flexfilelayout/flexfilelayout.o > In file included from ./include/linux/rbtree.h:24, > from ./include/linux/mm_types.h:10, > from ./include/linux/mmzone.h:21, > from ./include/linux/gfp.h:6, > from ./include/linux/mm.h:10, > from ./include/linux/nfs_fs.h:23, > from fs/nfs/flexfilelayout/flexfilelayout.c:10: > fs/nfs/flexfilelayout/flexfilelayout.c: In function `ff_local_open_fh´: > ./include/linux/rcupdate.h:441:9: error: dereferencing pointer to incomplete type `struct nfsd_file´ > typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ > ^ > ./include/linux/rcupdate.h:580:2: note: in expansion of macro `__rcu_dereference_check´ > __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) > ^~~~~~~~~~~~~~~~~~~~~~~ > ./include/linux/rcupdate.h:648:28: note: in expansion of macro `rcu_dereference_check´ > #define rcu_dereference(p) rcu_dereference_check(p, 0) > ^~~~~~~~~~~~~~~~~~~~~ > fs/nfs/flexfilelayout/flexfilelayout.c:193:7: note: in expansion of macro `rcu_dereference´ > nf = rcu_dereference(*pnf); > ^~~~~~~~~~~~~~~ Ahhh - good to know. Thanks! NeilBrown
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 01ee52551a63..d91b640f6c05 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,6 +11,7 @@ #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/file.h> #include <linux/sched/mm.h> #include <linux/sunrpc/metrics.h> @@ -162,6 +163,72 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } +/* + * A dummy definition to make RCU (and non-LOCALIO compilation) happy. + * struct nfsd_file should never be dereferenced in this file. + */ +struct nfsd_file { + int undefined__; +}; + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static struct nfsd_file * +ff_local_open_fh(struct pnfs_layout_segment *lseg, + u32 ds_idx, + struct nfs_client *clp, + const struct cred *cred, + struct nfs_fh *fh, + fmode_t mode) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + struct nfsd_file *nf, *new, __rcu **pnf; + + if (!nfs_server_is_local(clp)) + return NULL; + if (mode & FMODE_WRITE) { + /* + * Always request read and write access since this corresponds + * to a rw layout. + */ + mode |= FMODE_READ; + pnf = &mirror->rw_file; + } else + pnf = &mirror->ro_file; + + new = NULL; + rcu_read_lock(); + nf = rcu_dereference(*pnf); + if (!nf) { + rcu_read_unlock(); + new = nfs_local_open_fh(clp, cred, fh, mode); + if (IS_ERR(new)) + return NULL; + rcu_read_lock(); + /* try to swap in the pointer */ + nf = cmpxchg(pnf, NULL, new); + if (!nf) { + nf = new; + new = NULL; + } + } + nf = nfs_to.nfsd_file_get(nf); + rcu_read_unlock(); + if (new) + nfs_to.nfsd_file_put(new); + return nf; +} + +#else +static struct nfsd_file * +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, + struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, fmode_t mode) +{ + return NULL; +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, const struct nfs4_ff_layout_mirror *m2) { @@ -237,8 +304,17 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { - const struct cred *cred; + struct nfsd_file * __maybe_unused nf; + const struct cred *cred; +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + nf = rcu_access_pointer(mirror->ro_file); + if (nf) + nfs_to.nfsd_file_put(nf); + nf = rcu_access_pointer(mirror->rw_file); + if (nf) + nfs_to.nfsd_file_put(nf); +#endif ff_layout_remove_mirror(mirror); kfree(mirror->fh_versions); cred = rcu_access_pointer(mirror->ro_cred); @@ -514,6 +590,30 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { /* swap cred ptrs so free_mirror will clean up old */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + if (lgr->range.iomode == IOMODE_READ) { + const struct cred __rcu *old = + xchg(&mirror->ro_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, old); + /* drop file if creds changed */ + if (old != cred) { + struct nfsd_file *nf = + rcu_dereference_protected(xchg(&mirror->ro_file, NULL), 1); + if (nf) + nfs_to.nfsd_file_put(nf); + } + } else { + const struct cred __rcu *old = + xchg(&mirror->rw_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, old); + if (old != cred) { + struct nfsd_file *nf = + rcu_dereference_protected(xchg(&mirror->rw_file, NULL), 1); + if (nf) + nfs_to.nfsd_file_put(nf); + } + } +#else if (lgr->range.iomode == IOMODE_READ) { cred = xchg(&mirror->ro_cred, cred); rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); @@ -521,6 +621,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, cred = xchg(&mirror->rw_cred, cred); rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); } +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; } @@ -1756,6 +1857,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *nf; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; @@ -1802,11 +1904,19 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->args.offset = offset; hdr->mds_offset = offset; + /* Start IO accounting for local read */ + nf = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + FMODE_READ); + if (nf) { + hdr->task.tk_start = ktime_get(); + ff_layout_read_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous read to ds */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, - 0, RPC_TASK_SOFTCONN, NULL); + 0, RPC_TASK_SOFTCONN, nf); put_cred(ds_cred); return PNFS_ATTEMPTED; @@ -1826,6 +1936,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *nf; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; @@ -1870,11 +1981,19 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) */ hdr->args.offset = offset; + /* Start IO accounting for local write */ + nf = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (nf) { + hdr->task.tk_start = ktime_get(); + ff_layout_write_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous write */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, - sync, RPC_TASK_SOFTCONN, NULL); + sync, RPC_TASK_SOFTCONN, nf); put_cred(ds_cred); return PNFS_ATTEMPTED; @@ -1908,6 +2027,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *nf; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; u32 idx; @@ -1946,10 +2066,18 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) if (fh) data->args.fh = fh; + /* Start IO accounting for local commit */ + nf = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (nf) { + data->task.tk_start = ktime_get(); + ff_layout_commit_record_layoutstats_start(&data->task, data); + } + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, - how, RPC_TASK_SOFTCONN, NULL); + how, RPC_TASK_SOFTCONN, nf); put_cred(ds_cred); return ret; out_err: diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index f84b3fb0dddd..562e7e27a8b5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -82,7 +82,9 @@ struct nfs4_ff_layout_mirror { struct nfs_fh *fh_versions; nfs4_stateid stateid; const struct cred __rcu *ro_cred; + struct nfsd_file __rcu *ro_file; const struct cred __rcu *rw_cred; + struct nfsd_file __rcu *rw_file; refcount_t ref; spinlock_t lock; unsigned long flags; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e028f5a0ef5f..e58bedfb1dcc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -395,6 +395,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, /* connect success, check rsize/wsize limit */ if (!status) { + /* + * ds_clp is put in destroy_ds(). + * keep ds_clp even if DS is local, so that if local IO cannot + * proceed somehow, we can fall back to NFS whenever we want. + */ + nfs_local_probe(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL);