Message ID | 20240831223755.8569-17-snitzer@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | nfs/nfsd: add support for LOCALIO | expand |
On Sun, 01 Sep 2024, Mike Snitzer wrote: > From: Weston Andros Adamson <dros@primarydata.com> > > Add server support for bypassing NFS for localhost reads, writes, and > commits. This is only useful when both the client and server are > running on the same host. > > If nfsd_open_local_fh() fails then the NFS client will both retry and > fallback to normal network-based read, write and commit operations if > localio is no longer supported. > > Care is taken to ensure the same NFS security mechanisms are used > (authentication, etc) regardless of whether localio or regular NFS > access is used. The auth_domain established as part of the traditional > NFS client access to the NFS server is also used for localio. Store > auth_domain for localio in nfsd_uuid_t and transfer it to the client > if it is local to the server. > > Relative to containers, localio gives the client access to the network > namespace the server has. This is required to allow the client to > access the server's per-namespace nfsd_net struct. > > This commit also introduces the use of NFSD's percpu_ref to interlock > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > client code. > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > Co-developed-by: NeilBrown <neilb@suse.de> > Signed-off-by: NeilBrown <neilb@suse.de> > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > --- > fs/nfsd/Makefile | 1 + > fs/nfsd/filecache.c | 2 +- > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > fs/nfsd/netns.h | 4 ++ > fs/nfsd/nfsctl.c | 25 ++++++++- > fs/nfsd/trace.h | 3 +- > fs/nfsd/vfs.h | 2 + > include/linux/nfslocalio.h | 8 +++ > 8 files changed, 154 insertions(+), 3 deletions(-) > create mode 100644 fs/nfsd/localio.c > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > index b8736a82e57c..18cbd3fa7691 100644 > --- a/fs/nfsd/Makefile > +++ b/fs/nfsd/Makefile > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > index 89ff380ec31e..348c1b97092e 100644 > --- a/fs/nfsd/filecache.c > +++ b/fs/nfsd/filecache.c > @@ -52,7 +52,7 @@ > #define NFSD_FILE_CACHE_UP (0) > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > new file mode 100644 > index 000000000000..75df709c6903 > --- /dev/null > +++ b/fs/nfsd/localio.c > @@ -0,0 +1,112 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * NFS server support for local clients to bypass network stack > + * > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > + */ > + > +#include <linux/exportfs.h> > +#include <linux/sunrpc/svcauth.h> > +#include <linux/sunrpc/clnt.h> > +#include <linux/nfs.h> > +#include <linux/nfs_common.h> > +#include <linux/nfslocalio.h> > +#include <linux/string.h> > + > +#include "nfsd.h" > +#include "vfs.h" > +#include "netns.h" > +#include "filecache.h" > + > +static const struct nfsd_localio_operations nfsd_localio_ops = { > + .nfsd_open_local_fh = nfsd_open_local_fh, > + .nfsd_file_put_local = nfsd_file_put_local, > + .nfsd_file_file = nfsd_file_file, > +}; > + > +void nfsd_localio_ops_init(void) > +{ > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > +} Why isn't this nfs_to = &nfsd_loclaio_ops; ?? Why do we copy all the pointers in the struct instead of just the pointer to the struct? Is this to avoid an extra dereference? If so we need an in-code comment explaining this optimisation - and why we need it while most used of _operations structures don't. > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > +/** > + * nfsd_net_pre_exit - Disconnect localio clients from net namespace > + * @net: a network namespace that is about to be destroyed > + * > + * This invalidated ->net pointers held by localio clients > + * while they can still safely access nn->counter. > + */ > +static __net_exit void nfsd_net_pre_exit(struct net *net) > +{ > + struct nfsd_net *nn = net_generic(net, nfsd_net_id); > + > + nfs_uuid_invalidate_clients(&nn->local_clients); > +} > +#endif > + > /** > * nfsd_net_exit - Release the nfsd_net portion of a net namespace > * @net: a network namespace that is about to be destroyed > @@ -2285,6 +2304,9 @@ static __net_exit void nfsd_net_exit(struct net *net) > > static struct pernet_operations nfsd_net_ops = { > .init = nfsd_net_init, > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + .pre_exit = nfsd_net_pre_exit, > +#endif I would rather that these #ifs were not here, but that in the NFS_LOCALIO disabled case, nfs_uuid_invalidate_clients() were an empty static inline function. I think that code of .pre_exit sometime being an empty function is not significant. Section 21 of codiing-style. Or maybe section 20, depending on release (there is a new section on "Using bool"). https://www.kernel.org/doc/html/v4.14/process/coding-style.html#conditional-compilation NeilBrown
On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > From: Weston Andros Adamson <dros@primarydata.com> > > Add server support for bypassing NFS for localhost reads, writes, and > commits. This is only useful when both the client and server are > running on the same host. > > If nfsd_open_local_fh() fails then the NFS client will both retry and > fallback to normal network-based read, write and commit operations if > localio is no longer supported. > > Care is taken to ensure the same NFS security mechanisms are used > (authentication, etc) regardless of whether localio or regular NFS > access is used. The auth_domain established as part of the traditional > NFS client access to the NFS server is also used for localio. Store > auth_domain for localio in nfsd_uuid_t and transfer it to the client > if it is local to the server. > > Relative to containers, localio gives the client access to the network > namespace the server has. This is required to allow the client to > access the server's per-namespace nfsd_net struct. > > This commit also introduces the use of NFSD's percpu_ref to interlock > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > client code. > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > Co-developed-by: NeilBrown <neilb@suse.de> > Signed-off-by: NeilBrown <neilb@suse.de> > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > --- > fs/nfsd/Makefile | 1 + > fs/nfsd/filecache.c | 2 +- > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > fs/nfsd/netns.h | 4 ++ > fs/nfsd/nfsctl.c | 25 ++++++++- > fs/nfsd/trace.h | 3 +- > fs/nfsd/vfs.h | 2 + > include/linux/nfslocalio.h | 8 +++ > 8 files changed, 154 insertions(+), 3 deletions(-) > create mode 100644 fs/nfsd/localio.c > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > index b8736a82e57c..18cbd3fa7691 100644 > --- a/fs/nfsd/Makefile > +++ b/fs/nfsd/Makefile > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > index 89ff380ec31e..348c1b97092e 100644 > --- a/fs/nfsd/filecache.c > +++ b/fs/nfsd/filecache.c > @@ -52,7 +52,7 @@ > #define NFSD_FILE_CACHE_UP (0) > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > new file mode 100644 > index 000000000000..75df709c6903 > --- /dev/null > +++ b/fs/nfsd/localio.c > @@ -0,0 +1,112 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * NFS server support for local clients to bypass network stack > + * > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > + */ > + > +#include <linux/exportfs.h> > +#include <linux/sunrpc/svcauth.h> > +#include <linux/sunrpc/clnt.h> > +#include <linux/nfs.h> > +#include <linux/nfs_common.h> > +#include <linux/nfslocalio.h> > +#include <linux/string.h> > + > +#include "nfsd.h" > +#include "vfs.h" > +#include "netns.h" > +#include "filecache.h" > + > +static const struct nfsd_localio_operations nfsd_localio_ops = { > + .nfsd_open_local_fh = nfsd_open_local_fh, > + .nfsd_file_put_local = nfsd_file_put_local, > + .nfsd_file_file = nfsd_file_file, > +}; > + > +void nfsd_localio_ops_init(void) > +{ > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > +} Same comment as Neil: this should surface a pointer to the localio_ops struct. Copying the whole set of function pointers is generally unnecessary. > + > +/** > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > + * > + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > + * and the 'struct auth_domain' required for LOCALIO access > + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > + * @cred: cred that the client established > + * @nfs_fh: filehandle to lookup > + * @fmode: fmode_t to use for open > + * > + * This function maps a local fh to a path on a local filesystem. > + * This is useful when the nfs client has the local server mounted - it can > + * avoid all the NFS overhead with reads, writes and commits. > + * > + * On successful return, returned nfsd_file will have its nf_net member > + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > + */ > +struct nfsd_file * > +nfsd_open_local_fh(nfs_uuid_t *uuid, > + struct rpc_clnt *rpc_clnt, const struct cred *cred, > + const struct nfs_fh *nfs_fh, const fmode_t fmode) > + __must_hold(rcu) > +{ > + int mayflags = NFSD_MAY_LOCALIO; > + struct nfsd_net *nn = NULL; > + struct net *net; > + struct svc_cred rq_cred; > + struct svc_fh fh; > + struct nfsd_file *localio; > + __be32 beres; > + > + if (nfs_fh->size > NFS4_FHSIZE) > + return ERR_PTR(-EINVAL); > + > + /* > + * Not running in nfsd context, so must safely get reference on nfsd_serv. > + * But the server may already be shutting down, if so disallow new localio. > + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > + * and if it succeeds we will have an implied reference to the net. > + */ > + net = rcu_dereference(uuid->net); > + if (net) > + nn = net_generic(net, nfsd_net_id); > + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > + return ERR_PTR(-ENXIO); > + > + /* Drop the rcu lock for nfsd_file_acquire_local() */ > + rcu_read_unlock(); I'm struggling with the locking logistics. Caller takes the RCU read lock, this function drops the lock, then takes it again. So: - A caller might rely on the lock being held continuously, but - The API contract documented above doesn't indicate that this function drops that lock - The __must_hold(rcu) annotation doesn't indicate that this function drops that lock, IIUC Dropping and retaking the lock in here is an anti-pattern that should be avoided. I suggest we are better off in the long run if the caller does not need to take the RCU read lock, but instead, nfsd_open_local_fh takes it right here just for the rcu_dereference. OTOH, Why drop the lock before calling nfsd_file_acquire_local()? The RCU read lock can safely be taken more than once in succession. Let's rethink the locking strategy. > + > + /* nfs_fh -> svc_fh */ > + fh_init(&fh, NFS4_FHSIZE); > + fh.fh_handle.fh_size = nfs_fh->size; > + memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size); > + > + if (fmode & FMODE_READ) > + mayflags |= NFSD_MAY_READ; > + if (fmode & FMODE_WRITE) > + mayflags |= NFSD_MAY_WRITE; > + > + svcauth_map_clnt_to_svc_cred_local(rpc_clnt, cred, &rq_cred); > + > + beres = nfsd_file_acquire_local(uuid->net, &rq_cred, uuid->dom, > + &fh, mayflags, &localio); > + if (beres) { > + localio = ERR_PTR(nfs_stat_to_errno(be32_to_cpu(beres))); > + nfsd_serv_put(nn); > + } > + > + fh_put(&fh); > + if (rq_cred.cr_group_info) > + put_group_info(rq_cred.cr_group_info); > + > + rcu_read_lock(); > + return localio; > +} > +EXPORT_SYMBOL_GPL(nfsd_open_local_fh); > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > index e2d953f21dde..0fd31188a951 100644 > --- a/fs/nfsd/netns.h > +++ b/fs/nfsd/netns.h > @@ -216,6 +216,10 @@ struct nfsd_net { > /* last time an admin-revoke happened for NFSv4.0 */ > time64_t nfs40_last_revoke; > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + /* Local clients to be invalidated when net is shut down */ > + struct list_head local_clients; > +#endif > }; > > /* Simple check to find out if a given net was properly initialized */ > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > index 64c1b4d649bc..3adbc05ebaac 100644 > --- a/fs/nfsd/nfsctl.c > +++ b/fs/nfsd/nfsctl.c > @@ -18,6 +18,7 @@ > #include <linux/sunrpc/svc.h> > #include <linux/module.h> > #include <linux/fsnotify.h> > +#include <linux/nfslocalio.h> > > #include "idmap.h" > #include "nfsd.h" > @@ -2257,7 +2258,9 @@ static __net_init int nfsd_net_init(struct net *net) > get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); > seqlock_init(&nn->writeverf_lock); > nfsd_proc_stat_init(net); > - > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + INIT_LIST_HEAD(&nn->local_clients); > +#endif > return 0; > > out_repcache_error: > @@ -2268,6 +2271,22 @@ static __net_init int nfsd_net_init(struct net *net) > return retval; > } > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > +/** > + * nfsd_net_pre_exit - Disconnect localio clients from net namespace > + * @net: a network namespace that is about to be destroyed > + * > + * This invalidated ->net pointers held by localio clients > + * while they can still safely access nn->counter. > + */ > +static __net_exit void nfsd_net_pre_exit(struct net *net) > +{ > + struct nfsd_net *nn = net_generic(net, nfsd_net_id); > + > + nfs_uuid_invalidate_clients(&nn->local_clients); > +} > +#endif > + > /** > * nfsd_net_exit - Release the nfsd_net portion of a net namespace > * @net: a network namespace that is about to be destroyed > @@ -2285,6 +2304,9 @@ static __net_exit void nfsd_net_exit(struct net *net) > > static struct pernet_operations nfsd_net_ops = { > .init = nfsd_net_init, > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + .pre_exit = nfsd_net_pre_exit, > +#endif > .exit = nfsd_net_exit, > .id = &nfsd_net_id, > .size = sizeof(struct nfsd_net), > @@ -2322,6 +2344,7 @@ static int __init init_nfsd(void) > retval = genl_register_family(&nfsd_nl_family); > if (retval) > goto out_free_all; > + nfsd_localio_ops_init(); > > return 0; > out_free_all: > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h > index d22027e23761..82bcefcd1f21 100644 > --- a/fs/nfsd/trace.h > +++ b/fs/nfsd/trace.h > @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); > { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ > { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ > { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ > - { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) > + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }, \ > + { NFSD_MAY_LOCALIO, "LOCALIO" }) > > TRACE_EVENT(nfsd_compound, > TP_PROTO( > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h > index 01947561d375..3ff146522556 100644 > --- a/fs/nfsd/vfs.h > +++ b/fs/nfsd/vfs.h > @@ -33,6 +33,8 @@ > > #define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ > > +#define NFSD_MAY_LOCALIO 0x2000 /* for tracing, reflects when localio used */ > + > #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) > #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) > > diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h > index 62419c4bc8f1..61f2c781dd50 100644 > --- a/include/linux/nfslocalio.h > +++ b/include/linux/nfslocalio.h > @@ -6,6 +6,8 @@ > #ifndef __LINUX_NFSLOCALIO_H > #define __LINUX_NFSLOCALIO_H > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + > #include <linux/module.h> > #include <linux/list.h> > #include <linux/uuid.h> > @@ -63,4 +65,10 @@ struct nfsd_localio_operations { > extern void nfsd_localio_ops_init(void); > extern struct nfsd_localio_operations nfs_to; > > +#else /* CONFIG_NFS_LOCALIO */ > +static inline void nfsd_localio_ops_init(void) > +{ > +} > +#endif /* CONFIG_NFS_LOCALIO */ > + > #endif /* __LINUX_NFSLOCALIO_H */ > -- > 2.44.0 >
On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > > From: Weston Andros Adamson <dros@primarydata.com> > > > > Add server support for bypassing NFS for localhost reads, writes, and > > commits. This is only useful when both the client and server are > > running on the same host. > > > > If nfsd_open_local_fh() fails then the NFS client will both retry and > > fallback to normal network-based read, write and commit operations if > > localio is no longer supported. > > > > Care is taken to ensure the same NFS security mechanisms are used > > (authentication, etc) regardless of whether localio or regular NFS > > access is used. The auth_domain established as part of the traditional > > NFS client access to the NFS server is also used for localio. Store > > auth_domain for localio in nfsd_uuid_t and transfer it to the client > > if it is local to the server. > > > > Relative to containers, localio gives the client access to the network > > namespace the server has. This is required to allow the client to > > access the server's per-namespace nfsd_net struct. > > > > This commit also introduces the use of NFSD's percpu_ref to interlock > > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > > client code. > > > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > Co-developed-by: NeilBrown <neilb@suse.de> > > Signed-off-by: NeilBrown <neilb@suse.de> > > > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/nfsd/Makefile | 1 + > > fs/nfsd/filecache.c | 2 +- > > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > > fs/nfsd/netns.h | 4 ++ > > fs/nfsd/nfsctl.c | 25 ++++++++- > > fs/nfsd/trace.h | 3 +- > > fs/nfsd/vfs.h | 2 + > > include/linux/nfslocalio.h | 8 +++ > > 8 files changed, 154 insertions(+), 3 deletions(-) > > create mode 100644 fs/nfsd/localio.c > > > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > > index b8736a82e57c..18cbd3fa7691 100644 > > --- a/fs/nfsd/Makefile > > +++ b/fs/nfsd/Makefile > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > > index 89ff380ec31e..348c1b97092e 100644 > > --- a/fs/nfsd/filecache.c > > +++ b/fs/nfsd/filecache.c > > @@ -52,7 +52,7 @@ > > #define NFSD_FILE_CACHE_UP (0) > > > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > > new file mode 100644 > > index 000000000000..75df709c6903 > > --- /dev/null > > +++ b/fs/nfsd/localio.c > > @@ -0,0 +1,112 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * NFS server support for local clients to bypass network stack > > + * > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > > + */ > > + > > +#include <linux/exportfs.h> > > +#include <linux/sunrpc/svcauth.h> > > +#include <linux/sunrpc/clnt.h> > > +#include <linux/nfs.h> > > +#include <linux/nfs_common.h> > > +#include <linux/nfslocalio.h> > > +#include <linux/string.h> > > + > > +#include "nfsd.h" > > +#include "vfs.h" > > +#include "netns.h" > > +#include "filecache.h" > > + > > +static const struct nfsd_localio_operations nfsd_localio_ops = { > > + .nfsd_open_local_fh = nfsd_open_local_fh, > > + .nfsd_file_put_local = nfsd_file_put_local, > > + .nfsd_file_file = nfsd_file_file, > > +}; > > + > > +void nfsd_localio_ops_init(void) > > +{ > > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > > +} > > Same comment as Neil: this should surface a pointer to the > localio_ops struct. Copying the whole set of function pointers is > generally unnecessary. > > > > + > > +/** > > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > > + * > > + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > > + * and the 'struct auth_domain' required for LOCALIO access > > + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > > + * @cred: cred that the client established > > + * @nfs_fh: filehandle to lookup > > + * @fmode: fmode_t to use for open > > + * > > + * This function maps a local fh to a path on a local filesystem. > > + * This is useful when the nfs client has the local server mounted - it can > > + * avoid all the NFS overhead with reads, writes and commits. > > + * > > + * On successful return, returned nfsd_file will have its nf_net member > > + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > > + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > > + */ > > +struct nfsd_file * > > +nfsd_open_local_fh(nfs_uuid_t *uuid, > > + struct rpc_clnt *rpc_clnt, const struct cred *cred, > > + const struct nfs_fh *nfs_fh, const fmode_t fmode) > > + __must_hold(rcu) > > +{ > > + int mayflags = NFSD_MAY_LOCALIO; > > + struct nfsd_net *nn = NULL; > > + struct net *net; > > + struct svc_cred rq_cred; > > + struct svc_fh fh; > > + struct nfsd_file *localio; > > + __be32 beres; > > + > > + if (nfs_fh->size > NFS4_FHSIZE) > > + return ERR_PTR(-EINVAL); > > + > > + /* > > + * Not running in nfsd context, so must safely get reference on nfsd_serv. > > + * But the server may already be shutting down, if so disallow new localio. > > + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > > + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > > + * and if it succeeds we will have an implied reference to the net. > > + */ > > + net = rcu_dereference(uuid->net); > > + if (net) > > + nn = net_generic(net, nfsd_net_id); > > + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > > + return ERR_PTR(-ENXIO); > > + > > + /* Drop the rcu lock for nfsd_file_acquire_local() */ > > + rcu_read_unlock(); > > I'm struggling with the locking logistics. Caller takes the RCU read > lock, this function drops the lock, then takes it again. So: > > - A caller might rely on the lock being held continuously, but > - The API contract documented above doesn't indicate that this > function drops that lock > - The __must_hold(rcu) annotation doesn't indicate that this > function drops that lock, IIUC > > Dropping and retaking the lock in here is an anti-pattern that > should be avoided. I suggest we are better off in the long run if > the caller does not need to take the RCU read lock, but instead, > nfsd_open_local_fh takes it right here just for the rcu_dereference. > > OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > The RCU read lock can safely be taken more than once in succession. > > Let's rethink the locking strategy. > Agreed. The only caller does this: rcu_read_lock(); if (!rcu_access_pointer(uuid->net)) { rcu_read_unlock(); return ERR_PTR(-ENXIO); } localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, nfs_fh, fmode); rcu_read_unlock(); Maybe just move the check for uuid->net down into nfsd_open_local_fh, and it can acquire the rcu_read_lock for itself? > > > + > > + /* nfs_fh -> svc_fh */ > > + fh_init(&fh, NFS4_FHSIZE); > > + fh.fh_handle.fh_size = nfs_fh->size; > > + memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size); > > + > > + if (fmode & FMODE_READ) > > + mayflags |= NFSD_MAY_READ; > > + if (fmode & FMODE_WRITE) > > + mayflags |= NFSD_MAY_WRITE; > > + > > + svcauth_map_clnt_to_svc_cred_local(rpc_clnt, cred, &rq_cred); > > + > > + beres = nfsd_file_acquire_local(uuid->net, &rq_cred, uuid->dom, > > + &fh, mayflags, &localio); > > + if (beres) { > > + localio = ERR_PTR(nfs_stat_to_errno(be32_to_cpu(beres))); > > + nfsd_serv_put(nn); > > + } > > + > > + fh_put(&fh); > > + if (rq_cred.cr_group_info) > > + put_group_info(rq_cred.cr_group_info); > > + > > + rcu_read_lock(); > > + return localio; > > +} > > +EXPORT_SYMBOL_GPL(nfsd_open_local_fh); > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > > index e2d953f21dde..0fd31188a951 100644 > > --- a/fs/nfsd/netns.h > > +++ b/fs/nfsd/netns.h > > @@ -216,6 +216,10 @@ struct nfsd_net { > > /* last time an admin-revoke happened for NFSv4.0 */ > > time64_t nfs40_last_revoke; > > > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > > + /* Local clients to be invalidated when net is shut down */ > > + struct list_head local_clients; > > +#endif > > }; > > > > /* Simple check to find out if a given net was properly initialized */ > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > > index 64c1b4d649bc..3adbc05ebaac 100644 > > --- a/fs/nfsd/nfsctl.c > > +++ b/fs/nfsd/nfsctl.c > > @@ -18,6 +18,7 @@ > > #include <linux/sunrpc/svc.h> > > #include <linux/module.h> > > #include <linux/fsnotify.h> > > +#include <linux/nfslocalio.h> > > > > #include "idmap.h" > > #include "nfsd.h" > > @@ -2257,7 +2258,9 @@ static __net_init int nfsd_net_init(struct net *net) > > get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); > > seqlock_init(&nn->writeverf_lock); > > nfsd_proc_stat_init(net); > > - > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > > + INIT_LIST_HEAD(&nn->local_clients); > > +#endif > > return 0; > > > > out_repcache_error: > > @@ -2268,6 +2271,22 @@ static __net_init int nfsd_net_init(struct net *net) > > return retval; > > } > > > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > > +/** > > + * nfsd_net_pre_exit - Disconnect localio clients from net namespace > > + * @net: a network namespace that is about to be destroyed > > + * > > + * This invalidated ->net pointers held by localio clients > > + * while they can still safely access nn->counter. > > + */ > > +static __net_exit void nfsd_net_pre_exit(struct net *net) > > +{ > > + struct nfsd_net *nn = net_generic(net, nfsd_net_id); > > + > > + nfs_uuid_invalidate_clients(&nn->local_clients); > > +} > > +#endif > > + > > /** > > * nfsd_net_exit - Release the nfsd_net portion of a net namespace > > * @net: a network namespace that is about to be destroyed > > @@ -2285,6 +2304,9 @@ static __net_exit void nfsd_net_exit(struct net *net) > > > > static struct pernet_operations nfsd_net_ops = { > > .init = nfsd_net_init, > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > > + .pre_exit = nfsd_net_pre_exit, > > +#endif > > .exit = nfsd_net_exit, > > .id = &nfsd_net_id, > > .size = sizeof(struct nfsd_net), > > @@ -2322,6 +2344,7 @@ static int __init init_nfsd(void) > > retval = genl_register_family(&nfsd_nl_family); > > if (retval) > > goto out_free_all; > > + nfsd_localio_ops_init(); > > > > return 0; > > out_free_all: > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h > > index d22027e23761..82bcefcd1f21 100644 > > --- a/fs/nfsd/trace.h > > +++ b/fs/nfsd/trace.h > > @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); > > { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ > > { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ > > { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ > > - { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) > > + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }, \ > > + { NFSD_MAY_LOCALIO, "LOCALIO" }) > > > > TRACE_EVENT(nfsd_compound, > > TP_PROTO( > > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h > > index 01947561d375..3ff146522556 100644 > > --- a/fs/nfsd/vfs.h > > +++ b/fs/nfsd/vfs.h > > @@ -33,6 +33,8 @@ > > > > #define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ > > > > +#define NFSD_MAY_LOCALIO 0x2000 /* for tracing, reflects when localio used */ > > + > > #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) > > #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) > > > > diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h > > index 62419c4bc8f1..61f2c781dd50 100644 > > --- a/include/linux/nfslocalio.h > > +++ b/include/linux/nfslocalio.h > > @@ -6,6 +6,8 @@ > > #ifndef __LINUX_NFSLOCALIO_H > > #define __LINUX_NFSLOCALIO_H > > > > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > > + > > #include <linux/module.h> > > #include <linux/list.h> > > #include <linux/uuid.h> > > @@ -63,4 +65,10 @@ struct nfsd_localio_operations { > > extern void nfsd_localio_ops_init(void); > > extern struct nfsd_localio_operations nfs_to; > > > > +#else /* CONFIG_NFS_LOCALIO */ > > +static inline void nfsd_localio_ops_init(void) > > +{ > > +} > > +#endif /* CONFIG_NFS_LOCALIO */ > > + > > #endif /* __LINUX_NFSLOCALIO_H */ > > -- > > 2.44.0 > > >
On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: > On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > > On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > > > From: Weston Andros Adamson <dros@primarydata.com> > > > > > > Add server support for bypassing NFS for localhost reads, writes, and > > > commits. This is only useful when both the client and server are > > > running on the same host. > > > > > > If nfsd_open_local_fh() fails then the NFS client will both retry and > > > fallback to normal network-based read, write and commit operations if > > > localio is no longer supported. > > > > > > Care is taken to ensure the same NFS security mechanisms are used > > > (authentication, etc) regardless of whether localio or regular NFS > > > access is used. The auth_domain established as part of the traditional > > > NFS client access to the NFS server is also used for localio. Store > > > auth_domain for localio in nfsd_uuid_t and transfer it to the client > > > if it is local to the server. > > > > > > Relative to containers, localio gives the client access to the network > > > namespace the server has. This is required to allow the client to > > > access the server's per-namespace nfsd_net struct. > > > > > > This commit also introduces the use of NFSD's percpu_ref to interlock > > > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > > > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > > > client code. > > > > > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > > > > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > > Co-developed-by: NeilBrown <neilb@suse.de> > > > Signed-off-by: NeilBrown <neilb@suse.de> > > > > > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > > > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > > > --- > > > fs/nfsd/Makefile | 1 + > > > fs/nfsd/filecache.c | 2 +- > > > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > > > fs/nfsd/netns.h | 4 ++ > > > fs/nfsd/nfsctl.c | 25 ++++++++- > > > fs/nfsd/trace.h | 3 +- > > > fs/nfsd/vfs.h | 2 + > > > include/linux/nfslocalio.h | 8 +++ > > > 8 files changed, 154 insertions(+), 3 deletions(-) > > > create mode 100644 fs/nfsd/localio.c > > > > > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > > > index b8736a82e57c..18cbd3fa7691 100644 > > > --- a/fs/nfsd/Makefile > > > +++ b/fs/nfsd/Makefile > > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > > > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > > > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > > > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > > > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > > > index 89ff380ec31e..348c1b97092e 100644 > > > --- a/fs/nfsd/filecache.c > > > +++ b/fs/nfsd/filecache.c > > > @@ -52,7 +52,7 @@ > > > #define NFSD_FILE_CACHE_UP (0) > > > > > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > > > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > > > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > > > new file mode 100644 > > > index 000000000000..75df709c6903 > > > --- /dev/null > > > +++ b/fs/nfsd/localio.c > > > @@ -0,0 +1,112 @@ > > > +// SPDX-License-Identifier: GPL-2.0-only > > > +/* > > > + * NFS server support for local clients to bypass network stack > > > + * > > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > > > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > > > + */ > > > + > > > +#include <linux/exportfs.h> > > > +#include <linux/sunrpc/svcauth.h> > > > +#include <linux/sunrpc/clnt.h> > > > +#include <linux/nfs.h> > > > +#include <linux/nfs_common.h> > > > +#include <linux/nfslocalio.h> > > > +#include <linux/string.h> > > > + > > > +#include "nfsd.h" > > > +#include "vfs.h" > > > +#include "netns.h" > > > +#include "filecache.h" > > > + > > > +static const struct nfsd_localio_operations nfsd_localio_ops = { > > > + .nfsd_open_local_fh = nfsd_open_local_fh, > > > + .nfsd_file_put_local = nfsd_file_put_local, > > > + .nfsd_file_file = nfsd_file_file, > > > +}; > > > + > > > +void nfsd_localio_ops_init(void) > > > +{ > > > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > > > +} > > > > Same comment as Neil: this should surface a pointer to the > > localio_ops struct. Copying the whole set of function pointers is > > generally unnecessary. > > > > > > > + > > > +/** > > > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > > > + * > > > + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > > > + * and the 'struct auth_domain' required for LOCALIO access > > > + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > > > + * @cred: cred that the client established > > > + * @nfs_fh: filehandle to lookup > > > + * @fmode: fmode_t to use for open > > > + * > > > + * This function maps a local fh to a path on a local filesystem. > > > + * This is useful when the nfs client has the local server mounted - it can > > > + * avoid all the NFS overhead with reads, writes and commits. > > > + * > > > + * On successful return, returned nfsd_file will have its nf_net member > > > + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > > > + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > > > + */ > > > +struct nfsd_file * > > > +nfsd_open_local_fh(nfs_uuid_t *uuid, > > > + struct rpc_clnt *rpc_clnt, const struct cred *cred, > > > + const struct nfs_fh *nfs_fh, const fmode_t fmode) > > > + __must_hold(rcu) > > > +{ > > > + int mayflags = NFSD_MAY_LOCALIO; > > > + struct nfsd_net *nn = NULL; > > > + struct net *net; > > > + struct svc_cred rq_cred; > > > + struct svc_fh fh; > > > + struct nfsd_file *localio; > > > + __be32 beres; > > > + > > > + if (nfs_fh->size > NFS4_FHSIZE) > > > + return ERR_PTR(-EINVAL); > > > + > > > + /* > > > + * Not running in nfsd context, so must safely get reference on nfsd_serv. > > > + * But the server may already be shutting down, if so disallow new localio. > > > + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > > > + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > > > + * and if it succeeds we will have an implied reference to the net. > > > + */ > > > + net = rcu_dereference(uuid->net); > > > + if (net) > > > + nn = net_generic(net, nfsd_net_id); > > > + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > > > + return ERR_PTR(-ENXIO); > > > + > > > + /* Drop the rcu lock for nfsd_file_acquire_local() */ > > > + rcu_read_unlock(); > > > > I'm struggling with the locking logistics. Caller takes the RCU read > > lock, this function drops the lock, then takes it again. So: > > > > - A caller might rely on the lock being held continuously, but > > - The API contract documented above doesn't indicate that this > > function drops that lock > > - The __must_hold(rcu) annotation doesn't indicate that this > > function drops that lock, IIUC > > > > Dropping and retaking the lock in here is an anti-pattern that > > should be avoided. I suggest we are better off in the long run if > > the caller does not need to take the RCU read lock, but instead, > > nfsd_open_local_fh takes it right here just for the rcu_dereference. I thought so too when I first saw how Neil approached fixing this to be safe. It was only after putting further time to it (and having the benefit of being so close to all this) that I realized the nuance at play (please see my reply to Jeff below for the nuance I'm speaking of). > > > > OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > > The RCU read lock can safely be taken more than once in succession. > > > > Let's rethink the locking strategy. > > Yes, _that_ is a very valid point. I did wonder the same: it seems perfectly fine to simply retain the RCU throughout the entirety of nfsd_open_local_fh(). > Agreed. The only caller does this: > > rcu_read_lock(); > if (!rcu_access_pointer(uuid->net)) { > rcu_read_unlock(); > return ERR_PTR(-ENXIO); > } > localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, > nfs_fh, fmode); > rcu_read_unlock(); > > Maybe just move the check for uuid->net down into nfsd_open_local_fh, > and it can acquire the rcu_read_lock for itself? No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is a symbol provided by nfsd) is only safe if the RCU protected pre-check shows the uuid->net valid. Mike
On Tue, 2024-09-03 at 11:00 -0400, Mike Snitzer wrote: > On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: > > On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > > > On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > > > > From: Weston Andros Adamson <dros@primarydata.com> > > > > > > > > Add server support for bypassing NFS for localhost reads, writes, and > > > > commits. This is only useful when both the client and server are > > > > running on the same host. > > > > > > > > If nfsd_open_local_fh() fails then the NFS client will both retry and > > > > fallback to normal network-based read, write and commit operations if > > > > localio is no longer supported. > > > > > > > > Care is taken to ensure the same NFS security mechanisms are used > > > > (authentication, etc) regardless of whether localio or regular NFS > > > > access is used. The auth_domain established as part of the traditional > > > > NFS client access to the NFS server is also used for localio. Store > > > > auth_domain for localio in nfsd_uuid_t and transfer it to the client > > > > if it is local to the server. > > > > > > > > Relative to containers, localio gives the client access to the network > > > > namespace the server has. This is required to allow the client to > > > > access the server's per-namespace nfsd_net struct. > > > > > > > > This commit also introduces the use of NFSD's percpu_ref to interlock > > > > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > > > > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > > > > client code. > > > > > > > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > > > > > > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > > > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > > > Co-developed-by: NeilBrown <neilb@suse.de> > > > > Signed-off-by: NeilBrown <neilb@suse.de> > > > > > > > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > > > > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > > > > --- > > > > fs/nfsd/Makefile | 1 + > > > > fs/nfsd/filecache.c | 2 +- > > > > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > > > > fs/nfsd/netns.h | 4 ++ > > > > fs/nfsd/nfsctl.c | 25 ++++++++- > > > > fs/nfsd/trace.h | 3 +- > > > > fs/nfsd/vfs.h | 2 + > > > > include/linux/nfslocalio.h | 8 +++ > > > > 8 files changed, 154 insertions(+), 3 deletions(-) > > > > create mode 100644 fs/nfsd/localio.c > > > > > > > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > > > > index b8736a82e57c..18cbd3fa7691 100644 > > > > --- a/fs/nfsd/Makefile > > > > +++ b/fs/nfsd/Makefile > > > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > > > > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > > > > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > > > > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > > > > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > > > > index 89ff380ec31e..348c1b97092e 100644 > > > > --- a/fs/nfsd/filecache.c > > > > +++ b/fs/nfsd/filecache.c > > > > @@ -52,7 +52,7 @@ > > > > #define NFSD_FILE_CACHE_UP (0) > > > > > > > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > > > > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > > > > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > > > > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > > > > new file mode 100644 > > > > index 000000000000..75df709c6903 > > > > --- /dev/null > > > > +++ b/fs/nfsd/localio.c > > > > @@ -0,0 +1,112 @@ > > > > +// SPDX-License-Identifier: GPL-2.0-only > > > > +/* > > > > + * NFS server support for local clients to bypass network stack > > > > + * > > > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > > > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > > > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > > > > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > > > > + */ > > > > + > > > > +#include <linux/exportfs.h> > > > > +#include <linux/sunrpc/svcauth.h> > > > > +#include <linux/sunrpc/clnt.h> > > > > +#include <linux/nfs.h> > > > > +#include <linux/nfs_common.h> > > > > +#include <linux/nfslocalio.h> > > > > +#include <linux/string.h> > > > > + > > > > +#include "nfsd.h" > > > > +#include "vfs.h" > > > > +#include "netns.h" > > > > +#include "filecache.h" > > > > + > > > > +static const struct nfsd_localio_operations nfsd_localio_ops = { > > > > + .nfsd_open_local_fh = nfsd_open_local_fh, > > > > + .nfsd_file_put_local = nfsd_file_put_local, > > > > + .nfsd_file_file = nfsd_file_file, > > > > +}; > > > > + > > > > +void nfsd_localio_ops_init(void) > > > > +{ > > > > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > > > > +} > > > > > > Same comment as Neil: this should surface a pointer to the > > > localio_ops struct. Copying the whole set of function pointers is > > > generally unnecessary. > > > > > > > > > > + > > > > +/** > > > > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > > > > + * > > > > + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > > > > + * and the 'struct auth_domain' required for LOCALIO access > > > > + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > > > > + * @cred: cred that the client established > > > > + * @nfs_fh: filehandle to lookup > > > > + * @fmode: fmode_t to use for open > > > > + * > > > > + * This function maps a local fh to a path on a local filesystem. > > > > + * This is useful when the nfs client has the local server mounted - it can > > > > + * avoid all the NFS overhead with reads, writes and commits. > > > > + * > > > > + * On successful return, returned nfsd_file will have its nf_net member > > > > + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > > > > + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > > > > + */ > > > > +struct nfsd_file * > > > > +nfsd_open_local_fh(nfs_uuid_t *uuid, > > > > + struct rpc_clnt *rpc_clnt, const struct cred *cred, > > > > + const struct nfs_fh *nfs_fh, const fmode_t fmode) > > > > + __must_hold(rcu) > > > > +{ > > > > + int mayflags = NFSD_MAY_LOCALIO; > > > > + struct nfsd_net *nn = NULL; > > > > + struct net *net; > > > > + struct svc_cred rq_cred; > > > > + struct svc_fh fh; > > > > + struct nfsd_file *localio; > > > > + __be32 beres; > > > > + > > > > + if (nfs_fh->size > NFS4_FHSIZE) > > > > + return ERR_PTR(-EINVAL); > > > > + > > > > + /* > > > > + * Not running in nfsd context, so must safely get reference on nfsd_serv. > > > > + * But the server may already be shutting down, if so disallow new localio. > > > > + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > > > > + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > > > > + * and if it succeeds we will have an implied reference to the net. > > > > + */ > > > > + net = rcu_dereference(uuid->net); > > > > + if (net) > > > > + nn = net_generic(net, nfsd_net_id); > > > > + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > > > > + return ERR_PTR(-ENXIO); > > > > + > > > > + /* Drop the rcu lock for nfsd_file_acquire_local() */ > > > > + rcu_read_unlock(); > > > > > > I'm struggling with the locking logistics. Caller takes the RCU read > > > lock, this function drops the lock, then takes it again. So: > > > > > > - A caller might rely on the lock being held continuously, but > > > - The API contract documented above doesn't indicate that this > > > function drops that lock > > > - The __must_hold(rcu) annotation doesn't indicate that this > > > function drops that lock, IIUC > > > > > > Dropping and retaking the lock in here is an anti-pattern that > > > should be avoided. I suggest we are better off in the long run if > > > the caller does not need to take the RCU read lock, but instead, > > > nfsd_open_local_fh takes it right here just for the rcu_dereference. > > I thought so too when I first saw how Neil approached fixing this to > be safe. It was only after putting further time to it (and having the > benefit of being so close to all this) that I realized the nuance at > play (please see my reply to Jeff below for the nuance I'm speaking > of). > > > > > > > OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > > > The RCU read lock can safely be taken more than once in succession. > > > > > > Let's rethink the locking strategy. > > > > > Yes, _that_ is a very valid point. I did wonder the same: it seems > perfectly fine to simply retain the RCU throughout the entirety of > nfsd_open_local_fh(). > Nope. nfsd_file_do_acquire can allocate, so you can't hold the rcu_read_lock over the whole thing. > > Agreed. The only caller does this: > > > > rcu_read_lock(); > > if (!rcu_access_pointer(uuid->net)) { > > rcu_read_unlock(); > > return ERR_PTR(-ENXIO); > > } > > localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, > > nfs_fh, fmode); > > rcu_read_unlock(); > > > > Maybe just move the check for uuid->net down into nfsd_open_local_fh, > > and it can acquire the rcu_read_lock for itself? > > No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is > a symbol provided by nfsd) is only safe if the RCU protected pre-check > shows the uuid->net valid. Ouch, ok.
On Tue, Sep 03, 2024 at 11:19:45AM -0400, Jeff Layton wrote: > On Tue, 2024-09-03 at 11:00 -0400, Mike Snitzer wrote: > > On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: > > > On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > > > > On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > > > > > From: Weston Andros Adamson <dros@primarydata.com> > > > > > > > > > > Add server support for bypassing NFS for localhost reads, writes, and > > > > > commits. This is only useful when both the client and server are > > > > > running on the same host. > > > > > > > > > > If nfsd_open_local_fh() fails then the NFS client will both retry and > > > > > fallback to normal network-based read, write and commit operations if > > > > > localio is no longer supported. > > > > > > > > > > Care is taken to ensure the same NFS security mechanisms are used > > > > > (authentication, etc) regardless of whether localio or regular NFS > > > > > access is used. The auth_domain established as part of the traditional > > > > > NFS client access to the NFS server is also used for localio. Store > > > > > auth_domain for localio in nfsd_uuid_t and transfer it to the client > > > > > if it is local to the server. > > > > > > > > > > Relative to containers, localio gives the client access to the network > > > > > namespace the server has. This is required to allow the client to > > > > > access the server's per-namespace nfsd_net struct. > > > > > > > > > > This commit also introduces the use of NFSD's percpu_ref to interlock > > > > > nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > > > > > not destroyed while in use by nfsd_open_local_fh and other LOCALIO > > > > > client code. > > > > > > > > > > CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > > > > > > > > > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > > > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > Co-developed-by: Mike Snitzer <snitzer@kernel.org> > > > > > Signed-off-by: Mike Snitzer <snitzer@kernel.org> > > > > > Co-developed-by: NeilBrown <neilb@suse.de> > > > > > Signed-off-by: NeilBrown <neilb@suse.de> > > > > > > > > > > Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > > > > > Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > > > > > --- > > > > > fs/nfsd/Makefile | 1 + > > > > > fs/nfsd/filecache.c | 2 +- > > > > > fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > > > > > fs/nfsd/netns.h | 4 ++ > > > > > fs/nfsd/nfsctl.c | 25 ++++++++- > > > > > fs/nfsd/trace.h | 3 +- > > > > > fs/nfsd/vfs.h | 2 + > > > > > include/linux/nfslocalio.h | 8 +++ > > > > > 8 files changed, 154 insertions(+), 3 deletions(-) > > > > > create mode 100644 fs/nfsd/localio.c > > > > > > > > > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > > > > > index b8736a82e57c..18cbd3fa7691 100644 > > > > > --- a/fs/nfsd/Makefile > > > > > +++ b/fs/nfsd/Makefile > > > > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > > > > > nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > > > > > nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > > > > > nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > > > > > +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > > > > > index 89ff380ec31e..348c1b97092e 100644 > > > > > --- a/fs/nfsd/filecache.c > > > > > +++ b/fs/nfsd/filecache.c > > > > > @@ -52,7 +52,7 @@ > > > > > #define NFSD_FILE_CACHE_UP (0) > > > > > > > > > > /* We only care about NFSD_MAY_READ/WRITE for this cache */ > > > > > -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > > > > > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > > > > > > > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > > > > > static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > > > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > > > > > new file mode 100644 > > > > > index 000000000000..75df709c6903 > > > > > --- /dev/null > > > > > +++ b/fs/nfsd/localio.c > > > > > @@ -0,0 +1,112 @@ > > > > > +// SPDX-License-Identifier: GPL-2.0-only > > > > > +/* > > > > > + * NFS server support for local clients to bypass network stack > > > > > + * > > > > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > > > > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > > > > > + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > > > > > + */ > > > > > + > > > > > +#include <linux/exportfs.h> > > > > > +#include <linux/sunrpc/svcauth.h> > > > > > +#include <linux/sunrpc/clnt.h> > > > > > +#include <linux/nfs.h> > > > > > +#include <linux/nfs_common.h> > > > > > +#include <linux/nfslocalio.h> > > > > > +#include <linux/string.h> > > > > > + > > > > > +#include "nfsd.h" > > > > > +#include "vfs.h" > > > > > +#include "netns.h" > > > > > +#include "filecache.h" > > > > > + > > > > > +static const struct nfsd_localio_operations nfsd_localio_ops = { > > > > > + .nfsd_open_local_fh = nfsd_open_local_fh, > > > > > + .nfsd_file_put_local = nfsd_file_put_local, > > > > > + .nfsd_file_file = nfsd_file_file, > > > > > +}; > > > > > + > > > > > +void nfsd_localio_ops_init(void) > > > > > +{ > > > > > + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > > > > > +} > > > > > > > > Same comment as Neil: this should surface a pointer to the > > > > localio_ops struct. Copying the whole set of function pointers is > > > > generally unnecessary. > > > > > > > > > > > > > + > > > > > +/** > > > > > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > > > > > + * > > > > > + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > > > > > + * and the 'struct auth_domain' required for LOCALIO access > > > > > + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > > > > > + * @cred: cred that the client established > > > > > + * @nfs_fh: filehandle to lookup > > > > > + * @fmode: fmode_t to use for open > > > > > + * > > > > > + * This function maps a local fh to a path on a local filesystem. > > > > > + * This is useful when the nfs client has the local server mounted - it can > > > > > + * avoid all the NFS overhead with reads, writes and commits. > > > > > + * > > > > > + * On successful return, returned nfsd_file will have its nf_net member > > > > > + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > > > > > + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > > > > > + */ > > > > > +struct nfsd_file * > > > > > +nfsd_open_local_fh(nfs_uuid_t *uuid, > > > > > + struct rpc_clnt *rpc_clnt, const struct cred *cred, > > > > > + const struct nfs_fh *nfs_fh, const fmode_t fmode) > > > > > + __must_hold(rcu) > > > > > +{ > > > > > + int mayflags = NFSD_MAY_LOCALIO; > > > > > + struct nfsd_net *nn = NULL; > > > > > + struct net *net; > > > > > + struct svc_cred rq_cred; > > > > > + struct svc_fh fh; > > > > > + struct nfsd_file *localio; > > > > > + __be32 beres; > > > > > + > > > > > + if (nfs_fh->size > NFS4_FHSIZE) > > > > > + return ERR_PTR(-EINVAL); > > > > > + > > > > > + /* > > > > > + * Not running in nfsd context, so must safely get reference on nfsd_serv. > > > > > + * But the server may already be shutting down, if so disallow new localio. > > > > > + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > > > > > + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > > > > > + * and if it succeeds we will have an implied reference to the net. > > > > > + */ > > > > > + net = rcu_dereference(uuid->net); > > > > > + if (net) > > > > > + nn = net_generic(net, nfsd_net_id); > > > > > + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > > > > > + return ERR_PTR(-ENXIO); > > > > > + > > > > > + /* Drop the rcu lock for nfsd_file_acquire_local() */ > > > > > + rcu_read_unlock(); > > > > > > > > I'm struggling with the locking logistics. Caller takes the RCU read > > > > lock, this function drops the lock, then takes it again. So: > > > > > > > > - A caller might rely on the lock being held continuously, but > > > > - The API contract documented above doesn't indicate that this > > > > function drops that lock > > > > - The __must_hold(rcu) annotation doesn't indicate that this > > > > function drops that lock, IIUC > > > > > > > > Dropping and retaking the lock in here is an anti-pattern that > > > > should be avoided. I suggest we are better off in the long run if > > > > the caller does not need to take the RCU read lock, but instead, > > > > nfsd_open_local_fh takes it right here just for the rcu_dereference. > > > > I thought so too when I first saw how Neil approached fixing this to > > be safe. It was only after putting further time to it (and having the > > benefit of being so close to all this) that I realized the nuance at > > play (please see my reply to Jeff below for the nuance I'm speaking > > of). > > > > > > > > > > OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > > > > The RCU read lock can safely be taken more than once in succession. > > > > > > > > Let's rethink the locking strategy. > > > > > > > > Yes, _that_ is a very valid point. I did wonder the same: it seems > > perfectly fine to simply retain the RCU throughout the entirety of > > nfsd_open_local_fh(). > > > > Nope. nfsd_file_do_acquire can allocate, so you can't hold the > rcu_read_lock over the whole thing. Ah, yeap.. sorry, I knew that ;) > > > Agreed. The only caller does this: > > > > > > rcu_read_lock(); > > > if (!rcu_access_pointer(uuid->net)) { > > > rcu_read_unlock(); > > > return ERR_PTR(-ENXIO); > > > } > > > localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, > > > nfs_fh, fmode); > > > rcu_read_unlock(); > > > > > > Maybe just move the check for uuid->net down into nfsd_open_local_fh, > > > and it can acquire the rcu_read_lock for itself? > > > > No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is > > a symbol provided by nfsd) is only safe if the RCU protected pre-check > > shows the uuid->net valid. > > Ouch, ok. I had to double check but I did add a comment that speaks directly to this "nuance" above the code you quoted: /* * uuid->net must not be NULL, otherwise NFS may not have ref * on NFSD and therefore cannot safely make 'nfs_to' calls. */ So yeah, this code needs to stay like this. The __must_hold(rcu) just ensures the RCU is held on entry and exit.. the bouncing of RCU (dropping and retaking) isn't of immediate concern is it? While I agree it isn't ideal, it is what it is given: 1) NFS caller of NFSD symbol is only safe if it has RCU amd verified uuid->net valid 2) nfsd_file_do_acquire() can allocate. Thanks, Mike
> On Sep 3, 2024, at 11:29 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > On Tue, Sep 03, 2024 at 11:19:45AM -0400, Jeff Layton wrote: >> On Tue, 2024-09-03 at 11:00 -0400, Mike Snitzer wrote: >>> On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: >>>> On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: >>>>> On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: >>>>>> From: Weston Andros Adamson <dros@primarydata.com> >>>>>> >>>>>> Add server support for bypassing NFS for localhost reads, writes, and >>>>>> commits. This is only useful when both the client and server are >>>>>> running on the same host. >>>>>> >>>>>> If nfsd_open_local_fh() fails then the NFS client will both retry and >>>>>> fallback to normal network-based read, write and commit operations if >>>>>> localio is no longer supported. >>>>>> >>>>>> Care is taken to ensure the same NFS security mechanisms are used >>>>>> (authentication, etc) regardless of whether localio or regular NFS >>>>>> access is used. The auth_domain established as part of the traditional >>>>>> NFS client access to the NFS server is also used for localio. Store >>>>>> auth_domain for localio in nfsd_uuid_t and transfer it to the client >>>>>> if it is local to the server. >>>>>> >>>>>> Relative to containers, localio gives the client access to the network >>>>>> namespace the server has. This is required to allow the client to >>>>>> access the server's per-namespace nfsd_net struct. >>>>>> >>>>>> This commit also introduces the use of NFSD's percpu_ref to interlock >>>>>> nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is >>>>>> not destroyed while in use by nfsd_open_local_fh and other LOCALIO >>>>>> client code. >>>>>> >>>>>> CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. >>>>>> >>>>>> Signed-off-by: Weston Andros Adamson <dros@primarydata.com> >>>>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> >>>>>> Co-developed-by: Mike Snitzer <snitzer@kernel.org> >>>>>> Signed-off-by: Mike Snitzer <snitzer@kernel.org> >>>>>> Co-developed-by: NeilBrown <neilb@suse.de> >>>>>> Signed-off-by: NeilBrown <neilb@suse.de> >>>>>> >>>>>> Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> >>>>>> Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> >>>>>> --- >>>>>> fs/nfsd/Makefile | 1 + >>>>>> fs/nfsd/filecache.c | 2 +- >>>>>> fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ >>>>>> fs/nfsd/netns.h | 4 ++ >>>>>> fs/nfsd/nfsctl.c | 25 ++++++++- >>>>>> fs/nfsd/trace.h | 3 +- >>>>>> fs/nfsd/vfs.h | 2 + >>>>>> include/linux/nfslocalio.h | 8 +++ >>>>>> 8 files changed, 154 insertions(+), 3 deletions(-) >>>>>> create mode 100644 fs/nfsd/localio.c >>>>>> >>>>>> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile >>>>>> index b8736a82e57c..18cbd3fa7691 100644 >>>>>> --- a/fs/nfsd/Makefile >>>>>> +++ b/fs/nfsd/Makefile >>>>>> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o >>>>>> nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o >>>>>> nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o >>>>>> nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o >>>>>> +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o >>>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c >>>>>> index 89ff380ec31e..348c1b97092e 100644 >>>>>> --- a/fs/nfsd/filecache.c >>>>>> +++ b/fs/nfsd/filecache.c >>>>>> @@ -52,7 +52,7 @@ >>>>>> #define NFSD_FILE_CACHE_UP (0) >>>>>> >>>>>> /* We only care about NFSD_MAY_READ/WRITE for this cache */ >>>>>> -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) >>>>>> +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) >>>>>> >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); >>>>>> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c >>>>>> new file mode 100644 >>>>>> index 000000000000..75df709c6903 >>>>>> --- /dev/null >>>>>> +++ b/fs/nfsd/localio.c >>>>>> @@ -0,0 +1,112 @@ >>>>>> +// SPDX-License-Identifier: GPL-2.0-only >>>>>> +/* >>>>>> + * NFS server support for local clients to bypass network stack >>>>>> + * >>>>>> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> >>>>>> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> >>>>>> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> >>>>>> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> >>>>>> + */ >>>>>> + >>>>>> +#include <linux/exportfs.h> >>>>>> +#include <linux/sunrpc/svcauth.h> >>>>>> +#include <linux/sunrpc/clnt.h> >>>>>> +#include <linux/nfs.h> >>>>>> +#include <linux/nfs_common.h> >>>>>> +#include <linux/nfslocalio.h> >>>>>> +#include <linux/string.h> >>>>>> + >>>>>> +#include "nfsd.h" >>>>>> +#include "vfs.h" >>>>>> +#include "netns.h" >>>>>> +#include "filecache.h" >>>>>> + >>>>>> +static const struct nfsd_localio_operations nfsd_localio_ops = { >>>>>> + .nfsd_open_local_fh = nfsd_open_local_fh, >>>>>> + .nfsd_file_put_local = nfsd_file_put_local, >>>>>> + .nfsd_file_file = nfsd_file_file, >>>>>> +}; >>>>>> + >>>>>> +void nfsd_localio_ops_init(void) >>>>>> +{ >>>>>> + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); >>>>>> +} >>>>> >>>>> Same comment as Neil: this should surface a pointer to the >>>>> localio_ops struct. Copying the whole set of function pointers is >>>>> generally unnecessary. >>>>> >>>>> >>>>>> + >>>>>> +/** >>>>>> + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file >>>>>> + * >>>>>> + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net >>>>>> + * and the 'struct auth_domain' required for LOCALIO access >>>>>> + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred >>>>>> + * @cred: cred that the client established >>>>>> + * @nfs_fh: filehandle to lookup >>>>>> + * @fmode: fmode_t to use for open >>>>>> + * >>>>>> + * This function maps a local fh to a path on a local filesystem. >>>>>> + * This is useful when the nfs client has the local server mounted - it can >>>>>> + * avoid all the NFS overhead with reads, writes and commits. >>>>>> + * >>>>>> + * On successful return, returned nfsd_file will have its nf_net member >>>>>> + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and >>>>>> + * nfsd_file_put (via nfs_to.nfsd_file_put_local). >>>>>> + */ >>>>>> +struct nfsd_file * >>>>>> +nfsd_open_local_fh(nfs_uuid_t *uuid, >>>>>> + struct rpc_clnt *rpc_clnt, const struct cred *cred, >>>>>> + const struct nfs_fh *nfs_fh, const fmode_t fmode) >>>>>> + __must_hold(rcu) >>>>>> +{ >>>>>> + int mayflags = NFSD_MAY_LOCALIO; >>>>>> + struct nfsd_net *nn = NULL; >>>>>> + struct net *net; >>>>>> + struct svc_cred rq_cred; >>>>>> + struct svc_fh fh; >>>>>> + struct nfsd_file *localio; >>>>>> + __be32 beres; >>>>>> + >>>>>> + if (nfs_fh->size > NFS4_FHSIZE) >>>>>> + return ERR_PTR(-EINVAL); >>>>>> + >>>>>> + /* >>>>>> + * Not running in nfsd context, so must safely get reference on nfsd_serv. >>>>>> + * But the server may already be shutting down, if so disallow new localio. >>>>>> + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures >>>>>> + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe >>>>>> + * and if it succeeds we will have an implied reference to the net. >>>>>> + */ >>>>>> + net = rcu_dereference(uuid->net); >>>>>> + if (net) >>>>>> + nn = net_generic(net, nfsd_net_id); >>>>>> + if (unlikely(!nn || !nfsd_serv_try_get(nn))) >>>>>> + return ERR_PTR(-ENXIO); >>>>>> + >>>>>> + /* Drop the rcu lock for nfsd_file_acquire_local() */ >>>>>> + rcu_read_unlock(); >>>>> >>>>> I'm struggling with the locking logistics. Caller takes the RCU read >>>>> lock, this function drops the lock, then takes it again. So: >>>>> >>>>> - A caller might rely on the lock being held continuously, but >>>>> - The API contract documented above doesn't indicate that this >>>>> function drops that lock >>>>> - The __must_hold(rcu) annotation doesn't indicate that this >>>>> function drops that lock, IIUC >>>>> >>>>> Dropping and retaking the lock in here is an anti-pattern that >>>>> should be avoided. I suggest we are better off in the long run if >>>>> the caller does not need to take the RCU read lock, but instead, >>>>> nfsd_open_local_fh takes it right here just for the rcu_dereference. >>> >>> I thought so too when I first saw how Neil approached fixing this to >>> be safe. It was only after putting further time to it (and having the >>> benefit of being so close to all this) that I realized the nuance at >>> play (please see my reply to Jeff below for the nuance I'm speaking >>> of). >>> >>>>> >>>>> OTOH, Why drop the lock before calling nfsd_file_acquire_local()? >>>>> The RCU read lock can safely be taken more than once in succession. >>>>> >>>>> Let's rethink the locking strategy. >>>>> >>> >>> Yes, _that_ is a very valid point. I did wonder the same: it seems >>> perfectly fine to simply retain the RCU throughout the entirety of >>> nfsd_open_local_fh(). >>> >> >> Nope. nfsd_file_do_acquire can allocate, so you can't hold the >> rcu_read_lock over the whole thing. > > Ah, yeap.. sorry, I knew that ;) > >>>> Agreed. The only caller does this: >>>> >>>> rcu_read_lock(); >>>> if (!rcu_access_pointer(uuid->net)) { >>>> rcu_read_unlock(); >>>> return ERR_PTR(-ENXIO); >>>> } >>>> localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, >>>> nfs_fh, fmode); >>>> rcu_read_unlock(); >>>> >>>> Maybe just move the check for uuid->net down into nfsd_open_local_fh, >>>> and it can acquire the rcu_read_lock for itself? >>> >>> No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is >>> a symbol provided by nfsd) is only safe if the RCU protected pre-check >>> shows the uuid->net valid. >> >> Ouch, ok. > > I had to double check but I did add a comment that speaks directly to > this "nuance" above the code you quoted: > > /* > * uuid->net must not be NULL, otherwise NFS may not have ref > * on NFSD and therefore cannot safely make 'nfs_to' calls. > */ > > So yeah, this code needs to stay like this. The __must_hold(rcu) just > ensures the RCU is held on entry and exit.. the bouncing of RCU > (dropping and retaking) isn't of immediate concern is it? While I > agree it isn't ideal, it is what it is given: > 1) NFS caller of NFSD symbol is only safe if it has RCU amd verified > uuid->net valid > 2) nfsd_file_do_acquire() can allocate. OK, understood, but the annotation is still wrong. The lock is dropped here so I think you need __releases and __acquires in that case. However... Let's wait for Neil's comments, but I think this needs to be properly addressed before merging. The comments are not going to be enough IMO. -- Chuck Lever
On Tue, Sep 03, 2024 at 03:59:31PM +0000, Chuck Lever III wrote: > > > > On Sep 3, 2024, at 11:29 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > > > On Tue, Sep 03, 2024 at 11:19:45AM -0400, Jeff Layton wrote: > >> On Tue, 2024-09-03 at 11:00 -0400, Mike Snitzer wrote: > >>> On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: > >>>> On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > >>>>> On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > >>>>>> From: Weston Andros Adamson <dros@primarydata.com> > >>>>>> > >>>>>> Add server support for bypassing NFS for localhost reads, writes, and > >>>>>> commits. This is only useful when both the client and server are > >>>>>> running on the same host. > >>>>>> > >>>>>> If nfsd_open_local_fh() fails then the NFS client will both retry and > >>>>>> fallback to normal network-based read, write and commit operations if > >>>>>> localio is no longer supported. > >>>>>> > >>>>>> Care is taken to ensure the same NFS security mechanisms are used > >>>>>> (authentication, etc) regardless of whether localio or regular NFS > >>>>>> access is used. The auth_domain established as part of the traditional > >>>>>> NFS client access to the NFS server is also used for localio. Store > >>>>>> auth_domain for localio in nfsd_uuid_t and transfer it to the client > >>>>>> if it is local to the server. > >>>>>> > >>>>>> Relative to containers, localio gives the client access to the network > >>>>>> namespace the server has. This is required to allow the client to > >>>>>> access the server's per-namespace nfsd_net struct. > >>>>>> > >>>>>> This commit also introduces the use of NFSD's percpu_ref to interlock > >>>>>> nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > >>>>>> not destroyed while in use by nfsd_open_local_fh and other LOCALIO > >>>>>> client code. > >>>>>> > >>>>>> CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > >>>>>> > >>>>>> Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > >>>>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > >>>>>> Co-developed-by: Mike Snitzer <snitzer@kernel.org> > >>>>>> Signed-off-by: Mike Snitzer <snitzer@kernel.org> > >>>>>> Co-developed-by: NeilBrown <neilb@suse.de> > >>>>>> Signed-off-by: NeilBrown <neilb@suse.de> > >>>>>> > >>>>>> Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > >>>>>> Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > >>>>>> --- > >>>>>> fs/nfsd/Makefile | 1 + > >>>>>> fs/nfsd/filecache.c | 2 +- > >>>>>> fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > >>>>>> fs/nfsd/netns.h | 4 ++ > >>>>>> fs/nfsd/nfsctl.c | 25 ++++++++- > >>>>>> fs/nfsd/trace.h | 3 +- > >>>>>> fs/nfsd/vfs.h | 2 + > >>>>>> include/linux/nfslocalio.h | 8 +++ > >>>>>> 8 files changed, 154 insertions(+), 3 deletions(-) > >>>>>> create mode 100644 fs/nfsd/localio.c > >>>>>> > >>>>>> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > >>>>>> index b8736a82e57c..18cbd3fa7691 100644 > >>>>>> --- a/fs/nfsd/Makefile > >>>>>> +++ b/fs/nfsd/Makefile > >>>>>> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > >>>>>> nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > >>>>>> nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > >>>>>> nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > >>>>>> +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > >>>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > >>>>>> index 89ff380ec31e..348c1b97092e 100644 > >>>>>> --- a/fs/nfsd/filecache.c > >>>>>> +++ b/fs/nfsd/filecache.c > >>>>>> @@ -52,7 +52,7 @@ > >>>>>> #define NFSD_FILE_CACHE_UP (0) > >>>>>> > >>>>>> /* We only care about NFSD_MAY_READ/WRITE for this cache */ > >>>>>> -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > >>>>>> +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > >>>>>> > >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > >>>>>> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > >>>>>> new file mode 100644 > >>>>>> index 000000000000..75df709c6903 > >>>>>> --- /dev/null > >>>>>> +++ b/fs/nfsd/localio.c > >>>>>> @@ -0,0 +1,112 @@ > >>>>>> +// SPDX-License-Identifier: GPL-2.0-only > >>>>>> +/* > >>>>>> + * NFS server support for local clients to bypass network stack > >>>>>> + * > >>>>>> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > >>>>>> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > >>>>>> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > >>>>>> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > >>>>>> + */ > >>>>>> + > >>>>>> +#include <linux/exportfs.h> > >>>>>> +#include <linux/sunrpc/svcauth.h> > >>>>>> +#include <linux/sunrpc/clnt.h> > >>>>>> +#include <linux/nfs.h> > >>>>>> +#include <linux/nfs_common.h> > >>>>>> +#include <linux/nfslocalio.h> > >>>>>> +#include <linux/string.h> > >>>>>> + > >>>>>> +#include "nfsd.h" > >>>>>> +#include "vfs.h" > >>>>>> +#include "netns.h" > >>>>>> +#include "filecache.h" > >>>>>> + > >>>>>> +static const struct nfsd_localio_operations nfsd_localio_ops = { > >>>>>> + .nfsd_open_local_fh = nfsd_open_local_fh, > >>>>>> + .nfsd_file_put_local = nfsd_file_put_local, > >>>>>> + .nfsd_file_file = nfsd_file_file, > >>>>>> +}; > >>>>>> + > >>>>>> +void nfsd_localio_ops_init(void) > >>>>>> +{ > >>>>>> + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > >>>>>> +} > >>>>> > >>>>> Same comment as Neil: this should surface a pointer to the > >>>>> localio_ops struct. Copying the whole set of function pointers is > >>>>> generally unnecessary. > >>>>> > >>>>> > >>>>>> + > >>>>>> +/** > >>>>>> + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > >>>>>> + * > >>>>>> + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > >>>>>> + * and the 'struct auth_domain' required for LOCALIO access > >>>>>> + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > >>>>>> + * @cred: cred that the client established > >>>>>> + * @nfs_fh: filehandle to lookup > >>>>>> + * @fmode: fmode_t to use for open > >>>>>> + * > >>>>>> + * This function maps a local fh to a path on a local filesystem. > >>>>>> + * This is useful when the nfs client has the local server mounted - it can > >>>>>> + * avoid all the NFS overhead with reads, writes and commits. > >>>>>> + * > >>>>>> + * On successful return, returned nfsd_file will have its nf_net member > >>>>>> + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > >>>>>> + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > >>>>>> + */ > >>>>>> +struct nfsd_file * > >>>>>> +nfsd_open_local_fh(nfs_uuid_t *uuid, > >>>>>> + struct rpc_clnt *rpc_clnt, const struct cred *cred, > >>>>>> + const struct nfs_fh *nfs_fh, const fmode_t fmode) > >>>>>> + __must_hold(rcu) > >>>>>> +{ > >>>>>> + int mayflags = NFSD_MAY_LOCALIO; > >>>>>> + struct nfsd_net *nn = NULL; > >>>>>> + struct net *net; > >>>>>> + struct svc_cred rq_cred; > >>>>>> + struct svc_fh fh; > >>>>>> + struct nfsd_file *localio; > >>>>>> + __be32 beres; > >>>>>> + > >>>>>> + if (nfs_fh->size > NFS4_FHSIZE) > >>>>>> + return ERR_PTR(-EINVAL); > >>>>>> + > >>>>>> + /* > >>>>>> + * Not running in nfsd context, so must safely get reference on nfsd_serv. > >>>>>> + * But the server may already be shutting down, if so disallow new localio. > >>>>>> + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > >>>>>> + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > >>>>>> + * and if it succeeds we will have an implied reference to the net. > >>>>>> + */ > >>>>>> + net = rcu_dereference(uuid->net); > >>>>>> + if (net) > >>>>>> + nn = net_generic(net, nfsd_net_id); > >>>>>> + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > >>>>>> + return ERR_PTR(-ENXIO); > >>>>>> + > >>>>>> + /* Drop the rcu lock for nfsd_file_acquire_local() */ > >>>>>> + rcu_read_unlock(); > >>>>> > >>>>> I'm struggling with the locking logistics. Caller takes the RCU read > >>>>> lock, this function drops the lock, then takes it again. So: > >>>>> > >>>>> - A caller might rely on the lock being held continuously, but > >>>>> - The API contract documented above doesn't indicate that this > >>>>> function drops that lock > >>>>> - The __must_hold(rcu) annotation doesn't indicate that this > >>>>> function drops that lock, IIUC > >>>>> > >>>>> Dropping and retaking the lock in here is an anti-pattern that > >>>>> should be avoided. I suggest we are better off in the long run if > >>>>> the caller does not need to take the RCU read lock, but instead, > >>>>> nfsd_open_local_fh takes it right here just for the rcu_dereference. > >>> > >>> I thought so too when I first saw how Neil approached fixing this to > >>> be safe. It was only after putting further time to it (and having the > >>> benefit of being so close to all this) that I realized the nuance at > >>> play (please see my reply to Jeff below for the nuance I'm speaking > >>> of). > >>> > >>>>> > >>>>> OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > >>>>> The RCU read lock can safely be taken more than once in succession. > >>>>> > >>>>> Let's rethink the locking strategy. > >>>>> > >>> > >>> Yes, _that_ is a very valid point. I did wonder the same: it seems > >>> perfectly fine to simply retain the RCU throughout the entirety of > >>> nfsd_open_local_fh(). > >>> > >> > >> Nope. nfsd_file_do_acquire can allocate, so you can't hold the > >> rcu_read_lock over the whole thing. > > > > Ah, yeap.. sorry, I knew that ;) > > > >>>> Agreed. The only caller does this: > >>>> > >>>> rcu_read_lock(); > >>>> if (!rcu_access_pointer(uuid->net)) { > >>>> rcu_read_unlock(); > >>>> return ERR_PTR(-ENXIO); > >>>> } > >>>> localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, > >>>> nfs_fh, fmode); > >>>> rcu_read_unlock(); > >>>> > >>>> Maybe just move the check for uuid->net down into nfsd_open_local_fh, > >>>> and it can acquire the rcu_read_lock for itself? > >>> > >>> No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is > >>> a symbol provided by nfsd) is only safe if the RCU protected pre-check > >>> shows the uuid->net valid. > >> > >> Ouch, ok. > > > > I had to double check but I did add a comment that speaks directly to > > this "nuance" above the code you quoted: > > > > /* > > * uuid->net must not be NULL, otherwise NFS may not have ref > > * on NFSD and therefore cannot safely make 'nfs_to' calls. > > */ > > > > So yeah, this code needs to stay like this. The __must_hold(rcu) just > > ensures the RCU is held on entry and exit.. the bouncing of RCU > > (dropping and retaking) isn't of immediate concern is it? While I > > agree it isn't ideal, it is what it is given: > > 1) NFS caller of NFSD symbol is only safe if it has RCU amd verified > > uuid->net valid > > 2) nfsd_file_do_acquire() can allocate. > > OK, understood, but the annotation is still wrong. The lock > is dropped here so I think you need __releases and __acquires > in that case. However... Sure, that seems like more precise context with which to train lockdep. > Let's wait for Neil's comments, but I think this needs to be > properly addressed before merging. The comments are not going > to be enough IMO. I obviously have no issues with Neil confirming/expanding what I shared about the need for checking uuid->net with RCU held to ensure it safe to call this nfs_to method. Without it we cannot make the call, which happens to then take other references (nfsd_serv and nfsd_file) that we can then lean on for the duration of the NFS client issuing IO and then dropping the references/interlock when completing the IO. The NFS client maintainers need to give a good review anyway, so plenty of time for Neil to weigh in. Thanks, Mike
> On Sep 3, 2024, at 12:09 PM, Mike Snitzer <snitzer@kernel.org> wrote: > > On Tue, Sep 03, 2024 at 03:59:31PM +0000, Chuck Lever III wrote: >> >> >>> On Sep 3, 2024, at 11:29 AM, Mike Snitzer <snitzer@kernel.org> wrote: >>> >>> I had to double check but I did add a comment that speaks directly to >>> this "nuance" above the code you quoted: >>> >>> /* >>> * uuid->net must not be NULL, otherwise NFS may not have ref >>> * on NFSD and therefore cannot safely make 'nfs_to' calls. >>> */ >>> >>> So yeah, this code needs to stay like this. The __must_hold(rcu) just >>> ensures the RCU is held on entry and exit.. the bouncing of RCU >>> (dropping and retaking) isn't of immediate concern is it? While I >>> agree it isn't ideal, it is what it is given: >>> 1) NFS caller of NFSD symbol is only safe if it has RCU amd verified >>> uuid->net valid >>> 2) nfsd_file_do_acquire() can allocate. >> >> OK, understood, but the annotation is still wrong. The lock >> is dropped here so I think you need __releases and __acquires >> in that case. However... > > Sure, that seems like more precise context with which to train > lockdep. > >> Let's wait for Neil's comments, but I think this needs to be >> properly addressed before merging. The comments are not going >> to be enough IMO. > > I obviously have no issues with Neil confirming/expanding what I > shared about the need for checking uuid->net with RCU held to ensure > it safe to call this nfs_to method. Without it we cannot make the > call, which happens to then take other references (nfsd_serv and > nfsd_file) that we can then lean on for the duration of the NFS client > issuing IO and then dropping the references/interlock when completing > the IO. > > The NFS client maintainers need to give a good review anyway, so > plenty of time for Neil to weigh in. I forgot to mention before: I don't see any other issues at this point. Acked-by: Chuck Lever <chuck.lever@oracle.com <mailto:chuck.lever@oracle.com>> -- Chuck Lever
On Wed, 04 Sep 2024, Chuck Lever III wrote: > > > > On Sep 3, 2024, at 11:29 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > > > On Tue, Sep 03, 2024 at 11:19:45AM -0400, Jeff Layton wrote: > >> On Tue, 2024-09-03 at 11:00 -0400, Mike Snitzer wrote: > >>> On Tue, Sep 03, 2024 at 10:40:28AM -0400, Jeff Layton wrote: > >>>> On Tue, 2024-09-03 at 10:34 -0400, Chuck Lever wrote: > >>>>> On Sat, Aug 31, 2024 at 06:37:36PM -0400, Mike Snitzer wrote: > >>>>>> From: Weston Andros Adamson <dros@primarydata.com> > >>>>>> > >>>>>> Add server support for bypassing NFS for localhost reads, writes, and > >>>>>> commits. This is only useful when both the client and server are > >>>>>> running on the same host. > >>>>>> > >>>>>> If nfsd_open_local_fh() fails then the NFS client will both retry and > >>>>>> fallback to normal network-based read, write and commit operations if > >>>>>> localio is no longer supported. > >>>>>> > >>>>>> Care is taken to ensure the same NFS security mechanisms are used > >>>>>> (authentication, etc) regardless of whether localio or regular NFS > >>>>>> access is used. The auth_domain established as part of the traditional > >>>>>> NFS client access to the NFS server is also used for localio. Store > >>>>>> auth_domain for localio in nfsd_uuid_t and transfer it to the client > >>>>>> if it is local to the server. > >>>>>> > >>>>>> Relative to containers, localio gives the client access to the network > >>>>>> namespace the server has. This is required to allow the client to > >>>>>> access the server's per-namespace nfsd_net struct. > >>>>>> > >>>>>> This commit also introduces the use of NFSD's percpu_ref to interlock > >>>>>> nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is > >>>>>> not destroyed while in use by nfsd_open_local_fh and other LOCALIO > >>>>>> client code. > >>>>>> > >>>>>> CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. > >>>>>> > >>>>>> Signed-off-by: Weston Andros Adamson <dros@primarydata.com> > >>>>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > >>>>>> Co-developed-by: Mike Snitzer <snitzer@kernel.org> > >>>>>> Signed-off-by: Mike Snitzer <snitzer@kernel.org> > >>>>>> Co-developed-by: NeilBrown <neilb@suse.de> > >>>>>> Signed-off-by: NeilBrown <neilb@suse.de> > >>>>>> > >>>>>> Not-Acked-by: Chuck Lever <chuck.lever@oracle.com> > >>>>>> Not-Reviewed-by: Jeff Layton <jlayton@kernel.org> > >>>>>> --- > >>>>>> fs/nfsd/Makefile | 1 + > >>>>>> fs/nfsd/filecache.c | 2 +- > >>>>>> fs/nfsd/localio.c | 112 +++++++++++++++++++++++++++++++++++++ > >>>>>> fs/nfsd/netns.h | 4 ++ > >>>>>> fs/nfsd/nfsctl.c | 25 ++++++++- > >>>>>> fs/nfsd/trace.h | 3 +- > >>>>>> fs/nfsd/vfs.h | 2 + > >>>>>> include/linux/nfslocalio.h | 8 +++ > >>>>>> 8 files changed, 154 insertions(+), 3 deletions(-) > >>>>>> create mode 100644 fs/nfsd/localio.c > >>>>>> > >>>>>> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > >>>>>> index b8736a82e57c..18cbd3fa7691 100644 > >>>>>> --- a/fs/nfsd/Makefile > >>>>>> +++ b/fs/nfsd/Makefile > >>>>>> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o > >>>>>> nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o > >>>>>> nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o > >>>>>> nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o > >>>>>> +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o > >>>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > >>>>>> index 89ff380ec31e..348c1b97092e 100644 > >>>>>> --- a/fs/nfsd/filecache.c > >>>>>> +++ b/fs/nfsd/filecache.c > >>>>>> @@ -52,7 +52,7 @@ > >>>>>> #define NFSD_FILE_CACHE_UP (0) > >>>>>> > >>>>>> /* We only care about NFSD_MAY_READ/WRITE for this cache */ > >>>>>> -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > >>>>>> +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) > >>>>>> > >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); > >>>>>> static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); > >>>>>> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c > >>>>>> new file mode 100644 > >>>>>> index 000000000000..75df709c6903 > >>>>>> --- /dev/null > >>>>>> +++ b/fs/nfsd/localio.c > >>>>>> @@ -0,0 +1,112 @@ > >>>>>> +// SPDX-License-Identifier: GPL-2.0-only > >>>>>> +/* > >>>>>> + * NFS server support for local clients to bypass network stack > >>>>>> + * > >>>>>> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> > >>>>>> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> > >>>>>> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> > >>>>>> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> > >>>>>> + */ > >>>>>> + > >>>>>> +#include <linux/exportfs.h> > >>>>>> +#include <linux/sunrpc/svcauth.h> > >>>>>> +#include <linux/sunrpc/clnt.h> > >>>>>> +#include <linux/nfs.h> > >>>>>> +#include <linux/nfs_common.h> > >>>>>> +#include <linux/nfslocalio.h> > >>>>>> +#include <linux/string.h> > >>>>>> + > >>>>>> +#include "nfsd.h" > >>>>>> +#include "vfs.h" > >>>>>> +#include "netns.h" > >>>>>> +#include "filecache.h" > >>>>>> + > >>>>>> +static const struct nfsd_localio_operations nfsd_localio_ops = { > >>>>>> + .nfsd_open_local_fh = nfsd_open_local_fh, > >>>>>> + .nfsd_file_put_local = nfsd_file_put_local, > >>>>>> + .nfsd_file_file = nfsd_file_file, > >>>>>> +}; > >>>>>> + > >>>>>> +void nfsd_localio_ops_init(void) > >>>>>> +{ > >>>>>> + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); > >>>>>> +} > >>>>> > >>>>> Same comment as Neil: this should surface a pointer to the > >>>>> localio_ops struct. Copying the whole set of function pointers is > >>>>> generally unnecessary. > >>>>> > >>>>> > >>>>>> + > >>>>>> +/** > >>>>>> + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file > >>>>>> + * > >>>>>> + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net > >>>>>> + * and the 'struct auth_domain' required for LOCALIO access > >>>>>> + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred > >>>>>> + * @cred: cred that the client established > >>>>>> + * @nfs_fh: filehandle to lookup > >>>>>> + * @fmode: fmode_t to use for open > >>>>>> + * > >>>>>> + * This function maps a local fh to a path on a local filesystem. > >>>>>> + * This is useful when the nfs client has the local server mounted - it can > >>>>>> + * avoid all the NFS overhead with reads, writes and commits. > >>>>>> + * > >>>>>> + * On successful return, returned nfsd_file will have its nf_net member > >>>>>> + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and > >>>>>> + * nfsd_file_put (via nfs_to.nfsd_file_put_local). > >>>>>> + */ > >>>>>> +struct nfsd_file * > >>>>>> +nfsd_open_local_fh(nfs_uuid_t *uuid, > >>>>>> + struct rpc_clnt *rpc_clnt, const struct cred *cred, > >>>>>> + const struct nfs_fh *nfs_fh, const fmode_t fmode) > >>>>>> + __must_hold(rcu) > >>>>>> +{ > >>>>>> + int mayflags = NFSD_MAY_LOCALIO; > >>>>>> + struct nfsd_net *nn = NULL; > >>>>>> + struct net *net; > >>>>>> + struct svc_cred rq_cred; > >>>>>> + struct svc_fh fh; > >>>>>> + struct nfsd_file *localio; > >>>>>> + __be32 beres; > >>>>>> + > >>>>>> + if (nfs_fh->size > NFS4_FHSIZE) > >>>>>> + return ERR_PTR(-EINVAL); > >>>>>> + > >>>>>> + /* > >>>>>> + * Not running in nfsd context, so must safely get reference on nfsd_serv. > >>>>>> + * But the server may already be shutting down, if so disallow new localio. > >>>>>> + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures > >>>>>> + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe > >>>>>> + * and if it succeeds we will have an implied reference to the net. > >>>>>> + */ > >>>>>> + net = rcu_dereference(uuid->net); > >>>>>> + if (net) > >>>>>> + nn = net_generic(net, nfsd_net_id); > >>>>>> + if (unlikely(!nn || !nfsd_serv_try_get(nn))) > >>>>>> + return ERR_PTR(-ENXIO); > >>>>>> + > >>>>>> + /* Drop the rcu lock for nfsd_file_acquire_local() */ > >>>>>> + rcu_read_unlock(); > >>>>> > >>>>> I'm struggling with the locking logistics. Caller takes the RCU read > >>>>> lock, this function drops the lock, then takes it again. So: > >>>>> > >>>>> - A caller might rely on the lock being held continuously, but > >>>>> - The API contract documented above doesn't indicate that this > >>>>> function drops that lock > >>>>> - The __must_hold(rcu) annotation doesn't indicate that this > >>>>> function drops that lock, IIUC > >>>>> > >>>>> Dropping and retaking the lock in here is an anti-pattern that > >>>>> should be avoided. I suggest we are better off in the long run if > >>>>> the caller does not need to take the RCU read lock, but instead, > >>>>> nfsd_open_local_fh takes it right here just for the rcu_dereference. > >>> > >>> I thought so too when I first saw how Neil approached fixing this to > >>> be safe. It was only after putting further time to it (and having the > >>> benefit of being so close to all this) that I realized the nuance at > >>> play (please see my reply to Jeff below for the nuance I'm speaking > >>> of). > >>> > >>>>> > >>>>> OTOH, Why drop the lock before calling nfsd_file_acquire_local()? > >>>>> The RCU read lock can safely be taken more than once in succession. > >>>>> > >>>>> Let's rethink the locking strategy. > >>>>> > >>> > >>> Yes, _that_ is a very valid point. I did wonder the same: it seems > >>> perfectly fine to simply retain the RCU throughout the entirety of > >>> nfsd_open_local_fh(). > >>> > >> > >> Nope. nfsd_file_do_acquire can allocate, so you can't hold the > >> rcu_read_lock over the whole thing. > > > > Ah, yeap.. sorry, I knew that ;) > > > >>>> Agreed. The only caller does this: > >>>> > >>>> rcu_read_lock(); > >>>> if (!rcu_access_pointer(uuid->net)) { > >>>> rcu_read_unlock(); > >>>> return ERR_PTR(-ENXIO); > >>>> } > >>>> localio = nfs_to.nfsd_open_local_fh(uuid, rpc_clnt, cred, > >>>> nfs_fh, fmode); > >>>> rcu_read_unlock(); > >>>> > >>>> Maybe just move the check for uuid->net down into nfsd_open_local_fh, > >>>> and it can acquire the rcu_read_lock for itself? > >>> > >>> No, sorry we cannot. The call to nfs_to.nfsd_open_local_fh (which is > >>> a symbol provided by nfsd) is only safe if the RCU protected pre-check > >>> shows the uuid->net valid. > >> > >> Ouch, ok. > > > > I had to double check but I did add a comment that speaks directly to > > this "nuance" above the code you quoted: > > > > /* > > * uuid->net must not be NULL, otherwise NFS may not have ref > > * on NFSD and therefore cannot safely make 'nfs_to' calls. > > */ > > > > So yeah, this code needs to stay like this. The __must_hold(rcu) just > > ensures the RCU is held on entry and exit.. the bouncing of RCU > > (dropping and retaking) isn't of immediate concern is it? While I > > agree it isn't ideal, it is what it is given: > > 1) NFS caller of NFSD symbol is only safe if it has RCU amd verified > > uuid->net valid > > 2) nfsd_file_do_acquire() can allocate. > > OK, understood, but the annotation is still wrong. The lock > is dropped here so I think you need __releases and __acquires > in that case. However... > > Let's wait for Neil's comments, but I think this needs to be > properly addressed before merging. The comments are not going > to be enough IMO. I don't have much to add. Mike's description of the locking requirement (nfs.to.foo need to be safe) and Jeff's confirmation that we cannot hold rcu across getting the nfsd_file are exactly what I would have said. I agree that dropping and reclaiming a lock is an anti-pattern and in best avoided in general. I cannot see a better alternative in this case. According to Documentation/dev-tools/sparse.txt: __must_hold - The specified lock is held on function entry and exit. __acquires - The specified lock is held on function exit, but not entry. __releases - The specified lock is held on function entry, but not exit. only __must_hold applies. But maybe sparse.txt is wrong. static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) __acquires(RCU) __releases(RCU) gives a counter example. Maybe we should copy that as you say. Maybe we should fix sparse.txt too. NeilBrown
On Wed, 04 Sep 2024, NeilBrown wrote: > > I agree that dropping and reclaiming a lock is an anti-pattern and in > best avoided in general. I cannot see a better alternative in this > case. It occurred to me what I should spell out the alternate that I DO see so you have the option of disagreeing with my assessment that it isn't "better". We need RCU to call into nfsd, we need a per-cpu ref on the net (which we can only get inside nfsd) and NOT RCU to call nfsd_file_acquire_local(). The current code combines these (because they are only used together) and so the need to drop rcu. I thought briefly that it could simply drop rcu and leave it dropped (__releases(rcu)) but not only do I generally like that LESS than dropping and reclaiming, I think it would be buggy. While in the nfsd module code we need to be holding either rcu or a ref on the server else the code could disappear out from under the CPU. So if we exit without a ref on the server - which we do if nfsd_file_acquire_local() fails - then we need to reclaim RCU *before* dropping the ref. So the current code is slightly buggy. We could instead split the combined call into multiple nfs_to interfaces. So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something like: rcu_read_lock(); net = READ_ONCE(uuid->net); if (!net || !nfs_to.get_net(net)) { rcu_read_unlock(); return ERR_PTR(-ENXIO); } rcu_read_unlock(); localio = nfs_to.nfsd_open_local_fh(....); if (IS_ERR(localio)) nfs_to.put_net(net); return localio; So we have 3 interfaces instead of 1, but no hidden unlock/lock. As I said, I don't think this is a net win, but reasonable people might disagree with me. NeilBrown
On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > best avoided in general. I cannot see a better alternative in this > > case. > > It occurred to me what I should spell out the alternate that I DO see so > you have the option of disagreeing with my assessment that it isn't > "better". > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > we can only get inside nfsd) and NOT RCU to call > nfsd_file_acquire_local(). > > The current code combines these (because they are only used together) > and so the need to drop rcu. > > I thought briefly that it could simply drop rcu and leave it dropped > (__releases(rcu)) but not only do I generally like that LESS than > dropping and reclaiming, I think it would be buggy. While in the nfsd > module code we need to be holding either rcu or a ref on the server else > the code could disappear out from under the CPU. So if we exit without > a ref on the server - which we do if nfsd_file_acquire_local() fails - > then we need to reclaim RCU *before* dropping the ref. So the current > code is slightly buggy. > > We could instead split the combined call into multiple nfs_to > interfaces. > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > like: > > rcu_read_lock(); > net = READ_ONCE(uuid->net); > if (!net || !nfs_to.get_net(net)) { > rcu_read_unlock(); > return ERR_PTR(-ENXIO); > } > rcu_read_unlock(); > localio = nfs_to.nfsd_open_local_fh(....); > if (IS_ERR(localio)) > nfs_to.put_net(net); > return localio; > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. Splitting up the function call occurred to me as well, but I didn't come up with a specific bit of surgery. Thanks for the suggestion. At this point, my concern is that we will lose your cogent explanation of why the release/lock is done. Having it in email is great, but email is more ephemeral than actually putting it in the code. > As I said, I don't think this is a net win, but reasonable people might > disagree with me. The "win" here is that it makes this code self-documenting and somewhat less likely to be broken down the road by changes in and around this area. Since I'm more forgetful these days I lean towards the more obvious kinds of coding solutions. ;-) Mike, how do you feel about the 3-interface suggestion?
On Wed, 2024-09-04 at 15:01 +1000, NeilBrown wrote: > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > best avoided in general. I cannot see a better alternative in this > > case. > > It occurred to me what I should spell out the alternate that I DO see so > you have the option of disagreeing with my assessment that it isn't > "better". > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > we can only get inside nfsd) and NOT RCU to call > nfsd_file_acquire_local(). > > The current code combines these (because they are only used together) > and so the need to drop rcu. > > I thought briefly that it could simply drop rcu and leave it dropped > (__releases(rcu)) but not only do I generally like that LESS than > dropping and reclaiming, I think it would be buggy. While in the nfsd > module code we need to be holding either rcu or a ref on the server else > the code could disappear out from under the CPU. So if we exit without > a ref on the server - which we do if nfsd_file_acquire_local() fails - > then we need to reclaim RCU *before* dropping the ref. So the current > code is slightly buggy. > > We could instead split the combined call into multiple nfs_to > interfaces. > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > like: > > rcu_read_lock(); > net = READ_ONCE(uuid->net); > if (!net || !nfs_to.get_net(net)) { > rcu_read_unlock(); > return ERR_PTR(-ENXIO); > } > rcu_read_unlock(); > localio = nfs_to.nfsd_open_local_fh(....); > if (IS_ERR(localio)) > nfs_to.put_net(net); > return localio; > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. > > As I said, I don't think this is a net win, but reasonable people might > disagree with me. > I considered a few alternate designs here as well, and came to the same conclusion. This interface is ugly, but it's not materially worse than the alternatives. I think we just have to document this well, and deal with the ugliness. Luckily most of the gory details are managed inside nfsd and the nfs_common functions so the caller (nfs) shouldn't have to deal with the complex locking. One thing that might be good if we're sticking with this code, is a __might_sleep() at the top of nfs_open_local_fh function in nfs_common. That should help ensure that no one tries to call it with the rcu_read_lock() held (which is the main danger here).
> On Sep 4, 2024, at 9:54 AM, Jeff Layton <jlayton@kernel.org> wrote: > > On Wed, 2024-09-04 at 15:01 +1000, NeilBrown wrote: >> On Wed, 04 Sep 2024, NeilBrown wrote: >>> >>> I agree that dropping and reclaiming a lock is an anti-pattern and in >>> best avoided in general. I cannot see a better alternative in this >>> case. >> >> It occurred to me what I should spell out the alternate that I DO see so >> you have the option of disagreeing with my assessment that it isn't >> "better". >> >> We need RCU to call into nfsd, we need a per-cpu ref on the net (which >> we can only get inside nfsd) and NOT RCU to call >> nfsd_file_acquire_local(). >> >> The current code combines these (because they are only used together) >> and so the need to drop rcu. >> >> I thought briefly that it could simply drop rcu and leave it dropped >> (__releases(rcu)) but not only do I generally like that LESS than >> dropping and reclaiming, I think it would be buggy. While in the nfsd >> module code we need to be holding either rcu or a ref on the server else >> the code could disappear out from under the CPU. So if we exit without >> a ref on the server - which we do if nfsd_file_acquire_local() fails - >> then we need to reclaim RCU *before* dropping the ref. So the current >> code is slightly buggy. >> >> We could instead split the combined call into multiple nfs_to >> interfaces. >> >> So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something >> like: >> >> rcu_read_lock(); >> net = READ_ONCE(uuid->net); >> if (!net || !nfs_to.get_net(net)) { >> rcu_read_unlock(); >> return ERR_PTR(-ENXIO); >> } >> rcu_read_unlock(); >> localio = nfs_to.nfsd_open_local_fh(....); >> if (IS_ERR(localio)) >> nfs_to.put_net(net); >> return localio; >> >> So we have 3 interfaces instead of 1, but no hidden unlock/lock. >> >> As I said, I don't think this is a net win, but reasonable people might >> disagree with me. >> > > I considered a few alternate designs here as well, and came to the same > conclusion. This interface is ugly, but it's not materially worse than > the alternatives. I think we just have to document this well, and deal > with the ugliness. To be clear, I largely agree with that; but I think documenting it in code rather than writing more comments is a good choice. > Luckily most of the gory details are managed inside nfsd and the > nfs_common functions so the caller (nfs) shouldn't have to deal with > the complex locking. > > One thing that might be good if we're sticking with this code, is a > __might_sleep() at the top of nfs_open_local_fh function in nfs_common. > That should help ensure that no one tries to call it with the > rcu_read_lock() held (which is the main danger here). > -- > Jeff Layton <jlayton@kernel.org> -- Chuck Lever
On Wed, Sep 04, 2024 at 09:47:07AM -0400, Chuck Lever wrote: > On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: > > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > > best avoided in general. I cannot see a better alternative in this > > > case. > > > > It occurred to me what I should spell out the alternate that I DO see so > > you have the option of disagreeing with my assessment that it isn't > > "better". > > > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > > we can only get inside nfsd) and NOT RCU to call > > nfsd_file_acquire_local(). > > > > The current code combines these (because they are only used together) > > and so the need to drop rcu. > > > > I thought briefly that it could simply drop rcu and leave it dropped > > (__releases(rcu)) but not only do I generally like that LESS than > > dropping and reclaiming, I think it would be buggy. While in the nfsd > > module code we need to be holding either rcu or a ref on the server else > > the code could disappear out from under the CPU. So if we exit without > > a ref on the server - which we do if nfsd_file_acquire_local() fails - > > then we need to reclaim RCU *before* dropping the ref. So the current > > code is slightly buggy. > > > > We could instead split the combined call into multiple nfs_to > > interfaces. > > > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > > like: > > > > rcu_read_lock(); > > net = READ_ONCE(uuid->net); > > if (!net || !nfs_to.get_net(net)) { > > rcu_read_unlock(); > > return ERR_PTR(-ENXIO); > > } > > rcu_read_unlock(); > > localio = nfs_to.nfsd_open_local_fh(....); > > if (IS_ERR(localio)) > > nfs_to.put_net(net); > > return localio; > > > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. > > Splitting up the function call occurred to me as well, but I didn't > come up with a specific bit of surgery. Thanks for the suggestion. > > At this point, my concern is that we will lose your cogent > explanation of why the release/lock is done. Having it in email is > great, but email is more ephemeral than actually putting it in the > code. > > > > As I said, I don't think this is a net win, but reasonable people might > > disagree with me. > > The "win" here is that it makes this code self-documenting and > somewhat less likely to be broken down the road by changes in and > around this area. Since I'm more forgetful these days I lean towards > the more obvious kinds of coding solutions. ;-) > > Mike, how do you feel about the 3-interface suggestion? I dislike expanding from 1 indirect function call to 2 in rapid succession (3 for the error path, not a problem, just being precise. But I otherwise like it.. maybe.. heh. FYI, I did run with the suggestion to make nfs_to a pointer that just needs a simple assignment rather than memcpy to initialize. So Neil's above code becames: rcu_read_lock(); net = rcu_dereference(uuid->net); if (!net || !nfs_to->nfsd_serv_try_get(net)) { rcu_read_unlock(); return ERR_PTR(-ENXIO); } rcu_read_unlock(); /* We have an implied reference to net thanks to nfsd_serv_try_get */ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, cred, nfs_fh, fmode); if (IS_ERR(localio)) nfs_to->nfsd_serv_put(net); return localio; I do think it cleans the code up... full patch is here: https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/commit/?h=nfs-localio-for-next.v15-with-fixups&id=e85306941878a87070176702de687f2779436061 But I'm still on the fence.. someone help push me over! Tangent, but in the related business of "what are next steps?": I updated headers with various provided Reviewed-by:s and Acked-by:s, fixed at least 1 commit header, fixed some sparse issues, various fixes to nfs_to patch (removed EXPORT_SYMBOL_GPL, switched to using pointer, updated nfs_to callers). Etc... But if I fold those changes in I compromise the provided Reviewed-by and Acked-by.. so I'm leaning toward posting a v16 that has these incremental fixes/improvements, see the 3 topmost commits here: https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/log/?h=nfs-localio-for-next.v15-with-fixups Or if you can review the incremental patches I can fold them in and preserve the various Reviewed-by and Acked-by... You can also see incremental diff from .v15 to .v15-with-fixups with: git remote update snitzer git diff snitzer/nfs-localio-for-next.v15 snitzer/nfs-localio-for-next.v15-with-fixups Either way, I should post a v16 right? SO question is: should I fold these incremental changes in to the original or keep them split out? I'm good with whatever you guys think. But whatever is decided: this needs to be the handoff point to focused NFS client review and hopeful staging for 6.12 inclusion, I've pivoted to working with Trond to make certain he is good with everything. Thanks, Mike
> On Sep 5, 2024, at 10:21 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > On Wed, Sep 04, 2024 at 09:47:07AM -0400, Chuck Lever wrote: >> On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: >>> On Wed, 04 Sep 2024, NeilBrown wrote: >>>> >>>> I agree that dropping and reclaiming a lock is an anti-pattern and in >>>> best avoided in general. I cannot see a better alternative in this >>>> case. >>> >>> It occurred to me what I should spell out the alternate that I DO see so >>> you have the option of disagreeing with my assessment that it isn't >>> "better". >>> >>> We need RCU to call into nfsd, we need a per-cpu ref on the net (which >>> we can only get inside nfsd) and NOT RCU to call >>> nfsd_file_acquire_local(). >>> >>> The current code combines these (because they are only used together) >>> and so the need to drop rcu. >>> >>> I thought briefly that it could simply drop rcu and leave it dropped >>> (__releases(rcu)) but not only do I generally like that LESS than >>> dropping and reclaiming, I think it would be buggy. While in the nfsd >>> module code we need to be holding either rcu or a ref on the server else >>> the code could disappear out from under the CPU. So if we exit without >>> a ref on the server - which we do if nfsd_file_acquire_local() fails - >>> then we need to reclaim RCU *before* dropping the ref. So the current >>> code is slightly buggy. >>> >>> We could instead split the combined call into multiple nfs_to >>> interfaces. >>> >>> So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something >>> like: >>> >>> rcu_read_lock(); >>> net = READ_ONCE(uuid->net); >>> if (!net || !nfs_to.get_net(net)) { >>> rcu_read_unlock(); >>> return ERR_PTR(-ENXIO); >>> } >>> rcu_read_unlock(); >>> localio = nfs_to.nfsd_open_local_fh(....); >>> if (IS_ERR(localio)) >>> nfs_to.put_net(net); >>> return localio; >>> >>> So we have 3 interfaces instead of 1, but no hidden unlock/lock. >> >> Splitting up the function call occurred to me as well, but I didn't >> come up with a specific bit of surgery. Thanks for the suggestion. >> >> At this point, my concern is that we will lose your cogent >> explanation of why the release/lock is done. Having it in email is >> great, but email is more ephemeral than actually putting it in the >> code. >> >> >>> As I said, I don't think this is a net win, but reasonable people might >>> disagree with me. >> >> The "win" here is that it makes this code self-documenting and >> somewhat less likely to be broken down the road by changes in and >> around this area. Since I'm more forgetful these days I lean towards >> the more obvious kinds of coding solutions. ;-) >> >> Mike, how do you feel about the 3-interface suggestion? > > I dislike expanding from 1 indirect function call to 2 in rapid > succession (3 for the error path, not a problem, just being precise. > But I otherwise like it.. maybe.. heh. > > FYI, I did run with the suggestion to make nfs_to a pointer that just > needs a simple assignment rather than memcpy to initialize. So Neil's > above code becames: > > rcu_read_lock(); > net = rcu_dereference(uuid->net); > if (!net || !nfs_to->nfsd_serv_try_get(net)) { > rcu_read_unlock(); > return ERR_PTR(-ENXIO); > } > rcu_read_unlock(); > /* We have an implied reference to net thanks to nfsd_serv_try_get */ > localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, > cred, nfs_fh, fmode); > if (IS_ERR(localio)) > nfs_to->nfsd_serv_put(net); > return localio; > > I do think it cleans the code up... full patch is here: > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/commit/?h=nfs-localio-for-next.v15-with-fixups&id=e85306941878a87070176702de687f2779436061 > > But I'm still on the fence.. someone help push me over! I wasn't expecting it would be less ugly, but it does look harder to screw up down the road when we've forgotten why the API looks this way. > Tangent, but in the related business of "what are next steps?": > > I updated headers with various provided Reviewed-by:s and Acked-by:s, > fixed at least 1 commit header, fixed some sparse issues, various > fixes to nfs_to patch (removed EXPORT_SYMBOL_GPL, switched to using > pointer, updated nfs_to callers). Etc... > > But if I fold those changes in I compromise the provided Reviewed-by > and Acked-by.. so I'm leaning toward posting a v16 that has > these incremental fixes/improvements, see the 3 topmost commits here: > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/log/?h=nfs-localio-for-next.v15-with-fixups > > Or if you can review the incremental patches I can fold them in and > preserve the various Reviewed-by and Acked-by... For the three topmost patches in that branch: Reviewed-by: Chuck Lever <chuck.lever@oracle.com <mailto:chuck.lever@oracle.com>> HTH > You can also see incremental diff from .v15 to .v15-with-fixups with: > git remote update snitzer > git diff snitzer/nfs-localio-for-next.v15 snitzer/nfs-localio-for-next.v15-with-fixups > > Either way, I should post a v16 right? SO question is: should I fold > these incremental changes in to the original or keep them split out? > > I'm good with whatever you guys think. But whatever is decided: this > needs to be the handoff point to focused NFS client review and hopeful > staging for 6.12 inclusion, I've pivoted to working with Trond to > make certain he is good with everything. -- Chuck Lever
On Fri, 06 Sep 2024, Mike Snitzer wrote: > On Wed, Sep 04, 2024 at 09:47:07AM -0400, Chuck Lever wrote: > > On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: > > > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > > > best avoided in general. I cannot see a better alternative in this > > > > case. > > > > > > It occurred to me what I should spell out the alternate that I DO see so > > > you have the option of disagreeing with my assessment that it isn't > > > "better". > > > > > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > > > we can only get inside nfsd) and NOT RCU to call > > > nfsd_file_acquire_local(). > > > > > > The current code combines these (because they are only used together) > > > and so the need to drop rcu. > > > > > > I thought briefly that it could simply drop rcu and leave it dropped > > > (__releases(rcu)) but not only do I generally like that LESS than > > > dropping and reclaiming, I think it would be buggy. While in the nfsd > > > module code we need to be holding either rcu or a ref on the server else > > > the code could disappear out from under the CPU. So if we exit without > > > a ref on the server - which we do if nfsd_file_acquire_local() fails - > > > then we need to reclaim RCU *before* dropping the ref. So the current > > > code is slightly buggy. > > > > > > We could instead split the combined call into multiple nfs_to > > > interfaces. > > > > > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > > > like: > > > > > > rcu_read_lock(); > > > net = READ_ONCE(uuid->net); > > > if (!net || !nfs_to.get_net(net)) { > > > rcu_read_unlock(); > > > return ERR_PTR(-ENXIO); > > > } > > > rcu_read_unlock(); > > > localio = nfs_to.nfsd_open_local_fh(....); > > > if (IS_ERR(localio)) > > > nfs_to.put_net(net); > > > return localio; > > > > > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. > > > > Splitting up the function call occurred to me as well, but I didn't > > come up with a specific bit of surgery. Thanks for the suggestion. > > > > At this point, my concern is that we will lose your cogent > > explanation of why the release/lock is done. Having it in email is > > great, but email is more ephemeral than actually putting it in the > > code. > > > > > > > As I said, I don't think this is a net win, but reasonable people might > > > disagree with me. > > > > The "win" here is that it makes this code self-documenting and > > somewhat less likely to be broken down the road by changes in and > > around this area. Since I'm more forgetful these days I lean towards > > the more obvious kinds of coding solutions. ;-) > > > > Mike, how do you feel about the 3-interface suggestion? > > I dislike expanding from 1 indirect function call to 2 in rapid > succession (3 for the error path, not a problem, just being precise. > But I otherwise like it.. maybe.. heh. > > FYI, I did run with the suggestion to make nfs_to a pointer that just > needs a simple assignment rather than memcpy to initialize. So Neil's > above code becames: > > rcu_read_lock(); > net = rcu_dereference(uuid->net); > if (!net || !nfs_to->nfsd_serv_try_get(net)) { > rcu_read_unlock(); > return ERR_PTR(-ENXIO); > } > rcu_read_unlock(); > /* We have an implied reference to net thanks to nfsd_serv_try_get */ > localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, > cred, nfs_fh, fmode); > if (IS_ERR(localio)) > nfs_to->nfsd_serv_put(net); > return localio; > > I do think it cleans the code up... full patch is here: > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/commit/?h=nfs-localio-for-next.v15-with-fixups&id=e85306941878a87070176702de687f2779436061 > > But I'm still on the fence.. someone help push me over! I think the new code is unquestionable clearer, and not taking this approach would be a micro-optimisation which would need to be numerically justified. So I'm pushing for the three-interface version (despite what I said before). Unfortunately the new code is not bug-free - not quite. As soon as nfs_to->nfsd_serv_put() calls percpu_ref_put() the nfsd module can be unloaded, and the "return" instruction might not be present. For this to go wrong would require a lot of bad luck, but if the CPU took an interrupt at the wrong time were would be room. [Ever since module_put_and_exit() was added (now ..and_kthread_exit) I've been sensitive to dropping the ref to a module in code running in the module] So I think nfsd_serv_put (and nfsd_serv_try_get() __must_hold(RCU) and nfs_open_local_fh() needs rcu_read_lock() before calling nfs_to->nfsd_serv_put(net). > > Tangent, but in the related business of "what are next steps?": > > I updated headers with various provided Reviewed-by:s and Acked-by:s, > fixed at least 1 commit header, fixed some sparse issues, various > fixes to nfs_to patch (removed EXPORT_SYMBOL_GPL, switched to using > pointer, updated nfs_to callers). Etc... > > But if I fold those changes in I compromise the provided Reviewed-by > and Acked-by.. so I'm leaning toward posting a v16 that has > these incremental fixes/improvements, see the 3 topmost commits here: > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/log/?h=nfs-localio-for-next.v15-with-fixups > > Or if you can review the incremental patches I can fold them in and > preserve the various Reviewed-by and Acked-by... I have reviewed the incremental patches and I'm happy for all my tags to apply to the new versions of the patches. NeilBrown
On Fri, Sep 06, 2024 at 09:34:08AM +1000, NeilBrown wrote: > On Fri, 06 Sep 2024, Mike Snitzer wrote: > > On Wed, Sep 04, 2024 at 09:47:07AM -0400, Chuck Lever wrote: > > > On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: > > > > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > > > > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > > > > best avoided in general. I cannot see a better alternative in this > > > > > case. > > > > > > > > It occurred to me what I should spell out the alternate that I DO see so > > > > you have the option of disagreeing with my assessment that it isn't > > > > "better". > > > > > > > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > > > > we can only get inside nfsd) and NOT RCU to call > > > > nfsd_file_acquire_local(). > > > > > > > > The current code combines these (because they are only used together) > > > > and so the need to drop rcu. > > > > > > > > I thought briefly that it could simply drop rcu and leave it dropped > > > > (__releases(rcu)) but not only do I generally like that LESS than > > > > dropping and reclaiming, I think it would be buggy. While in the nfsd > > > > module code we need to be holding either rcu or a ref on the server else > > > > the code could disappear out from under the CPU. So if we exit without > > > > a ref on the server - which we do if nfsd_file_acquire_local() fails - > > > > then we need to reclaim RCU *before* dropping the ref. So the current > > > > code is slightly buggy. > > > > > > > > We could instead split the combined call into multiple nfs_to > > > > interfaces. > > > > > > > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > > > > like: > > > > > > > > rcu_read_lock(); > > > > net = READ_ONCE(uuid->net); > > > > if (!net || !nfs_to.get_net(net)) { > > > > rcu_read_unlock(); > > > > return ERR_PTR(-ENXIO); > > > > } > > > > rcu_read_unlock(); > > > > localio = nfs_to.nfsd_open_local_fh(....); > > > > if (IS_ERR(localio)) > > > > nfs_to.put_net(net); > > > > return localio; > > > > > > > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. > > > > > > Splitting up the function call occurred to me as well, but I didn't > > > come up with a specific bit of surgery. Thanks for the suggestion. > > > > > > At this point, my concern is that we will lose your cogent > > > explanation of why the release/lock is done. Having it in email is > > > great, but email is more ephemeral than actually putting it in the > > > code. > > > > > > > > > > As I said, I don't think this is a net win, but reasonable people might > > > > disagree with me. > > > > > > The "win" here is that it makes this code self-documenting and > > > somewhat less likely to be broken down the road by changes in and > > > around this area. Since I'm more forgetful these days I lean towards > > > the more obvious kinds of coding solutions. ;-) > > > > > > Mike, how do you feel about the 3-interface suggestion? > > > > I dislike expanding from 1 indirect function call to 2 in rapid > > succession (3 for the error path, not a problem, just being precise. > > But I otherwise like it.. maybe.. heh. > > > > FYI, I did run with the suggestion to make nfs_to a pointer that just > > needs a simple assignment rather than memcpy to initialize. So Neil's > > above code becames: > > > > rcu_read_lock(); > > net = rcu_dereference(uuid->net); > > if (!net || !nfs_to->nfsd_serv_try_get(net)) { > > rcu_read_unlock(); > > return ERR_PTR(-ENXIO); > > } > > rcu_read_unlock(); > > /* We have an implied reference to net thanks to nfsd_serv_try_get */ > > localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, > > cred, nfs_fh, fmode); > > if (IS_ERR(localio)) > > nfs_to->nfsd_serv_put(net); > > return localio; > > > > I do think it cleans the code up... full patch is here: > > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/commit/?h=nfs-localio-for-next.v15-with-fixups&id=e85306941878a87070176702de687f2779436061 > > > > But I'm still on the fence.. someone help push me over! > > I think the new code is unquestionable clearer, and not taking this > approach would be a micro-optimisation which would need to be > numerically justified. So I'm pushing for the three-interface version > (despite what I said before). > > Unfortunately the new code is not bug-free - not quite. > As soon as nfs_to->nfsd_serv_put() calls percpu_ref_put() the nfsd > module can be unloaded, and the "return" instruction might not be > present. For this to go wrong would require a lot of bad luck, but if > the CPU took an interrupt at the wrong time were would be room. > > [Ever since module_put_and_exit() was added (now ..and_kthread_exit) > I've been sensitive to dropping the ref to a module in code running in > the module] > > So I think nfsd_serv_put (and nfsd_serv_try_get() __must_hold(RCU) and > nfs_open_local_fh() needs rcu_read_lock() before calling > nfs_to->nfsd_serv_put(net). OK, yes I can see that, I implemented what you suggested at the end of your reply (see inline patch below)... But I'd just like to point out that something like the below patch wouldn't be needed if we kept my "heavy" approach (nfs reference on nfsd modules via nfs_common using request_symbol): https://marc.info/?l=linux-nfs&m=172499445027800&w=2 (that patch has stuff I since cleaned up, e.g. removed typedefs and EXPORT_SYMBOL_GPLs..) I knew we were going to pay for being too cute with how nfs took its reference on nfsd. So here we are, needing fiddly incremental fixes like this to close a really-small-yet-will-be-deadly race: diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index c29cdf51c458..d124c265b8fd 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -341,7 +341,7 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - nfs_to->nfsd_file_put_local(iocb->localio); + nfs_to_nfsd_file_put_local(iocb->localio); nfs_local_iocb_free(iocb); nfs_local_hdr_release(hdr, hdr->task.tk_ops); } @@ -622,7 +622,7 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, } out: if (status != 0) { - nfs_to->nfsd_file_put_local(localio); + nfs_to_nfsd_file_put_local(localio); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); } @@ -673,7 +673,7 @@ nfs_local_release_commit_data(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops) { - nfs_to->nfsd_file_put_local(localio); + nfs_to_nfsd_file_put_local(localio); call_ops->rpc_call_done(&data->task, data); call_ops->rpc_release(data); } diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 42b479b9191f..5c8ce5066c16 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -142,8 +142,11 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, /* We have an implied reference to net thanks to nfsd_serv_try_get */ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, cred, nfs_fh, fmode); - if (IS_ERR(localio)) + if (IS_ERR(localio)) { + rcu_read_lock(); nfs_to->nfsd_serv_put(net); + rcu_read_unlock(); + } return localio; } EXPORT_SYMBOL_GPL(nfs_open_local_fh); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7ff477b40bcd..0d389051d08d 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -398,7 +398,7 @@ nfsd_file_put(struct nfsd_file *nf) * reference to the associated nn->nfsd_serv. */ void -nfsd_file_put_local(struct nfsd_file *nf) +nfsd_file_put_local(struct nfsd_file *nf) __must_hold(rcu) { struct net *net = nf->nf_net; diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 291e9c69cae4..f441cb9f74d5 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -53,7 +53,7 @@ void nfsd_localio_ops_init(void) * * On successful return, returned nfsd_file will have its nf_net member * set. Caller (NFS client) is responsible for calling nfsd_serv_put and - * nfsd_file_put (via nfs_to->nfsd_file_put_local). + * nfsd_file_put (via nfs_to_nfsd_file_put_local). */ struct nfsd_file * nfsd_open_local_fh(struct net *net, struct auth_domain *dom, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e236135ddc63..47172b407be8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -214,14 +214,14 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change return 0; } -bool nfsd_serv_try_get(struct net *net) +bool nfsd_serv_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref)); } -void nfsd_serv_put(struct net *net) +void nfsd_serv_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index b353abe00357..b0dd9b1eef4f 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -65,10 +65,25 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, const fmode_t); +static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +{ + /* + * Once reference to nfsd_serv is dropped, NFSD could be + * unloaded, so ensure safe return from nfsd_file_put_local() + * by always taking RCU. + */ + rcu_read_lock(); + nfs_to->nfsd_file_put_local(localio); + rcu_read_unlock(); +} + #else /* CONFIG_NFS_LOCALIO */ static inline void nfsd_localio_ops_init(void) { } +static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +{ +} #endif /* CONFIG_NFS_LOCALIO */ #endif /* __LINUX_NFSLOCALIO_H */ > > > > Tangent, but in the related business of "what are next steps?": > > > > I updated headers with various provided Reviewed-by:s and Acked-by:s, > > fixed at least 1 commit header, fixed some sparse issues, various > > fixes to nfs_to patch (removed EXPORT_SYMBOL_GPL, switched to using > > pointer, updated nfs_to callers). Etc... > > > > But if I fold those changes in I compromise the provided Reviewed-by > > and Acked-by.. so I'm leaning toward posting a v16 that has > > these incremental fixes/improvements, see the 3 topmost commits here: > > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/log/?h=nfs-localio-for-next.v15-with-fixups > > > > Or if you can review the incremental patches I can fold them in and > > preserve the various Reviewed-by and Acked-by... > > I have reviewed the incremental patches and I'm happy for all my tags to > apply to the new versions of the patches. Thanks! Mike
On Fri, Sep 06, 2024 at 11:04:16AM -0400, Mike Snitzer wrote: > On Fri, Sep 06, 2024 at 09:34:08AM +1000, NeilBrown wrote: > > On Fri, 06 Sep 2024, Mike Snitzer wrote: > > > On Wed, Sep 04, 2024 at 09:47:07AM -0400, Chuck Lever wrote: > > > > On Wed, Sep 04, 2024 at 03:01:46PM +1000, NeilBrown wrote: > > > > > On Wed, 04 Sep 2024, NeilBrown wrote: > > > > > > > > > > > > I agree that dropping and reclaiming a lock is an anti-pattern and in > > > > > > best avoided in general. I cannot see a better alternative in this > > > > > > case. > > > > > > > > > > It occurred to me what I should spell out the alternate that I DO see so > > > > > you have the option of disagreeing with my assessment that it isn't > > > > > "better". > > > > > > > > > > We need RCU to call into nfsd, we need a per-cpu ref on the net (which > > > > > we can only get inside nfsd) and NOT RCU to call > > > > > nfsd_file_acquire_local(). > > > > > > > > > > The current code combines these (because they are only used together) > > > > > and so the need to drop rcu. > > > > > > > > > > I thought briefly that it could simply drop rcu and leave it dropped > > > > > (__releases(rcu)) but not only do I generally like that LESS than > > > > > dropping and reclaiming, I think it would be buggy. While in the nfsd > > > > > module code we need to be holding either rcu or a ref on the server else > > > > > the code could disappear out from under the CPU. So if we exit without > > > > > a ref on the server - which we do if nfsd_file_acquire_local() fails - > > > > > then we need to reclaim RCU *before* dropping the ref. So the current > > > > > code is slightly buggy. > > > > > > > > > > We could instead split the combined call into multiple nfs_to > > > > > interfaces. > > > > > > > > > > So nfs_open_local_fh() in nfs_common/nfslocalio.c would be something > > > > > like: > > > > > > > > > > rcu_read_lock(); > > > > > net = READ_ONCE(uuid->net); > > > > > if (!net || !nfs_to.get_net(net)) { > > > > > rcu_read_unlock(); > > > > > return ERR_PTR(-ENXIO); > > > > > } > > > > > rcu_read_unlock(); > > > > > localio = nfs_to.nfsd_open_local_fh(....); > > > > > if (IS_ERR(localio)) > > > > > nfs_to.put_net(net); > > > > > return localio; > > > > > > > > > > So we have 3 interfaces instead of 1, but no hidden unlock/lock. > > > > > > > > Splitting up the function call occurred to me as well, but I didn't > > > > come up with a specific bit of surgery. Thanks for the suggestion. > > > > > > > > At this point, my concern is that we will lose your cogent > > > > explanation of why the release/lock is done. Having it in email is > > > > great, but email is more ephemeral than actually putting it in the > > > > code. > > > > > > > > > > > > > As I said, I don't think this is a net win, but reasonable people might > > > > > disagree with me. > > > > > > > > The "win" here is that it makes this code self-documenting and > > > > somewhat less likely to be broken down the road by changes in and > > > > around this area. Since I'm more forgetful these days I lean towards > > > > the more obvious kinds of coding solutions. ;-) > > > > > > > > Mike, how do you feel about the 3-interface suggestion? > > > > > > I dislike expanding from 1 indirect function call to 2 in rapid > > > succession (3 for the error path, not a problem, just being precise. > > > But I otherwise like it.. maybe.. heh. > > > > > > FYI, I did run with the suggestion to make nfs_to a pointer that just > > > needs a simple assignment rather than memcpy to initialize. So Neil's > > > above code becames: > > > > > > rcu_read_lock(); > > > net = rcu_dereference(uuid->net); > > > if (!net || !nfs_to->nfsd_serv_try_get(net)) { > > > rcu_read_unlock(); > > > return ERR_PTR(-ENXIO); > > > } > > > rcu_read_unlock(); > > > /* We have an implied reference to net thanks to nfsd_serv_try_get */ > > > localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, > > > cred, nfs_fh, fmode); > > > if (IS_ERR(localio)) > > > nfs_to->nfsd_serv_put(net); > > > return localio; > > > > > > I do think it cleans the code up... full patch is here: > > > https://git.kernel.org/pub/scm/linux/kernel/git/snitzer/linux.git/commit/?h=nfs-localio-for-next.v15-with-fixups&id=e85306941878a87070176702de687f2779436061 > > > > > > But I'm still on the fence.. someone help push me over! > > > > I think the new code is unquestionable clearer, and not taking this > > approach would be a micro-optimisation which would need to be > > numerically justified. So I'm pushing for the three-interface version > > (despite what I said before). > > > > Unfortunately the new code is not bug-free - not quite. > > As soon as nfs_to->nfsd_serv_put() calls percpu_ref_put() the nfsd > > module can be unloaded, and the "return" instruction might not be > > present. For this to go wrong would require a lot of bad luck, but if > > the CPU took an interrupt at the wrong time were would be room. > > > > [Ever since module_put_and_exit() was added (now ..and_kthread_exit) > > I've been sensitive to dropping the ref to a module in code running in > > the module] > > > > So I think nfsd_serv_put (and nfsd_serv_try_get() __must_hold(RCU) and > > nfs_open_local_fh() needs rcu_read_lock() before calling > > nfs_to->nfsd_serv_put(net). > > OK, yes I can see that, I implemented what you suggested at the end of > your reply (see inline patch below)... > > But I'd just like to point out that something like the below patch > wouldn't be needed if we kept my "heavy" approach (nfs reference on > nfsd modules via nfs_common using request_symbol): > https://marc.info/?l=linux-nfs&m=172499445027800&w=2 > (that patch has stuff I since cleaned up, e.g. removed typedefs and > EXPORT_SYMBOL_GPLs..) > > I knew we were going to pay for being too cute with how nfs took its > reference on nfsd. > > So here we are, needing fiddly incremental fixes like this to close a > really-small-yet-will-be-deadly race: <snip required delicate rcu re-locking requirements patch> I prefer this incremental re-implementation of my symbol_request patch that eliminates all concerns about the validity of 'nfs_to' calls: --- fs/nfs/localio.c | 5 +++ fs/nfs_common/nfslocalio.c | 84 +++++++++++++++++++++++++++++++------- fs/nfsd/localio.c | 2 +- include/linux/nfslocalio.h | 7 +++- 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index c29cdf51c458..43520ac0fde8 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -124,6 +124,10 @@ const struct rpc_program nfslocalio_program = { static void nfs_local_enable(struct nfs_client *clp) { spin_lock(&clp->cl_localio_lock); + if (!nfs_to_nfsd_localio_ops_get()) { + spin_unlock(&clp->cl_localio_lock); + return; + } set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags); trace_nfs_local_enable(clp); spin_unlock(&clp->cl_localio_lock); @@ -138,6 +142,7 @@ void nfs_local_disable(struct nfs_client *clp) if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) { trace_nfs_local_disable(clp); nfs_uuid_invalidate_one_client(&clp->cl_uuid); + nfs_to_nfsd_localio_ops_put(); } spin_unlock(&clp->cl_localio_lock); } diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 42b479b9191f..9039e0f1afa3 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/rculist.h> #include <linux/nfslocalio.h> +#include <linux/refcount.h> #include <net/netns/generic.h> MODULE_LICENSE("GPL"); @@ -53,11 +54,8 @@ static nfs_uuid_t * nfs_uuid_lookup_locked(const uuid_t *uuid) return NULL; } -static struct module *nfsd_mod; - void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list, - struct net *net, struct auth_domain *dom, - struct module *mod) + struct net *net, struct auth_domain *dom) { nfs_uuid_t *nfs_uuid; @@ -73,9 +71,6 @@ void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list, */ list_move(&nfs_uuid->list, list); rcu_assign_pointer(nfs_uuid->net, net); - - __module_get(mod); - nfsd_mod = mod; } spin_unlock(&nfs_uuid_lock); } @@ -83,10 +78,8 @@ EXPORT_SYMBOL_GPL(nfs_uuid_is_local); static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid) { - if (nfs_uuid->net) { - module_put(nfsd_mod); - nfs_uuid->net = NULL; - } + if (nfs_uuid->net) + RCU_INIT_POINTER(nfs_uuid->net, NULL); if (nfs_uuid->dom) { auth_domain_put(nfs_uuid->dom); nfs_uuid->dom = NULL; @@ -123,14 +116,14 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, struct nfsd_file *localio; /* - * Not running in nfsd context, so must safely get reference on nfsd_serv. + * NFS has a reference to NFSD and can safely make 'nfs_to' calls. + * + * But not running in NFSD context, so must safely get reference to nfsd_serv. * But the server may already be shutting down, if so disallow new localio. + * * uuid->net is NOT a counted reference, but rcu_read_lock() ensures that * if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe * and if it succeeds we will have an implied reference to the net. - * - * Otherwise NFS may not have ref on NFSD and therefore cannot safely - * make 'nfs_to' calls. */ rcu_read_lock(); net = rcu_dereference(uuid->net); @@ -153,6 +146,7 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh); * but cannot be statically linked, because that will make the NFS * module always depend on the NFSD module. * + * [FIXME: must adjust following 2 paragraphs] * 'nfs_to' provides NFS access to NFSD functions needed for LOCALIO, * its lifetime is tightly coupled to the NFSD module and will always * be available to NFS LOCALIO because any successful client<->server @@ -170,3 +164,63 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh); */ const struct nfsd_localio_operations *nfs_to; EXPORT_SYMBOL_GPL(nfs_to); + +static DEFINE_SPINLOCK(nfs_to_nfsd_lock); +static refcount_t nfs_to_ref; + +bool nfs_to_nfsd_localio_ops_get(void) +{ + spin_lock(&nfs_to_nfsd_lock); + + /* Only get nfsd_localio_operations on first reference */ + if (refcount_read(&nfs_to_ref) == 0) { + refcount_set(&nfs_to_ref, 1); + /* fallthru */ + } else { + refcount_inc(&nfs_to_ref); + spin_unlock(&nfs_to_nfsd_lock); + return true; + } + + /* Must drop spinlock before call to symbol_request */ + spin_unlock(&nfs_to_nfsd_lock); + + /* + * If NFSD isn't available LOCALIO isn't possible. + * Use nfsd_open_local_fh symbol as the bellwether, if + * available then nfs_common has NFSD module reference + * on NFS's behalf and can safely call 'nfs_to' functions. + */ + if (!symbol_request(nfsd_open_local_fh)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(nfs_to_nfsd_localio_ops_get); + +void nfs_to_nfsd_localio_ops_put(void) +{ + spin_lock(&nfs_to_nfsd_lock); + + if (!refcount_dec_and_test(&nfs_to_ref)) + goto out; + + symbol_put(nfsd_open_local_fh); + nfs_to = NULL; +out: + spin_unlock(&nfs_to_nfsd_lock); +} +EXPORT_SYMBOL_GPL(nfs_to_nfsd_localio_ops_put); + +static int __init nfslocalio_init(void) +{ + refcount_set(&nfs_to_ref, 0); + + return 0; +} + +static void __exit nfslocalio_exit(void) +{ +} + +module_init(nfslocalio_init); +module_exit(nfslocalio_exit); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 291e9c69cae4..291ad916d67a 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -114,7 +114,7 @@ static __be32 localio_proc_uuid_is_local(struct svc_rqst *rqstp) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs_uuid_is_local(&argp->uuid, &nn->local_clients, - net, rqstp->rq_client, THIS_MODULE); + net, rqstp->rq_client); return rpc_success; } diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index b353abe00357..2e6b9107a7d1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -35,7 +35,7 @@ typedef struct { void nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, - struct net *, struct auth_domain *, struct module *); + struct net *, struct auth_domain *); void nfs_uuid_invalidate_clients(struct list_head *list); void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); @@ -58,9 +58,12 @@ struct nfsd_localio_operations { struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -extern void nfsd_localio_ops_init(void); extern const struct nfsd_localio_operations *nfs_to; +extern void nfsd_localio_ops_init(void); +bool nfs_to_nfsd_localio_ops_get(void); +void nfs_to_nfsd_localio_ops_put(void); + struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, const fmode_t);
On Sat, 07 Sep 2024, Mike Snitzer wrote: > > But I'd just like to point out that something like the below patch > > wouldn't be needed if we kept my "heavy" approach (nfs reference on > > nfsd modules via nfs_common using symbol_request): > > https://marc.info/?l=linux-nfs&m=172499445027800&w=2 > > (that patch has stuff I since cleaned up, e.g. removed typedefs and > > EXPORT_SYMBOL_GPLs..) > > > > I knew we were going to pay for being too cute with how nfs took its > > reference on nfsd. > > > > So here we are, needing fiddly incremental fixes like this to close a > > really-small-yet-will-be-deadly race: > > <snip required delicate rcu re-locking requirements patch> > > I prefer this incremental re-implementation of my symbol_request patch > that eliminates all concerns about the validity of 'nfs_to' calls: We could achieve the same effect without using symbol_request() (which hardly anyone uses) if we did a __module_get (or try_module_get) at the same place you are calling symbol_request(), and module_put() where you do symbol_put(). This would mean that once NFS LOCALIO had detected a path to the local server, it would hold the nfsd module until the nfs server were shutdown and the nfs client noticed. So you wouldn't be able to unmount the nfsd module immediately after stopping all nfsd servers. Maybe that doesn't matter. I think it is important to be able to completely shut down the NFS server at any time. I think it is important to be able to completely shutdown a network namespace at any time. I am less concerned about being able to rmmod the nfsd module after all obvious users have been disabled. So if others think that the improvements in code maintainability are worth the loss of being able to rmmod nfsd without (potentially) having to unmount all NFS filesystems, then I won't argue against it. But I really would want it to be get/put of the module, not of some symbol. .... BTW you probably wanted to use symbol_get(), not symbol_request(). The latter tries to load the module if it isn't already loaded. Using symbol_get() does have the benefit that you don't need any locking dance to prevent the module unloading while we get the ref. So if we really want to go for less tricky locking that might be a justification - but you don't need much locking for try_module_get()... Thanks, NeilBrown
> On Sep 6, 2024, at 5:56 PM, NeilBrown <neilb@suse.de> wrote: > > We could achieve the same effect without using symbol_request() (which > hardly anyone uses) if we did a __module_get (or try_module_get) at the > same place you are calling symbol_request(), and module_put() where you > do symbol_put(). > > This would mean that once NFS LOCALIO had detected a path to the local > server, it would hold the nfsd module until the nfs server were shutdown > and the nfs client noticed. So you wouldn't be able to unmount the nfsd > module immediately after stopping all nfsd servers. > > Maybe that doesn't matter. I think it is important to be able to > completely shut down the NFS server at any time. I think it is > important to be able to completely shutdown a network namespace at any > time. I am less concerned about being able to rmmod the nfsd module > after all obvious users have been disabled. > > So if others think that the improvements in code maintainability are > worth the loss of being able to rmmod nfsd without (potentially) having > to unmount all NFS filesystems, then I won't argue against it. But I > really would want it to be get/put of the module, not of some symbol. The client and server are potentially in separate containers, administered independently. An NFS mount should not pin either the NFS server's running status, its ability to unexport a shared file system, the ability for the NFS server's administrator to rmmod nfsd.ko, the ability for the administrator to rmmod a network device that is in use by the NFS server, or the ability to destroy the NFS server's namespace once NFSD has shut down. I don't feel that this is a code maintainability issue, but rather this is a usability and security mandate. Remote NFS mounts don't (or, are not supposed to) pin NFSD's resources in any way. That is the behavioral standard, and if we find that is not the case, we treat it as a bug. TL;DR: it does matter. LOCALIO NFS mounts should not indefinitely pin NFSD or its resources. -- Chuck Lever
On Sat, 07 Sep 2024, Chuck Lever III wrote: > > > > On Sep 6, 2024, at 5:56 PM, NeilBrown <neilb@suse.de> wrote: > > > > We could achieve the same effect without using symbol_request() (which > > hardly anyone uses) if we did a __module_get (or try_module_get) at the > > same place you are calling symbol_request(), and module_put() where you > > do symbol_put(). > > > > This would mean that once NFS LOCALIO had detected a path to the local > > server, it would hold the nfsd module until the nfs server were shutdown > > and the nfs client noticed. So you wouldn't be able to unmount the nfsd > > module immediately after stopping all nfsd servers. > > > > Maybe that doesn't matter. I think it is important to be able to > > completely shut down the NFS server at any time. I think it is > > important to be able to completely shutdown a network namespace at any > > time. I am less concerned about being able to rmmod the nfsd module > > after all obvious users have been disabled. > > > > So if others think that the improvements in code maintainability are > > worth the loss of being able to rmmod nfsd without (potentially) having > > to unmount all NFS filesystems, then I won't argue against it. But I > > really would want it to be get/put of the module, not of some symbol. > > The client and server are potentially in separate containers, > administered independently. An NFS mount should not pin either > the NFS server's running status, its ability to unexport a > shared file system, the ability for the NFS server's > administrator to rmmod nfsd.ko, the ability for the > administrator to rmmod a network device that is in use by the > NFS server, or the ability to destroy the NFS server's > namespace once NFSD has shut down. While I mostly agree, I should point out that nfsd.ko is a global resource across all containers. So if the client and server are administer separately, there is no certainty that the server administrator is at all related to the global moderator who controls when nfsd.ko might be unloaded. So preventing the unload of nfsd.ko is quite a different class of problem to preventing the shutdown of the nfsd service or of the container that it runs in. NeilBrown > > I don't feel that this is a code maintainability issue, but > rather this is a usability and security mandate. Remote NFS > mounts don't (or, are not supposed to) pin NFSD's resources > in any way. That is the behavioral standard, and if we find > that is not the case, we treat it as a bug. > > TL;DR: it does matter. LOCALIO NFS mounts should not > indefinitely pin NFSD or its resources. > > > -- > Chuck Lever > > >
On Sat, Sep 07, 2024 at 09:14:57AM +1000, NeilBrown wrote: > On Sat, 07 Sep 2024, Chuck Lever III wrote: > > > > > > > On Sep 6, 2024, at 5:56 PM, NeilBrown <neilb@suse.de> wrote: > > > > > > We could achieve the same effect without using symbol_request() (which > > > hardly anyone uses) if we did a __module_get (or try_module_get) at the > > > same place you are calling symbol_request(), and module_put() where you > > > do symbol_put(). > > > > > > This would mean that once NFS LOCALIO had detected a path to the local > > > server, it would hold the nfsd module until the nfs server were shutdown > > > and the nfs client noticed. So you wouldn't be able to unmount the nfsd > > > module immediately after stopping all nfsd servers. > > > > > > Maybe that doesn't matter. I think it is important to be able to > > > completely shut down the NFS server at any time. I think it is > > > important to be able to completely shutdown a network namespace at any > > > time. I am less concerned about being able to rmmod the nfsd module > > > after all obvious users have been disabled. > > > > > > So if others think that the improvements in code maintainability are > > > worth the loss of being able to rmmod nfsd without (potentially) having > > > to unmount all NFS filesystems, then I won't argue against it. But I > > > really would want it to be get/put of the module, not of some symbol. > > > > The client and server are potentially in separate containers, > > administered independently. An NFS mount should not pin either > > the NFS server's running status, its ability to unexport a > > shared file system, the ability for the NFS server's > > administrator to rmmod nfsd.ko, the ability for the > > administrator to rmmod a network device that is in use by the > > NFS server, or the ability to destroy the NFS server's > > namespace once NFSD has shut down. > > While I mostly agree, I should point out that nfsd.ko is a global > resource across all containers. So if the client and server are > administer separately, there is no certainty that the server > administrator is at all related to the global moderator who controls > when nfsd.ko might be unloaded. So preventing the unload of nfsd.ko is > quite a different class of problem to preventing the shutdown of the > nfsd service or of the container that it runs in. Right, and in practice LOCALIO doesn't prevent any expected NFS client or server shutdown within containers. Proper refcounting of required resources from host is needed, new requirement for LOCALIO is that nfsd be properly refcounted until consuming nfs client code exits. But if LOCALIO client is connected to a server, if/when that server shuts down it isn't blocked from doing so simply because a LOCALIO client is active. Rather than have general concern for LOCALIO doing something wrong, we'd do well to make sure there is proper test coverage for required shutdown sequences (completely indepent of LOCALIO, maybe that already exists?). I'm 99.99% sure LOCALIO will pass all of that testing (informed by manual testing I have done with container shutdown, etc). Mike
On Sat, Sep 07, 2024 at 07:56:47AM +1000, NeilBrown wrote: > On Sat, 07 Sep 2024, Mike Snitzer wrote: > > > > But I'd just like to point out that something like the below patch > > > wouldn't be needed if we kept my "heavy" approach (nfs reference on > > > nfsd modules via nfs_common using symbol_request): > > > https://marc.info/?l=linux-nfs&m=172499445027800&w=2 > > > (that patch has stuff I since cleaned up, e.g. removed typedefs and > > > EXPORT_SYMBOL_GPLs..) > > > > > > I knew we were going to pay for being too cute with how nfs took its > > > reference on nfsd. > > > > > > So here we are, needing fiddly incremental fixes like this to close a > > > really-small-yet-will-be-deadly race: > > > > <snip required delicate rcu re-locking requirements patch> > > > > I prefer this incremental re-implementation of my symbol_request patch > > that eliminates all concerns about the validity of 'nfs_to' calls: > > We could achieve the same effect without using symbol_request() (which > hardly anyone uses) if we did a __module_get (or try_module_get) at the > same place you are calling symbol_request(), and module_put() where you > do symbol_put(). > > This would mean that once NFS LOCALIO had detected a path to the local > server, it would hold the nfsd module until the nfs server were shutdown > and the nfs client noticed. So you wouldn't be able to unmount the nfsd > module immediately after stopping all nfsd servers. s/unmount the nfsd module/remove the nfsd module/ Right, the nfsd module wouldn't be able to be unloaded if LOCALIO client is still active (on host or within some container) with a mount from an export the now shutdown server hosted. With LOCALIO the client has extended the footprint of its kernel code to include portions of the nfsd module (indirectly via nfs_common). That said, we can preserve the fine-grained rcu-based locking dances that I reinforced with the first patch (call it "option 1") I shared in this sub-thread ;) > Maybe that doesn't matter. I think it is important to be able to > completely shut down the NFS server at any time. I think it is > important to be able to completely shutdown a network namespace at any > time. I am less concerned about being able to rmmod the nfsd module > after all obvious users have been disabled. Yes, and even the last case: if all obvious users have been disabled and unmounted then LOCALIO nfs client would have no cause to be holding a reference for nfsd. > So if others think that the improvements in code maintainability are > worth the loss of being able to rmmod nfsd without (potentially) having > to unmount all NFS filesystems, then I won't argue against it. But I > really would want it to be get/put of the module, not of some symbol. I can do that. > .... BTW you probably wanted to use symbol_get(), not symbol_request(). > The latter tries to load the module if it isn't already loaded. Using > symbol_get() does have the benefit that you don't need any locking dance > to prevent the module unloading while we get the ref. So if we really > want to go for less tricky locking that might be a justification - but > you don't need much locking for try_module_get()... Yes that was the entire reason I did it this way (call it "option 2", nicely avoids the bouncing of the rcu locking while putting nfsd_serv ref in nfs_open_local_fh's error path). Will look at switching to try_module_get(). Whichever way we go with fixing nfs_open_local_fh's narrow race with putting the nfsd_serv ref, thankfully this is a pretty minor point that we can quickly fix once Anna and Trond are comfortable with LOCALIO being merged. option 2's coarser nfs reference for nfsd via try_module_get would reduce think-time if/when we make LOCALIO changes in future. But I'd be fine with either fix. Thanks, Mike
> On Sep 7, 2024, at 11:17 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > Rather than have general concern for LOCALIO doing something wrong, > we'd do well to make sure there is proper test coverage for required > shutdown sequences (completely indepent of LOCALIO, maybe that already > exists?). That is on the to-do list for the NFSD kdevops CI infrastructure, but unfortunately implementation has not been started yet. -- Chuck Lever
On Sat, Sep 07, 2024 at 04:09:33PM +0000, Chuck Lever III wrote: > > > On Sep 7, 2024, at 11:17 AM, Mike Snitzer <snitzer@kernel.org> wrote: > > > > Rather than have general concern for LOCALIO doing something wrong, > > we'd do well to make sure there is proper test coverage for required > > shutdown sequences (completely indepent of LOCALIO, maybe that already > > exists?). > > That is on the to-do list for the NFSD kdevops CI infrastructure, > but unfortunately implementation has not been started yet. Could be a good project for me to help with. I'm on the fence between kdevops and ktest, ideally I could come up with something that'd easily hook into both test harnesses. Supporting both would be simple if the new tests were added to a popular testsuite that both can run (e.g. xfstests, or any other separate nfs/nfsd testsuite you may have?). Or is "NFSD kdevops CI" itself what your tests be engineered with? I can contribute anywhere, would just like to kill multiple birds when doing so. Thanks, Mike
> On Sep 7, 2024, at 3:08 PM, Mike Snitzer <snitzer@kernel.org> wrote: > > On Sat, Sep 07, 2024 at 04:09:33PM +0000, Chuck Lever III wrote: >> >>> On Sep 7, 2024, at 11:17 AM, Mike Snitzer <snitzer@kernel.org> wrote: >>> >>> Rather than have general concern for LOCALIO doing something wrong, >>> we'd do well to make sure there is proper test coverage for required >>> shutdown sequences (completely indepent of LOCALIO, maybe that already >>> exists?). >> >> That is on the to-do list for the NFSD kdevops CI infrastructure, >> but unfortunately implementation has not been started yet. > > Could be a good project for me to help with. I'm on the fence between > kdevops and ktest, ideally I could come up with something that'd > easily hook into both test harnesses. > > Supporting both would be simple if the new tests were added to a > popular testsuite that both can run (e.g. xfstests, or any other > separate nfs/nfsd testsuite you may have?). Or is "NFSD kdevops CI" > itself what your tests be engineered with? kdevops is a CI framework; the individual tests are "workflows" that run under that framework. Source: https://github.com/linux-kdevops/kdevops Right now kdevops can run these tests (created elsewhere): - (x)fstests - the git regression suite - ltp - nfstests (from Jorge Borge) - pynfs ... in addition to the kernel self-tests, CXL-related tests, and a system reboot test, among others. We will have to develop something from scratch that is geared specifically towards NFSD on Linux. Probably the closest fit for unit-testing administrative commands on Linux is ltp: Source: https://github.com/linux-test-project/ltp Docs: https://linux-test-project.readthedocs.io/en/latest/ If ktest can run ltp, then new ltp tests could be inserted easily into both kdevops or ktest. Or the NFSD administrative tests might be added to the kernel's self-test suite or to Kunit; such tests would reside under tools/ in the kernel source tree. A third alternative would be to add the tests to the nfs-utils package, where Linux NFS user space tooling lives today; but I don't think there's a lot of test framework in that package right now. -- Chuck Lever
> On Sep 7, 2024, at 5:12 PM, Chuck Lever III <chuck.lever@oracle.com> wrote: > >> >> On Sep 7, 2024, at 3:08 PM, Mike Snitzer <snitzer@kernel.org> wrote: >> >> On Sat, Sep 07, 2024 at 04:09:33PM +0000, Chuck Lever III wrote: >>> >>>> On Sep 7, 2024, at 11:17 AM, Mike Snitzer <snitzer@kernel.org> wrote: >>>> >>>> Rather than have general concern for LOCALIO doing something wrong, >>>> we'd do well to make sure there is proper test coverage for required >>>> shutdown sequences (completely indepent of LOCALIO, maybe that already >>>> exists?). >>> >>> That is on the to-do list for the NFSD kdevops CI infrastructure, >>> but unfortunately implementation has not been started yet. >> >> Could be a good project for me to help with. I'm on the fence between >> kdevops and ktest, ideally I could come up with something that'd >> easily hook into both test harnesses. >> >> Supporting both would be simple if the new tests were added to a >> popular testsuite that both can run (e.g. xfstests, or any other >> separate nfs/nfsd testsuite you may have?). Or is "NFSD kdevops CI" >> itself what your tests be engineered with? > > kdevops is a CI framework; the individual tests are > "workflows" that run under that framework. > > Source: https://github.com/linux-kdevops/kdevops > > Right now kdevops can run these tests (created elsewhere): > > - (x)fstests > - the git regression suite > - ltp > - nfstests (from Jorge Borge) > - pynfs > > ... in addition to the kernel self-tests, CXL-related > tests, and a system reboot test, among others. > > We will have to develop something from scratch that is > geared specifically towards NFSD on Linux. Probably the > closest fit for unit-testing administrative commands on > Linux is ltp: > > Source: https://github.com/linux-test-project/ltp > Docs: https://linux-test-project.readthedocs.io/en/latest/ > > If ktest can run ltp, then new ltp tests could be inserted > easily into both kdevops or ktest. > > Or the NFSD administrative tests might be added to the > kernel's self-test suite or to Kunit; such tests would > reside under tools/ in the kernel source tree. > > A third alternative would be to add the tests to the > nfs-utils package, where Linux NFS user space tooling > lives today; but I don't think there's a lot of test > framework in that package right now. TL;DR: this "to-do" item is far enough down on the list that we haven't begun discussing a plan of action. A good starting place would be to prototype some test cases and then we can see where they fit into the ecosystem. (Or, post what you might already have now to begin the conversation). -- Chuck Lever
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index b8736a82e57c..18cbd3fa7691 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 89ff380ec31e..348c1b97092e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -52,7 +52,7 @@ #define NFSD_FILE_CACHE_UP (0) /* We only care about NFSD_MAY_READ/WRITE for this cache */ -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c new file mode 100644 index 000000000000..75df709c6903 --- /dev/null +++ b/fs/nfsd/localio.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NFS server support for local clients to bypass network stack + * + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> + */ + +#include <linux/exportfs.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> +#include <linux/nfs_common.h> +#include <linux/nfslocalio.h> +#include <linux/string.h> + +#include "nfsd.h" +#include "vfs.h" +#include "netns.h" +#include "filecache.h" + +static const struct nfsd_localio_operations nfsd_localio_ops = { + .nfsd_open_local_fh = nfsd_open_local_fh, + .nfsd_file_put_local = nfsd_file_put_local, + .nfsd_file_file = nfsd_file_file, +}; + +void nfsd_localio_ops_init(void) +{ + memcpy(&nfs_to, &nfsd_localio_ops, sizeof(nfsd_localio_ops)); +} + +/** + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file + * + * @uuid: nfs_uuid_t which provides the 'struct net' to get the proper nfsd_net + * and the 'struct auth_domain' required for LOCALIO access + * @rpc_clnt: rpc_clnt that the client established, used for sockaddr and cred + * @cred: cred that the client established + * @nfs_fh: filehandle to lookup + * @fmode: fmode_t to use for open + * + * This function maps a local fh to a path on a local filesystem. + * This is useful when the nfs client has the local server mounted - it can + * avoid all the NFS overhead with reads, writes and commits. + * + * On successful return, returned nfsd_file will have its nf_net member + * set. Caller (NFS client) is responsible for calling nfsd_serv_put and + * nfsd_file_put (via nfs_to.nfsd_file_put_local). + */ +struct nfsd_file * +nfsd_open_local_fh(nfs_uuid_t *uuid, + struct rpc_clnt *rpc_clnt, const struct cred *cred, + const struct nfs_fh *nfs_fh, const fmode_t fmode) + __must_hold(rcu) +{ + int mayflags = NFSD_MAY_LOCALIO; + struct nfsd_net *nn = NULL; + struct net *net; + struct svc_cred rq_cred; + struct svc_fh fh; + struct nfsd_file *localio; + __be32 beres; + + if (nfs_fh->size > NFS4_FHSIZE) + return ERR_PTR(-EINVAL); + + /* + * Not running in nfsd context, so must safely get reference on nfsd_serv. + * But the server may already be shutting down, if so disallow new localio. + * uuid->net is NOT a counted reference, but caller's rcu_read_lock() ensures + * that if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe + * and if it succeeds we will have an implied reference to the net. + */ + net = rcu_dereference(uuid->net); + if (net) + nn = net_generic(net, nfsd_net_id); + if (unlikely(!nn || !nfsd_serv_try_get(nn))) + return ERR_PTR(-ENXIO); + + /* Drop the rcu lock for nfsd_file_acquire_local() */ + rcu_read_unlock(); + + /* nfs_fh -> svc_fh */ + fh_init(&fh, NFS4_FHSIZE); + fh.fh_handle.fh_size = nfs_fh->size; + memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size); + + if (fmode & FMODE_READ) + mayflags |= NFSD_MAY_READ; + if (fmode & FMODE_WRITE) + mayflags |= NFSD_MAY_WRITE; + + svcauth_map_clnt_to_svc_cred_local(rpc_clnt, cred, &rq_cred); + + beres = nfsd_file_acquire_local(uuid->net, &rq_cred, uuid->dom, + &fh, mayflags, &localio); + if (beres) { + localio = ERR_PTR(nfs_stat_to_errno(be32_to_cpu(beres))); + nfsd_serv_put(nn); + } + + fh_put(&fh); + if (rq_cred.cr_group_info) + put_group_info(rq_cred.cr_group_info); + + rcu_read_lock(); + return localio; +} +EXPORT_SYMBOL_GPL(nfsd_open_local_fh); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index e2d953f21dde..0fd31188a951 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -216,6 +216,10 @@ struct nfsd_net { /* last time an admin-revoke happened for NFSv4.0 */ time64_t nfs40_last_revoke; +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + /* Local clients to be invalidated when net is shut down */ + struct list_head local_clients; +#endif }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 64c1b4d649bc..3adbc05ebaac 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -18,6 +18,7 @@ #include <linux/sunrpc/svc.h> #include <linux/module.h> #include <linux/fsnotify.h> +#include <linux/nfslocalio.h> #include "idmap.h" #include "nfsd.h" @@ -2257,7 +2258,9 @@ static __net_init int nfsd_net_init(struct net *net) get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); nfsd_proc_stat_init(net); - +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + INIT_LIST_HEAD(&nn->local_clients); +#endif return 0; out_repcache_error: @@ -2268,6 +2271,22 @@ static __net_init int nfsd_net_init(struct net *net) return retval; } +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +/** + * nfsd_net_pre_exit - Disconnect localio clients from net namespace + * @net: a network namespace that is about to be destroyed + * + * This invalidated ->net pointers held by localio clients + * while they can still safely access nn->counter. + */ +static __net_exit void nfsd_net_pre_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs_uuid_invalidate_clients(&nn->local_clients); +} +#endif + /** * nfsd_net_exit - Release the nfsd_net portion of a net namespace * @net: a network namespace that is about to be destroyed @@ -2285,6 +2304,9 @@ static __net_exit void nfsd_net_exit(struct net *net) static struct pernet_operations nfsd_net_ops = { .init = nfsd_net_init, +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + .pre_exit = nfsd_net_pre_exit, +#endif .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), @@ -2322,6 +2344,7 @@ static int __init init_nfsd(void) retval = genl_register_family(&nfsd_nl_family); if (retval) goto out_free_all; + nfsd_localio_ops_init(); return 0; out_free_all: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d22027e23761..82bcefcd1f21 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ - { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }, \ + { NFSD_MAY_LOCALIO, "LOCALIO" }) TRACE_EVENT(nfsd_compound, TP_PROTO( diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 01947561d375..3ff146522556 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -33,6 +33,8 @@ #define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ +#define NFSD_MAY_LOCALIO 0x2000 /* for tracing, reflects when localio used */ + #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 62419c4bc8f1..61f2c781dd50 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,6 +6,8 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + #include <linux/module.h> #include <linux/list.h> #include <linux/uuid.h> @@ -63,4 +65,10 @@ struct nfsd_localio_operations { extern void nfsd_localio_ops_init(void); extern struct nfsd_localio_operations nfs_to; +#else /* CONFIG_NFS_LOCALIO */ +static inline void nfsd_localio_ops_init(void) +{ +} +#endif /* CONFIG_NFS_LOCALIO */ + #endif /* __LINUX_NFSLOCALIO_H */