diff mbox series

[v9,13/19] nfsd: add "localio" support

Message ID 20240628211105.54736-14-snitzer@kernel.org (mailing list archive)
State New
Headers show
Series nfs/nfsd: add support for localio | expand

Commit Message

Mike Snitzer June 28, 2024, 9:10 p.m. UTC
Pass the stored cl_nfssvc_net from the client to the server as
first argument to nfsd_open_local_fh() to ensure the proper network
namespace is used for localio.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfsd/Makefile    |   1 +
 fs/nfsd/filecache.c |   2 +-
 fs/nfsd/localio.c   | 239 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfssvc.c    |   1 +
 fs/nfsd/trace.h     |   3 +-
 fs/nfsd/vfs.h       |   9 ++
 6 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 fs/nfsd/localio.c

Comments

Chuck Lever June 29, 2024, 10:18 p.m. UTC | #1
Sorry, I guess I expected to have more time to learn about these
patches before writing review comments. But if you want them to go
in soon, I had better look more closely at them now.


On Fri, Jun 28, 2024 at 05:10:59PM -0400, Mike Snitzer wrote:
> Pass the stored cl_nfssvc_net from the client to the server as

This is the only mention of cl_nfssvc_net I can find in this
patch. I'm not sure what it is. Patch description should maybe
provide some context.


> first argument to nfsd_open_local_fh() to ensure the proper network
> namespace is used for localio.

Can the patch description say something about the distinct mount 
namespaces -- if the local application is running in a different
container than the NFS server, are we using only the network
namespaces for authorizing the file access? And is that OK to do?
If yes, patch description should explain that NFS local I/O ignores
the boundaries of mount namespaces and why that is OK to do.


> Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  fs/nfsd/Makefile    |   1 +
>  fs/nfsd/filecache.c |   2 +-
>  fs/nfsd/localio.c   | 239 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfssvc.c    |   1 +
>  fs/nfsd/trace.h     |   3 +-
>  fs/nfsd/vfs.h       |   9 ++
>  6 files changed, 253 insertions(+), 2 deletions(-)
>  create mode 100644 fs/nfsd/localio.c
> 
> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> index b8736a82e57c..78b421778a79 100644
> --- a/fs/nfsd/Makefile
> +++ b/fs/nfsd/Makefile
> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
>  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
>  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index ad9083ca144b..99631fa56662 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -52,7 +52,7 @@
>  #define NFSD_FILE_CACHE_UP		     (0)
>  
>  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
>  
>  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
>  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> new file mode 100644
> index 000000000000..759a5cb79652
> --- /dev/null
> +++ b/fs/nfsd/localio.c
> @@ -0,0 +1,239 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * NFS server support for local clients to bypass network stack
> + *
> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> + */
> +
> +#include <linux/exportfs.h>
> +#include <linux/sunrpc/svcauth_gss.h>
> +#include <linux/sunrpc/clnt.h>
> +#include <linux/nfs.h>
> +#include <linux/string.h>
> +
> +#include "nfsd.h"
> +#include "vfs.h"
> +#include "netns.h"
> +#include "filecache.h"
> +
> +#define NFSDDBG_FACILITY		NFSDDBG_FH

With no more dprintk() call sites in this patch, you no longer need
this macro definition.


> +/*
> + * We need to translate between nfs status return values and
> + * the local errno values which may not be the same.
> + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> + *   all compiled nfs objects if it were in include/linux/nfs.h
> + */
> +static const struct {
> +	int stat;
> +	int errno;
> +} nfs_common_errtbl[] = {
> +	{ NFS_OK,		0		},
> +	{ NFSERR_PERM,		-EPERM		},
> +	{ NFSERR_NOENT,		-ENOENT		},
> +	{ NFSERR_IO,		-EIO		},
> +	{ NFSERR_NXIO,		-ENXIO		},
> +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> +	{ NFSERR_ACCES,		-EACCES		},
> +	{ NFSERR_EXIST,		-EEXIST		},
> +	{ NFSERR_XDEV,		-EXDEV		},
> +	{ NFSERR_NODEV,		-ENODEV		},
> +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> +	{ NFSERR_ISDIR,		-EISDIR		},
> +	{ NFSERR_INVAL,		-EINVAL		},
> +	{ NFSERR_FBIG,		-EFBIG		},
> +	{ NFSERR_NOSPC,		-ENOSPC		},
> +	{ NFSERR_ROFS,		-EROFS		},
> +	{ NFSERR_MLINK,		-EMLINK		},
> +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> +	{ NFSERR_DQUOT,		-EDQUOT		},
> +	{ NFSERR_STALE,		-ESTALE		},
> +	{ NFSERR_REMOTE,	-EREMOTE	},
> +#ifdef EWFLUSH
> +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> +#endif
> +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> +	{ -1,			-EIO		}
> +};
> +
> +/**
> + * nfs_stat_to_errno - convert an NFS status code to a local errno
> + * @status: NFS status code to convert
> + *
> + * Returns a local errno value, or -EIO if the NFS status code is
> + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> + * needs to be translated to an errno before being returned to a
> + * local client application.
> + */
> +static int nfs_stat_to_errno(enum nfs_stat status)
> +{
> +	int i;
> +
> +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> +		if (nfs_common_errtbl[i].stat == (int)status)
> +			return nfs_common_errtbl[i].errno;
> +	}
> +	return nfs_common_errtbl[i].errno;
> +}
> +
> +static void
> +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> +{
> +	if (rqstp->rq_client)
> +		auth_domain_put(rqstp->rq_client);
> +	if (rqstp->rq_cred.cr_group_info)
> +		put_group_info(rqstp->rq_cred.cr_group_info);
> +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> +	kfree(rqstp->rq_xprt);
> +	kfree(rqstp);
> +}
> +
> +static struct svc_rqst *
> +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> +			const struct cred *cred)
> +{
> +	struct svc_rqst *rqstp;
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	int status;
> +
> +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> +	if (unlikely(!READ_ONCE(nn->nfsd_serv)))
> +		return ERR_PTR(-ENXIO);
> +
> +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> +	if (!rqstp)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> +	if (!rqstp->rq_xprt) {
> +		status = -ENOMEM;
> +		goto out_err;
> +	}

struct svc_rqst is pretty big (like, bigger than a couple of pages).
What happens if this allocation fails?

And how often does it occur -- does that add significant overhead?


> +
> +	rqstp->rq_xprt->xpt_net = net;
> +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> +	rqstp->rq_proc = 1;
> +	rqstp->rq_vers = 3;

IMO these need to be symbolic constants, not integers. Or, at least
there needs to be some documenting comments that explain these are
fake and why that's OK to do. Or, are there better choices?


> +	rqstp->rq_prot = IPPROTO_TCP;
> +	rqstp->rq_server = nn->nfsd_serv;
> +
> +	/* Note: we're connecting to ourself, so source addr == peer addr */
> +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> +			(struct sockaddr *)&rqstp->rq_addr,
> +			sizeof(rqstp->rq_addr));
> +
> +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> +
> +	/*
> +	 * set up enough for svcauth_unix_set_client to be able to wait
> +	 * for the cache downcall. Note that we do _not_ want to allow the
> +	 * request to be deferred for later revisit since this rqst and xprt
> +	 * are not set up to run inside of the normal svc_rqst engine.
> +	 */
> +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> +	kref_init(&rqstp->rq_xprt->xpt_ref);
> +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> +
> +	status = svcauth_unix_set_client(rqstp);
> +	switch (status) {
> +	case SVC_OK:
> +		break;
> +	case SVC_DENIED:
> +		status = -ENXIO;
> +		goto out_err;
> +	default:
> +		status = -ETIMEDOUT;
> +		goto out_err;
> +	}

Interesting. Why would svcauth_unix_set_client fail for a local I/O
request? Wouldn't it only be because the local application is trying
to open a file it doesn't have permission to?


> +	return rqstp;
> +
> +out_err:
> +	nfsd_local_fakerqst_destroy(rqstp);
> +	return ERR_PTR(status);
> +}
> +
> +/*
> + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
> + *
> + * This function maps a local fh to a path on a local filesystem.
> + * This is useful when the nfs client has the local server mounted - it can
> + * avoid all the NFS overhead with reads, writes and commits.

Hm. It just occurred to me that there won't be a two-phase commit
here, and possibly no flush-on-close, either? Can someone help
explain whether/how the writeback semantics are different for NFS
local I/O?


> + *
> + * on successful return, caller is responsible for calling path_put. Also
> + * note that this is called from nfs.ko via find_symbol() to avoid an explicit
> + * dependency on knfsd. So, there is no forward declaration in a header file
> + * for it.

Yet I see a declaration added below in fs/nfsd/vfs.h. Is this
comment out of date? Or perhaps you mean there's no declaration
that is shared with the client code?


> + */
> +int nfsd_open_local_fh(struct net *net,

I've been asking that new NFSD code use genuine full-blooded kdoc
comments for new functions. Since this is a global (EXPORTED)
function, please make this a genuine kdoc comment.


> +			 struct rpc_clnt *rpc_clnt,
> +			 const struct cred *cred,
> +			 const struct nfs_fh *nfs_fh,
> +			 const fmode_t fmode,
> +			 struct file **pfilp)
> +{
> +	const struct cred *save_cred;
> +	struct svc_rqst *rqstp;
> +	struct svc_fh fh;
> +	struct nfsd_file *nf;
> +	int status = 0;
> +	int mayflags = NFSD_MAY_LOCALIO;
> +	__be32 beres;

Nit: I've been asking that new NFSD code use reverse-christmas tree
format for variable declarations.


> +
> +	/* Save creds before calling into nfsd */
> +	save_cred = get_current_cred();
> +
> +	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
> +	if (IS_ERR(rqstp)) {
> +		status = PTR_ERR(rqstp);
> +		goto out_revertcred;
> +	}

It might be nicer if you had a small pool of svc threads pre-created
for this purpose instead of manufacturing one of these and then
tearing it down for every local open call.

Maybe even better if you had an internal transport on which to queue
these open requests... because this is all pretty bespoke.


> +
> +	/* nfs_fh -> svc_fh */
> +	if (nfs_fh->size > NFS4_FHSIZE) {
> +		status = -EINVAL;
> +		goto out;
> +	}
> +	fh_init(&fh, NFS4_FHSIZE);
> +	fh.fh_handle.fh_size = nfs_fh->size;
> +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> +
> +	if (fmode & FMODE_READ)
> +		mayflags |= NFSD_MAY_READ;
> +	if (fmode & FMODE_WRITE)
> +		mayflags |= NFSD_MAY_WRITE;
> +
> +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> +	if (beres) {
> +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> +		goto out_fh_put;
> +	}

So I'm wondering whether just calling fh_verify() and then
nfsd_open_verified() would be simpler and/or good enough here. Is
there a strong reason to use the file cache for locally opened
files? Jeff, any thoughts? Will there be writeback ramifications for
doing this? Maybe we need a comment here explaining how these files
are garbage collected (just an fput by the local I/O client, I would
guess).


> +
> +	*pfilp = get_file(nf->nf_file);
> +
> +	nfsd_file_put(nf);
> +out_fh_put:
> +	fh_put(&fh);
> +
> +out:
> +	nfsd_local_fakerqst_destroy(rqstp);
> +out_revertcred:
> +	revert_creds(save_cred);
> +	return status;
> +}
> +EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
> +
> +/* Compile time type checking, not used by anything */
> +static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 1222a0a33fe1..a477d2c5088a 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -431,6 +431,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
>  #endif
>  #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
>  	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
> +	nn->nfsd_uuid.net = net;
>  	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
>  #endif
>  	nn->nfsd_net_up = true;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 77bbd23aa150..9c0610fdd11c 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
>  		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
>  		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
>  		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
> -		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
> +		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
> +		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
>  
>  TRACE_EVENT(nfsd_compound,
>  	TP_PROTO(
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index 57cd70062048..5146f0c81752 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -33,6 +33,8 @@
>  
>  #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
>  
> +#define NFSD_MAY_LOCALIO		0x2000
> +
>  #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
>  #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
>  
> @@ -158,6 +160,13 @@ __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
>  
>  void		nfsd_filp_close(struct file *fp);
>  
> +int		nfsd_open_local_fh(struct net *net,
> +				   struct rpc_clnt *rpc_clnt,
> +				   const struct cred *cred,
> +				   const struct nfs_fh *nfs_fh,
> +				   const fmode_t fmode,
> +				   struct file **pfilp);
> +
>  static inline int fh_want_write(struct svc_fh *fh)
>  {
>  	int ret;
> -- 
> 2.44.0
> 
>
Chuck Lever June 30, 2024, 2:49 p.m. UTC | #2
On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > +
> > +	/* nfs_fh -> svc_fh */
> > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > +		status = -EINVAL;
> > +		goto out;
> > +	}
> > +	fh_init(&fh, NFS4_FHSIZE);
> > +	fh.fh_handle.fh_size = nfs_fh->size;
> > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > +
> > +	if (fmode & FMODE_READ)
> > +		mayflags |= NFSD_MAY_READ;
> > +	if (fmode & FMODE_WRITE)
> > +		mayflags |= NFSD_MAY_WRITE;
> > +
> > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > +	if (beres) {
> > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > +		goto out_fh_put;
> > +	}
> 
> So I'm wondering whether just calling fh_verify() and then
> nfsd_open_verified() would be simpler and/or good enough here. Is
> there a strong reason to use the file cache for locally opened
> files? Jeff, any thoughts?

> Will there be writeback ramifications for
> doing this? Maybe we need a comment here explaining how these files
> are garbage collected (just an fput by the local I/O client, I would
> guess).

OK, going back to this: Since right here we immediately call 

	nfsd_file_put(nf);

There are no writeback ramifications nor any need to comment about
garbage collection. But this still seems like a lot of (possibly
unnecessary) overhead for simply obtaining a struct file.


> > +
> > +	*pfilp = get_file(nf->nf_file);
> > +
> > +	nfsd_file_put(nf);
> > +out_fh_put:
> > +	fh_put(&fh);
> > +
> > +out:
> > +	nfsd_local_fakerqst_destroy(rqstp);
> > +out_revertcred:
> > +	revert_creds(save_cred);
> > +	return status;
> > +}
> > +EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
Mike Snitzer June 30, 2024, 7:44 p.m. UTC | #3
On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > +
> > > +	/* nfs_fh -> svc_fh */
> > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > +		status = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	fh_init(&fh, NFS4_FHSIZE);
> > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > +
> > > +	if (fmode & FMODE_READ)
> > > +		mayflags |= NFSD_MAY_READ;
> > > +	if (fmode & FMODE_WRITE)
> > > +		mayflags |= NFSD_MAY_WRITE;
> > > +
> > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > +	if (beres) {
> > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > +		goto out_fh_put;
> > > +	}
> > 
> > So I'm wondering whether just calling fh_verify() and then
> > nfsd_open_verified() would be simpler and/or good enough here. Is
> > there a strong reason to use the file cache for locally opened
> > files? Jeff, any thoughts?
> 
> > Will there be writeback ramifications for
> > doing this? Maybe we need a comment here explaining how these files
> > are garbage collected (just an fput by the local I/O client, I would
> > guess).
> 
> OK, going back to this: Since right here we immediately call 
> 
> 	nfsd_file_put(nf);
> 
> There are no writeback ramifications nor any need to comment about
> garbage collection. But this still seems like a lot of (possibly
> unnecessary) overhead for simply obtaining a struct file.

Easy enough change, probably best to avoid the filecache but would like
to verify with Jeff before switching:

diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 1d6508aa931e..85ebf63789fb 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
        const struct cred *save_cred;
        struct svc_rqst *rqstp;
        struct svc_fh fh;
-       struct nfsd_file *nf;
        __be32 beres;

        if (nfs_fh->size > NFS4_FHSIZE)
@@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
        if (fmode & FMODE_WRITE)
                mayflags |= NFSD_MAY_WRITE;

-       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
+       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
        if (beres) {
                status = nfs_stat_to_errno(be32_to_cpu(beres));
                goto out_fh_put;
        }
-       *pfilp = get_file(nf->nf_file);
-       nfsd_file_put(nf);
+       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
 out_fh_put:
        fh_put(&fh);
        nfsd_local_fakerqst_destroy(rqstp);

Mike
Jeff Layton June 30, 2024, 7:51 p.m. UTC | #4
On Sat, 2024-06-29 at 18:18 -0400, Chuck Lever wrote:
> Sorry, I guess I expected to have more time to learn about these
> patches before writing review comments. But if you want them to go
> in soon, I had better look more closely at them now.
> 
> 
> On Fri, Jun 28, 2024 at 05:10:59PM -0400, Mike Snitzer wrote:
> > Pass the stored cl_nfssvc_net from the client to the server as
> 
> This is the only mention of cl_nfssvc_net I can find in this
> patch. I'm not sure what it is. Patch description should maybe
> provide some context.
> 
> 
> > first argument to nfsd_open_local_fh() to ensure the proper network
> > namespace is used for localio.
> 
> Can the patch description say something about the distinct mount 
> namespaces -- if the local application is running in a different
> container than the NFS server, are we using only the network
> namespaces for authorizing the file access? And is that OK to do?
> If yes, patch description should explain that NFS local I/O ignores
> the boundaries of mount namespaces and why that is OK to do.
> 
> 
> > Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> > Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> > Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> > ---
> >  fs/nfsd/Makefile    |   1 +
> >  fs/nfsd/filecache.c |   2 +-
> >  fs/nfsd/localio.c   | 239 ++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfssvc.c    |   1 +
> >  fs/nfsd/trace.h     |   3 +-
> >  fs/nfsd/vfs.h       |   9 ++
> >  6 files changed, 253 insertions(+), 2 deletions(-)
> >  create mode 100644 fs/nfsd/localio.c
> > 
> > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > index b8736a82e57c..78b421778a79 100644
> > --- a/fs/nfsd/Makefile
> > +++ b/fs/nfsd/Makefile
> > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> > +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index ad9083ca144b..99631fa56662 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -52,7 +52,7 @@
> >  #define NFSD_FILE_CACHE_UP		     (0)
> >  
> >  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> > -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> > +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> >  
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > new file mode 100644
> > index 000000000000..759a5cb79652
> > --- /dev/null
> > +++ b/fs/nfsd/localio.c
> > @@ -0,0 +1,239 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * NFS server support for local clients to bypass network stack
> > + *
> > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> > + */
> > +
> > +#include <linux/exportfs.h>
> > +#include <linux/sunrpc/svcauth_gss.h>
> > +#include <linux/sunrpc/clnt.h>
> > +#include <linux/nfs.h>
> > +#include <linux/string.h>
> > +
> > +#include "nfsd.h"
> > +#include "vfs.h"
> > +#include "netns.h"
> > +#include "filecache.h"
> > +
> > +#define NFSDDBG_FACILITY		NFSDDBG_FH
> 
> With no more dprintk() call sites in this patch, you no longer need
> this macro definition.
> 
> 
> > +/*
> > + * We need to translate between nfs status return values and
> > + * the local errno values which may not be the same.
> > + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> > + *   all compiled nfs objects if it were in include/linux/nfs.h
> > + */
> > +static const struct {
> > +	int stat;
> > +	int errno;
> > +} nfs_common_errtbl[] = {
> > +	{ NFS_OK,		0		},
> > +	{ NFSERR_PERM,		-EPERM		},
> > +	{ NFSERR_NOENT,		-ENOENT		},
> > +	{ NFSERR_IO,		-EIO		},
> > +	{ NFSERR_NXIO,		-ENXIO		},
> > +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> > +	{ NFSERR_ACCES,		-EACCES		},
> > +	{ NFSERR_EXIST,		-EEXIST		},
> > +	{ NFSERR_XDEV,		-EXDEV		},
> > +	{ NFSERR_NODEV,		-ENODEV		},
> > +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> > +	{ NFSERR_ISDIR,		-EISDIR		},
> > +	{ NFSERR_INVAL,		-EINVAL		},
> > +	{ NFSERR_FBIG,		-EFBIG		},
> > +	{ NFSERR_NOSPC,		-ENOSPC		},
> > +	{ NFSERR_ROFS,		-EROFS		},
> > +	{ NFSERR_MLINK,		-EMLINK		},
> > +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> > +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> > +	{ NFSERR_DQUOT,		-EDQUOT		},
> > +	{ NFSERR_STALE,		-ESTALE		},
> > +	{ NFSERR_REMOTE,	-EREMOTE	},
> > +#ifdef EWFLUSH
> > +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> > +#endif
> > +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> > +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> > +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> > +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> > +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> > +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> > +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> > +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> > +	{ -1,			-EIO		}
> > +};
> > +
> > +/**
> > + * nfs_stat_to_errno - convert an NFS status code to a local errno
> > + * @status: NFS status code to convert
> > + *
> > + * Returns a local errno value, or -EIO if the NFS status code is
> > + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> > + * needs to be translated to an errno before being returned to a
> > + * local client application.
> > + */
> > +static int nfs_stat_to_errno(enum nfs_stat status)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> > +		if (nfs_common_errtbl[i].stat == (int)status)
> > +			return nfs_common_errtbl[i].errno;
> > +	}
> > +	return nfs_common_errtbl[i].errno;
> > +}
> > +
> > +static void
> > +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> > +{
> > +	if (rqstp->rq_client)
> > +		auth_domain_put(rqstp->rq_client);
> > +	if (rqstp->rq_cred.cr_group_info)
> > +		put_group_info(rqstp->rq_cred.cr_group_info);
> > +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> > +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> > +	kfree(rqstp->rq_xprt);
> > +	kfree(rqstp);
> > +}
> > +
> > +static struct svc_rqst *
> > +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> > +			const struct cred *cred)
> > +{
> > +	struct svc_rqst *rqstp;
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	int status;
> > +
> > +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> > +	if (unlikely(!READ_ONCE(nn->nfsd_serv)))
> > +		return ERR_PTR(-ENXIO);
> > +
> > +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > +	if (!rqstp)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> > +	if (!rqstp->rq_xprt) {
> > +		status = -ENOMEM;
> > +		goto out_err;
> > +	}
> 
> struct svc_rqst is pretty big (like, bigger than a couple of pages).
> What happens if this allocation fails?
> 
> And how often does it occur -- does that add significant overhead?
> 
> 
> > +
> > +	rqstp->rq_xprt->xpt_net = net;
> > +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> > +	rqstp->rq_proc = 1;
> > +	rqstp->rq_vers = 3;
> 
> IMO these need to be symbolic constants, not integers. Or, at least
> there needs to be some documenting comments that explain these are
> fake and why that's OK to do. Or, are there better choices?
> 
> 
> > +	rqstp->rq_prot = IPPROTO_TCP;
> > +	rqstp->rq_server = nn->nfsd_serv;
> > +
> > +	/* Note: we're connecting to ourself, so source addr == peer addr */
> > +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> > +			(struct sockaddr *)&rqstp->rq_addr,
> > +			sizeof(rqstp->rq_addr));
> > +
> > +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> > +
> > +	/*
> > +	 * set up enough for svcauth_unix_set_client to be able to wait
> > +	 * for the cache downcall. Note that we do _not_ want to allow the
> > +	 * request to be deferred for later revisit since this rqst and xprt
> > +	 * are not set up to run inside of the normal svc_rqst engine.
> > +	 */
> > +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> > +	kref_init(&rqstp->rq_xprt->xpt_ref);
> > +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> > +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> > +
> > +	status = svcauth_unix_set_client(rqstp);
> > +	switch (status) {
> > +	case SVC_OK:
> > +		break;
> > +	case SVC_DENIED:
> > +		status = -ENXIO;
> > +		goto out_err;
> > +	default:
> > +		status = -ETIMEDOUT;
> > +		goto out_err;
> > +	}
> 
> Interesting. Why would svcauth_unix_set_client fail for a local I/O
> request? Wouldn't it only be because the local application is trying
> to open a file it doesn't have permission to?
> 

I'd think so, yes. This case is not exactly like doing I/O on a local
fs since we don't have a persistent open file. nfsd has to do the
permission check on every READ/WRITE op, and I think we have to follow
suit here, particularly in the case of flexfiles DS traffic, since
fencing requires it.

> 
> > +	return rqstp;
> > +
> > +out_err:
> > +	nfsd_local_fakerqst_destroy(rqstp);
> > +	return ERR_PTR(status);
> > +}
> > +
> > +/*
> > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
> > + *
> > + * This function maps a local fh to a path on a local filesystem.
> > + * This is useful when the nfs client has the local server mounted - it can
> > + * avoid all the NFS overhead with reads, writes and commits.
> 
> Hm. It just occurred to me that there won't be a two-phase commit
> here, and possibly no flush-on-close, either? Can someone help
> explain whether/how the writeback semantics are different for NFS
> local I/O?
> 

A localio request is just a replacement for a READ or WRITE RPC. We
don't have flush on close or a COMMIT in the WRITE RPC case either. The
client has to follow up with a COMMIT RPC (or the localio equivalent).

> 
> > + *
> > + * on successful return, caller is responsible for calling path_put. Also
> > + * note that this is called from nfs.ko via find_symbol() to avoid an explicit
> > + * dependency on knfsd. So, there is no forward declaration in a header file
> > + * for it.
> 
> Yet I see a declaration added below in fs/nfsd/vfs.h. Is this
> comment out of date? Or perhaps you mean there's no declaration
> that is shared with the client code?
> 
> 
> > + */
> > +int nfsd_open_local_fh(struct net *net,
> 
> I've been asking that new NFSD code use genuine full-blooded kdoc
> comments for new functions. Since this is a global (EXPORTED)
> function, please make this a genuine kdoc comment.
> 
> 
> > +			 struct rpc_clnt *rpc_clnt,
> > +			 const struct cred *cred,
> > +			 const struct nfs_fh *nfs_fh,
> > +			 const fmode_t fmode,
> > +			 struct file **pfilp)
> > +{
> > +	const struct cred *save_cred;
> > +	struct svc_rqst *rqstp;
> > +	struct svc_fh fh;
> > +	struct nfsd_file *nf;
> > +	int status = 0;
> > +	int mayflags = NFSD_MAY_LOCALIO;
> > +	__be32 beres;
> 
> Nit: I've been asking that new NFSD code use reverse-christmas tree
> format for variable declarations.
> 
> 
> > +
> > +	/* Save creds before calling into nfsd */
> > +	save_cred = get_current_cred();
> > +
> > +	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
> > +	if (IS_ERR(rqstp)) {
> > +		status = PTR_ERR(rqstp);
> > +		goto out_revertcred;
> > +	}
> 
> It might be nicer if you had a small pool of svc threads pre-created
> for this purpose instead of manufacturing one of these and then
> tearing it down for every local open call.
> 
> Maybe even better if you had an internal transport on which to queue
> these open requests... because this is all pretty bespoke.
> 
> 
> > +
> > +	/* nfs_fh -> svc_fh */
> > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > +		status = -EINVAL;
> > +		goto out;
> > +	}
> > +	fh_init(&fh, NFS4_FHSIZE);
> > +	fh.fh_handle.fh_size = nfs_fh->size;
> > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > +
> > +	if (fmode & FMODE_READ)
> > +		mayflags |= NFSD_MAY_READ;
> > +	if (fmode & FMODE_WRITE)
> > +		mayflags |= NFSD_MAY_WRITE;
> > +
> > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > +	if (beres) {
> > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > +		goto out_fh_put;
> > +	}
> 
> So I'm wondering whether just calling fh_verify() and then
> nfsd_open_verified() would be simpler and/or good enough here. Is
> there a strong reason to use the file cache for locally opened
> files? Jeff, any thoughts? Will there be writeback ramifications for
> doing this? Maybe we need a comment here explaining how these files
> are garbage collected (just an fput by the local I/O client, I would
> guess).
> 

Yes, I think you do want to use the filecache here. The whole point of
the filecache is to optimize away the open and close in v3 calls.
Optimizing those away in the context of a localio op seems even more
valuable, since you're not dealing with network latency.

> 
> > +
> > +	*pfilp = get_file(nf->nf_file);
> > +
> > +	nfsd_file_put(nf);
> > +out_fh_put:
> > +	fh_put(&fh);
> > +
> > +out:
> > +	nfsd_local_fakerqst_destroy(rqstp);
> > +out_revertcred:
> > +	revert_creds(save_cred);
> > +	return status;
> > +}
> > +EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
> > +
> > +/* Compile time type checking, not used by anything */
> > +static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index 1222a0a33fe1..a477d2c5088a 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -431,6 +431,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
> >  #endif
> >  #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
> >  	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
> > +	nn->nfsd_uuid.net = net;
> >  	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
> >  #endif
> >  	nn->nfsd_net_up = true;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 77bbd23aa150..9c0610fdd11c 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
> >  		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
> >  		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
> >  		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
> > -		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
> > +		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
> > +		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
> >  
> >  TRACE_EVENT(nfsd_compound,
> >  	TP_PROTO(
> > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> > index 57cd70062048..5146f0c81752 100644
> > --- a/fs/nfsd/vfs.h
> > +++ b/fs/nfsd/vfs.h
> > @@ -33,6 +33,8 @@
> >  
> >  #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
> >  
> > +#define NFSD_MAY_LOCALIO		0x2000
> > +
> >  #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
> >  #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
> >  
> > @@ -158,6 +160,13 @@ __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
> >  
> >  void		nfsd_filp_close(struct file *fp);
> >  
> > +int		nfsd_open_local_fh(struct net *net,
> > +				   struct rpc_clnt *rpc_clnt,
> > +				   const struct cred *cred,
> > +				   const struct nfs_fh *nfs_fh,
> > +				   const fmode_t fmode,
> > +				   struct file **pfilp);
> > +
> >  static inline int fh_want_write(struct svc_fh *fh)
> >  {
> >  	int ret;
> > -- 
> > 2.44.0
> > 
> > 
>
Jeff Layton June 30, 2024, 7:52 p.m. UTC | #5
On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > +
> > > > +	/* nfs_fh -> svc_fh */
> > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > +		status = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > +
> > > > +	if (fmode & FMODE_READ)
> > > > +		mayflags |= NFSD_MAY_READ;
> > > > +	if (fmode & FMODE_WRITE)
> > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > +
> > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > +	if (beres) {
> > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > +		goto out_fh_put;
> > > > +	}
> > > 
> > > So I'm wondering whether just calling fh_verify() and then
> > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > there a strong reason to use the file cache for locally opened
> > > files? Jeff, any thoughts?
> > 
> > > Will there be writeback ramifications for
> > > doing this? Maybe we need a comment here explaining how these files
> > > are garbage collected (just an fput by the local I/O client, I would
> > > guess).
> > 
> > OK, going back to this: Since right here we immediately call 
> > 
> > 	nfsd_file_put(nf);
> > 
> > There are no writeback ramifications nor any need to comment about
> > garbage collection. But this still seems like a lot of (possibly
> > unnecessary) overhead for simply obtaining a struct file.
> 
> Easy enough change, probably best to avoid the filecache but would like
> to verify with Jeff before switching:
> 
> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> index 1d6508aa931e..85ebf63789fb 100644
> --- a/fs/nfsd/localio.c
> +++ b/fs/nfsd/localio.c
> @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
>         const struct cred *save_cred;
>         struct svc_rqst *rqstp;
>         struct svc_fh fh;
> -       struct nfsd_file *nf;
>         __be32 beres;
> 
>         if (nfs_fh->size > NFS4_FHSIZE)
> @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
>         if (fmode & FMODE_WRITE)
>                 mayflags |= NFSD_MAY_WRITE;
> 
> -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
>         if (beres) {
>                 status = nfs_stat_to_errno(be32_to_cpu(beres));
>                 goto out_fh_put;
>         }
> -       *pfilp = get_file(nf->nf_file);
> -       nfsd_file_put(nf);
> +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
>  out_fh_put:
>         fh_put(&fh);
>         nfsd_local_fakerqst_destroy(rqstp);
> 

My suggestion would be to _not_ do this. I think you do want to use the
filecache (mostly for performance reasons).
Chuck Lever June 30, 2024, 7:55 p.m. UTC | #6
On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > +
> > > > > +	/* nfs_fh -> svc_fh */
> > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > +		status = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > +
> > > > > +	if (fmode & FMODE_READ)
> > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > +	if (fmode & FMODE_WRITE)
> > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > +
> > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > +	if (beres) {
> > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > +		goto out_fh_put;
> > > > > +	}
> > > > 
> > > > So I'm wondering whether just calling fh_verify() and then
> > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > there a strong reason to use the file cache for locally opened
> > > > files? Jeff, any thoughts?
> > > 
> > > > Will there be writeback ramifications for
> > > > doing this? Maybe we need a comment here explaining how these files
> > > > are garbage collected (just an fput by the local I/O client, I would
> > > > guess).
> > > 
> > > OK, going back to this: Since right here we immediately call 
> > > 
> > > 	nfsd_file_put(nf);
> > > 
> > > There are no writeback ramifications nor any need to comment about
> > > garbage collection. But this still seems like a lot of (possibly
> > > unnecessary) overhead for simply obtaining a struct file.
> > 
> > Easy enough change, probably best to avoid the filecache but would like
> > to verify with Jeff before switching:
> > 
> > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > index 1d6508aa931e..85ebf63789fb 100644
> > --- a/fs/nfsd/localio.c
> > +++ b/fs/nfsd/localio.c
> > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> >         const struct cred *save_cred;
> >         struct svc_rqst *rqstp;
> >         struct svc_fh fh;
> > -       struct nfsd_file *nf;
> >         __be32 beres;
> > 
> >         if (nfs_fh->size > NFS4_FHSIZE)
> > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> >         if (fmode & FMODE_WRITE)
> >                 mayflags |= NFSD_MAY_WRITE;
> > 
> > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> >         if (beres) {
> >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> >                 goto out_fh_put;
> >         }
> > -       *pfilp = get_file(nf->nf_file);
> > -       nfsd_file_put(nf);
> > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> >  out_fh_put:
> >         fh_put(&fh);
> >         nfsd_local_fakerqst_destroy(rqstp);
> > 
> 
> My suggestion would be to _not_ do this. I think you do want to use the
> filecache (mostly for performance reasons).

But look carefully:

 -- we're not calling nfsd_file_acquire_gc() here

 -- we're immediately calling nfsd_file_put() on the returned nf

There's nothing left in the file cache when nfsd_open_local_fh()
returns. Each call here will do a full file open and a full close.
Jeff Layton June 30, 2024, 7:59 p.m. UTC | #7
On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > +
> > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > +		status = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > +
> > > > > > +	if (fmode & FMODE_READ)
> > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > +
> > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > +	if (beres) {
> > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > +		goto out_fh_put;
> > > > > > +	}
> > > > > 
> > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > there a strong reason to use the file cache for locally opened
> > > > > files? Jeff, any thoughts?
> > > > 
> > > > > Will there be writeback ramifications for
> > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > guess).
> > > > 
> > > > OK, going back to this: Since right here we immediately call 
> > > > 
> > > > 	nfsd_file_put(nf);
> > > > 
> > > > There are no writeback ramifications nor any need to comment about
> > > > garbage collection. But this still seems like a lot of (possibly
> > > > unnecessary) overhead for simply obtaining a struct file.
> > > 
> > > Easy enough change, probably best to avoid the filecache but would like
> > > to verify with Jeff before switching:
> > > 
> > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > index 1d6508aa931e..85ebf63789fb 100644
> > > --- a/fs/nfsd/localio.c
> > > +++ b/fs/nfsd/localio.c
> > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > >         const struct cred *save_cred;
> > >         struct svc_rqst *rqstp;
> > >         struct svc_fh fh;
> > > -       struct nfsd_file *nf;
> > >         __be32 beres;
> > > 
> > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > >         if (fmode & FMODE_WRITE)
> > >                 mayflags |= NFSD_MAY_WRITE;
> > > 
> > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > >         if (beres) {
> > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > >                 goto out_fh_put;
> > >         }
> > > -       *pfilp = get_file(nf->nf_file);
> > > -       nfsd_file_put(nf);
> > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > >  out_fh_put:
> > >         fh_put(&fh);
> > >         nfsd_local_fakerqst_destroy(rqstp);
> > > 
> > 
> > My suggestion would be to _not_ do this. I think you do want to use the
> > filecache (mostly for performance reasons).
> 
> But look carefully:
> 
>  -- we're not calling nfsd_file_acquire_gc() here
> 
>  -- we're immediately calling nfsd_file_put() on the returned nf
> 
> There's nothing left in the file cache when nfsd_open_local_fh()
> returns. Each call here will do a full file open and a full close.
> 
> 

Good point. This should be calling nfsd_file_acquire_gc(), IMO.
Chuck Lever June 30, 2024, 8:15 p.m. UTC | #8
On Sun, Jun 30, 2024 at 03:59:58PM -0400, Jeff Layton wrote:
> On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> > On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > > +
> > > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > > +		status = -EINVAL;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > > +
> > > > > > > +	if (fmode & FMODE_READ)
> > > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > > +
> > > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > > +	if (beres) {
> > > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > > +		goto out_fh_put;
> > > > > > > +	}
> > > > > > 
> > > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > > there a strong reason to use the file cache for locally opened
> > > > > > files? Jeff, any thoughts?
> > > > > 
> > > > > > Will there be writeback ramifications for
> > > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > > guess).
> > > > > 
> > > > > OK, going back to this: Since right here we immediately call 
> > > > > 
> > > > > 	nfsd_file_put(nf);
> > > > > 
> > > > > There are no writeback ramifications nor any need to comment about
> > > > > garbage collection. But this still seems like a lot of (possibly
> > > > > unnecessary) overhead for simply obtaining a struct file.
> > > > 
> > > > Easy enough change, probably best to avoid the filecache but would like
> > > > to verify with Jeff before switching:
> > > > 
> > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > > index 1d6508aa931e..85ebf63789fb 100644
> > > > --- a/fs/nfsd/localio.c
> > > > +++ b/fs/nfsd/localio.c
> > > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > >         const struct cred *save_cred;
> > > >         struct svc_rqst *rqstp;
> > > >         struct svc_fh fh;
> > > > -       struct nfsd_file *nf;
> > > >         __be32 beres;
> > > > 
> > > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > >         if (fmode & FMODE_WRITE)
> > > >                 mayflags |= NFSD_MAY_WRITE;
> > > > 
> > > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > > >         if (beres) {
> > > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > >                 goto out_fh_put;
> > > >         }
> > > > -       *pfilp = get_file(nf->nf_file);
> > > > -       nfsd_file_put(nf);
> > > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > > >  out_fh_put:
> > > >         fh_put(&fh);
> > > >         nfsd_local_fakerqst_destroy(rqstp);
> > > > 
> > > 
> > > My suggestion would be to _not_ do this. I think you do want to use the
> > > filecache (mostly for performance reasons).
> > 
> > But look carefully:
> > 
> >  -- we're not calling nfsd_file_acquire_gc() here
> > 
> >  -- we're immediately calling nfsd_file_put() on the returned nf
> > 
> > There's nothing left in the file cache when nfsd_open_local_fh()
> > returns. Each call here will do a full file open and a full close.
> 
> Good point. This should be calling nfsd_file_acquire_gc(), IMO. 

So that goes to my point yesterday about writeback ramifications.

If these open files linger in the file cache, then when will they
get written back to storage and by whom? Is it going to be an nfsd
thread writing them back as part of garbage collection?

So, you're saying that the local I/O client will always behave like
NFSv3 in this regard, and open/read/close, open/write/close instead
of hanging on to the open file? That seems... suboptimal... and not
expected for a local file. That needs to be documented in the
LOCALIO design doc.

I'm also concerned about local applications closing a file but
having an open file handle linger in the file cache -- that can
prevent other accesses to the file until the GC ejects that open
file, as we've seen in the field.

IMHO nfsd_file_acquire_gc() is going to have some unwanted side
effects.
Jeff Layton June 30, 2024, 9:07 p.m. UTC | #9
On Sun, 2024-06-30 at 16:15 -0400, Chuck Lever wrote:
> On Sun, Jun 30, 2024 at 03:59:58PM -0400, Jeff Layton wrote:
> > On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> > > On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > > > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > > > +
> > > > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > > > +		status = -EINVAL;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > > > +
> > > > > > > > +	if (fmode & FMODE_READ)
> > > > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > > > +
> > > > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > > > +	if (beres) {
> > > > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > > > +		goto out_fh_put;
> > > > > > > > +	}
> > > > > > > 
> > > > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > > > there a strong reason to use the file cache for locally opened
> > > > > > > files? Jeff, any thoughts?
> > > > > > 
> > > > > > > Will there be writeback ramifications for
> > > > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > > > guess).
> > > > > > 
> > > > > > OK, going back to this: Since right here we immediately call 
> > > > > > 
> > > > > > 	nfsd_file_put(nf);
> > > > > > 
> > > > > > There are no writeback ramifications nor any need to comment about
> > > > > > garbage collection. But this still seems like a lot of (possibly
> > > > > > unnecessary) overhead for simply obtaining a struct file.
> > > > > 
> > > > > Easy enough change, probably best to avoid the filecache but would like
> > > > > to verify with Jeff before switching:
> > > > > 
> > > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > > > index 1d6508aa931e..85ebf63789fb 100644
> > > > > --- a/fs/nfsd/localio.c
> > > > > +++ b/fs/nfsd/localio.c
> > > > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > >         const struct cred *save_cred;
> > > > >         struct svc_rqst *rqstp;
> > > > >         struct svc_fh fh;
> > > > > -       struct nfsd_file *nf;
> > > > >         __be32 beres;
> > > > > 
> > > > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > >         if (fmode & FMODE_WRITE)
> > > > >                 mayflags |= NFSD_MAY_WRITE;
> > > > > 
> > > > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > > > >         if (beres) {
> > > > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > >                 goto out_fh_put;
> > > > >         }
> > > > > -       *pfilp = get_file(nf->nf_file);
> > > > > -       nfsd_file_put(nf);
> > > > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > > > >  out_fh_put:
> > > > >         fh_put(&fh);
> > > > >         nfsd_local_fakerqst_destroy(rqstp);
> > > > > 
> > > > 
> > > > My suggestion would be to _not_ do this. I think you do want to use the
> > > > filecache (mostly for performance reasons).
> > > 
> > > But look carefully:
> > > 
> > >  -- we're not calling nfsd_file_acquire_gc() here
> > > 
> > >  -- we're immediately calling nfsd_file_put() on the returned nf
> > > 
> > > There's nothing left in the file cache when nfsd_open_local_fh()
> > > returns. Each call here will do a full file open and a full close.
> > 
> > Good point. This should be calling nfsd_file_acquire_gc(), IMO. 
> 
> So that goes to my point yesterday about writeback ramifications.
> 
> If these open files linger in the file cache, then when will they
> get written back to storage and by whom? Is it going to be an nfsd
> thread writing them back as part of garbage collection?
> 

Usually the client is issuing regular COMMITs. If that doesn't happen,
then the flusher threads should get the rest.

Side note: I don't guess COMMIT goes over the localio path yet, does
it? Maybe it should. It would be nice to not tie up an nfsd thread with
writeback.

> So, you're saying that the local I/O client will always behave like
> NFSv3 in this regard, and open/read/close, open/write/close instead
> of hanging on to the open file? That seems... suboptimal... and not
> expected for a local file. That needs to be documented in the
> LOCALIO design doc.
> 

I imagine so, which is why I suggest using the filecache. If we get one
READ or WRITE for the file via localio, we're pretty likely to get
more. Why not amortize that file open over several operations?
 
> I'm also concerned about local applications closing a file but
> having an open file handle linger in the file cache -- that can
> prevent other accesses to the file until the GC ejects that open
> file, as we've seen in the field.
> 
> IMHO nfsd_file_acquire_gc() is going to have some unwanted side
> effects.
> 

Typically, the client issues COMMIT calls when the client-side fd is
closed (for CTO). While I think we do need to be able to deal with
flushing files with dirty data that are left "hanging", that shouldn't
be the common case. Most of the time, the client is going to be issuing
regular COMMITs so that it can clean its pages.

IOW, I don't see how localio is any different than the case of normal
v3 IO in this respect.
NeilBrown June 30, 2024, 9:54 p.m. UTC | #10
On Mon, 01 Jul 2024, Jeff Layton wrote:
> On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> > On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > > +
> > > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > > +		status = -EINVAL;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > > +
> > > > > > > +	if (fmode & FMODE_READ)
> > > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > > +
> > > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > > +	if (beres) {
> > > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > > +		goto out_fh_put;
> > > > > > > +	}
> > > > > > 
> > > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > > there a strong reason to use the file cache for locally opened
> > > > > > files? Jeff, any thoughts?
> > > > > 
> > > > > > Will there be writeback ramifications for
> > > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > > guess).
> > > > > 
> > > > > OK, going back to this: Since right here we immediately call 
> > > > > 
> > > > > 	nfsd_file_put(nf);
> > > > > 
> > > > > There are no writeback ramifications nor any need to comment about
> > > > > garbage collection. But this still seems like a lot of (possibly
> > > > > unnecessary) overhead for simply obtaining a struct file.
> > > > 
> > > > Easy enough change, probably best to avoid the filecache but would like
> > > > to verify with Jeff before switching:
> > > > 
> > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > > index 1d6508aa931e..85ebf63789fb 100644
> > > > --- a/fs/nfsd/localio.c
> > > > +++ b/fs/nfsd/localio.c
> > > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > >         const struct cred *save_cred;
> > > >         struct svc_rqst *rqstp;
> > > >         struct svc_fh fh;
> > > > -       struct nfsd_file *nf;
> > > >         __be32 beres;
> > > > 
> > > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > >         if (fmode & FMODE_WRITE)
> > > >                 mayflags |= NFSD_MAY_WRITE;
> > > > 
> > > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > > >         if (beres) {
> > > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > >                 goto out_fh_put;
> > > >         }
> > > > -       *pfilp = get_file(nf->nf_file);
> > > > -       nfsd_file_put(nf);
> > > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > > >  out_fh_put:
> > > >         fh_put(&fh);
> > > >         nfsd_local_fakerqst_destroy(rqstp);
> > > > 
> > > 
> > > My suggestion would be to _not_ do this. I think you do want to use the
> > > filecache (mostly for performance reasons).
> > 
> > But look carefully:
> > 
> >  -- we're not calling nfsd_file_acquire_gc() here
> > 
> >  -- we're immediately calling nfsd_file_put() on the returned nf
> > 
> > There's nothing left in the file cache when nfsd_open_local_fh()
> > returns. Each call here will do a full file open and a full close.
> > 
> > 
> 
> Good point. This should be calling nfsd_file_acquire_gc(), IMO. 

Or the client could do a v4 style acquire, and not call nfsd_file_put()
until it was done with the file.  I don't see a specific problem with
_gc, but avoiding the heuristic it implies seems best where possible.

NeilBrown
NeilBrown June 30, 2024, 9:56 p.m. UTC | #11
On Mon, 01 Jul 2024, Jeff Layton wrote:
> On Sun, 2024-06-30 at 16:15 -0400, Chuck Lever wrote:
> > On Sun, Jun 30, 2024 at 03:59:58PM -0400, Jeff Layton wrote:
> > > On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> > > > On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > > > > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > > > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > > > > +
> > > > > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > > > > +		status = -EINVAL;
> > > > > > > > > +		goto out;
> > > > > > > > > +	}
> > > > > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > > > > +
> > > > > > > > > +	if (fmode & FMODE_READ)
> > > > > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > > > > +
> > > > > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > > > > +	if (beres) {
> > > > > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > > > > +		goto out_fh_put;
> > > > > > > > > +	}
> > > > > > > > 
> > > > > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > > > > there a strong reason to use the file cache for locally opened
> > > > > > > > files? Jeff, any thoughts?
> > > > > > > 
> > > > > > > > Will there be writeback ramifications for
> > > > > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > > > > guess).
> > > > > > > 
> > > > > > > OK, going back to this: Since right here we immediately call 
> > > > > > > 
> > > > > > > 	nfsd_file_put(nf);
> > > > > > > 
> > > > > > > There are no writeback ramifications nor any need to comment about
> > > > > > > garbage collection. But this still seems like a lot of (possibly
> > > > > > > unnecessary) overhead for simply obtaining a struct file.
> > > > > > 
> > > > > > Easy enough change, probably best to avoid the filecache but would like
> > > > > > to verify with Jeff before switching:
> > > > > > 
> > > > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > > > > index 1d6508aa931e..85ebf63789fb 100644
> > > > > > --- a/fs/nfsd/localio.c
> > > > > > +++ b/fs/nfsd/localio.c
> > > > > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > > >         const struct cred *save_cred;
> > > > > >         struct svc_rqst *rqstp;
> > > > > >         struct svc_fh fh;
> > > > > > -       struct nfsd_file *nf;
> > > > > >         __be32 beres;
> > > > > > 
> > > > > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > > > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > > >         if (fmode & FMODE_WRITE)
> > > > > >                 mayflags |= NFSD_MAY_WRITE;
> > > > > > 
> > > > > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > > > > >         if (beres) {
> > > > > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > >                 goto out_fh_put;
> > > > > >         }
> > > > > > -       *pfilp = get_file(nf->nf_file);
> > > > > > -       nfsd_file_put(nf);
> > > > > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > > > > >  out_fh_put:
> > > > > >         fh_put(&fh);
> > > > > >         nfsd_local_fakerqst_destroy(rqstp);
> > > > > > 
> > > > > 
> > > > > My suggestion would be to _not_ do this. I think you do want to use the
> > > > > filecache (mostly for performance reasons).
> > > > 
> > > > But look carefully:
> > > > 
> > > >  -- we're not calling nfsd_file_acquire_gc() here
> > > > 
> > > >  -- we're immediately calling nfsd_file_put() on the returned nf
> > > > 
> > > > There's nothing left in the file cache when nfsd_open_local_fh()
> > > > returns. Each call here will do a full file open and a full close.
> > > 
> > > Good point. This should be calling nfsd_file_acquire_gc(), IMO. 
> > 
> > So that goes to my point yesterday about writeback ramifications.
> > 
> > If these open files linger in the file cache, then when will they
> > get written back to storage and by whom? Is it going to be an nfsd
> > thread writing them back as part of garbage collection?
> > 
> 
> Usually the client is issuing regular COMMITs. If that doesn't happen,
> then the flusher threads should get the rest.
> 
> Side note: I don't guess COMMIT goes over the localio path yet, does
> it? Maybe it should. It would be nice to not tie up an nfsd thread with
> writeback.

The documentation certainly claims that COMMIT uses the localio path.  I
haven't double checked the code but I'd be very surprised if it didn't.

NeilBrown


> 
> > So, you're saying that the local I/O client will always behave like
> > NFSv3 in this regard, and open/read/close, open/write/close instead
> > of hanging on to the open file? That seems... suboptimal... and not
> > expected for a local file. That needs to be documented in the
> > LOCALIO design doc.
> > 
> 
> I imagine so, which is why I suggest using the filecache. If we get one
> READ or WRITE for the file via localio, we're pretty likely to get
> more. Why not amortize that file open over several operations?
>  
> > I'm also concerned about local applications closing a file but
> > having an open file handle linger in the file cache -- that can
> > prevent other accesses to the file until the GC ejects that open
> > file, as we've seen in the field.
> > 
> > IMHO nfsd_file_acquire_gc() is going to have some unwanted side
> > effects.
> > 
> 
> Typically, the client issues COMMIT calls when the client-side fd is
> closed (for CTO). While I think we do need to be able to deal with
> flushing files with dirty data that are left "hanging", that shouldn't
> be the common case. Most of the time, the client is going to be issuing
> regular COMMITs so that it can clean its pages.
> 
> IOW, I don't see how localio is any different than the case of normal
> v3 IO in this respect.
> -- 
> Jeff Layton <jlayton@kernel.org>
>
NeilBrown June 30, 2024, 10:22 p.m. UTC | #12
On Sun, 30 Jun 2024, Chuck Lever wrote:
> Sorry, I guess I expected to have more time to learn about these
> patches before writing review comments. But if you want them to go
> in soon, I had better look more closely at them now.
> 
> 
> On Fri, Jun 28, 2024 at 05:10:59PM -0400, Mike Snitzer wrote:
> > Pass the stored cl_nfssvc_net from the client to the server as
> 
> This is the only mention of cl_nfssvc_net I can find in this
> patch. I'm not sure what it is. Patch description should maybe
> provide some context.
> 
> 
> > first argument to nfsd_open_local_fh() to ensure the proper network
> > namespace is used for localio.
> 
> Can the patch description say something about the distinct mount 
> namespaces -- if the local application is running in a different
> container than the NFS server, are we using only the network
> namespaces for authorizing the file access? And is that OK to do?
> If yes, patch description should explain that NFS local I/O ignores
> the boundaries of mount namespaces and why that is OK to do.
> 
> 
> > Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> > Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> > Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> > ---
> >  fs/nfsd/Makefile    |   1 +
> >  fs/nfsd/filecache.c |   2 +-
> >  fs/nfsd/localio.c   | 239 ++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfssvc.c    |   1 +
> >  fs/nfsd/trace.h     |   3 +-
> >  fs/nfsd/vfs.h       |   9 ++
> >  6 files changed, 253 insertions(+), 2 deletions(-)
> >  create mode 100644 fs/nfsd/localio.c
> > 
> > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > index b8736a82e57c..78b421778a79 100644
> > --- a/fs/nfsd/Makefile
> > +++ b/fs/nfsd/Makefile
> > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> > +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index ad9083ca144b..99631fa56662 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -52,7 +52,7 @@
> >  #define NFSD_FILE_CACHE_UP		     (0)
> >  
> >  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> > -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> > +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> >  
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > new file mode 100644
> > index 000000000000..759a5cb79652
> > --- /dev/null
> > +++ b/fs/nfsd/localio.c
> > @@ -0,0 +1,239 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * NFS server support for local clients to bypass network stack
> > + *
> > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> > + */
> > +
> > +#include <linux/exportfs.h>
> > +#include <linux/sunrpc/svcauth_gss.h>
> > +#include <linux/sunrpc/clnt.h>
> > +#include <linux/nfs.h>
> > +#include <linux/string.h>
> > +
> > +#include "nfsd.h"
> > +#include "vfs.h"
> > +#include "netns.h"
> > +#include "filecache.h"
> > +
> > +#define NFSDDBG_FACILITY		NFSDDBG_FH
> 
> With no more dprintk() call sites in this patch, you no longer need
> this macro definition.
> 
> 
> > +/*
> > + * We need to translate between nfs status return values and
> > + * the local errno values which may not be the same.
> > + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> > + *   all compiled nfs objects if it were in include/linux/nfs.h
> > + */
> > +static const struct {
> > +	int stat;
> > +	int errno;
> > +} nfs_common_errtbl[] = {
> > +	{ NFS_OK,		0		},
> > +	{ NFSERR_PERM,		-EPERM		},
> > +	{ NFSERR_NOENT,		-ENOENT		},
> > +	{ NFSERR_IO,		-EIO		},
> > +	{ NFSERR_NXIO,		-ENXIO		},
> > +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> > +	{ NFSERR_ACCES,		-EACCES		},
> > +	{ NFSERR_EXIST,		-EEXIST		},
> > +	{ NFSERR_XDEV,		-EXDEV		},
> > +	{ NFSERR_NODEV,		-ENODEV		},
> > +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> > +	{ NFSERR_ISDIR,		-EISDIR		},
> > +	{ NFSERR_INVAL,		-EINVAL		},
> > +	{ NFSERR_FBIG,		-EFBIG		},
> > +	{ NFSERR_NOSPC,		-ENOSPC		},
> > +	{ NFSERR_ROFS,		-EROFS		},
> > +	{ NFSERR_MLINK,		-EMLINK		},
> > +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> > +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> > +	{ NFSERR_DQUOT,		-EDQUOT		},
> > +	{ NFSERR_STALE,		-ESTALE		},
> > +	{ NFSERR_REMOTE,	-EREMOTE	},
> > +#ifdef EWFLUSH
> > +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> > +#endif
> > +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> > +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> > +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> > +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> > +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> > +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> > +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> > +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> > +	{ -1,			-EIO		}
> > +};
> > +
> > +/**
> > + * nfs_stat_to_errno - convert an NFS status code to a local errno
> > + * @status: NFS status code to convert
> > + *
> > + * Returns a local errno value, or -EIO if the NFS status code is
> > + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> > + * needs to be translated to an errno before being returned to a
> > + * local client application.
> > + */
> > +static int nfs_stat_to_errno(enum nfs_stat status)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> > +		if (nfs_common_errtbl[i].stat == (int)status)
> > +			return nfs_common_errtbl[i].errno;
> > +	}
> > +	return nfs_common_errtbl[i].errno;
> > +}
> > +
> > +static void
> > +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> > +{
> > +	if (rqstp->rq_client)
> > +		auth_domain_put(rqstp->rq_client);
> > +	if (rqstp->rq_cred.cr_group_info)
> > +		put_group_info(rqstp->rq_cred.cr_group_info);
> > +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> > +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> > +	kfree(rqstp->rq_xprt);
> > +	kfree(rqstp);
> > +}
> > +
> > +static struct svc_rqst *
> > +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> > +			const struct cred *cred)
> > +{
> > +	struct svc_rqst *rqstp;
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	int status;
> > +
> > +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> > +	if (unlikely(!READ_ONCE(nn->nfsd_serv)))
> > +		return ERR_PTR(-ENXIO);
> > +
> > +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > +	if (!rqstp)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> > +	if (!rqstp->rq_xprt) {
> > +		status = -ENOMEM;
> > +		goto out_err;
> > +	}
> 
> struct svc_rqst is pretty big (like, bigger than a couple of pages).
> What happens if this allocation fails?
> 
> And how often does it occur -- does that add significant overhead?
> 
> 
> > +
> > +	rqstp->rq_xprt->xpt_net = net;
> > +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> > +	rqstp->rq_proc = 1;
> > +	rqstp->rq_vers = 3;
> 
> IMO these need to be symbolic constants, not integers. Or, at least
> there needs to be some documenting comments that explain these are
> fake and why that's OK to do. Or, are there better choices?
> 
> 
> > +	rqstp->rq_prot = IPPROTO_TCP;
> > +	rqstp->rq_server = nn->nfsd_serv;
> > +
> > +	/* Note: we're connecting to ourself, so source addr == peer addr */
> > +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> > +			(struct sockaddr *)&rqstp->rq_addr,
> > +			sizeof(rqstp->rq_addr));
> > +
> > +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> > +
> > +	/*
> > +	 * set up enough for svcauth_unix_set_client to be able to wait
> > +	 * for the cache downcall. Note that we do _not_ want to allow the
> > +	 * request to be deferred for later revisit since this rqst and xprt
> > +	 * are not set up to run inside of the normal svc_rqst engine.
> > +	 */
> > +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> > +	kref_init(&rqstp->rq_xprt->xpt_ref);
> > +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> > +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> > +
> > +	status = svcauth_unix_set_client(rqstp);
> > +	switch (status) {
> > +	case SVC_OK:
> > +		break;
> > +	case SVC_DENIED:
> > +		status = -ENXIO;
> > +		goto out_err;
> > +	default:
> > +		status = -ETIMEDOUT;
> > +		goto out_err;
> > +	}
> 
> Interesting. Why would svcauth_unix_set_client fail for a local I/O
> request? Wouldn't it only be because the local application is trying
> to open a file it doesn't have permission to?
> 

I'm beginning to think this section of code is the of the sort where you
need to be twice as clever when debugging as you where when writing.  It
is trying to get the client to use interfaces written for server-side
actions, and it isn't a good fit.

I think that instead we should modify fh_verify() so that it takes
explicit net, rq_vers, rq_cred, rq_client as well as the rqstp, and
the localio client passes in a NULL rqstp.

Getting the rq_client is an interesting challenge.
The above code (if I'm reading it correctly) gets the server-side
address of the IP connection, and passes that through to the sunrpc code
as though it is the client address.  So as long as the server is
exporting to itself, and as long as no address translation is happening
on the path, this works.  It feels messy though - and fragile.

I would rather we had some rq_client (struct auth_domain) that was
dedicated to localio.  The client should be able to access it based on
the fact that it could rather the server UUID using the LOCALIO RPC
protocol.

I'm not sure what exactly this would look like, but the 
'struct auth_domain *' should be something that can be accessed
directly, not looked up in a cache.

I can try to knock up a patch to allow fh_verify (and nfsd_file_acquire)
without an rqstp.  I won't try the auth_domain change until I hear what
others think.

NeilBrown
Chuck Lever June 30, 2024, 10:34 p.m. UTC | #13
On Mon, Jul 01, 2024 at 08:22:56AM +1000, NeilBrown wrote:
> On Sun, 30 Jun 2024, Chuck Lever wrote:
> > Sorry, I guess I expected to have more time to learn about these
> > patches before writing review comments. But if you want them to go
> > in soon, I had better look more closely at them now.
> > 
> > 
> > On Fri, Jun 28, 2024 at 05:10:59PM -0400, Mike Snitzer wrote:
> > > Pass the stored cl_nfssvc_net from the client to the server as
> > 
> > This is the only mention of cl_nfssvc_net I can find in this
> > patch. I'm not sure what it is. Patch description should maybe
> > provide some context.
> > 
> > 
> > > first argument to nfsd_open_local_fh() to ensure the proper network
> > > namespace is used for localio.
> > 
> > Can the patch description say something about the distinct mount 
> > namespaces -- if the local application is running in a different
> > container than the NFS server, are we using only the network
> > namespaces for authorizing the file access? And is that OK to do?
> > If yes, patch description should explain that NFS local I/O ignores
> > the boundaries of mount namespaces and why that is OK to do.
> > 
> > 
> > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> > > Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> > > Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> > > ---
> > >  fs/nfsd/Makefile    |   1 +
> > >  fs/nfsd/filecache.c |   2 +-
> > >  fs/nfsd/localio.c   | 239 ++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/nfssvc.c    |   1 +
> > >  fs/nfsd/trace.h     |   3 +-
> > >  fs/nfsd/vfs.h       |   9 ++
> > >  6 files changed, 253 insertions(+), 2 deletions(-)
> > >  create mode 100644 fs/nfsd/localio.c
> > > 
> > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > > index b8736a82e57c..78b421778a79 100644
> > > --- a/fs/nfsd/Makefile
> > > +++ b/fs/nfsd/Makefile
> > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> > >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> > >  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> > >  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> > > +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index ad9083ca144b..99631fa56662 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -52,7 +52,7 @@
> > >  #define NFSD_FILE_CACHE_UP		     (0)
> > >  
> > >  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> > > -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> > > +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> > >  
> > >  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> > >  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > new file mode 100644
> > > index 000000000000..759a5cb79652
> > > --- /dev/null
> > > +++ b/fs/nfsd/localio.c
> > > @@ -0,0 +1,239 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * NFS server support for local clients to bypass network stack
> > > + *
> > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> > > + */
> > > +
> > > +#include <linux/exportfs.h>
> > > +#include <linux/sunrpc/svcauth_gss.h>
> > > +#include <linux/sunrpc/clnt.h>
> > > +#include <linux/nfs.h>
> > > +#include <linux/string.h>
> > > +
> > > +#include "nfsd.h"
> > > +#include "vfs.h"
> > > +#include "netns.h"
> > > +#include "filecache.h"
> > > +
> > > +#define NFSDDBG_FACILITY		NFSDDBG_FH
> > 
> > With no more dprintk() call sites in this patch, you no longer need
> > this macro definition.
> > 
> > 
> > > +/*
> > > + * We need to translate between nfs status return values and
> > > + * the local errno values which may not be the same.
> > > + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> > > + *   all compiled nfs objects if it were in include/linux/nfs.h
> > > + */
> > > +static const struct {
> > > +	int stat;
> > > +	int errno;
> > > +} nfs_common_errtbl[] = {
> > > +	{ NFS_OK,		0		},
> > > +	{ NFSERR_PERM,		-EPERM		},
> > > +	{ NFSERR_NOENT,		-ENOENT		},
> > > +	{ NFSERR_IO,		-EIO		},
> > > +	{ NFSERR_NXIO,		-ENXIO		},
> > > +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> > > +	{ NFSERR_ACCES,		-EACCES		},
> > > +	{ NFSERR_EXIST,		-EEXIST		},
> > > +	{ NFSERR_XDEV,		-EXDEV		},
> > > +	{ NFSERR_NODEV,		-ENODEV		},
> > > +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> > > +	{ NFSERR_ISDIR,		-EISDIR		},
> > > +	{ NFSERR_INVAL,		-EINVAL		},
> > > +	{ NFSERR_FBIG,		-EFBIG		},
> > > +	{ NFSERR_NOSPC,		-ENOSPC		},
> > > +	{ NFSERR_ROFS,		-EROFS		},
> > > +	{ NFSERR_MLINK,		-EMLINK		},
> > > +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> > > +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> > > +	{ NFSERR_DQUOT,		-EDQUOT		},
> > > +	{ NFSERR_STALE,		-ESTALE		},
> > > +	{ NFSERR_REMOTE,	-EREMOTE	},
> > > +#ifdef EWFLUSH
> > > +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> > > +#endif
> > > +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> > > +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> > > +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> > > +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> > > +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> > > +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> > > +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> > > +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> > > +	{ -1,			-EIO		}
> > > +};
> > > +
> > > +/**
> > > + * nfs_stat_to_errno - convert an NFS status code to a local errno
> > > + * @status: NFS status code to convert
> > > + *
> > > + * Returns a local errno value, or -EIO if the NFS status code is
> > > + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> > > + * needs to be translated to an errno before being returned to a
> > > + * local client application.
> > > + */
> > > +static int nfs_stat_to_errno(enum nfs_stat status)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> > > +		if (nfs_common_errtbl[i].stat == (int)status)
> > > +			return nfs_common_errtbl[i].errno;
> > > +	}
> > > +	return nfs_common_errtbl[i].errno;
> > > +}
> > > +
> > > +static void
> > > +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> > > +{
> > > +	if (rqstp->rq_client)
> > > +		auth_domain_put(rqstp->rq_client);
> > > +	if (rqstp->rq_cred.cr_group_info)
> > > +		put_group_info(rqstp->rq_cred.cr_group_info);
> > > +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> > > +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> > > +	kfree(rqstp->rq_xprt);
> > > +	kfree(rqstp);
> > > +}
> > > +
> > > +static struct svc_rqst *
> > > +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> > > +			const struct cred *cred)
> > > +{
> > > +	struct svc_rqst *rqstp;
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +	int status;
> > > +
> > > +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> > > +	if (unlikely(!READ_ONCE(nn->nfsd_serv)))
> > > +		return ERR_PTR(-ENXIO);
> > > +
> > > +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > > +	if (!rqstp)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> > > +	if (!rqstp->rq_xprt) {
> > > +		status = -ENOMEM;
> > > +		goto out_err;
> > > +	}
> > 
> > struct svc_rqst is pretty big (like, bigger than a couple of pages).
> > What happens if this allocation fails?
> > 
> > And how often does it occur -- does that add significant overhead?
> > 
> > 
> > > +
> > > +	rqstp->rq_xprt->xpt_net = net;
> > > +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> > > +	rqstp->rq_proc = 1;
> > > +	rqstp->rq_vers = 3;
> > 
> > IMO these need to be symbolic constants, not integers. Or, at least
> > there needs to be some documenting comments that explain these are
> > fake and why that's OK to do. Or, are there better choices?
> > 
> > 
> > > +	rqstp->rq_prot = IPPROTO_TCP;
> > > +	rqstp->rq_server = nn->nfsd_serv;
> > > +
> > > +	/* Note: we're connecting to ourself, so source addr == peer addr */
> > > +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> > > +			(struct sockaddr *)&rqstp->rq_addr,
> > > +			sizeof(rqstp->rq_addr));
> > > +
> > > +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> > > +
> > > +	/*
> > > +	 * set up enough for svcauth_unix_set_client to be able to wait
> > > +	 * for the cache downcall. Note that we do _not_ want to allow the
> > > +	 * request to be deferred for later revisit since this rqst and xprt
> > > +	 * are not set up to run inside of the normal svc_rqst engine.
> > > +	 */
> > > +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> > > +	kref_init(&rqstp->rq_xprt->xpt_ref);
> > > +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> > > +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> > > +
> > > +	status = svcauth_unix_set_client(rqstp);
> > > +	switch (status) {
> > > +	case SVC_OK:
> > > +		break;
> > > +	case SVC_DENIED:
> > > +		status = -ENXIO;
> > > +		goto out_err;
> > > +	default:
> > > +		status = -ETIMEDOUT;
> > > +		goto out_err;
> > > +	}
> > 
> > Interesting. Why would svcauth_unix_set_client fail for a local I/O
> > request? Wouldn't it only be because the local application is trying
> > to open a file it doesn't have permission to?
> > 
> 
> I'm beginning to think this section of code is the of the sort where you
> need to be twice as clever when debugging as you where when writing.  It
> is trying to get the client to use interfaces written for server-side
> actions, and it isn't a good fit.
> 
> I think that instead we should modify fh_verify() so that it takes
> explicit net, rq_vers, rq_cred, rq_client as well as the rqstp, and
> the localio client passes in a NULL rqstp.

Nit: I'd rather provide a new fh_verify-like API -- changing the
synopsis of fh_verify() itself will result in a lot of code churn
for only a single call site.


> Getting the rq_client is an interesting challenge.
> The above code (if I'm reading it correctly) gets the server-side
> address of the IP connection, and passes that through to the sunrpc code
> as though it is the client address.  So as long as the server is
> exporting to itself, and as long as no address translation is happening
> on the path, this works.  It feels messy though - and fragile.
> 
> I would rather we had some rq_client (struct auth_domain) that was
> dedicated to localio.  The client should be able to access it based on
> the fact that it could rather the server UUID using the LOCALIO RPC
> protocol.
> 
> I'm not sure what exactly this would look like, but the 
> 'struct auth_domain *' should be something that can be accessed
> directly, not looked up in a cache.

I'd like to mitigate the possibility of having to wait for a
possible cache upcall, and reduce or remove the need for a phony
svc_rqst. It sounds like you are on that path.

Further, this needs to be clearly documented -- it's bypassing
(or perhaps augmenting) the export's usual IP address-based
authorization mechanism, so there are security considerations.


> I can try to knock up a patch to allow fh_verify (and nfsd_file_acquire)
> without an rqstp.  I won't try the auth_domain change until I hear what
> others think.
NeilBrown July 1, 2024, 1:29 a.m. UTC | #14
On Mon, 01 Jul 2024, NeilBrown wrote:
> On Mon, 01 Jul 2024, Jeff Layton wrote:
> > On Sun, 2024-06-30 at 15:55 -0400, Chuck Lever wrote:
> > > On Sun, Jun 30, 2024 at 03:52:51PM -0400, Jeff Layton wrote:
> > > > On Sun, 2024-06-30 at 15:44 -0400, Mike Snitzer wrote:
> > > > > On Sun, Jun 30, 2024 at 10:49:51AM -0400, Chuck Lever wrote:
> > > > > > On Sat, Jun 29, 2024 at 06:18:42PM -0400, Chuck Lever wrote:
> > > > > > > > +
> > > > > > > > +	/* nfs_fh -> svc_fh */
> > > > > > > > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > > > > > > > +		status = -EINVAL;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +	fh_init(&fh, NFS4_FHSIZE);
> > > > > > > > +	fh.fh_handle.fh_size = nfs_fh->size;
> > > > > > > > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > > > > > > > +
> > > > > > > > +	if (fmode & FMODE_READ)
> > > > > > > > +		mayflags |= NFSD_MAY_READ;
> > > > > > > > +	if (fmode & FMODE_WRITE)
> > > > > > > > +		mayflags |= NFSD_MAY_WRITE;
> > > > > > > > +
> > > > > > > > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > > > > +	if (beres) {
> > > > > > > > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > > > > > +		goto out_fh_put;
> > > > > > > > +	}
> > > > > > > 
> > > > > > > So I'm wondering whether just calling fh_verify() and then
> > > > > > > nfsd_open_verified() would be simpler and/or good enough here. Is
> > > > > > > there a strong reason to use the file cache for locally opened
> > > > > > > files? Jeff, any thoughts?
> > > > > > 
> > > > > > > Will there be writeback ramifications for
> > > > > > > doing this? Maybe we need a comment here explaining how these files
> > > > > > > are garbage collected (just an fput by the local I/O client, I would
> > > > > > > guess).
> > > > > > 
> > > > > > OK, going back to this: Since right here we immediately call 
> > > > > > 
> > > > > > 	nfsd_file_put(nf);
> > > > > > 
> > > > > > There are no writeback ramifications nor any need to comment about
> > > > > > garbage collection. But this still seems like a lot of (possibly
> > > > > > unnecessary) overhead for simply obtaining a struct file.
> > > > > 
> > > > > Easy enough change, probably best to avoid the filecache but would like
> > > > > to verify with Jeff before switching:
> > > > > 
> > > > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > > > index 1d6508aa931e..85ebf63789fb 100644
> > > > > --- a/fs/nfsd/localio.c
> > > > > +++ b/fs/nfsd/localio.c
> > > > > @@ -197,7 +197,6 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > >         const struct cred *save_cred;
> > > > >         struct svc_rqst *rqstp;
> > > > >         struct svc_fh fh;
> > > > > -       struct nfsd_file *nf;
> > > > >         __be32 beres;
> > > > > 
> > > > >         if (nfs_fh->size > NFS4_FHSIZE)
> > > > > @@ -235,13 +234,12 @@ int nfsd_open_local_fh(struct net *cl_nfssvc_net,
> > > > >         if (fmode & FMODE_WRITE)
> > > > >                 mayflags |= NFSD_MAY_WRITE;
> > > > > 
> > > > > -       beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > > > > +       beres = fh_verify(rqstp, &fh, S_IFREG, mayflags);
> > > > >         if (beres) {
> > > > >                 status = nfs_stat_to_errno(be32_to_cpu(beres));
> > > > >                 goto out_fh_put;
> > > > >         }
> > > > > -       *pfilp = get_file(nf->nf_file);
> > > > > -       nfsd_file_put(nf);
> > > > > +       status = nfsd_open_verified(rqstp, &fh, mayflags, pfilp);
> > > > >  out_fh_put:
> > > > >         fh_put(&fh);
> > > > >         nfsd_local_fakerqst_destroy(rqstp);
> > > > > 
> > > > 
> > > > My suggestion would be to _not_ do this. I think you do want to use the
> > > > filecache (mostly for performance reasons).
> > > 
> > > But look carefully:
> > > 
> > >  -- we're not calling nfsd_file_acquire_gc() here
> > > 
> > >  -- we're immediately calling nfsd_file_put() on the returned nf
> > > 
> > > There's nothing left in the file cache when nfsd_open_local_fh()
> > > returns. Each call here will do a full file open and a full close.
> > > 
> > > 
> > 
> > Good point. This should be calling nfsd_file_acquire_gc(), IMO. 
> 
> Or the client could do a v4 style acquire, and not call nfsd_file_put()
> until it was done with the file.  I don't see a specific problem with
> _gc, but avoiding the heuristic it implies seems best where possible.
> 

I'm now wondering if this matters at all.
For NFSv4 the client still calls OPEN and CLOSE over the wire, so the
file will be in the cache whenever it is open so the current code is
fine.

For NFSv3 the client will only do the lookup once on the first IO
request.  The struct file is stored in a client data structure and used
subsequently without any interaction with nfsd.
So if the client opens the same file multiple times we might get extra
lookups on the server, but I'm not at all sure that justifies any
complexity.

So my current inclination is to leave this code as is.

NeilBrown
diff mbox series

Patch

diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index b8736a82e57c..78b421778a79 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -23,3 +23,4 @@  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
 nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
+nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ad9083ca144b..99631fa56662 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -52,7 +52,7 @@ 
 #define NFSD_FILE_CACHE_UP		     (0)
 
 /* We only care about NFSD_MAY_READ/WRITE for this cache */
-#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
+#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
 
 static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
new file mode 100644
index 000000000000..759a5cb79652
--- /dev/null
+++ b/fs/nfsd/localio.c
@@ -0,0 +1,239 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NFS server support for local clients to bypass network stack
+ *
+ * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
+ * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+
+#include <linux/exportfs.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/string.h>
+
+#include "nfsd.h"
+#include "vfs.h"
+#include "netns.h"
+#include "filecache.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FH
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
+ *   all compiled nfs objects if it were in include/linux/nfs.h
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_common_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-EIO		},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  nfsd_file_acquire() returns an nfsstat that
+ * needs to be translated to an errno before being returned to a
+ * local client application.
+ */
+static int nfs_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
+		if (nfs_common_errtbl[i].stat == (int)status)
+			return nfs_common_errtbl[i].errno;
+	}
+	return nfs_common_errtbl[i].errno;
+}
+
+static void
+nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_client)
+		auth_domain_put(rqstp->rq_client);
+	if (rqstp->rq_cred.cr_group_info)
+		put_group_info(rqstp->rq_cred.cr_group_info);
+	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
+	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
+	kfree(rqstp->rq_xprt);
+	kfree(rqstp);
+}
+
+static struct svc_rqst *
+nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
+			const struct cred *cred)
+{
+	struct svc_rqst *rqstp;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int status;
+
+	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
+	if (unlikely(!READ_ONCE(nn->nfsd_serv)))
+		return ERR_PTR(-ENXIO);
+
+	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+	if (!rqstp)
+		return ERR_PTR(-ENOMEM);
+
+	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
+	if (!rqstp->rq_xprt) {
+		status = -ENOMEM;
+		goto out_err;
+	}
+
+	rqstp->rq_xprt->xpt_net = net;
+	__set_bit(RQ_SECURE, &rqstp->rq_flags);
+	rqstp->rq_proc = 1;
+	rqstp->rq_vers = 3;
+	rqstp->rq_prot = IPPROTO_TCP;
+	rqstp->rq_server = nn->nfsd_serv;
+
+	/* Note: we're connecting to ourself, so source addr == peer addr */
+	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
+			(struct sockaddr *)&rqstp->rq_addr,
+			sizeof(rqstp->rq_addr));
+
+	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
+
+	/*
+	 * set up enough for svcauth_unix_set_client to be able to wait
+	 * for the cache downcall. Note that we do _not_ want to allow the
+	 * request to be deferred for later revisit since this rqst and xprt
+	 * are not set up to run inside of the normal svc_rqst engine.
+	 */
+	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
+	kref_init(&rqstp->rq_xprt->xpt_ref);
+	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
+	rqstp->rq_chandle.thread_wait = 5 * HZ;
+
+	status = svcauth_unix_set_client(rqstp);
+	switch (status) {
+	case SVC_OK:
+		break;
+	case SVC_DENIED:
+		status = -ENXIO;
+		goto out_err;
+	default:
+		status = -ETIMEDOUT;
+		goto out_err;
+	}
+
+	return rqstp;
+
+out_err:
+	nfsd_local_fakerqst_destroy(rqstp);
+	return ERR_PTR(status);
+}
+
+/*
+ * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
+ *
+ * This function maps a local fh to a path on a local filesystem.
+ * This is useful when the nfs client has the local server mounted - it can
+ * avoid all the NFS overhead with reads, writes and commits.
+ *
+ * on successful return, caller is responsible for calling path_put. Also
+ * note that this is called from nfs.ko via find_symbol() to avoid an explicit
+ * dependency on knfsd. So, there is no forward declaration in a header file
+ * for it.
+ */
+int nfsd_open_local_fh(struct net *net,
+			 struct rpc_clnt *rpc_clnt,
+			 const struct cred *cred,
+			 const struct nfs_fh *nfs_fh,
+			 const fmode_t fmode,
+			 struct file **pfilp)
+{
+	const struct cred *save_cred;
+	struct svc_rqst *rqstp;
+	struct svc_fh fh;
+	struct nfsd_file *nf;
+	int status = 0;
+	int mayflags = NFSD_MAY_LOCALIO;
+	__be32 beres;
+
+	/* Save creds before calling into nfsd */
+	save_cred = get_current_cred();
+
+	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
+	if (IS_ERR(rqstp)) {
+		status = PTR_ERR(rqstp);
+		goto out_revertcred;
+	}
+
+	/* nfs_fh -> svc_fh */
+	if (nfs_fh->size > NFS4_FHSIZE) {
+		status = -EINVAL;
+		goto out;
+	}
+	fh_init(&fh, NFS4_FHSIZE);
+	fh.fh_handle.fh_size = nfs_fh->size;
+	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
+
+	if (fmode & FMODE_READ)
+		mayflags |= NFSD_MAY_READ;
+	if (fmode & FMODE_WRITE)
+		mayflags |= NFSD_MAY_WRITE;
+
+	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
+	if (beres) {
+		status = nfs_stat_to_errno(be32_to_cpu(beres));
+		goto out_fh_put;
+	}
+
+	*pfilp = get_file(nf->nf_file);
+
+	nfsd_file_put(nf);
+out_fh_put:
+	fh_put(&fh);
+
+out:
+	nfsd_local_fakerqst_destroy(rqstp);
+out_revertcred:
+	revert_creds(save_cred);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
+
+/* Compile time type checking, not used by anything */
+static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1222a0a33fe1..a477d2c5088a 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -431,6 +431,7 @@  static int nfsd_startup_net(struct net *net, const struct cred *cred)
 #endif
 #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
 	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
+	nn->nfsd_uuid.net = net;
 	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
 #endif
 	nn->nfsd_net_up = true;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 77bbd23aa150..9c0610fdd11c 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -86,7 +86,8 @@  DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
 		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
 		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
 		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
-		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
+		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
+		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
 
 TRACE_EVENT(nfsd_compound,
 	TP_PROTO(
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 57cd70062048..5146f0c81752 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -33,6 +33,8 @@ 
 
 #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
 
+#define NFSD_MAY_LOCALIO		0x2000
+
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
 
@@ -158,6 +160,13 @@  __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 
 void		nfsd_filp_close(struct file *fp);
 
+int		nfsd_open_local_fh(struct net *net,
+				   struct rpc_clnt *rpc_clnt,
+				   const struct cred *cred,
+				   const struct nfs_fh *nfs_fh,
+				   const fmode_t fmode,
+				   struct file **pfilp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;