diff mbox series

[v8,07/18] nfsd: add "localio" support

Message ID 20240626182438.69539-8-snitzer@kernel.org (mailing list archive)
State New
Headers show
Series nfs/nfsd: add support for localio | expand

Commit Message

Mike Snitzer June 26, 2024, 6:24 p.m. UTC
Pass the stored cl_nfssvc_net from the client to the server as
first argument to nfsd_open_local_fh() to ensure the proper network
namespace is used for localio.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 fs/nfsd/Makefile    |   1 +
 fs/nfsd/filecache.c |   2 +-
 fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfssvc.c    |   1 +
 fs/nfsd/trace.h     |   3 +-
 fs/nfsd/vfs.h       |   9 ++
 6 files changed, 260 insertions(+), 2 deletions(-)
 create mode 100644 fs/nfsd/localio.c

Comments

Jeff Layton June 27, 2024, 3:48 p.m. UTC | #1
On Wed, 2024-06-26 at 14:24 -0400, Mike Snitzer wrote:
> Pass the stored cl_nfssvc_net from the client to the server as
> first argument to nfsd_open_local_fh() to ensure the proper network
> namespace is used for localio.
> 
> Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  fs/nfsd/Makefile    |   1 +
>  fs/nfsd/filecache.c |   2 +-
>  fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfssvc.c    |   1 +
>  fs/nfsd/trace.h     |   3 +-
>  fs/nfsd/vfs.h       |   9 ++
>  6 files changed, 260 insertions(+), 2 deletions(-)
>  create mode 100644 fs/nfsd/localio.c
> 
> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> index b8736a82e57c..78b421778a79 100644
> --- a/fs/nfsd/Makefile
> +++ b/fs/nfsd/Makefile
> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
>  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
>  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index ad9083ca144b..99631fa56662 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -52,7 +52,7 @@
>  #define NFSD_FILE_CACHE_UP		     (0)
>  
>  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
>  
>  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
>  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> new file mode 100644
> index 000000000000..ba9187735947
> --- /dev/null
> +++ b/fs/nfsd/localio.c
> @@ -0,0 +1,246 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * NFS server support for local clients to bypass network stack
> + *
> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> + */
> +
> +#include <linux/exportfs.h>
> +#include <linux/sunrpc/svcauth_gss.h>
> +#include <linux/sunrpc/clnt.h>
> +#include <linux/nfs.h>
> +#include <linux/string.h>
> +
> +#include "nfsd.h"
> +#include "vfs.h"
> +#include "netns.h"
> +#include "filecache.h"
> +
> +#define NFSDDBG_FACILITY		NFSDDBG_FH
> +
> +/*
> + * We need to translate between nfs status return values and
> + * the local errno values which may not be the same.
> + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> + *   all compiled nfs objects if it were in include/linux/nfs.h
> + */
> +static const struct {
> +	int stat;
> +	int errno;
> +} nfs_common_errtbl[] = {
> +	{ NFS_OK,		0		},
> +	{ NFSERR_PERM,		-EPERM		},
> +	{ NFSERR_NOENT,		-ENOENT		},
> +	{ NFSERR_IO,		-EIO		},
> +	{ NFSERR_NXIO,		-ENXIO		},
> +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> +	{ NFSERR_ACCES,		-EACCES		},
> +	{ NFSERR_EXIST,		-EEXIST		},
> +	{ NFSERR_XDEV,		-EXDEV		},
> +	{ NFSERR_NODEV,		-ENODEV		},
> +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> +	{ NFSERR_ISDIR,		-EISDIR		},
> +	{ NFSERR_INVAL,		-EINVAL		},
> +	{ NFSERR_FBIG,		-EFBIG		},
> +	{ NFSERR_NOSPC,		-ENOSPC		},
> +	{ NFSERR_ROFS,		-EROFS		},
> +	{ NFSERR_MLINK,		-EMLINK		},
> +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> +	{ NFSERR_DQUOT,		-EDQUOT		},
> +	{ NFSERR_STALE,		-ESTALE		},
> +	{ NFSERR_REMOTE,	-EREMOTE	},
> +#ifdef EWFLUSH
> +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> +#endif
> +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> +	{ -1,			-EIO		}
> +};
> +
> +/**
> + * nfs_stat_to_errno - convert an NFS status code to a local errno
> + * @status: NFS status code to convert
> + *
> + * Returns a local errno value, or -EIO if the NFS status code is
> + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> + * needs to be translated to an errno before being returned to a
> + * local client application.
> + */
> +static int nfs_stat_to_errno(enum nfs_stat status)
> +{
> +	int i;
> +
> +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> +		if (nfs_common_errtbl[i].stat == (int)status)
> +			return nfs_common_errtbl[i].errno;
> +	}
> +	return nfs_common_errtbl[i].errno;
> +}
> +
> +static void
> +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> +{
> +	if (rqstp->rq_client)
> +		auth_domain_put(rqstp->rq_client);
> +	if (rqstp->rq_cred.cr_group_info)
> +		put_group_info(rqstp->rq_cred.cr_group_info);
> +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> +	kfree(rqstp->rq_xprt);
> +	kfree(rqstp);
> +}
> +
> +static struct svc_rqst *
> +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> +			const struct cred *cred)
> +{
> +	struct svc_rqst *rqstp;
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	int status;
> +
> +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> +	if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
> +		dprintk("%s: localio denied. Server not running\n", __func__);

Chuck mentioned this earlier, but I don't think we ought to merge the
dprintks. If they're useful for debugging then they should be turned
into tracepoints. This one, I'd probably just drop.

> +		return ERR_PTR(-ENXIO);
> +	}
> +
> +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> +	if (!rqstp)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> +	if (!rqstp->rq_xprt) {
> +		status = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	rqstp->rq_xprt->xpt_net = net;
> +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> +	rqstp->rq_proc = 1;
> +	rqstp->rq_vers = 3;
> +	rqstp->rq_prot = IPPROTO_TCP;
> +	rqstp->rq_server = nn->nfsd_serv;
> +
> +	/* Note: we're connecting to ourself, so source addr == peer addr */
> +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> +			(struct sockaddr *)&rqstp->rq_addr,
> +			sizeof(rqstp->rq_addr));
> +
> +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> +
> +	/*
> +	 * set up enough for svcauth_unix_set_client to be able to wait
> +	 * for the cache downcall. Note that we do _not_ want to allow the
> +	 * request to be deferred for later revisit since this rqst and xprt
> +	 * are not set up to run inside of the normal svc_rqst engine.
> +	 */
> +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> +	kref_init(&rqstp->rq_xprt->xpt_ref);
> +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> +
> +	status = svcauth_unix_set_client(rqstp);
> +	switch (status) {
> +	case SVC_OK:
> +		break;
> +	case SVC_DENIED:
> +		status = -ENXIO;
> +		dprintk("%s: client %pISpc denied localio access\n",
> +				__func__, (struct sockaddr *)&rqstp->rq_addr);
> +		goto out_err;
> +	default:
> +		status = -ETIMEDOUT;
> +		dprintk("%s: client %pISpc temporarily denied localio access\n",
> +				__func__, (struct sockaddr *)&rqstp->rq_addr);
> +		goto out_err;
> +	}
> +
> +	return rqstp;
> +
> +out_err:

The two above can probably be turned into a single tracepoint here,
though it might just be best to have a single tracepoint that always
fires when this function exits.

> +	nfsd_local_fakerqst_destroy(rqstp);
> +	return ERR_PTR(status);
> +}
> +
> +/*
> + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
> + *
> + * This function maps a local fh to a path on a local filesystem.
> + * This is useful when the nfs client has the local server mounted - it can
> + * avoid all the NFS overhead with reads, writes and commits.
> + *
> + * on successful return, caller is responsible for calling path_put. Also
> + * note that this is called from nfs.ko via find_symbol() to avoid an explicit
> + * dependency on knfsd. So, there is no forward declaration in a header file
> + * for it.
> + */
> +int nfsd_open_local_fh(struct net *net,
> +			 struct rpc_clnt *rpc_clnt,
> +			 const struct cred *cred,
> +			 const struct nfs_fh *nfs_fh,
> +			 const fmode_t fmode,
> +			 struct file **pfilp)
> +{
> +	const struct cred *save_cred;
> +	struct svc_rqst *rqstp;
> +	struct svc_fh fh;
> +	struct nfsd_file *nf;
> +	int status = 0;
> +	int mayflags = NFSD_MAY_LOCALIO;
> +	__be32 beres;
> +
> +	/* Save creds before calling into nfsd */
> +	save_cred = get_current_cred();
> +
> +	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
> +	if (IS_ERR(rqstp)) {
> +		status = PTR_ERR(rqstp);
> +		goto out_revertcred;
> +	}
> +
> +	/* nfs_fh -> svc_fh */
> +	if (nfs_fh->size > NFS4_FHSIZE) {
> +		status = -EINVAL;
> +		goto out;
> +	}
> +	fh_init(&fh, NFS4_FHSIZE);
> +	fh.fh_handle.fh_size = nfs_fh->size;
> +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> +
> +	if (fmode & FMODE_READ)
> +		mayflags |= NFSD_MAY_READ;
> +	if (fmode & FMODE_WRITE)
> +		mayflags |= NFSD_MAY_WRITE;
> +
> +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> +	if (beres) {
> +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> +		dprintk("%s: fh_verify failed %d\n", __func__, status);

This should also be a tracepoint.

> +		goto out_fh_put;
> +	}
> +
> +	*pfilp = get_file(nf->nf_file);
> +
> +	nfsd_file_put(nf);
> +out_fh_put:
> +	fh_put(&fh);
> +
> +out:
> +	nfsd_local_fakerqst_destroy(rqstp);
> +out_revertcred:
> +	revert_creds(save_cred);
> +	return status;
> +}
> +EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
> +
> +/* Compile time type checking, not used by anything */
> +static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 1222a0a33fe1..a477d2c5088a 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -431,6 +431,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
>  #endif
>  #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
>  	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
> +	nn->nfsd_uuid.net = net;
>  	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
>  #endif
>  	nn->nfsd_net_up = true;
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 77bbd23aa150..9c0610fdd11c 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
>  		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
>  		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
>  		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
> -		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
> +		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
> +		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
>  
>  TRACE_EVENT(nfsd_compound,
>  	TP_PROTO(
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index 57cd70062048..5146f0c81752 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -33,6 +33,8 @@
>  
>  #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
>  
> +#define NFSD_MAY_LOCALIO		0x2000
> +
>  #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
>  #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
>  
> @@ -158,6 +160,13 @@ __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
>  
>  void		nfsd_filp_close(struct file *fp);
>  
> +int		nfsd_open_local_fh(struct net *net,
> +				   struct rpc_clnt *rpc_clnt,
> +				   const struct cred *cred,
> +				   const struct nfs_fh *nfs_fh,
> +				   const fmode_t fmode,
> +				   struct file **pfilp);
> +
>  static inline int fh_want_write(struct svc_fh *fh)
>  {
>  	int ret;
Chuck Lever June 27, 2024, 4:07 p.m. UTC | #2
On Thu, Jun 27, 2024 at 11:48:09AM -0400, Jeff Layton wrote:
> On Wed, 2024-06-26 at 14:24 -0400, Mike Snitzer wrote:
> > Pass the stored cl_nfssvc_net from the client to the server as
> > first argument to nfsd_open_local_fh() to ensure the proper network
> > namespace is used for localio.
> > 
> > Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> > Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> > Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> > ---
> >  fs/nfsd/Makefile    |   1 +
> >  fs/nfsd/filecache.c |   2 +-
> >  fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfssvc.c    |   1 +
> >  fs/nfsd/trace.h     |   3 +-
> >  fs/nfsd/vfs.h       |   9 ++
> >  6 files changed, 260 insertions(+), 2 deletions(-)
> >  create mode 100644 fs/nfsd/localio.c
> > 
> > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > index b8736a82e57c..78b421778a79 100644
> > --- a/fs/nfsd/Makefile
> > +++ b/fs/nfsd/Makefile
> > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> >  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> > +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index ad9083ca144b..99631fa56662 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -52,7 +52,7 @@
> >  #define NFSD_FILE_CACHE_UP		     (0)
> >  
> >  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> > -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> > +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> >  
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> >  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > new file mode 100644
> > index 000000000000..ba9187735947
> > --- /dev/null
> > +++ b/fs/nfsd/localio.c
> > @@ -0,0 +1,246 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * NFS server support for local clients to bypass network stack
> > + *
> > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> > + */
> > +
> > +#include <linux/exportfs.h>
> > +#include <linux/sunrpc/svcauth_gss.h>
> > +#include <linux/sunrpc/clnt.h>
> > +#include <linux/nfs.h>
> > +#include <linux/string.h>
> > +
> > +#include "nfsd.h"
> > +#include "vfs.h"
> > +#include "netns.h"
> > +#include "filecache.h"
> > +
> > +#define NFSDDBG_FACILITY		NFSDDBG_FH
> > +
> > +/*
> > + * We need to translate between nfs status return values and
> > + * the local errno values which may not be the same.
> > + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> > + *   all compiled nfs objects if it were in include/linux/nfs.h
> > + */
> > +static const struct {
> > +	int stat;
> > +	int errno;
> > +} nfs_common_errtbl[] = {
> > +	{ NFS_OK,		0		},
> > +	{ NFSERR_PERM,		-EPERM		},
> > +	{ NFSERR_NOENT,		-ENOENT		},
> > +	{ NFSERR_IO,		-EIO		},
> > +	{ NFSERR_NXIO,		-ENXIO		},
> > +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> > +	{ NFSERR_ACCES,		-EACCES		},
> > +	{ NFSERR_EXIST,		-EEXIST		},
> > +	{ NFSERR_XDEV,		-EXDEV		},
> > +	{ NFSERR_NODEV,		-ENODEV		},
> > +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> > +	{ NFSERR_ISDIR,		-EISDIR		},
> > +	{ NFSERR_INVAL,		-EINVAL		},
> > +	{ NFSERR_FBIG,		-EFBIG		},
> > +	{ NFSERR_NOSPC,		-ENOSPC		},
> > +	{ NFSERR_ROFS,		-EROFS		},
> > +	{ NFSERR_MLINK,		-EMLINK		},
> > +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> > +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> > +	{ NFSERR_DQUOT,		-EDQUOT		},
> > +	{ NFSERR_STALE,		-ESTALE		},
> > +	{ NFSERR_REMOTE,	-EREMOTE	},
> > +#ifdef EWFLUSH
> > +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> > +#endif
> > +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> > +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> > +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> > +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> > +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> > +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> > +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> > +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> > +	{ -1,			-EIO		}
> > +};
> > +
> > +/**
> > + * nfs_stat_to_errno - convert an NFS status code to a local errno
> > + * @status: NFS status code to convert
> > + *
> > + * Returns a local errno value, or -EIO if the NFS status code is
> > + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> > + * needs to be translated to an errno before being returned to a
> > + * local client application.
> > + */
> > +static int nfs_stat_to_errno(enum nfs_stat status)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> > +		if (nfs_common_errtbl[i].stat == (int)status)
> > +			return nfs_common_errtbl[i].errno;
> > +	}
> > +	return nfs_common_errtbl[i].errno;
> > +}
> > +
> > +static void
> > +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> > +{
> > +	if (rqstp->rq_client)
> > +		auth_domain_put(rqstp->rq_client);
> > +	if (rqstp->rq_cred.cr_group_info)
> > +		put_group_info(rqstp->rq_cred.cr_group_info);
> > +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> > +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> > +	kfree(rqstp->rq_xprt);
> > +	kfree(rqstp);
> > +}
> > +
> > +static struct svc_rqst *
> > +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> > +			const struct cred *cred)
> > +{
> > +	struct svc_rqst *rqstp;
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	int status;
> > +
> > +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> > +	if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
> > +		dprintk("%s: localio denied. Server not running\n", __func__);
> 
> Chuck mentioned this earlier, but I don't think we ought to merge the
> dprintks. If they're useful for debugging then they should be turned
> into tracepoints. This one, I'd probably just drop.

Right; the problem with dprintk() is they can create so much chatter
that the systemd journal will automatically toss those messages and
they are lost. No diagnostic value in that.

Also you probably won't find it useful if lots of debugging info
goes into the trace log but a handful of the stuff you are
looking for is dumped into the system journal; the two use different
timestamps and so are really hard to line up after the fact.

We're trying to transition away from dprintk() in NFSD for these
reasons.


> > +		return ERR_PTR(-ENXIO);
> > +	}
> > +
> > +	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
> > +	if (!rqstp)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
> > +	if (!rqstp->rq_xprt) {
> > +		status = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	rqstp->rq_xprt->xpt_net = net;
> > +	__set_bit(RQ_SECURE, &rqstp->rq_flags);
> > +	rqstp->rq_proc = 1;
> > +	rqstp->rq_vers = 3;
> > +	rqstp->rq_prot = IPPROTO_TCP;
> > +	rqstp->rq_server = nn->nfsd_serv;
> > +
> > +	/* Note: we're connecting to ourself, so source addr == peer addr */
> > +	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
> > +			(struct sockaddr *)&rqstp->rq_addr,
> > +			sizeof(rqstp->rq_addr));
> > +
> > +	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
> > +
> > +	/*
> > +	 * set up enough for svcauth_unix_set_client to be able to wait
> > +	 * for the cache downcall. Note that we do _not_ want to allow the
> > +	 * request to be deferred for later revisit since this rqst and xprt
> > +	 * are not set up to run inside of the normal svc_rqst engine.
> > +	 */
> > +	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
> > +	kref_init(&rqstp->rq_xprt->xpt_ref);
> > +	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
> > +	rqstp->rq_chandle.thread_wait = 5 * HZ;
> > +
> > +	status = svcauth_unix_set_client(rqstp);
> > +	switch (status) {
> > +	case SVC_OK:
> > +		break;
> > +	case SVC_DENIED:
> > +		status = -ENXIO;
> > +		dprintk("%s: client %pISpc denied localio access\n",
> > +				__func__, (struct sockaddr *)&rqstp->rq_addr);
> > +		goto out_err;
> > +	default:
> > +		status = -ETIMEDOUT;
> > +		dprintk("%s: client %pISpc temporarily denied localio access\n",
> > +				__func__, (struct sockaddr *)&rqstp->rq_addr);
> > +		goto out_err;
> > +	}
> > +
> > +	return rqstp;
> > +
> > +out_err:
> 
> The two above can probably be turned into a single tracepoint here,
> though it might just be best to have a single tracepoint that always
> fires when this function exits.
> 
> > +	nfsd_local_fakerqst_destroy(rqstp);
> > +	return ERR_PTR(status);
> > +}
> > +
> > +/*
> > + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
> > + *
> > + * This function maps a local fh to a path on a local filesystem.
> > + * This is useful when the nfs client has the local server mounted - it can
> > + * avoid all the NFS overhead with reads, writes and commits.
> > + *
> > + * on successful return, caller is responsible for calling path_put. Also
> > + * note that this is called from nfs.ko via find_symbol() to avoid an explicit
> > + * dependency on knfsd. So, there is no forward declaration in a header file
> > + * for it.
> > + */
> > +int nfsd_open_local_fh(struct net *net,
> > +			 struct rpc_clnt *rpc_clnt,
> > +			 const struct cred *cred,
> > +			 const struct nfs_fh *nfs_fh,
> > +			 const fmode_t fmode,
> > +			 struct file **pfilp)
> > +{
> > +	const struct cred *save_cred;
> > +	struct svc_rqst *rqstp;
> > +	struct svc_fh fh;
> > +	struct nfsd_file *nf;
> > +	int status = 0;
> > +	int mayflags = NFSD_MAY_LOCALIO;
> > +	__be32 beres;
> > +
> > +	/* Save creds before calling into nfsd */
> > +	save_cred = get_current_cred();
> > +
> > +	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
> > +	if (IS_ERR(rqstp)) {
> > +		status = PTR_ERR(rqstp);
> > +		goto out_revertcred;
> > +	}
> > +
> > +	/* nfs_fh -> svc_fh */
> > +	if (nfs_fh->size > NFS4_FHSIZE) {
> > +		status = -EINVAL;
> > +		goto out;
> > +	}
> > +	fh_init(&fh, NFS4_FHSIZE);
> > +	fh.fh_handle.fh_size = nfs_fh->size;
> > +	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
> > +
> > +	if (fmode & FMODE_READ)
> > +		mayflags |= NFSD_MAY_READ;
> > +	if (fmode & FMODE_WRITE)
> > +		mayflags |= NFSD_MAY_WRITE;
> > +
> > +	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
> > +	if (beres) {
> > +		status = nfs_stat_to_errno(be32_to_cpu(beres));
> > +		dprintk("%s: fh_verify failed %d\n", __func__, status);
> 
> This should also be a tracepoint.
> 
> > +		goto out_fh_put;
> > +	}
> > +
> > +	*pfilp = get_file(nf->nf_file);
> > +
> > +	nfsd_file_put(nf);
> > +out_fh_put:
> > +	fh_put(&fh);
> > +
> > +out:
> > +	nfsd_local_fakerqst_destroy(rqstp);
> > +out_revertcred:
> > +	revert_creds(save_cred);
> > +	return status;
> > +}
> > +EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
> > +
> > +/* Compile time type checking, not used by anything */
> > +static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index 1222a0a33fe1..a477d2c5088a 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -431,6 +431,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
> >  #endif
> >  #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
> >  	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
> > +	nn->nfsd_uuid.net = net;
> >  	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
> >  #endif
> >  	nn->nfsd_net_up = true;
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 77bbd23aa150..9c0610fdd11c 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
> >  		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
> >  		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
> >  		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
> > -		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
> > +		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
> > +		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
> >  
> >  TRACE_EVENT(nfsd_compound,
> >  	TP_PROTO(
> > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> > index 57cd70062048..5146f0c81752 100644
> > --- a/fs/nfsd/vfs.h
> > +++ b/fs/nfsd/vfs.h
> > @@ -33,6 +33,8 @@
> >  
> >  #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
> >  
> > +#define NFSD_MAY_LOCALIO		0x2000
> > +
> >  #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
> >  #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
> >  
> > @@ -158,6 +160,13 @@ __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
> >  
> >  void		nfsd_filp_close(struct file *fp);
> >  
> > +int		nfsd_open_local_fh(struct net *net,
> > +				   struct rpc_clnt *rpc_clnt,
> > +				   const struct cred *cred,
> > +				   const struct nfs_fh *nfs_fh,
> > +				   const fmode_t fmode,
> > +				   struct file **pfilp);
> > +
> >  static inline int fh_want_write(struct svc_fh *fh)
> >  {
> >  	int ret;
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
Mike Snitzer June 27, 2024, 5:27 p.m. UTC | #3
On Thu, Jun 27, 2024 at 12:07:03PM -0400, Chuck Lever wrote:
> On Thu, Jun 27, 2024 at 11:48:09AM -0400, Jeff Layton wrote:
> > On Wed, 2024-06-26 at 14:24 -0400, Mike Snitzer wrote:
> > > Pass the stored cl_nfssvc_net from the client to the server as
> > > first argument to nfsd_open_local_fh() to ensure the proper network
> > > namespace is used for localio.
> > > 
> > > Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> > > Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> > > Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> > > ---
> > >  fs/nfsd/Makefile    |   1 +
> > >  fs/nfsd/filecache.c |   2 +-
> > >  fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/nfssvc.c    |   1 +
> > >  fs/nfsd/trace.h     |   3 +-
> > >  fs/nfsd/vfs.h       |   9 ++
> > >  6 files changed, 260 insertions(+), 2 deletions(-)
> > >  create mode 100644 fs/nfsd/localio.c
> > > 
> > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> > > index b8736a82e57c..78b421778a79 100644
> > > --- a/fs/nfsd/Makefile
> > > +++ b/fs/nfsd/Makefile
> > > @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> > >  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> > >  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> > >  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> > > +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index ad9083ca144b..99631fa56662 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -52,7 +52,7 @@
> > >  #define NFSD_FILE_CACHE_UP		     (0)
> > >  
> > >  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> > > -#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
> > > +#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> > >  
> > >  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> > >  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> > > diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> > > new file mode 100644
> > > index 000000000000..ba9187735947
> > > --- /dev/null
> > > +++ b/fs/nfsd/localio.c
> > > @@ -0,0 +1,246 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * NFS server support for local clients to bypass network stack
> > > + *
> > > + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> > > + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> > > + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> > > + */
> > > +
> > > +#include <linux/exportfs.h>
> > > +#include <linux/sunrpc/svcauth_gss.h>
> > > +#include <linux/sunrpc/clnt.h>
> > > +#include <linux/nfs.h>
> > > +#include <linux/string.h>
> > > +
> > > +#include "nfsd.h"
> > > +#include "vfs.h"
> > > +#include "netns.h"
> > > +#include "filecache.h"
> > > +
> > > +#define NFSDDBG_FACILITY		NFSDDBG_FH
> > > +
> > > +/*
> > > + * We need to translate between nfs status return values and
> > > + * the local errno values which may not be the same.
> > > + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> > > + *   all compiled nfs objects if it were in include/linux/nfs.h
> > > + */
> > > +static const struct {
> > > +	int stat;
> > > +	int errno;
> > > +} nfs_common_errtbl[] = {
> > > +	{ NFS_OK,		0		},
> > > +	{ NFSERR_PERM,		-EPERM		},
> > > +	{ NFSERR_NOENT,		-ENOENT		},
> > > +	{ NFSERR_IO,		-EIO		},
> > > +	{ NFSERR_NXIO,		-ENXIO		},
> > > +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
> > > +	{ NFSERR_ACCES,		-EACCES		},
> > > +	{ NFSERR_EXIST,		-EEXIST		},
> > > +	{ NFSERR_XDEV,		-EXDEV		},
> > > +	{ NFSERR_NODEV,		-ENODEV		},
> > > +	{ NFSERR_NOTDIR,	-ENOTDIR	},
> > > +	{ NFSERR_ISDIR,		-EISDIR		},
> > > +	{ NFSERR_INVAL,		-EINVAL		},
> > > +	{ NFSERR_FBIG,		-EFBIG		},
> > > +	{ NFSERR_NOSPC,		-ENOSPC		},
> > > +	{ NFSERR_ROFS,		-EROFS		},
> > > +	{ NFSERR_MLINK,		-EMLINK		},
> > > +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
> > > +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
> > > +	{ NFSERR_DQUOT,		-EDQUOT		},
> > > +	{ NFSERR_STALE,		-ESTALE		},
> > > +	{ NFSERR_REMOTE,	-EREMOTE	},
> > > +#ifdef EWFLUSH
> > > +	{ NFSERR_WFLUSH,	-EWFLUSH	},
> > > +#endif
> > > +	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
> > > +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
> > > +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
> > > +	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
> > > +	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
> > > +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
> > > +	{ NFSERR_BADTYPE,	-EBADTYPE	},
> > > +	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
> > > +	{ -1,			-EIO		}
> > > +};
> > > +
> > > +/**
> > > + * nfs_stat_to_errno - convert an NFS status code to a local errno
> > > + * @status: NFS status code to convert
> > > + *
> > > + * Returns a local errno value, or -EIO if the NFS status code is
> > > + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> > > + * needs to be translated to an errno before being returned to a
> > > + * local client application.
> > > + */
> > > +static int nfs_stat_to_errno(enum nfs_stat status)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> > > +		if (nfs_common_errtbl[i].stat == (int)status)
> > > +			return nfs_common_errtbl[i].errno;
> > > +	}
> > > +	return nfs_common_errtbl[i].errno;
> > > +}
> > > +
> > > +static void
> > > +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> > > +{
> > > +	if (rqstp->rq_client)
> > > +		auth_domain_put(rqstp->rq_client);
> > > +	if (rqstp->rq_cred.cr_group_info)
> > > +		put_group_info(rqstp->rq_cred.cr_group_info);
> > > +	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
> > > +	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> > > +	kfree(rqstp->rq_xprt);
> > > +	kfree(rqstp);
> > > +}
> > > +
> > > +static struct svc_rqst *
> > > +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> > > +			const struct cred *cred)
> > > +{
> > > +	struct svc_rqst *rqstp;
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +	int status;
> > > +
> > > +	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> > > +	if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
> > > +		dprintk("%s: localio denied. Server not running\n", __func__);
> > 
> > Chuck mentioned this earlier, but I don't think we ought to merge the
> > dprintks. If they're useful for debugging then they should be turned
> > into tracepoints. This one, I'd probably just drop.
> 
> Right; the problem with dprintk() is they can create so much chatter
> that the systemd journal will automatically toss those messages and
> they are lost. No diagnostic value in that.
> 
> Also you probably won't find it useful if lots of debugging info
> goes into the trace log but a handful of the stuff you are
> looking for is dumped into the system journal; the two use different
> timestamps and so are really hard to line up after the fact.
> 
> We're trying to transition away from dprintk() in NFSD for these
> reasons.

OK, I understand wanting to not allow new dprintk() to be added.

Meanwhile:
$ grep -ri dprintk fs/nfsd/*.[ch]  | wc -l
     181

So I'm struggling to get motivated to convert to tracepoints.  Feels
like a needless make-work hurdle, these could be converted by others
more proficient with tracepoints if/when needed.

Making everyone have to be proficient at developing debugging via
tracepoints seems misplaced (but I also understand that forcing others
to fish enables "others" to not be doing the conversion work).

This is the end of my mild protest.. I'll shutup and either convert
the dprintk()s or kill them all. ;)

Thanks,
Mike
Chuck Lever June 27, 2024, 5:50 p.m. UTC | #4
> On Jun 27, 2024, at 1:27 PM, Mike Snitzer <snitzer@kernel.org> wrote:
> 
> On Thu, Jun 27, 2024 at 12:07:03PM -0400, Chuck Lever wrote:
>> On Thu, Jun 27, 2024 at 11:48:09AM -0400, Jeff Layton wrote:
>>> On Wed, 2024-06-26 at 14:24 -0400, Mike Snitzer wrote:
>>>> Pass the stored cl_nfssvc_net from the client to the server as
>>>> first argument to nfsd_open_local_fh() to ensure the proper network
>>>> namespace is used for localio.
>>>> 
>>>> Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
>>>> Signed-off-by: Peng Tao <tao.peng@primarydata.com>
>>>> Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
>>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
>>>> ---
>>>>  fs/nfsd/Makefile    |   1 +
>>>>  fs/nfsd/filecache.c |   2 +-
>>>>  fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
>>>>  fs/nfsd/nfssvc.c    |   1 +
>>>>  fs/nfsd/trace.h     |   3 +-
>>>>  fs/nfsd/vfs.h       |   9 ++
>>>>  6 files changed, 260 insertions(+), 2 deletions(-)
>>>>  create mode 100644 fs/nfsd/localio.c
>>>> 
>>>> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
>>>> index b8736a82e57c..78b421778a79 100644
>>>> --- a/fs/nfsd/Makefile
>>>> +++ b/fs/nfsd/Makefile
>>>> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
>>>>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
>>>>  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
>>>>  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
>>>> +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>>> index ad9083ca144b..99631fa56662 100644
>>>> --- a/fs/nfsd/filecache.c
>>>> +++ b/fs/nfsd/filecache.c
>>>> @@ -52,7 +52,7 @@
>>>>  #define NFSD_FILE_CACHE_UP      (0)
>>>>  
>>>>  /* We only care about NFSD_MAY_READ/WRITE for this cache */
>>>> -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
>>>> +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
>>>>  
>>>>  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
>>>>  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
>>>> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
>>>> new file mode 100644
>>>> index 000000000000..ba9187735947
>>>> --- /dev/null
>>>> +++ b/fs/nfsd/localio.c
>>>> @@ -0,0 +1,246 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/*
>>>> + * NFS server support for local clients to bypass network stack
>>>> + *
>>>> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
>>>> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
>>>> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
>>>> + */
>>>> +
>>>> +#include <linux/exportfs.h>
>>>> +#include <linux/sunrpc/svcauth_gss.h>
>>>> +#include <linux/sunrpc/clnt.h>
>>>> +#include <linux/nfs.h>
>>>> +#include <linux/string.h>
>>>> +
>>>> +#include "nfsd.h"
>>>> +#include "vfs.h"
>>>> +#include "netns.h"
>>>> +#include "filecache.h"
>>>> +
>>>> +#define NFSDDBG_FACILITY NFSDDBG_FH
>>>> +
>>>> +/*
>>>> + * We need to translate between nfs status return values and
>>>> + * the local errno values which may not be the same.
>>>> + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
>>>> + *   all compiled nfs objects if it were in include/linux/nfs.h
>>>> + */
>>>> +static const struct {
>>>> + int stat;
>>>> + int errno;
>>>> +} nfs_common_errtbl[] = {
>>>> + { NFS_OK, 0 },
>>>> + { NFSERR_PERM, -EPERM },
>>>> + { NFSERR_NOENT, -ENOENT },
>>>> + { NFSERR_IO, -EIO },
>>>> + { NFSERR_NXIO, -ENXIO },
>>>> +/* { NFSERR_EAGAIN, -EAGAIN }, */
>>>> + { NFSERR_ACCES, -EACCES },
>>>> + { NFSERR_EXIST, -EEXIST },
>>>> + { NFSERR_XDEV, -EXDEV },
>>>> + { NFSERR_NODEV, -ENODEV },
>>>> + { NFSERR_NOTDIR, -ENOTDIR },
>>>> + { NFSERR_ISDIR, -EISDIR },
>>>> + { NFSERR_INVAL, -EINVAL },
>>>> + { NFSERR_FBIG, -EFBIG },
>>>> + { NFSERR_NOSPC, -ENOSPC },
>>>> + { NFSERR_ROFS, -EROFS },
>>>> + { NFSERR_MLINK, -EMLINK },
>>>> + { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
>>>> + { NFSERR_NOTEMPTY, -ENOTEMPTY },
>>>> + { NFSERR_DQUOT, -EDQUOT },
>>>> + { NFSERR_STALE, -ESTALE },
>>>> + { NFSERR_REMOTE, -EREMOTE },
>>>> +#ifdef EWFLUSH
>>>> + { NFSERR_WFLUSH, -EWFLUSH },
>>>> +#endif
>>>> + { NFSERR_BADHANDLE, -EBADHANDLE },
>>>> + { NFSERR_NOT_SYNC, -ENOTSYNC },
>>>> + { NFSERR_BAD_COOKIE, -EBADCOOKIE },
>>>> + { NFSERR_NOTSUPP, -ENOTSUPP },
>>>> + { NFSERR_TOOSMALL, -ETOOSMALL },
>>>> + { NFSERR_SERVERFAULT, -EREMOTEIO },
>>>> + { NFSERR_BADTYPE, -EBADTYPE },
>>>> + { NFSERR_JUKEBOX, -EJUKEBOX },
>>>> + { -1, -EIO }
>>>> +};
>>>> +
>>>> +/**
>>>> + * nfs_stat_to_errno - convert an NFS status code to a local errno
>>>> + * @status: NFS status code to convert
>>>> + *
>>>> + * Returns a local errno value, or -EIO if the NFS status code is
>>>> + * not recognized.  nfsd_file_acquire() returns an nfsstat that
>>>> + * needs to be translated to an errno before being returned to a
>>>> + * local client application.
>>>> + */
>>>> +static int nfs_stat_to_errno(enum nfs_stat status)
>>>> +{
>>>> + int i;
>>>> +
>>>> + for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
>>>> + if (nfs_common_errtbl[i].stat == (int)status)
>>>> + return nfs_common_errtbl[i].errno;
>>>> + }
>>>> + return nfs_common_errtbl[i].errno;
>>>> +}
>>>> +
>>>> +static void
>>>> +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
>>>> +{
>>>> + if (rqstp->rq_client)
>>>> + auth_domain_put(rqstp->rq_client);
>>>> + if (rqstp->rq_cred.cr_group_info)
>>>> + put_group_info(rqstp->rq_cred.cr_group_info);
>>>> + /* rpcauth_map_to_svc_cred_local() clears cr_principal */
>>>> + WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
>>>> + kfree(rqstp->rq_xprt);
>>>> + kfree(rqstp);
>>>> +}
>>>> +
>>>> +static struct svc_rqst *
>>>> +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
>>>> + const struct cred *cred)
>>>> +{
>>>> + struct svc_rqst *rqstp;
>>>> + struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>>>> + int status;
>>>> +
>>>> + /* FIXME: not running in nfsd context, must get reference on nfsd_serv */
>>>> + if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
>>>> + dprintk("%s: localio denied. Server not running\n", __func__);
>>> 
>>> Chuck mentioned this earlier, but I don't think we ought to merge the
>>> dprintks. If they're useful for debugging then they should be turned
>>> into tracepoints. This one, I'd probably just drop.
>> 
>> Right; the problem with dprintk() is they can create so much chatter
>> that the systemd journal will automatically toss those messages and
>> they are lost. No diagnostic value in that.
>> 
>> Also you probably won't find it useful if lots of debugging info
>> goes into the trace log but a handful of the stuff you are
>> looking for is dumped into the system journal; the two use different
>> timestamps and so are really hard to line up after the fact.
>> 
>> We're trying to transition away from dprintk() in NFSD for these
>> reasons.
> 
> OK, I understand wanting to not allow new dprintk() to be added.
> 
> Meanwhile:
> $ grep -ri dprintk fs/nfsd/*.[ch]  | wc -l
>     181
> 
> So I'm struggling to get motivated to convert to tracepoints.  Feels
> like a needless make-work hurdle, these could be converted by others
> more proficient with tracepoints if/when needed.
> 
> Making everyone have to be proficient at developing debugging via
> tracepoints seems misplaced (but I also understand that forcing others
> to fish enables "others" to not be doing the conversion work).

Trace points are part of the cost of contributing to NFSD,
just like XDR, RCU, lockdep_assert, and dozens of other
kernel facilities. Not a hurdle, and I don't ask for busy
work changes.


> This is the end of my mild protest.. I'll shutup and either convert
> the dprintk()s or kill them all. ;)

IMO, if a dprintk is killable (or you don't know its needed
yet) then it shouldn't be included in the patch series.


--
Chuck Lever
NeilBrown June 28, 2024, 3:35 a.m. UTC | #5
On Fri, 28 Jun 2024, Chuck Lever III wrote:
> 
> 
> > On Jun 27, 2024, at 1:27 PM, Mike Snitzer <snitzer@kernel.org> wrote:
> > 
> > On Thu, Jun 27, 2024 at 12:07:03PM -0400, Chuck Lever wrote:
> >> On Thu, Jun 27, 2024 at 11:48:09AM -0400, Jeff Layton wrote:
> >>> On Wed, 2024-06-26 at 14:24 -0400, Mike Snitzer wrote:
> >>>> Pass the stored cl_nfssvc_net from the client to the server as
> >>>> first argument to nfsd_open_local_fh() to ensure the proper network
> >>>> namespace is used for localio.
> >>>> 
> >>>> Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
> >>>> Signed-off-by: Peng Tao <tao.peng@primarydata.com>
> >>>> Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
> >>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> >>>> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> >>>> ---
> >>>>  fs/nfsd/Makefile    |   1 +
> >>>>  fs/nfsd/filecache.c |   2 +-
> >>>>  fs/nfsd/localio.c   | 246 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>  fs/nfsd/nfssvc.c    |   1 +
> >>>>  fs/nfsd/trace.h     |   3 +-
> >>>>  fs/nfsd/vfs.h       |   9 ++
> >>>>  6 files changed, 260 insertions(+), 2 deletions(-)
> >>>>  create mode 100644 fs/nfsd/localio.c
> >>>> 
> >>>> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> >>>> index b8736a82e57c..78b421778a79 100644
> >>>> --- a/fs/nfsd/Makefile
> >>>> +++ b/fs/nfsd/Makefile
> >>>> @@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> >>>>  nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
> >>>>  nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
> >>>>  nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
> >>>> +nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
> >>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> >>>> index ad9083ca144b..99631fa56662 100644
> >>>> --- a/fs/nfsd/filecache.c
> >>>> +++ b/fs/nfsd/filecache.c
> >>>> @@ -52,7 +52,7 @@
> >>>>  #define NFSD_FILE_CACHE_UP      (0)
> >>>>  
> >>>>  /* We only care about NFSD_MAY_READ/WRITE for this cache */
> >>>> -#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
> >>>> +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
> >>>>  
> >>>>  static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
> >>>>  static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
> >>>> diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
> >>>> new file mode 100644
> >>>> index 000000000000..ba9187735947
> >>>> --- /dev/null
> >>>> +++ b/fs/nfsd/localio.c
> >>>> @@ -0,0 +1,246 @@
> >>>> +// SPDX-License-Identifier: GPL-2.0-only
> >>>> +/*
> >>>> + * NFS server support for local clients to bypass network stack
> >>>> + *
> >>>> + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
> >>>> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
> >>>> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
> >>>> + */
> >>>> +
> >>>> +#include <linux/exportfs.h>
> >>>> +#include <linux/sunrpc/svcauth_gss.h>
> >>>> +#include <linux/sunrpc/clnt.h>
> >>>> +#include <linux/nfs.h>
> >>>> +#include <linux/string.h>
> >>>> +
> >>>> +#include "nfsd.h"
> >>>> +#include "vfs.h"
> >>>> +#include "netns.h"
> >>>> +#include "filecache.h"
> >>>> +
> >>>> +#define NFSDDBG_FACILITY NFSDDBG_FH
> >>>> +
> >>>> +/*
> >>>> + * We need to translate between nfs status return values and
> >>>> + * the local errno values which may not be the same.
> >>>> + * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
> >>>> + *   all compiled nfs objects if it were in include/linux/nfs.h
> >>>> + */
> >>>> +static const struct {
> >>>> + int stat;
> >>>> + int errno;
> >>>> +} nfs_common_errtbl[] = {
> >>>> + { NFS_OK, 0 },
> >>>> + { NFSERR_PERM, -EPERM },
> >>>> + { NFSERR_NOENT, -ENOENT },
> >>>> + { NFSERR_IO, -EIO },
> >>>> + { NFSERR_NXIO, -ENXIO },
> >>>> +/* { NFSERR_EAGAIN, -EAGAIN }, */
> >>>> + { NFSERR_ACCES, -EACCES },
> >>>> + { NFSERR_EXIST, -EEXIST },
> >>>> + { NFSERR_XDEV, -EXDEV },
> >>>> + { NFSERR_NODEV, -ENODEV },
> >>>> + { NFSERR_NOTDIR, -ENOTDIR },
> >>>> + { NFSERR_ISDIR, -EISDIR },
> >>>> + { NFSERR_INVAL, -EINVAL },
> >>>> + { NFSERR_FBIG, -EFBIG },
> >>>> + { NFSERR_NOSPC, -ENOSPC },
> >>>> + { NFSERR_ROFS, -EROFS },
> >>>> + { NFSERR_MLINK, -EMLINK },
> >>>> + { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
> >>>> + { NFSERR_NOTEMPTY, -ENOTEMPTY },
> >>>> + { NFSERR_DQUOT, -EDQUOT },
> >>>> + { NFSERR_STALE, -ESTALE },
> >>>> + { NFSERR_REMOTE, -EREMOTE },
> >>>> +#ifdef EWFLUSH
> >>>> + { NFSERR_WFLUSH, -EWFLUSH },
> >>>> +#endif
> >>>> + { NFSERR_BADHANDLE, -EBADHANDLE },
> >>>> + { NFSERR_NOT_SYNC, -ENOTSYNC },
> >>>> + { NFSERR_BAD_COOKIE, -EBADCOOKIE },
> >>>> + { NFSERR_NOTSUPP, -ENOTSUPP },
> >>>> + { NFSERR_TOOSMALL, -ETOOSMALL },
> >>>> + { NFSERR_SERVERFAULT, -EREMOTEIO },
> >>>> + { NFSERR_BADTYPE, -EBADTYPE },
> >>>> + { NFSERR_JUKEBOX, -EJUKEBOX },
> >>>> + { -1, -EIO }
> >>>> +};
> >>>> +
> >>>> +/**
> >>>> + * nfs_stat_to_errno - convert an NFS status code to a local errno
> >>>> + * @status: NFS status code to convert
> >>>> + *
> >>>> + * Returns a local errno value, or -EIO if the NFS status code is
> >>>> + * not recognized.  nfsd_file_acquire() returns an nfsstat that
> >>>> + * needs to be translated to an errno before being returned to a
> >>>> + * local client application.
> >>>> + */
> >>>> +static int nfs_stat_to_errno(enum nfs_stat status)
> >>>> +{
> >>>> + int i;
> >>>> +
> >>>> + for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
> >>>> + if (nfs_common_errtbl[i].stat == (int)status)
> >>>> + return nfs_common_errtbl[i].errno;
> >>>> + }
> >>>> + return nfs_common_errtbl[i].errno;
> >>>> +}
> >>>> +
> >>>> +static void
> >>>> +nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
> >>>> +{
> >>>> + if (rqstp->rq_client)
> >>>> + auth_domain_put(rqstp->rq_client);
> >>>> + if (rqstp->rq_cred.cr_group_info)
> >>>> + put_group_info(rqstp->rq_cred.cr_group_info);
> >>>> + /* rpcauth_map_to_svc_cred_local() clears cr_principal */
> >>>> + WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
> >>>> + kfree(rqstp->rq_xprt);
> >>>> + kfree(rqstp);
> >>>> +}
> >>>> +
> >>>> +static struct svc_rqst *
> >>>> +nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
> >>>> + const struct cred *cred)
> >>>> +{
> >>>> + struct svc_rqst *rqstp;
> >>>> + struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >>>> + int status;
> >>>> +
> >>>> + /* FIXME: not running in nfsd context, must get reference on nfsd_serv */
> >>>> + if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
> >>>> + dprintk("%s: localio denied. Server not running\n", __func__);
> >>> 
> >>> Chuck mentioned this earlier, but I don't think we ought to merge the
> >>> dprintks. If they're useful for debugging then they should be turned
> >>> into tracepoints. This one, I'd probably just drop.
> >> 
> >> Right; the problem with dprintk() is they can create so much chatter
> >> that the systemd journal will automatically toss those messages and
> >> they are lost. No diagnostic value in that.
> >> 
> >> Also you probably won't find it useful if lots of debugging info
> >> goes into the trace log but a handful of the stuff you are
> >> looking for is dumped into the system journal; the two use different
> >> timestamps and so are really hard to line up after the fact.
> >> 
> >> We're trying to transition away from dprintk() in NFSD for these
> >> reasons.
> > 
> > OK, I understand wanting to not allow new dprintk() to be added.
> > 
> > Meanwhile:
> > $ grep -ri dprintk fs/nfsd/*.[ch]  | wc -l
> >     181
> > 
> > So I'm struggling to get motivated to convert to tracepoints.  Feels
> > like a needless make-work hurdle, these could be converted by others
> > more proficient with tracepoints if/when needed.
> > 
> > Making everyone have to be proficient at developing debugging via
> > tracepoints seems misplaced (but I also understand that forcing others
> > to fish enables "others" to not be doing the conversion work).
> 
> Trace points are part of the cost of contributing to NFSD,
> just like XDR, RCU, lockdep_assert, and dozens of other
> kernel facilities. Not a hurdle, and I don't ask for busy
> work changes.

I think trace points are quite different from the other facilities you
highlight.
You need to know XDR and RCU etc to get correct performant code.  If you
get it wrong, then the code won't work or (hopefully) a reviewer will
tell you.

But trace points .... when and where are they really useful?  The answer
to that question is no where near as clear cut.

While I'm sure they can be useful, I rarely find them to be so.  I've
certainly had a few positive experiences, but also seen a lot of noise
that doesn't really help me with the particular behaviour that I'm
trying the analyse.  system-tap can be incredibly useful as it is
targeted.  Fixed trace points are (for me) only occasionally useful.

So I would recommend removing all dprintks and not add any trace points
unless we have good reason to think they will be useful - and the person
with the good reason will probably be motivated to do the work (I don't
think I've ever added a new tracepoint, though I have moved them and
revised them slightly).

I think it would be good to know if localio is active - maybe something
in /proc/self/mountinfo could provide that.
I think it might be useful to know what server-uuid each server and each
mount was using.  The client could again have it in
/proc/self/mountinfo.  The server ...  maybe in /proc/fs/nfsd/, maybe
available over netlink...

But these aren't tracing messages, these are status messages.

just fyi, the most valuable part of the dprintk debugging in my
experience is the rpc_show_tasks() call when rpc debugging is turned on
or off.  This view into the current status can be very useful.

> 
> > This is the end of my mild protest.. I'll shutup and either convert
> > the dprintk()s or kill them all. ;)
> 
> IMO, if a dprintk is killable (or you don't know its needed
> yet) then it shouldn't be included in the patch series.

I agree.  Kill them all for now.

NeilBrown


> 
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever June 28, 2024, 2:40 p.m. UTC | #6
> On Jun 27, 2024, at 11:35 PM, NeilBrown <neilb@suse.de> wrote:
> On Fri, 28 Jun 2024, Chuck Lever III wrote:
>> 
>>> On Jun 27, 2024, at 1:27 PM, Mike Snitzer <snitzer@kernel.org> wrote:
>>> On Thu, Jun 27, 2024 at 12:07:03PM -0400, Chuck Lever wrote:
>>>> On Thu, Jun 27, 2024 at 11:48:09AM -0400, Jeff Layton wrote:
>>>>> 
>>>>> Chuck mentioned this earlier, but I don't think we ought to merge the
>>>>> dprintks. If they're useful for debugging then they should be turned
>>>>> into tracepoints. This one, I'd probably just drop.
>>>> 
>>>> Right; the problem with dprintk() is they can create so much chatter
>>>> that the systemd journal will automatically toss those messages and
>>>> they are lost. No diagnostic value in that.
>>>> 
>>>> Also you probably won't find it useful if lots of debugging info
>>>> goes into the trace log but a handful of the stuff you are
>>>> looking for is dumped into the system journal; the two use different
>>>> timestamps and so are really hard to line up after the fact.
>>>> 
>>>> We're trying to transition away from dprintk() in NFSD for these
>>>> reasons.
>>> 
>>> OK, I understand wanting to not allow new dprintk() to be added.
>>> 
>>> Meanwhile:
>>> $ grep -ri dprintk fs/nfsd/*.[ch]  | wc -l
>>>    181
>>> 
>>> So I'm struggling to get motivated to convert to tracepoints.  Feels
>>> like a needless make-work hurdle, these could be converted by others
>>> more proficient with tracepoints if/when needed.
>>> 
>>> Making everyone have to be proficient at developing debugging via
>>> tracepoints seems misplaced (but I also understand that forcing others
>>> to fish enables "others" to not be doing the conversion work).
>> 
>> Trace points are part of the cost of contributing to NFSD,
>> just like XDR, RCU, lockdep_assert, and dozens of other
>> kernel facilities. Not a hurdle, and I don't ask for busy
>> work changes.
> 
> I think trace points are quite different from the other facilities you
> highlight.
> You need to know XDR and RCU etc to get correct performant code.  If you
> get it wrong, then the code won't work or (hopefully) a reviewer will
> tell you.
> 
> But trace points .... when and where are they really useful?  The answer
> to that question is no where near as clear cut.

I disagree; see below.


> While I'm sure they can be useful, I rarely find them to be so.  I've
> certainly had a few positive experiences, but also seen a lot of noise
> that doesn't really help me with the particular behaviour that I'm
> trying the analyse.  system-tap can be incredibly useful as it is
> targeted.  Fixed trace points are (for me) only occasionally useful.

Some of Oracle's customers, for example, refuse to use out-of-band
debugging facilities like BPF or systemtap because that requires
bespoke case-specific code to be written. They feel that enabling
any lightly-tested code at a kernel privilege level on heavily-used
production systems introduces an unacceptable risk of crashing such
systems. (I'm told by Red Hat support engineers that they have
heard the same concerns).

dprintk impacts thread timing and has a heavy performance penalty.
It can also run the root file system out of space, thus it's not
something that can be left enabled for long periods of time. It
has no mechanisms for data reduction during capture. So it's
simply not a viable player in most live debugging scenarios.

If you prefer systemtap or BPF, you are still free to use those
instead! However, built-in tracing is the only choice for the
above cases, and it has to be part of the source code.


> I think it would be good to know if localio is active - maybe something
> in /proc/self/mountinfo could provide that.
> I think it might be useful to know what server-uuid each server and each
> mount was using.  The client could again have it in
> /proc/self/mountinfo.  The server ...  maybe in /proc/fs/nfsd/, maybe
> available over netlink...

Netlink is where we are adding such things these days.


> just fyi, the most valuable part of the dprintk debugging in my
> experience is the rpc_show_tasks() call when rpc debugging is turned on
> or off.  This view into the current status can be very useful.


NFSD now has a similar facility via netlink.

Note also that the client's "show tasks" mechanism can also be
accessed via /sys.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index b8736a82e57c..78b421778a79 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -23,3 +23,4 @@  nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
 nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
 nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
+nfsd-$(CONFIG_NFSD_LOCALIO) += localio.o
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ad9083ca144b..99631fa56662 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -52,7 +52,7 @@ 
 #define NFSD_FILE_CACHE_UP		     (0)
 
 /* We only care about NFSD_MAY_READ/WRITE for this cache */
-#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
+#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
 
 static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
new file mode 100644
index 000000000000..ba9187735947
--- /dev/null
+++ b/fs/nfsd/localio.c
@@ -0,0 +1,246 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NFS server support for local clients to bypass network stack
+ *
+ * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
+ * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ */
+
+#include <linux/exportfs.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/string.h>
+
+#include "nfsd.h"
+#include "vfs.h"
+#include "netns.h"
+#include "filecache.h"
+
+#define NFSDDBG_FACILITY		NFSDDBG_FH
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ * - duplicated from fs/nfs/nfs2xdr.c to avoid needless bloat of
+ *   all compiled nfs objects if it were in include/linux/nfs.h
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_common_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-EIO		},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  nfsd_file_acquire() returns an nfsstat that
+ * needs to be translated to an errno before being returned to a
+ * local client application.
+ */
+static int nfs_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_common_errtbl[i].stat != -1; i++) {
+		if (nfs_common_errtbl[i].stat == (int)status)
+			return nfs_common_errtbl[i].errno;
+	}
+	return nfs_common_errtbl[i].errno;
+}
+
+static void
+nfsd_local_fakerqst_destroy(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_client)
+		auth_domain_put(rqstp->rq_client);
+	if (rqstp->rq_cred.cr_group_info)
+		put_group_info(rqstp->rq_cred.cr_group_info);
+	/* rpcauth_map_to_svc_cred_local() clears cr_principal */
+	WARN_ON_ONCE(rqstp->rq_cred.cr_principal != NULL);
+	kfree(rqstp->rq_xprt);
+	kfree(rqstp);
+}
+
+static struct svc_rqst *
+nfsd_local_fakerqst_create(struct net *net, struct rpc_clnt *rpc_clnt,
+			const struct cred *cred)
+{
+	struct svc_rqst *rqstp;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int status;
+
+	/* FIXME: not running in nfsd context, must get reference on nfsd_serv */
+	if (unlikely(!READ_ONCE(nn->nfsd_serv))) {
+		dprintk("%s: localio denied. Server not running\n", __func__);
+		return ERR_PTR(-ENXIO);
+	}
+
+	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+	if (!rqstp)
+		return ERR_PTR(-ENOMEM);
+
+	rqstp->rq_xprt = kzalloc(sizeof(*rqstp->rq_xprt), GFP_KERNEL);
+	if (!rqstp->rq_xprt) {
+		status = -ENOMEM;
+		goto out_err;
+	}
+
+	rqstp->rq_xprt->xpt_net = net;
+	__set_bit(RQ_SECURE, &rqstp->rq_flags);
+	rqstp->rq_proc = 1;
+	rqstp->rq_vers = 3;
+	rqstp->rq_prot = IPPROTO_TCP;
+	rqstp->rq_server = nn->nfsd_serv;
+
+	/* Note: we're connecting to ourself, so source addr == peer addr */
+	rqstp->rq_addrlen = rpc_peeraddr(rpc_clnt,
+			(struct sockaddr *)&rqstp->rq_addr,
+			sizeof(rqstp->rq_addr));
+
+	rpcauth_map_to_svc_cred_local(rpc_clnt->cl_auth, cred, &rqstp->rq_cred);
+
+	/*
+	 * set up enough for svcauth_unix_set_client to be able to wait
+	 * for the cache downcall. Note that we do _not_ want to allow the
+	 * request to be deferred for later revisit since this rqst and xprt
+	 * are not set up to run inside of the normal svc_rqst engine.
+	 */
+	INIT_LIST_HEAD(&rqstp->rq_xprt->xpt_deferred);
+	kref_init(&rqstp->rq_xprt->xpt_ref);
+	spin_lock_init(&rqstp->rq_xprt->xpt_lock);
+	rqstp->rq_chandle.thread_wait = 5 * HZ;
+
+	status = svcauth_unix_set_client(rqstp);
+	switch (status) {
+	case SVC_OK:
+		break;
+	case SVC_DENIED:
+		status = -ENXIO;
+		dprintk("%s: client %pISpc denied localio access\n",
+				__func__, (struct sockaddr *)&rqstp->rq_addr);
+		goto out_err;
+	default:
+		status = -ETIMEDOUT;
+		dprintk("%s: client %pISpc temporarily denied localio access\n",
+				__func__, (struct sockaddr *)&rqstp->rq_addr);
+		goto out_err;
+	}
+
+	return rqstp;
+
+out_err:
+	nfsd_local_fakerqst_destroy(rqstp);
+	return ERR_PTR(status);
+}
+
+/*
+ * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to @file
+ *
+ * This function maps a local fh to a path on a local filesystem.
+ * This is useful when the nfs client has the local server mounted - it can
+ * avoid all the NFS overhead with reads, writes and commits.
+ *
+ * on successful return, caller is responsible for calling path_put. Also
+ * note that this is called from nfs.ko via find_symbol() to avoid an explicit
+ * dependency on knfsd. So, there is no forward declaration in a header file
+ * for it.
+ */
+int nfsd_open_local_fh(struct net *net,
+			 struct rpc_clnt *rpc_clnt,
+			 const struct cred *cred,
+			 const struct nfs_fh *nfs_fh,
+			 const fmode_t fmode,
+			 struct file **pfilp)
+{
+	const struct cred *save_cred;
+	struct svc_rqst *rqstp;
+	struct svc_fh fh;
+	struct nfsd_file *nf;
+	int status = 0;
+	int mayflags = NFSD_MAY_LOCALIO;
+	__be32 beres;
+
+	/* Save creds before calling into nfsd */
+	save_cred = get_current_cred();
+
+	rqstp = nfsd_local_fakerqst_create(net, rpc_clnt, cred);
+	if (IS_ERR(rqstp)) {
+		status = PTR_ERR(rqstp);
+		goto out_revertcred;
+	}
+
+	/* nfs_fh -> svc_fh */
+	if (nfs_fh->size > NFS4_FHSIZE) {
+		status = -EINVAL;
+		goto out;
+	}
+	fh_init(&fh, NFS4_FHSIZE);
+	fh.fh_handle.fh_size = nfs_fh->size;
+	memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
+
+	if (fmode & FMODE_READ)
+		mayflags |= NFSD_MAY_READ;
+	if (fmode & FMODE_WRITE)
+		mayflags |= NFSD_MAY_WRITE;
+
+	beres = nfsd_file_acquire(rqstp, &fh, mayflags, &nf);
+	if (beres) {
+		status = nfs_stat_to_errno(be32_to_cpu(beres));
+		dprintk("%s: fh_verify failed %d\n", __func__, status);
+		goto out_fh_put;
+	}
+
+	*pfilp = get_file(nf->nf_file);
+
+	nfsd_file_put(nf);
+out_fh_put:
+	fh_put(&fh);
+
+out:
+	nfsd_local_fakerqst_destroy(rqstp);
+out_revertcred:
+	revert_creds(save_cred);
+	return status;
+}
+EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
+
+/* Compile time type checking, not used by anything */
+static nfs_to_nfsd_open_t __maybe_unused nfsd_open_local_fh_typecheck = nfsd_open_local_fh;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1222a0a33fe1..a477d2c5088a 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -431,6 +431,7 @@  static int nfsd_startup_net(struct net *net, const struct cred *cred)
 #endif
 #if IS_ENABLED(CONFIG_NFSD_LOCALIO)
 	INIT_LIST_HEAD(&nn->nfsd_uuid.list);
+	nn->nfsd_uuid.net = net;
 	list_add_tail_rcu(&nn->nfsd_uuid.list, &nfsd_uuids);
 #endif
 	nn->nfsd_net_up = true;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 77bbd23aa150..9c0610fdd11c 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -86,7 +86,8 @@  DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
 		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" },	\
 		{ NFSD_MAY_BYPASS_GSS,		"BYPASS_GSS" },		\
 		{ NFSD_MAY_READ_IF_EXEC,	"READ_IF_EXEC" },	\
-		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
+		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" },	\
+		{ NFSD_MAY_LOCALIO,		"LOCALIO" })
 
 TRACE_EVENT(nfsd_compound,
 	TP_PROTO(
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 57cd70062048..5146f0c81752 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -33,6 +33,8 @@ 
 
 #define NFSD_MAY_64BIT_COOKIE		0x1000 /* 64 bit readdir cookies for >= NFSv3 */
 
+#define NFSD_MAY_LOCALIO		0x2000
+
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
 
@@ -158,6 +160,13 @@  __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 
 void		nfsd_filp_close(struct file *fp);
 
+int		nfsd_open_local_fh(struct net *net,
+				   struct rpc_clnt *rpc_clnt,
+				   const struct cred *cred,
+				   const struct nfs_fh *nfs_fh,
+				   const fmode_t fmode,
+				   struct file **pfilp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;