diff mbox series

[v8,3/3] NFSD: add rpc_status netlink support

Message ID ac18892ea3f718c63f0a12e39aeaac812c081515.1694436263.git.lorenzo@kernel.org (mailing list archive)
State Not Applicable
Headers show
Series add rpc_status netlink support for NFSD | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Lorenzo Bianconi Sept. 11, 2023, 12:49 p.m. UTC
Introduce rpc_status netlink support for NFSD in order to dump pending
RPC requests debugging information from userspace.

Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 fs/nfsd/nfsctl.c           | 192 ++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfsd.h             |  16 ++++
 fs/nfsd/nfssvc.c           |  15 +++
 fs/nfsd/state.h            |   2 -
 include/linux/sunrpc/svc.h |   1 +
 5 files changed, 222 insertions(+), 4 deletions(-)

Comments

Jeff Layton Sept. 11, 2023, 7:43 p.m. UTC | #1
On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote:
> Introduce rpc_status netlink support for NFSD in order to dump pending
> RPC requests debugging information from userspace.
> 
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  fs/nfsd/nfsctl.c           | 192 ++++++++++++++++++++++++++++++++++++-
>  fs/nfsd/nfsd.h             |  16 ++++
>  fs/nfsd/nfssvc.c           |  15 +++
>  fs/nfsd/state.h            |   2 -
>  include/linux/sunrpc/svc.h |   1 +
>  5 files changed, 222 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 1be66088849c..b862a759ea15 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -26,6 +26,7 @@
>  #include "pnfs.h"
>  #include "filecache.h"
>  #include "trace.h"
> +#include "nfs_netlink_gen.h"
>  
>  /*
>   *	We have a single directory with several nodes in it.
> @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id;
>  
>  int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb)
>  {
> -	return 0;
> +	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&nfsd_mutex);
> +	if (nn->nfsd_serv) {
> +		svc_get(nn->nfsd_serv);
> +		ret = 0;
> +	}
> +	mutex_unlock(&nfsd_mutex);
> +
> +	return ret;
>  }
>  
> -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
> +					    struct netlink_callback *cb,
> +					    struct nfsd_genl_rqstp *rqstp)
>  {
> +	void *hdr;
> +	int i;
> +
> +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> +			  &nfsd_server_nl_family, NLM_F_MULTI,
> +			  NFSD_CMD_RPC_STATUS_GET);
> +	if (!hdr)
> +		return -ENOBUFS;
> +
> +	if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) ||
> +	    nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) ||
> +	    nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME,
> +			ktime_to_us(rqstp->rq_stime),
> +			NFSD_ATTR_RPC_STATUS_PAD))
> +		return -ENOBUFS;
> +
> +	switch (rqstp->saddr.sa_family) {
> +	case AF_INET: {
> +		const struct sockaddr_in *s_in, *d_in;
> +
> +		s_in = (const struct sockaddr_in *)&rqstp->saddr;
> +		d_in = (const struct sockaddr_in *)&rqstp->daddr;
> +		if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4,
> +				    s_in->sin_addr.s_addr) ||
> +		    nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4,
> +				    d_in->sin_addr.s_addr) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> +				 s_in->sin_port) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> +				 d_in->sin_port))
> +			return -ENOBUFS;
> +		break;
> +	}
> +	case AF_INET6: {
> +		const struct sockaddr_in6 *s_in, *d_in;
> +
> +		s_in = (const struct sockaddr_in6 *)&rqstp->saddr;
> +		d_in = (const struct sockaddr_in6 *)&rqstp->daddr;
> +		if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6,
> +				     &s_in->sin6_addr) ||
> +		    nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6,
> +				     &d_in->sin6_addr) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> +				 s_in->sin6_port) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> +				 d_in->sin6_port))
> +			return -ENOBUFS;
> +		break;
> +	}
> +	default:
> +		break;
> +	}
> +
> +	if (rqstp->opcnt) {

It may be that we always have an opcount of 0 whenever we're running
something besides NFSv4 COMPOUND, but it may be best not to count on
that. I'd test for prog == NFS_PROGRAM, vers == 4, and that the proc ==
COMPOUND and then for the opcnt.

> +		struct nlattr *attr;
> +
> +		attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP);
> +		if (!attr)
> +			return -ENOBUFS;
> +
> +		for (i = 0; i < rqstp->opcnt; i++) {
> +			struct nlattr *op_attr;
> +
> +			op_attr = nla_nest_start(skb, i);
> +			if (!op_attr)
> +				return -ENOBUFS;
> +
> +			if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP,
> +					rqstp->opnum[i]))
> +				return -ENOBUFS;
> +
> +			nla_nest_end(skb, op_attr);
> +		}
> +
> +		nla_nest_end(skb, attr);
> +	}
> +
> +	genlmsg_end(skb, hdr);
> +
>  	return 0;
>  }
>  
>  int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb,
>  					 struct netlink_callback *cb)
>  {
> +	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
> +	int i, ret, rqstp_index;
> +
> +	rcu_read_lock();
> +
> +	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
> +		struct svc_rqst *rqstp;
> +
> +		if (i < cb->args[0]) /* already consumed */
> +			continue;
> +
> +		rqstp_index = 0;
> +		list_for_each_entry_rcu(rqstp,
> +				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
> +				rq_all) {
> +			struct nfsd_genl_rqstp genl_rqstp;
> +			unsigned int status_counter;
> +
> +			if (rqstp_index++ < cb->args[1]) /* already consumed */
> +				continue;
> +			/*
> +			 * Acquire rq_status_counter before parsing the rqst
> +			 * fields. rq_status_counter is set to an odd value in
> +			 * order to notify the consumers the rqstp fields are
> +			 * meaningful.
> +			 */
> +			status_counter =
> +				smp_load_acquire(&rqstp->rq_status_counter);
> +			if (!(status_counter & 1))
> +				continue;
> +
> +			genl_rqstp.rq_xid = rqstp->rq_xid;
> +			genl_rqstp.rq_flags = rqstp->rq_flags;
> +			genl_rqstp.rq_vers = rqstp->rq_vers;
> +			genl_rqstp.rq_prog = rqstp->rq_prog;
> +			genl_rqstp.rq_proc = rqstp->rq_proc;
> +			genl_rqstp.rq_stime = rqstp->rq_stime;
> +			genl_rqstp.opcnt = 0;
> +			memcpy(&genl_rqstp.daddr, svc_daddr(rqstp),
> +			       sizeof(struct sockaddr));
> +			memcpy(&genl_rqstp.saddr, svc_addr(rqstp),
> +			       sizeof(struct sockaddr));
> +
> +#ifdef CONFIG_NFSD_V4
> +			if (rqstp->rq_vers == NFS4_VERSION &&
> +			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
> +				/* NFSv4 compund */
> +				struct nfsd4_compoundargs *args;
> +				int j;
> +
> +				args = rqstp->rq_argp;
> +				genl_rqstp.opcnt = args->opcnt;
> +				for (j = 0; j < genl_rqstp.opcnt; j++)
> +					genl_rqstp.opnum[j] =
> +						args->ops[j].opnum;
> +			}
> +#endif /* CONFIG_NFSD_V4 */
> +
> +			/*
> +			 * Acquire rq_status_counter before reporting the rqst
> +			 * fields to the user.
> +			 */
> +			if (smp_load_acquire(&rqstp->rq_status_counter) !=
> +			    status_counter)
> +				continue;
> +
> +			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
> +							       &genl_rqstp);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	cb->args[0] = i;
> +	cb->args[1] = rqstp_index;
> +	ret = skb->len;
> +out:
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> +{
> +	mutex_lock(&nfsd_mutex);
> +	nfsd_put(sock_net(cb->skb->sk));
> +	mutex_unlock(&nfsd_mutex);
> +
>  	return 0;
>  }
>  
> @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void)
>  	retval = register_filesystem(&nfsd_fs_type);
>  	if (retval)
>  		goto out_free_all;
> +	retval = genl_register_family(&nfsd_server_nl_family);
> +	if (retval)
> +		goto out_free_all;
> +
>  	return 0;
>  out_free_all:
>  	nfsd4_destroy_laundry_wq();
> @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void)
>  
>  static void __exit exit_nfsd(void)
>  {
> +	genl_unregister_family(&nfsd_server_nl_family);
>  	unregister_filesystem(&nfsd_fs_type);
>  	nfsd4_destroy_laundry_wq();
>  	unregister_cld_notifier();
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 11c14faa6c67..d787bd38c053 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -62,6 +62,22 @@ struct readdir_cd {
>  	__be32			err;	/* 0, nfserr, or nfserr_eof */
>  };
>  
> +/* Maximum number of operations per session compound */
> +#define NFSD_MAX_OPS_PER_COMPOUND	50
> +
> +struct nfsd_genl_rqstp {
> +	struct sockaddr daddr;
> +	struct sockaddr saddr;
> +	unsigned long rq_flags;
> +	ktime_t rq_stime;
> +	__be32 rq_xid;
> +	u32 rq_vers;
> +	u32 rq_prog;
> +	u32 rq_proc;
> +	/* NFSv4 compund */
> +	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
> +	u16 opcnt;
> +};
>  

Again, I'm wondering, is there a way to pass down some sort context-
specific value with netlink? It might be nice to make the two NFSv4
specific fields into a union if that can be done with netlink.

>  extern struct svc_program	nfsd_program;
>  extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 1582af33e204..fad34a7325b3 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
>  	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
>  		goto out_decode_err;
>  
> +	/*
> +	 * Release rq_status_counter setting it to an odd value after the rpc
> +	 * request has been properly parsed. rq_status_counter is used to
> +	 * notify the consumers if the rqstp fields are stable
> +	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
> +	 * is even).
> +	 */
> +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
> +
>  	rp = NULL;
>  	switch (nfsd_cache_lookup(rqstp, &rp)) {
>  	case RC_DOIT:
> @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
>  	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
>  		goto out_encode_err;
>  
> +	/*
> +	 * Release rq_status_counter setting it to an even value after the rpc
> +	 * request has been properly processed.
> +	 */
> +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
> +
>  	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
>  out_cached_reply:
>  	return 1;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index cbddcf484dba..41bdc913fa71 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
>  
>  /* Maximum number of slots per session. 160 is useful for long haul TCP */
>  #define NFSD_MAX_SLOTS_PER_SESSION     160
> -/* Maximum number of operations per session compound */
> -#define NFSD_MAX_OPS_PER_COMPOUND	50
>  /* Maximum  session per slot cache size */
>  #define NFSD_SLOT_CACHE_SIZE		2048
>  /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index dbf5b21feafe..caa20defd255 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -251,6 +251,7 @@ struct svc_rqst {
>  						 * net namespace
>  						 */
>  	void **			rq_lease_breaker; /* The v4 client breaking a lease */
> +	unsigned int		rq_status_counter; /* RPC processing counter */
>  };
>  
>  /* bits for rq_flags */
Simon Horman Sept. 12, 2023, 3:13 p.m. UTC | #2
On Mon, Sep 11, 2023 at 02:49:46PM +0200, Lorenzo Bianconi wrote:
> Introduce rpc_status netlink support for NFSD in order to dump pending
> RPC requests debugging information from userspace.
> 
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
o
Hi Lorenzo,

some minor feedback from my side.

...

>  int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb,
>  					 struct netlink_callback *cb)
>  {
> +	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
> +	int i, ret, rqstp_index;
> +
> +	rcu_read_lock();
> +
> +	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
> +		struct svc_rqst *rqstp;
> +
> +		if (i < cb->args[0]) /* already consumed */
> +			continue;
> +
> +		rqstp_index = 0;
> +		list_for_each_entry_rcu(rqstp,
> +				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
> +				rq_all) {
> +			struct nfsd_genl_rqstp genl_rqstp;
> +			unsigned int status_counter;
> +
> +			if (rqstp_index++ < cb->args[1]) /* already consumed */
> +				continue;
> +			/*
> +			 * Acquire rq_status_counter before parsing the rqst
> +			 * fields. rq_status_counter is set to an odd value in
> +			 * order to notify the consumers the rqstp fields are
> +			 * meaningful.
> +			 */
> +			status_counter =
> +				smp_load_acquire(&rqstp->rq_status_counter);
> +			if (!(status_counter & 1))
> +				continue;
> +
> +			genl_rqstp.rq_xid = rqstp->rq_xid;
> +			genl_rqstp.rq_flags = rqstp->rq_flags;
> +			genl_rqstp.rq_vers = rqstp->rq_vers;
> +			genl_rqstp.rq_prog = rqstp->rq_prog;
> +			genl_rqstp.rq_proc = rqstp->rq_proc;
> +			genl_rqstp.rq_stime = rqstp->rq_stime;
> +			genl_rqstp.opcnt = 0;
> +			memcpy(&genl_rqstp.daddr, svc_daddr(rqstp),
> +			       sizeof(struct sockaddr));
> +			memcpy(&genl_rqstp.saddr, svc_addr(rqstp),
> +			       sizeof(struct sockaddr));
> +
> +#ifdef CONFIG_NFSD_V4
> +			if (rqstp->rq_vers == NFS4_VERSION &&
> +			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
> +				/* NFSv4 compund */

nit: compound

> +				struct nfsd4_compoundargs *args;
> +				int j;
> +
> +				args = rqstp->rq_argp;
> +				genl_rqstp.opcnt = args->opcnt;
> +				for (j = 0; j < genl_rqstp.opcnt; j++)
> +					genl_rqstp.opnum[j] =
> +						args->ops[j].opnum;
> +			}
> +#endif /* CONFIG_NFSD_V4 */
> +
> +			/*
> +			 * Acquire rq_status_counter before reporting the rqst
> +			 * fields to the user.
> +			 */
> +			if (smp_load_acquire(&rqstp->rq_status_counter) !=
> +			    status_counter)
> +				continue;
> +
> +			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
> +							       &genl_rqstp);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	cb->args[0] = i;
> +	cb->args[1] = rqstp_index;

I'm unsure if this is possible, but if the for loop above iterates zero
times, or for all iterations (i < cb->args[0]), then rqstp_index will
be used uninitialised here.

Flagged by Smatch.

> +	ret = skb->len;
> +out:
> +	rcu_read_unlock();
> +
> +	return ret;
> +}

...

> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 11c14faa6c67..d787bd38c053 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -62,6 +62,22 @@ struct readdir_cd {
>  	__be32			err;	/* 0, nfserr, or nfserr_eof */
>  };
>  
> +/* Maximum number of operations per session compound */
> +#define NFSD_MAX_OPS_PER_COMPOUND	50
> +
> +struct nfsd_genl_rqstp {
> +	struct sockaddr daddr;
> +	struct sockaddr saddr;
> +	unsigned long rq_flags;
> +	ktime_t rq_stime;
> +	__be32 rq_xid;
> +	u32 rq_vers;
> +	u32 rq_prog;
> +	u32 rq_proc;
> +	/* NFSv4 compund */

nit: compound

> +	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
> +	u16 opcnt;
> +};
>  
>  extern struct svc_program	nfsd_program;
>  extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;

...
Lorenzo Bianconi Sept. 14, 2023, 11:41 a.m. UTC | #3
[...]

> > +
> > +#ifdef CONFIG_NFSD_V4
> > +			if (rqstp->rq_vers == NFS4_VERSION &&
> > +			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
> > +				/* NFSv4 compund */
> 
> nit: compound

ack, I will fix it.

> 
> > +				struct nfsd4_compoundargs *args;
> > +				int j;
> > +
> > +				args = rqstp->rq_argp;
> > +				genl_rqstp.opcnt = args->opcnt;
> > +				for (j = 0; j < genl_rqstp.opcnt; j++)
> > +					genl_rqstp.opnum[j] =
> > +						args->ops[j].opnum;
> > +			}
> > +#endif /* CONFIG_NFSD_V4 */
> > +
> > +			/*
> > +			 * Acquire rq_status_counter before reporting the rqst
> > +			 * fields to the user.
> > +			 */
> > +			if (smp_load_acquire(&rqstp->rq_status_counter) !=
> > +			    status_counter)
> > +				continue;
> > +
> > +			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
> > +							       &genl_rqstp);
> > +			if (ret)
> > +				goto out;
> > +		}
> > +	}
> > +
> > +	cb->args[0] = i;
> > +	cb->args[1] = rqstp_index;
> 
> I'm unsure if this is possible, but if the for loop above iterates zero
> times, or for all iterations (i < cb->args[0]), then rqstp_index will
> be used uninitialised here.

ack, thx for spotting it, I will fix it.

Regards,
Lorenzo

> 
> Flagged by Smatch.
> 
> > +	ret = skb->len;
> > +out:
> > +	rcu_read_unlock();
> > +
> > +	return ret;
> > +}
> 
> ...
> 
> > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > index 11c14faa6c67..d787bd38c053 100644
> > --- a/fs/nfsd/nfsd.h
> > +++ b/fs/nfsd/nfsd.h
> > @@ -62,6 +62,22 @@ struct readdir_cd {
> >  	__be32			err;	/* 0, nfserr, or nfserr_eof */
> >  };
> >  
> > +/* Maximum number of operations per session compound */
> > +#define NFSD_MAX_OPS_PER_COMPOUND	50
> > +
> > +struct nfsd_genl_rqstp {
> > +	struct sockaddr daddr;
> > +	struct sockaddr saddr;
> > +	unsigned long rq_flags;
> > +	ktime_t rq_stime;
> > +	__be32 rq_xid;
> > +	u32 rq_vers;
> > +	u32 rq_prog;
> > +	u32 rq_proc;
> > +	/* NFSv4 compund */
> 
> nit: compound
> 
> > +	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
> > +	u16 opcnt;
> > +};
> >  
> >  extern struct svc_program	nfsd_program;
> >  extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
> 
> ...
>
Jakub Kicinski Oct. 3, 2023, 6:03 p.m. UTC | #4
On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote:
> +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> +			  &nfsd_server_nl_family, NLM_F_MULTI,
> +			  NFSD_CMD_RPC_STATUS_GET);
> +	if (!hdr)
> +		return -ENOBUFS;

Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over
multiple messages". 99% of the time the right thing to do is change 
what we consider to be "an object" rather than do F_MULTI. In theory
user space should re-constitute all the NLM_F_MULTI messages into as
single object, which none of YNL does today :(
Lorenzo Bianconi Oct. 4, 2023, 10:14 a.m. UTC | #5
> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote:
> > +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> > +			  &nfsd_server_nl_family, NLM_F_MULTI,
> > +			  NFSD_CMD_RPC_STATUS_GET);
> > +	if (!hdr)
> > +		return -ENOBUFS;
> 
> Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over
> multiple messages". 99% of the time the right thing to do is change 
> what we consider to be "an object" rather than do F_MULTI. In theory
> user space should re-constitute all the NLM_F_MULTI messages into as
> single object, which none of YNL does today :(
> 
ack, fine. I think we can get rid of it.
@chuck: do you want me to send a patch or are you taking care of it?

Regards,
Lorenzo
Chuck Lever Oct. 4, 2023, 1:27 p.m. UTC | #6
> On Oct 4, 2023, at 6:14 AM, Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote:
> 
>> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote:
>>> + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
>>> +  &nfsd_server_nl_family, NLM_F_MULTI,
>>> +  NFSD_CMD_RPC_STATUS_GET);
>>> + if (!hdr)
>>> + return -ENOBUFS;
>> 
>> Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over
>> multiple messages". 99% of the time the right thing to do is change 
>> what we consider to be "an object" rather than do F_MULTI. In theory
>> user space should re-constitute all the NLM_F_MULTI messages into as
>> single object, which none of YNL does today :(
>> 
> ack, fine. I think we can get rid of it.
> @chuck: do you want me to send a patch or are you taking care of it?

Send a (tested) patch and I can squash it into this one.


--
Chuck Lever
Lorenzo Bianconi Oct. 4, 2023, 2:04 p.m. UTC | #7
> 
> 
> > On Oct 4, 2023, at 6:14 AM, Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote:
> > 
> >> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote:
> >>> + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> >>> +  &nfsd_server_nl_family, NLM_F_MULTI,
> >>> +  NFSD_CMD_RPC_STATUS_GET);
> >>> + if (!hdr)
> >>> + return -ENOBUFS;
> >> 
> >> Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over
> >> multiple messages". 99% of the time the right thing to do is change 
> >> what we consider to be "an object" rather than do F_MULTI. In theory
> >> user space should re-constitute all the NLM_F_MULTI messages into as
> >> single object, which none of YNL does today :(
> >> 
> > ack, fine. I think we can get rid of it.
> > @chuck: do you want me to send a patch or are you taking care of it?
> 
> Send a (tested) patch and I can squash it into this one.

ack, I will do.

Regards,
Lorenzo

> 
> 
> --
> Chuck Lever
> 
>
Jeff Layton Dec. 11, 2023, 6:56 p.m. UTC | #8
On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote:
> Introduce rpc_status netlink support for NFSD in order to dump pending
> RPC requests debugging information from userspace.
> 
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  fs/nfsd/nfsctl.c           | 192 ++++++++++++++++++++++++++++++++++++-
>  fs/nfsd/nfsd.h             |  16 ++++
>  fs/nfsd/nfssvc.c           |  15 +++
>  fs/nfsd/state.h            |   2 -
>  include/linux/sunrpc/svc.h |   1 +
>  5 files changed, 222 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 1be66088849c..b862a759ea15 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -26,6 +26,7 @@
>  #include "pnfs.h"
>  #include "filecache.h"
>  #include "trace.h"
> +#include "nfs_netlink_gen.h"
>  
>  /*
>   *	We have a single directory with several nodes in it.
> @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id;
>  
>  int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb)
>  {
> -	return 0;
> +	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&nfsd_mutex);
> +	if (nn->nfsd_serv) {
> +		svc_get(nn->nfsd_serv);
> +		ret = 0;
> +	}
> +	mutex_unlock(&nfsd_mutex);
> +
> +	return ret;
>  }

I think there is a potential race above. Once you've dropped the
nfsd_mutex, there is no guarantee that the nn->nfsd_serv will still be
set when you come back to put the serv. That means that we could oops
when we hit the _done method below.

Is it possible to stash a pointer to the serv while we hold the
reference?

>  
> -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
> +					    struct netlink_callback *cb,
> +					    struct nfsd_genl_rqstp *rqstp)
>  {
> +	void *hdr;
> +	int i;
> +
> +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> +			  &nfsd_server_nl_family, NLM_F_MULTI,
> +			  NFSD_CMD_RPC_STATUS_GET);
> +	if (!hdr)
> +		return -ENOBUFS;
> +
> +	if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) ||
> +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) ||
> +	    nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) ||
> +	    nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME,
> +			ktime_to_us(rqstp->rq_stime),
> +			NFSD_ATTR_RPC_STATUS_PAD))
> +		return -ENOBUFS;
> +
> +	switch (rqstp->saddr.sa_family) {
> +	case AF_INET: {
> +		const struct sockaddr_in *s_in, *d_in;
> +
> +		s_in = (const struct sockaddr_in *)&rqstp->saddr;
> +		d_in = (const struct sockaddr_in *)&rqstp->daddr;
> +		if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4,
> +				    s_in->sin_addr.s_addr) ||
> +		    nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4,
> +				    d_in->sin_addr.s_addr) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> +				 s_in->sin_port) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> +				 d_in->sin_port))
> +			return -ENOBUFS;
> +		break;
> +	}
> +	case AF_INET6: {
> +		const struct sockaddr_in6 *s_in, *d_in;
> +
> +		s_in = (const struct sockaddr_in6 *)&rqstp->saddr;
> +		d_in = (const struct sockaddr_in6 *)&rqstp->daddr;
> +		if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6,
> +				     &s_in->sin6_addr) ||
> +		    nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6,
> +				     &d_in->sin6_addr) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> +				 s_in->sin6_port) ||
> +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> +				 d_in->sin6_port))
> +			return -ENOBUFS;
> +		break;
> +	}
> +	default:
> +		break;
> +	}
> +
> +	if (rqstp->opcnt) {
> +		struct nlattr *attr;
> +
> +		attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP);
> +		if (!attr)
> +			return -ENOBUFS;
> +
> +		for (i = 0; i < rqstp->opcnt; i++) {
> +			struct nlattr *op_attr;
> +
> +			op_attr = nla_nest_start(skb, i);
> +			if (!op_attr)
> +				return -ENOBUFS;
> +
> +			if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP,
> +					rqstp->opnum[i]))
> +				return -ENOBUFS;
> +
> +			nla_nest_end(skb, op_attr);
> +		}
> +
> +		nla_nest_end(skb, attr);
> +	}
> +
> +	genlmsg_end(skb, hdr);
> +
>  	return 0;
>  }
>  
>  int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb,
>  					 struct netlink_callback *cb)
>  {
> +	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
> +	int i, ret, rqstp_index;
> +
> +	rcu_read_lock();
> +
> +	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
> +		struct svc_rqst *rqstp;
> +
> +		if (i < cb->args[0]) /* already consumed */
> +			continue;
> +
> +		rqstp_index = 0;
> +		list_for_each_entry_rcu(rqstp,
> +				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
> +				rq_all) {
> +			struct nfsd_genl_rqstp genl_rqstp;
> +			unsigned int status_counter;
> +
> +			if (rqstp_index++ < cb->args[1]) /* already consumed */
> +				continue;
> +			/*
> +			 * Acquire rq_status_counter before parsing the rqst
> +			 * fields. rq_status_counter is set to an odd value in
> +			 * order to notify the consumers the rqstp fields are
> +			 * meaningful.
> +			 */
> +			status_counter =
> +				smp_load_acquire(&rqstp->rq_status_counter);
> +			if (!(status_counter & 1))
> +				continue;
> +
> +			genl_rqstp.rq_xid = rqstp->rq_xid;
> +			genl_rqstp.rq_flags = rqstp->rq_flags;
> +			genl_rqstp.rq_vers = rqstp->rq_vers;
> +			genl_rqstp.rq_prog = rqstp->rq_prog;
> +			genl_rqstp.rq_proc = rqstp->rq_proc;
> +			genl_rqstp.rq_stime = rqstp->rq_stime;
> +			genl_rqstp.opcnt = 0;
> +			memcpy(&genl_rqstp.daddr, svc_daddr(rqstp),
> +			       sizeof(struct sockaddr));
> +			memcpy(&genl_rqstp.saddr, svc_addr(rqstp),
> +			       sizeof(struct sockaddr));
> +
> +#ifdef CONFIG_NFSD_V4
> +			if (rqstp->rq_vers == NFS4_VERSION &&
> +			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
> +				/* NFSv4 compund */
> +				struct nfsd4_compoundargs *args;
> +				int j;
> +
> +				args = rqstp->rq_argp;
> +				genl_rqstp.opcnt = args->opcnt;
> +				for (j = 0; j < genl_rqstp.opcnt; j++)
> +					genl_rqstp.opnum[j] =
> +						args->ops[j].opnum;
> +			}
> +#endif /* CONFIG_NFSD_V4 */
> +
> +			/*
> +			 * Acquire rq_status_counter before reporting the rqst
> +			 * fields to the user.
> +			 */
> +			if (smp_load_acquire(&rqstp->rq_status_counter) !=
> +			    status_counter)
> +				continue;
> +
> +			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
> +							       &genl_rqstp);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	cb->args[0] = i;
> +	cb->args[1] = rqstp_index;
> +	ret = skb->len;
> +out:
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> +{
> +	mutex_lock(&nfsd_mutex);
> +	nfsd_put(sock_net(cb->skb->sk));
> +	mutex_unlock(&nfsd_mutex);
> +
>  	return 0;
>  }
>  

I think there is a potential race above. Once you've 


> @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void)
>  	retval = register_filesystem(&nfsd_fs_type);
>  	if (retval)
>  		goto out_free_all;
> +	retval = genl_register_family(&nfsd_server_nl_family);
> +	if (retval)
> +		goto out_free_all;
> +
>  	return 0;
>  out_free_all:
>  	nfsd4_destroy_laundry_wq();
> @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void)
>  
>  static void __exit exit_nfsd(void)
>  {
> +	genl_unregister_family(&nfsd_server_nl_family);
>  	unregister_filesystem(&nfsd_fs_type);
>  	nfsd4_destroy_laundry_wq();
>  	unregister_cld_notifier();
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 11c14faa6c67..d787bd38c053 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -62,6 +62,22 @@ struct readdir_cd {
>  	__be32			err;	/* 0, nfserr, or nfserr_eof */
>  };
>  
> +/* Maximum number of operations per session compound */
> +#define NFSD_MAX_OPS_PER_COMPOUND	50
> +
> +struct nfsd_genl_rqstp {
> +	struct sockaddr daddr;
> +	struct sockaddr saddr;
> +	unsigned long rq_flags;
> +	ktime_t rq_stime;
> +	__be32 rq_xid;
> +	u32 rq_vers;
> +	u32 rq_prog;
> +	u32 rq_proc;
> +	/* NFSv4 compund */
> +	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
> +	u16 opcnt;
> +};
>  
>  extern struct svc_program	nfsd_program;
>  extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 1582af33e204..fad34a7325b3 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
>  	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
>  		goto out_decode_err;
>  
> +	/*
> +	 * Release rq_status_counter setting it to an odd value after the rpc
> +	 * request has been properly parsed. rq_status_counter is used to
> +	 * notify the consumers if the rqstp fields are stable
> +	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
> +	 * is even).
> +	 */
> +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
> +
>  	rp = NULL;
>  	switch (nfsd_cache_lookup(rqstp, &rp)) {
>  	case RC_DOIT:
> @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
>  	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
>  		goto out_encode_err;
>  
> +	/*
> +	 * Release rq_status_counter setting it to an even value after the rpc
> +	 * request has been properly processed.
> +	 */
> +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
> +
>  	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
>  out_cached_reply:
>  	return 1;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index cbddcf484dba..41bdc913fa71 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
>  
>  /* Maximum number of slots per session. 160 is useful for long haul TCP */
>  #define NFSD_MAX_SLOTS_PER_SESSION     160
> -/* Maximum number of operations per session compound */
> -#define NFSD_MAX_OPS_PER_COMPOUND	50
>  /* Maximum  session per slot cache size */
>  #define NFSD_SLOT_CACHE_SIZE		2048
>  /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index dbf5b21feafe..caa20defd255 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -251,6 +251,7 @@ struct svc_rqst {
>  						 * net namespace
>  						 */
>  	void **			rq_lease_breaker; /* The v4 client breaking a lease */
> +	unsigned int		rq_status_counter; /* RPC processing counter */
>  };
>  
>  /* bits for rq_flags */
Jeff Layton Dec. 12, 2023, 12:07 p.m. UTC | #9
On Mon, 2023-12-11 at 13:56 -0500, Jeff Layton wrote:
> On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote:
> > Introduce rpc_status netlink support for NFSD in order to dump pending
> > RPC requests debugging information from userspace.
> > 
> > Tested-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  fs/nfsd/nfsctl.c           | 192 ++++++++++++++++++++++++++++++++++++-
> >  fs/nfsd/nfsd.h             |  16 ++++
> >  fs/nfsd/nfssvc.c           |  15 +++
> >  fs/nfsd/state.h            |   2 -
> >  include/linux/sunrpc/svc.h |   1 +
> >  5 files changed, 222 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 1be66088849c..b862a759ea15 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -26,6 +26,7 @@
> >  #include "pnfs.h"
> >  #include "filecache.h"
> >  #include "trace.h"
> > +#include "nfs_netlink_gen.h"
> >  
> >  /*
> >   *	We have a single directory with several nodes in it.
> > @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id;
> >  
> >  int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb)
> >  {
> > -	return 0;
> > +	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
> > +	int ret = -ENODEV;
> > +
> > +	mutex_lock(&nfsd_mutex);
> > +	if (nn->nfsd_serv) {
> > +		svc_get(nn->nfsd_serv);
> > +		ret = 0;
> > +	}
> > +	mutex_unlock(&nfsd_mutex);
> > +
> > +	return ret;
> >  }
> 
> I think there is a potential race above. Once you've dropped the
> nfsd_mutex, there is no guarantee that the nn->nfsd_serv will still be
> set when you come back to put the serv. That means that we could oops
> when we hit the _done method below.
> 
> Is it possible to stash a pointer to the serv while we hold the
> reference?
> 

Actually, it looks like Neil may have already fixed this in the series
he sent on Oct 29th. See:

    [PATCH 3/5] nfsd: hold nfsd_mutex across entire netlink operation

Another reason to go ahead and get that series in...

> >  
> > -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> > +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
> > +					    struct netlink_callback *cb,
> > +					    struct nfsd_genl_rqstp *rqstp)
> >  {
> > +	void *hdr;
> > +	int i;
> > +
> > +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> > +			  &nfsd_server_nl_family, NLM_F_MULTI,
> > +			  NFSD_CMD_RPC_STATUS_GET);
> > +	if (!hdr)
> > +		return -ENOBUFS;
> > +
> > +	if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) ||
> > +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
> > +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) ||
> > +	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) ||
> > +	    nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) ||
> > +	    nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME,
> > +			ktime_to_us(rqstp->rq_stime),
> > +			NFSD_ATTR_RPC_STATUS_PAD))
> > +		return -ENOBUFS;
> > +
> > +	switch (rqstp->saddr.sa_family) {
> > +	case AF_INET: {
> > +		const struct sockaddr_in *s_in, *d_in;
> > +
> > +		s_in = (const struct sockaddr_in *)&rqstp->saddr;
> > +		d_in = (const struct sockaddr_in *)&rqstp->daddr;
> > +		if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4,
> > +				    s_in->sin_addr.s_addr) ||
> > +		    nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4,
> > +				    d_in->sin_addr.s_addr) ||
> > +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> > +				 s_in->sin_port) ||
> > +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> > +				 d_in->sin_port))
> > +			return -ENOBUFS;
> > +		break;
> > +	}
> > +	case AF_INET6: {
> > +		const struct sockaddr_in6 *s_in, *d_in;
> > +
> > +		s_in = (const struct sockaddr_in6 *)&rqstp->saddr;
> > +		d_in = (const struct sockaddr_in6 *)&rqstp->daddr;
> > +		if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6,
> > +				     &s_in->sin6_addr) ||
> > +		    nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6,
> > +				     &d_in->sin6_addr) ||
> > +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
> > +				 s_in->sin6_port) ||
> > +		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
> > +				 d_in->sin6_port))
> > +			return -ENOBUFS;
> > +		break;
> > +	}
> > +	default:
> > +		break;
> > +	}
> > +
> > +	if (rqstp->opcnt) {
> > +		struct nlattr *attr;
> > +
> > +		attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP);
> > +		if (!attr)
> > +			return -ENOBUFS;
> > +
> > +		for (i = 0; i < rqstp->opcnt; i++) {
> > +			struct nlattr *op_attr;
> > +
> > +			op_attr = nla_nest_start(skb, i);
> > +			if (!op_attr)
> > +				return -ENOBUFS;
> > +
> > +			if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP,
> > +					rqstp->opnum[i]))
> > +				return -ENOBUFS;
> > +
> > +			nla_nest_end(skb, op_attr);
> > +		}
> > +
> > +		nla_nest_end(skb, attr);
> > +	}
> > +
> > +	genlmsg_end(skb, hdr);
> > +
> >  	return 0;
> >  }
> >  
> >  int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb,
> >  					 struct netlink_callback *cb)
> >  {
> > +	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
> > +	int i, ret, rqstp_index;
> > +
> > +	rcu_read_lock();
> > +
> > +	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
> > +		struct svc_rqst *rqstp;
> > +
> > +		if (i < cb->args[0]) /* already consumed */
> > +			continue;
> > +
> > +		rqstp_index = 0;
> > +		list_for_each_entry_rcu(rqstp,
> > +				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
> > +				rq_all) {
> > +			struct nfsd_genl_rqstp genl_rqstp;
> > +			unsigned int status_counter;
> > +
> > +			if (rqstp_index++ < cb->args[1]) /* already consumed */
> > +				continue;
> > +			/*
> > +			 * Acquire rq_status_counter before parsing the rqst
> > +			 * fields. rq_status_counter is set to an odd value in
> > +			 * order to notify the consumers the rqstp fields are
> > +			 * meaningful.
> > +			 */
> > +			status_counter =
> > +				smp_load_acquire(&rqstp->rq_status_counter);
> > +			if (!(status_counter & 1))
> > +				continue;
> > +
> > +			genl_rqstp.rq_xid = rqstp->rq_xid;
> > +			genl_rqstp.rq_flags = rqstp->rq_flags;
> > +			genl_rqstp.rq_vers = rqstp->rq_vers;
> > +			genl_rqstp.rq_prog = rqstp->rq_prog;
> > +			genl_rqstp.rq_proc = rqstp->rq_proc;
> > +			genl_rqstp.rq_stime = rqstp->rq_stime;
> > +			genl_rqstp.opcnt = 0;
> > +			memcpy(&genl_rqstp.daddr, svc_daddr(rqstp),
> > +			       sizeof(struct sockaddr));
> > +			memcpy(&genl_rqstp.saddr, svc_addr(rqstp),
> > +			       sizeof(struct sockaddr));
> > +
> > +#ifdef CONFIG_NFSD_V4
> > +			if (rqstp->rq_vers == NFS4_VERSION &&
> > +			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
> > +				/* NFSv4 compund */
> > +				struct nfsd4_compoundargs *args;
> > +				int j;
> > +
> > +				args = rqstp->rq_argp;
> > +				genl_rqstp.opcnt = args->opcnt;
> > +				for (j = 0; j < genl_rqstp.opcnt; j++)
> > +					genl_rqstp.opnum[j] =
> > +						args->ops[j].opnum;
> > +			}
> > +#endif /* CONFIG_NFSD_V4 */
> > +
> > +			/*
> > +			 * Acquire rq_status_counter before reporting the rqst
> > +			 * fields to the user.
> > +			 */
> > +			if (smp_load_acquire(&rqstp->rq_status_counter) !=
> > +			    status_counter)
> > +				continue;
> > +
> > +			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
> > +							       &genl_rqstp);
> > +			if (ret)
> > +				goto out;
> > +		}
> > +	}
> > +
> > +	cb->args[0] = i;
> > +	cb->args[1] = rqstp_index;
> > +	ret = skb->len;
> > +out:
> > +	rcu_read_unlock();
> > +
> > +	return ret;
> > +}
> > +
> > +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
> > +{
> > +	mutex_lock(&nfsd_mutex);
> > +	nfsd_put(sock_net(cb->skb->sk));
> > +	mutex_unlock(&nfsd_mutex);
> > +
> >  	return 0;
> >  }
> >  
> 
> I think there is a potential race above. Once you've 
> 
> 
> > @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void)
> >  	retval = register_filesystem(&nfsd_fs_type);
> >  	if (retval)
> >  		goto out_free_all;
> > +	retval = genl_register_family(&nfsd_server_nl_family);
> > +	if (retval)
> > +		goto out_free_all;
> > +
> >  	return 0;
> >  out_free_all:
> >  	nfsd4_destroy_laundry_wq();
> > @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void)
> >  
> >  static void __exit exit_nfsd(void)
> >  {
> > +	genl_unregister_family(&nfsd_server_nl_family);
> >  	unregister_filesystem(&nfsd_fs_type);
> >  	nfsd4_destroy_laundry_wq();
> >  	unregister_cld_notifier();
> > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > index 11c14faa6c67..d787bd38c053 100644
> > --- a/fs/nfsd/nfsd.h
> > +++ b/fs/nfsd/nfsd.h
> > @@ -62,6 +62,22 @@ struct readdir_cd {
> >  	__be32			err;	/* 0, nfserr, or nfserr_eof */
> >  };
> >  
> > +/* Maximum number of operations per session compound */
> > +#define NFSD_MAX_OPS_PER_COMPOUND	50
> > +
> > +struct nfsd_genl_rqstp {
> > +	struct sockaddr daddr;
> > +	struct sockaddr saddr;
> > +	unsigned long rq_flags;
> > +	ktime_t rq_stime;
> > +	__be32 rq_xid;
> > +	u32 rq_vers;
> > +	u32 rq_prog;
> > +	u32 rq_proc;
> > +	/* NFSv4 compund */
> > +	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
> > +	u16 opcnt;
> > +};
> >  
> >  extern struct svc_program	nfsd_program;
> >  extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index 1582af33e204..fad34a7325b3 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> >  	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
> >  		goto out_decode_err;
> >  
> > +	/*
> > +	 * Release rq_status_counter setting it to an odd value after the rpc
> > +	 * request has been properly parsed. rq_status_counter is used to
> > +	 * notify the consumers if the rqstp fields are stable
> > +	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
> > +	 * is even).
> > +	 */
> > +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
> > +
> >  	rp = NULL;
> >  	switch (nfsd_cache_lookup(rqstp, &rp)) {
> >  	case RC_DOIT:
> > @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
> >  	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
> >  		goto out_encode_err;
> >  
> > +	/*
> > +	 * Release rq_status_counter setting it to an even value after the rpc
> > +	 * request has been properly processed.
> > +	 */
> > +	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
> > +
> >  	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
> >  out_cached_reply:
> >  	return 1;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index cbddcf484dba..41bdc913fa71 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
> >  
> >  /* Maximum number of slots per session. 160 is useful for long haul TCP */
> >  #define NFSD_MAX_SLOTS_PER_SESSION     160
> > -/* Maximum number of operations per session compound */
> > -#define NFSD_MAX_OPS_PER_COMPOUND	50
> >  /* Maximum  session per slot cache size */
> >  #define NFSD_SLOT_CACHE_SIZE		2048
> >  /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
> > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> > index dbf5b21feafe..caa20defd255 100644
> > --- a/include/linux/sunrpc/svc.h
> > +++ b/include/linux/sunrpc/svc.h
> > @@ -251,6 +251,7 @@ struct svc_rqst {
> >  						 * net namespace
> >  						 */
> >  	void **			rq_lease_breaker; /* The v4 client breaking a lease */
> > +	unsigned int		rq_status_counter; /* RPC processing counter */
> >  };
> >  
> >  /* bits for rq_flags */
>
diff mbox series

Patch

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1be66088849c..b862a759ea15 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -26,6 +26,7 @@ 
 #include "pnfs.h"
 #include "filecache.h"
 #include "trace.h"
+#include "nfs_netlink_gen.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1497,17 +1498,199 @@  unsigned int nfsd_net_id;
 
 int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb)
 {
-	return 0;
+	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+	int ret = -ENODEV;
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv) {
+		svc_get(nn->nfsd_serv);
+		ret = 0;
+	}
+	mutex_unlock(&nfsd_mutex);
+
+	return ret;
 }
 
-int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+					    struct netlink_callback *cb,
+					    struct nfsd_genl_rqstp *rqstp)
 {
+	void *hdr;
+	int i;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &nfsd_server_nl_family, NLM_F_MULTI,
+			  NFSD_CMD_RPC_STATUS_GET);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) ||
+	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) ||
+	    nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) ||
+	    nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+	    nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME,
+			ktime_to_us(rqstp->rq_stime),
+			NFSD_ATTR_RPC_STATUS_PAD))
+		return -ENOBUFS;
+
+	switch (rqstp->saddr.sa_family) {
+	case AF_INET: {
+		const struct sockaddr_in *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in *)&rqstp->saddr;
+		d_in = (const struct sockaddr_in *)&rqstp->daddr;
+		if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4,
+				    s_in->sin_addr.s_addr) ||
+		    nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4,
+				    d_in->sin_addr.s_addr) ||
+		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
+				 s_in->sin_port) ||
+		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
+				 d_in->sin_port))
+			return -ENOBUFS;
+		break;
+	}
+	case AF_INET6: {
+		const struct sockaddr_in6 *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in6 *)&rqstp->saddr;
+		d_in = (const struct sockaddr_in6 *)&rqstp->daddr;
+		if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6,
+				     &s_in->sin6_addr) ||
+		    nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6,
+				     &d_in->sin6_addr) ||
+		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT,
+				 s_in->sin6_port) ||
+		    nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT,
+				 d_in->sin6_port))
+			return -ENOBUFS;
+		break;
+	}
+	default:
+		break;
+	}
+
+	if (rqstp->opcnt) {
+		struct nlattr *attr;
+
+		attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP);
+		if (!attr)
+			return -ENOBUFS;
+
+		for (i = 0; i < rqstp->opcnt; i++) {
+			struct nlattr *op_attr;
+
+			op_attr = nla_nest_start(skb, i);
+			if (!op_attr)
+				return -ENOBUFS;
+
+			if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP,
+					rqstp->opnum[i]))
+				return -ENOBUFS;
+
+			nla_nest_end(skb, op_attr);
+		}
+
+		nla_nest_end(skb, attr);
+	}
+
+	genlmsg_end(skb, hdr);
+
 	return 0;
 }
 
 int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb,
 					 struct netlink_callback *cb)
 {
+	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+	int i, ret, rqstp_index;
+
+	rcu_read_lock();
+
+	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+		struct svc_rqst *rqstp;
+
+		if (i < cb->args[0]) /* already consumed */
+			continue;
+
+		rqstp_index = 0;
+		list_for_each_entry_rcu(rqstp,
+				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
+				rq_all) {
+			struct nfsd_genl_rqstp genl_rqstp;
+			unsigned int status_counter;
+
+			if (rqstp_index++ < cb->args[1]) /* already consumed */
+				continue;
+			/*
+			 * Acquire rq_status_counter before parsing the rqst
+			 * fields. rq_status_counter is set to an odd value in
+			 * order to notify the consumers the rqstp fields are
+			 * meaningful.
+			 */
+			status_counter =
+				smp_load_acquire(&rqstp->rq_status_counter);
+			if (!(status_counter & 1))
+				continue;
+
+			genl_rqstp.rq_xid = rqstp->rq_xid;
+			genl_rqstp.rq_flags = rqstp->rq_flags;
+			genl_rqstp.rq_vers = rqstp->rq_vers;
+			genl_rqstp.rq_prog = rqstp->rq_prog;
+			genl_rqstp.rq_proc = rqstp->rq_proc;
+			genl_rqstp.rq_stime = rqstp->rq_stime;
+			genl_rqstp.opcnt = 0;
+			memcpy(&genl_rqstp.daddr, svc_daddr(rqstp),
+			       sizeof(struct sockaddr));
+			memcpy(&genl_rqstp.saddr, svc_addr(rqstp),
+			       sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+			if (rqstp->rq_vers == NFS4_VERSION &&
+			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
+				/* NFSv4 compund */
+				struct nfsd4_compoundargs *args;
+				int j;
+
+				args = rqstp->rq_argp;
+				genl_rqstp.opcnt = args->opcnt;
+				for (j = 0; j < genl_rqstp.opcnt; j++)
+					genl_rqstp.opnum[j] =
+						args->ops[j].opnum;
+			}
+#endif /* CONFIG_NFSD_V4 */
+
+			/*
+			 * Acquire rq_status_counter before reporting the rqst
+			 * fields to the user.
+			 */
+			if (smp_load_acquire(&rqstp->rq_status_counter) !=
+			    status_counter)
+				continue;
+
+			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+							       &genl_rqstp);
+			if (ret)
+				goto out;
+		}
+	}
+
+	cb->args[0] = i;
+	cb->args[1] = rqstp_index;
+	ret = skb->len;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb)
+{
+	mutex_lock(&nfsd_mutex);
+	nfsd_put(sock_net(cb->skb->sk));
+	mutex_unlock(&nfsd_mutex);
+
 	return 0;
 }
 
@@ -1605,6 +1788,10 @@  static int __init init_nfsd(void)
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
+	retval = genl_register_family(&nfsd_server_nl_family);
+	if (retval)
+		goto out_free_all;
+
 	return 0;
 out_free_all:
 	nfsd4_destroy_laundry_wq();
@@ -1629,6 +1816,7 @@  static int __init init_nfsd(void)
 
 static void __exit exit_nfsd(void)
 {
+	genl_unregister_family(&nfsd_server_nl_family);
 	unregister_filesystem(&nfsd_fs_type);
 	nfsd4_destroy_laundry_wq();
 	unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..d787bd38c053 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,22 @@  struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
 };
 
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	50
+
+struct nfsd_genl_rqstp {
+	struct sockaddr daddr;
+	struct sockaddr saddr;
+	unsigned long rq_flags;
+	ktime_t rq_stime;
+	__be32 rq_xid;
+	u32 rq_vers;
+	u32 rq_prog;
+	u32 rq_proc;
+	/* NFSv4 compund */
+	u32 opnum[NFSD_MAX_OPS_PER_COMPOUND];
+	u16 opcnt;
+};
 
 extern struct svc_program	nfsd_program;
 extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1582af33e204..fad34a7325b3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -998,6 +998,15 @@  int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
 		goto out_decode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an odd value after the rpc
+	 * request has been properly parsed. rq_status_counter is used to
+	 * notify the consumers if the rqstp fields are stable
+	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
+	 * is even).
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
 	rp = NULL;
 	switch (nfsd_cache_lookup(rqstp, &rp)) {
 	case RC_DOIT:
@@ -1015,6 +1024,12 @@  int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
 		goto out_encode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an even value after the rpc
+	 * request has been properly processed.
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
 	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
 out_cached_reply:
 	return 1;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cbddcf484dba..41bdc913fa71 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -174,8 +174,6 @@  static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND	50
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dbf5b21feafe..caa20defd255 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -251,6 +251,7 @@  struct svc_rqst {
 						 * net namespace
 						 */
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
+	unsigned int		rq_status_counter; /* RPC processing counter */
 };
 
 /* bits for rq_flags */