diff mbox series

[13/14] nfsd: introduce concept of a maximum number of threads.

Message ID 20240715074657.18174-14-neilb@suse.de (mailing list archive)
State New
Headers show
Series support automatic changes to nfsd thread count | expand

Commit Message

NeilBrown July 15, 2024, 7:14 a.m. UTC
A future patch will allow the number of threads in each nfsd pool to
vary dynamically.
The lower bound will be the number explicit requested via
/proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads

The upper bound can be set in each net-namespace by writing
/proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
there is no per-pool upper limit.

If no upper bound is set, then one is calculated.  A global upper limit
is chosen based on amount of memory.  This limit only affects dynamic
changes. Static configuration can always over-ride it.

We track how many threads are configured in each net namespace, with the
max or the min.  We also track how many net namespaces have nfsd
configured with only a min, not a max.

The difference between the calculated max and the total allocation is
available to be shared among those namespaces which don't have a maximum
configured.  Within a namespace, the available share is distributed
equally across all pools.

In the common case there is one namespace and one pool.  A small number
of threads are configured as a minimum and no maximum is set.  In this
case the effective maximum will be directly based on total memory.
Approximately 8 per gigabyte.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/netns.h  |  6 +++++
 fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsd.h   |  4 ++++
 fs/nfsd/nfssvc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/trace.h  | 19 +++++++++++++++
 5 files changed, 135 insertions(+)

Comments

Jeff Layton July 15, 2024, 5:06 p.m. UTC | #1
On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
> A future patch will allow the number of threads in each nfsd pool to
> vary dynamically.
> The lower bound will be the number explicit requested via
> /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
> 
> The upper bound can be set in each net-namespace by writing
> /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
> there is no per-pool upper limit.
> 
> If no upper bound is set, then one is calculated.  A global upper limit
> is chosen based on amount of memory.  This limit only affects dynamic
> changes. Static configuration can always over-ride it.
> 
> We track how many threads are configured in each net namespace, with the
> max or the min.  We also track how many net namespaces have nfsd
> configured with only a min, not a max.
> 
> The difference between the calculated max and the total allocation is
> available to be shared among those namespaces which don't have a maximum
> configured.  Within a namespace, the available share is distributed
> equally across all pools.
> 
> In the common case there is one namespace and one pool.  A small number
> of threads are configured as a minimum and no maximum is set.  In this
> case the effective maximum will be directly based on total memory.
> Approximately 8 per gigabyte.
> 


Some of this may come across as bikeshedding, but I'd probably prefer
that this work a bit differently:

1/ I don't think we should enable this universally -- at least not
initially. What I'd prefer to see is a new pool_mode for the dynamic
threadpools (maybe call it "dynamic"). That gives us a clear opt-in
mechanism. Later once we're convinced it's safe, we can make "dynamic"
the default instead of "global".

2/ Rather than specifying a max_threads value separately, why not allow
the old threads/pool_threads interface to set the max and just have a
reasonable minimum setting (like the current default of 8). Since we're
growing the threadpool dynamically, I don't see why we need to have a
real configurable minimum.

3/ the dynamic pool-mode should probably be layered on top of the
pernode pool mode. IOW, in a NUMA configuration, we should split the
threads across NUMA nodes.


> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/netns.h  |  6 +++++
>  fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfsd.h   |  4 ++++
>  fs/nfsd/nfssvc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/trace.h  | 19 +++++++++++++++
>  5 files changed, 135 insertions(+)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 0d2ac15a5003..329484696a42 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -133,6 +133,12 @@ struct nfsd_net {
>  	 */
>  	unsigned int max_connections;
>  
> +	/*
> +	 * Maximum number of threads to auto-adjust up to.  If 0 then a
> +	 * share of nfsd_max_threads will be used.
> +	 */
> +	unsigned int max_threads;
> +
>  	u32 clientid_base;
>  	u32 clientid_counter;
>  	u32 clverifier_counter;
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index d85b6d1fa31f..37e9936567e9 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -48,6 +48,7 @@ enum {
>  	NFSD_Ports,
>  	NFSD_MaxBlkSize,
>  	NFSD_MaxConnections,
> +	NFSD_MaxThreads,
>  	NFSD_Filecache,
>  	NFSD_Leasetime,
>  	NFSD_Gracetime,
> @@ -68,6 +69,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size);
>  static ssize_t write_ports(struct file *file, char *buf, size_t size);
>  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
>  static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
> +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size);
>  #ifdef CONFIG_NFSD_V4
>  static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
>  static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
> @@ -87,6 +89,7 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
>  	[NFSD_Ports] = write_ports,
>  	[NFSD_MaxBlkSize] = write_maxblksize,
>  	[NFSD_MaxConnections] = write_maxconn,
> +	[NFSD_MaxThreads] = write_maxthreads,
>  #ifdef CONFIG_NFSD_V4
>  	[NFSD_Leasetime] = write_leasetime,
>  	[NFSD_Gracetime] = write_gracetime,
> @@ -939,6 +942,47 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
>  	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
>  }
>  
> +/*
> + * write_maxthreads - Set or report the current max number threads
> + *
> + * Input:
> + *			buf:		ignored
> + *			size:		zero
> + * OR
> + *
> + * Input:
> + *			buf:		C string containing an unsigned
> + *					integer value representing the new
> + *					max number of threads
> + *			size:		non-zero length of C string in @buf
> + * Output:
> + *	On success:	passed-in buffer filled with '\n'-terminated C string
> + *			containing numeric value of max_threads setting
> + *			for this net namespace;
> + *			return code is the size in bytes of the string
> + *	On error:	return code is zero or a negative errno value
> + */
> +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size)
> +{
> +	char *mesg = buf;
> +	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
> +	unsigned int maxthreads = nn->max_threads;
> +
> +	if (size > 0) {
> +		int rv = get_uint(&mesg, &maxthreads);
> +
> +		if (rv)
> +			return rv;
> +		trace_nfsd_ctl_maxthreads(netns(file), maxthreads);
> +		mutex_lock(&nfsd_mutex);
> +		nn->max_threads = maxthreads;
> +		nfsd_update_nets();
> +		mutex_unlock(&nfsd_mutex);
> +	}
> +
> +	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxthreads);
> +}
> +
>  #ifdef CONFIG_NFSD_V4
>  static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
>  				  time64_t *time, struct nfsd_net *nn)
> @@ -1372,6 +1416,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
>  		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
>  		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
>  		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
> +		[NFSD_MaxThreads] = {"max_threads", &transaction_ops, S_IWUSR|S_IRUGO},
>  		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
>  #ifdef CONFIG_NFSD_V4
>  		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index e4c643255dc9..6874c2de670b 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -156,6 +156,10 @@ int nfsd_create_serv(struct net *net);
>  void nfsd_destroy_serv(struct net *net);
>  
>  extern int nfsd_max_blksize;
> +void nfsd_update_nets(void);
> +extern unsigned int	nfsd_max_threads;
> +extern unsigned long	nfsd_net_used;
> +extern unsigned int	nfsd_net_cnt;
>  
>  static inline int nfsd_v4client(struct svc_rqst *rq)
>  {
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index b005b2e2e6ad..75d78c17756f 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -80,6 +80,21 @@ DEFINE_SPINLOCK(nfsd_drc_lock);
>  unsigned long	nfsd_drc_max_mem;
>  unsigned long	nfsd_drc_slotsize_sum;
>  
> +/*
> + * nfsd_max_threads is auto-configured based on available ram.
> + * Each network namespace can configure a minimum number of threads
> + * and optionally a maximum.
> + * nfsd_net_used is the number of the max or min from each net namespace.
> + * nfsd_new_cnt is the number of net namespaces with a configured minimum
> + *    but no configured maximum.
> + * When nfsd_max_threads exceeds nfsd_net_used, the different is divided
> + * by nfsd_net_cnt and this number gives the excess above the configured minimum
> + * for all net namespaces without a configured maximum.
> + */
> +unsigned int	nfsd_max_threads;
> +unsigned long	nfsd_net_used;
> +unsigned int	nfsd_net_cnt;
> +
>  #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
>  static const struct svc_version *nfsd_acl_version[] = {
>  # if defined(CONFIG_NFSD_V2_ACL)
> @@ -130,6 +145,47 @@ struct svc_program		nfsd_program = {
>  	.pg_rpcbind_set		= nfsd_rpcbind_set,
>  };
>  
> +void nfsd_update_nets(void)
> +{
> +	struct net *net;
> +
> +	if (nfsd_max_threads == 0) {
> +		nfsd_max_threads = (nr_free_buffer_pages() >> 7) /
> +			(NFSSVC_MAXBLKSIZE >> PAGE_SHIFT);
> +	}
> +	nfsd_net_used = 0;
> +	nfsd_net_cnt = 0;
> +	down_read(&net_rwsem);
> +	for_each_net(net) {
> +		struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +
> +		if (!nn->nfsd_serv)
> +			continue;
> +		if (nn->max_threads > 0) {
> +			nfsd_net_used += nn->max_threads;
> +		} else {
> +			nfsd_net_used += nn->nfsd_serv->sv_nrthreads;
> +			nfsd_net_cnt += 1;
> +		}
> +	}
> +	up_read(&net_rwsem);
> +}
> +
> +static inline int nfsd_max_pool_threads(struct svc_pool *p, struct nfsd_net *nn)
> +{
> +	int svthreads = nn->nfsd_serv->sv_nrthreads;
> +
> +	if (nn->max_threads > 0)
> +		return nn->max_threads;
> +	if (nfsd_net_cnt == 0 || svthreads == 0)
> +		return 0;
> +	if (nfsd_max_threads < nfsd_net_cnt)
> +		return p->sp_nrthreads;
> +	/* Share nfsd_max_threads among all net, then among pools in this net. */
> +	return p->sp_nrthreads +
> +		nfsd_max_threads / nfsd_net_cnt * p->sp_nrthreads / svthreads;
> +}
> +
>  bool nfsd_support_version(int vers)
>  {
>  	if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS)
> @@ -474,6 +530,7 @@ void nfsd_destroy_serv(struct net *net)
>  	spin_lock(&nfsd_notifier_lock);
>  	nn->nfsd_serv = NULL;
>  	spin_unlock(&nfsd_notifier_lock);
> +	nfsd_update_nets();
>  
>  	/* check if the notifier still has clients */
>  	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
> @@ -614,6 +671,8 @@ int nfsd_create_serv(struct net *net)
>  	nn->nfsd_serv = serv;
>  	spin_unlock(&nfsd_notifier_lock);
>  
> +	nfsd_update_nets();
> +
>  	set_max_drc();
>  	/* check if the notifier is already set */
>  	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
> @@ -720,6 +779,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
>  			goto out;
>  	}
>  out:
> +	nfsd_update_nets();
>  	return err;
>  }
>  
> @@ -759,6 +819,7 @@ nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const c
>  	error = nfsd_set_nrthreads(n, nthreads, net);
>  	if (error)
>  		goto out_put;
> +	nfsd_update_nets();
>  	error = serv->sv_nrthreads;
>  out_put:
>  	if (serv->sv_nrthreads == 0)
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 77bbd23aa150..92b888e178e8 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -2054,6 +2054,25 @@ TRACE_EVENT(nfsd_ctl_maxconn,
>  	)
>  );
>  
> +TRACE_EVENT(nfsd_ctl_maxthreads,
> +	TP_PROTO(
> +		const struct net *net,
> +		int maxthreads
> +	),
> +	TP_ARGS(net, maxthreads),
> +	TP_STRUCT__entry(
> +		__field(unsigned int, netns_ino)
> +		__field(int, maxthreads)
> +	),
> +	TP_fast_assign(
> +		__entry->netns_ino = net->ns.inum;
> +		__entry->maxthreads = maxthreads
> +	),
> +	TP_printk("maxthreads=%d",
> +		__entry->maxthreads
> +	)
> +);
> +
>  TRACE_EVENT(nfsd_ctl_time,
>  	TP_PROTO(
>  		const struct net *net,
NeilBrown July 16, 2024, 3:21 a.m. UTC | #2
On Tue, 16 Jul 2024, Jeff Layton wrote:
> On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
> > A future patch will allow the number of threads in each nfsd pool to
> > vary dynamically.
> > The lower bound will be the number explicit requested via
> > /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
> > 
> > The upper bound can be set in each net-namespace by writing
> > /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
> > there is no per-pool upper limit.
> > 
> > If no upper bound is set, then one is calculated.  A global upper limit
> > is chosen based on amount of memory.  This limit only affects dynamic
> > changes. Static configuration can always over-ride it.
> > 
> > We track how many threads are configured in each net namespace, with the
> > max or the min.  We also track how many net namespaces have nfsd
> > configured with only a min, not a max.
> > 
> > The difference between the calculated max and the total allocation is
> > available to be shared among those namespaces which don't have a maximum
> > configured.  Within a namespace, the available share is distributed
> > equally across all pools.
> > 
> > In the common case there is one namespace and one pool.  A small number
> > of threads are configured as a minimum and no maximum is set.  In this
> > case the effective maximum will be directly based on total memory.
> > Approximately 8 per gigabyte.
> > 
> 
> 
> Some of this may come across as bikeshedding, but I'd probably prefer
> that this work a bit differently:
> 
> 1/ I don't think we should enable this universally -- at least not
> initially. What I'd prefer to see is a new pool_mode for the dynamic
> threadpools (maybe call it "dynamic"). That gives us a clear opt-in
> mechanism. Later once we're convinced it's safe, we can make "dynamic"
> the default instead of "global".
> 
> 2/ Rather than specifying a max_threads value separately, why not allow
> the old threads/pool_threads interface to set the max and just have a
> reasonable minimum setting (like the current default of 8). Since we're
> growing the threadpool dynamically, I don't see why we need to have a
> real configurable minimum.
> 
> 3/ the dynamic pool-mode should probably be layered on top of the
> pernode pool mode. IOW, in a NUMA configuration, we should split the
> threads across NUMA nodes.

Maybe we should start by discussing the goal.  What do we want
configuration to look like when we finish?

I think we want it to be transparent.  Sysadmin does nothing, and it all
works perfectly.  Or as close to that as we can get.

So I think the "nproc" option to rpc.nfsd should eventually be ignored
except for whether it is zero or non-zero.  If it is non-zero we start 1
thread on each NUMA node and let that grow with limits imposed primarily
by memory pressure.

We should probably change

#define SVC_POOL_DEFAULT	SVC_POOL_GLOBAL

to

#define SVC_POOL_DEFAULT	SVC_POOL_AUTO

about 10 years ago, but failing that, maybe change it tomorrow?

I'm not sure how
    /proc/fs/nfsd/{threads,pool_threads}
should be handled.  Like you I don't think it is really useful to have
a configured minimum but I don't want them to be imposed as a maximum
because I want servers with the current default (of 8) to be able to
start more new threads if necessary without needing a config change.
Maybe that outcome can be delayed until rpc.nfsd is updated?

I don't really like the idea of a dynamic pool mode.  I want the pool to
*always* be dynamic.

Thanks,
NeilBrown

> 
> 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/netns.h  |  6 +++++
> >  fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfsd.h   |  4 ++++
> >  fs/nfsd/nfssvc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/trace.h  | 19 +++++++++++++++
> >  5 files changed, 135 insertions(+)
> > 
> > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > index 0d2ac15a5003..329484696a42 100644
> > --- a/fs/nfsd/netns.h
> > +++ b/fs/nfsd/netns.h
> > @@ -133,6 +133,12 @@ struct nfsd_net {
> >  	 */
> >  	unsigned int max_connections;
> >  
> > +	/*
> > +	 * Maximum number of threads to auto-adjust up to.  If 0 then a
> > +	 * share of nfsd_max_threads will be used.
> > +	 */
> > +	unsigned int max_threads;
> > +
> >  	u32 clientid_base;
> >  	u32 clientid_counter;
> >  	u32 clverifier_counter;
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index d85b6d1fa31f..37e9936567e9 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -48,6 +48,7 @@ enum {
> >  	NFSD_Ports,
> >  	NFSD_MaxBlkSize,
> >  	NFSD_MaxConnections,
> > +	NFSD_MaxThreads,
> >  	NFSD_Filecache,
> >  	NFSD_Leasetime,
> >  	NFSD_Gracetime,
> > @@ -68,6 +69,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size);
> >  static ssize_t write_ports(struct file *file, char *buf, size_t size);
> >  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
> >  static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
> > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size);
> >  #ifdef CONFIG_NFSD_V4
> >  static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
> >  static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
> > @@ -87,6 +89,7 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
> >  	[NFSD_Ports] = write_ports,
> >  	[NFSD_MaxBlkSize] = write_maxblksize,
> >  	[NFSD_MaxConnections] = write_maxconn,
> > +	[NFSD_MaxThreads] = write_maxthreads,
> >  #ifdef CONFIG_NFSD_V4
> >  	[NFSD_Leasetime] = write_leasetime,
> >  	[NFSD_Gracetime] = write_gracetime,
> > @@ -939,6 +942,47 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
> >  	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
> >  }
> >  
> > +/*
> > + * write_maxthreads - Set or report the current max number threads
> > + *
> > + * Input:
> > + *			buf:		ignored
> > + *			size:		zero
> > + * OR
> > + *
> > + * Input:
> > + *			buf:		C string containing an unsigned
> > + *					integer value representing the new
> > + *					max number of threads
> > + *			size:		non-zero length of C string in @buf
> > + * Output:
> > + *	On success:	passed-in buffer filled with '\n'-terminated C string
> > + *			containing numeric value of max_threads setting
> > + *			for this net namespace;
> > + *			return code is the size in bytes of the string
> > + *	On error:	return code is zero or a negative errno value
> > + */
> > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size)
> > +{
> > +	char *mesg = buf;
> > +	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
> > +	unsigned int maxthreads = nn->max_threads;
> > +
> > +	if (size > 0) {
> > +		int rv = get_uint(&mesg, &maxthreads);
> > +
> > +		if (rv)
> > +			return rv;
> > +		trace_nfsd_ctl_maxthreads(netns(file), maxthreads);
> > +		mutex_lock(&nfsd_mutex);
> > +		nn->max_threads = maxthreads;
> > +		nfsd_update_nets();
> > +		mutex_unlock(&nfsd_mutex);
> > +	}
> > +
> > +	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxthreads);
> > +}
> > +
> >  #ifdef CONFIG_NFSD_V4
> >  static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
> >  				  time64_t *time, struct nfsd_net *nn)
> > @@ -1372,6 +1416,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
> >  		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
> >  		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
> >  		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
> > +		[NFSD_MaxThreads] = {"max_threads", &transaction_ops, S_IWUSR|S_IRUGO},
> >  		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
> >  #ifdef CONFIG_NFSD_V4
> >  		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
> > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > index e4c643255dc9..6874c2de670b 100644
> > --- a/fs/nfsd/nfsd.h
> > +++ b/fs/nfsd/nfsd.h
> > @@ -156,6 +156,10 @@ int nfsd_create_serv(struct net *net);
> >  void nfsd_destroy_serv(struct net *net);
> >  
> >  extern int nfsd_max_blksize;
> > +void nfsd_update_nets(void);
> > +extern unsigned int	nfsd_max_threads;
> > +extern unsigned long	nfsd_net_used;
> > +extern unsigned int	nfsd_net_cnt;
> >  
> >  static inline int nfsd_v4client(struct svc_rqst *rq)
> >  {
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index b005b2e2e6ad..75d78c17756f 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -80,6 +80,21 @@ DEFINE_SPINLOCK(nfsd_drc_lock);
> >  unsigned long	nfsd_drc_max_mem;
> >  unsigned long	nfsd_drc_slotsize_sum;
> >  
> > +/*
> > + * nfsd_max_threads is auto-configured based on available ram.
> > + * Each network namespace can configure a minimum number of threads
> > + * and optionally a maximum.
> > + * nfsd_net_used is the number of the max or min from each net namespace.
> > + * nfsd_new_cnt is the number of net namespaces with a configured minimum
> > + *    but no configured maximum.
> > + * When nfsd_max_threads exceeds nfsd_net_used, the different is divided
> > + * by nfsd_net_cnt and this number gives the excess above the configured minimum
> > + * for all net namespaces without a configured maximum.
> > + */
> > +unsigned int	nfsd_max_threads;
> > +unsigned long	nfsd_net_used;
> > +unsigned int	nfsd_net_cnt;
> > +
> >  #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
> >  static const struct svc_version *nfsd_acl_version[] = {
> >  # if defined(CONFIG_NFSD_V2_ACL)
> > @@ -130,6 +145,47 @@ struct svc_program		nfsd_program = {
> >  	.pg_rpcbind_set		= nfsd_rpcbind_set,
> >  };
> >  
> > +void nfsd_update_nets(void)
> > +{
> > +	struct net *net;
> > +
> > +	if (nfsd_max_threads == 0) {
> > +		nfsd_max_threads = (nr_free_buffer_pages() >> 7) /
> > +			(NFSSVC_MAXBLKSIZE >> PAGE_SHIFT);
> > +	}
> > +	nfsd_net_used = 0;
> > +	nfsd_net_cnt = 0;
> > +	down_read(&net_rwsem);
> > +	for_each_net(net) {
> > +		struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +
> > +		if (!nn->nfsd_serv)
> > +			continue;
> > +		if (nn->max_threads > 0) {
> > +			nfsd_net_used += nn->max_threads;
> > +		} else {
> > +			nfsd_net_used += nn->nfsd_serv->sv_nrthreads;
> > +			nfsd_net_cnt += 1;
> > +		}
> > +	}
> > +	up_read(&net_rwsem);
> > +}
> > +
> > +static inline int nfsd_max_pool_threads(struct svc_pool *p, struct nfsd_net *nn)
> > +{
> > +	int svthreads = nn->nfsd_serv->sv_nrthreads;
> > +
> > +	if (nn->max_threads > 0)
> > +		return nn->max_threads;
> > +	if (nfsd_net_cnt == 0 || svthreads == 0)
> > +		return 0;
> > +	if (nfsd_max_threads < nfsd_net_cnt)
> > +		return p->sp_nrthreads;
> > +	/* Share nfsd_max_threads among all net, then among pools in this net. */
> > +	return p->sp_nrthreads +
> > +		nfsd_max_threads / nfsd_net_cnt * p->sp_nrthreads / svthreads;
> > +}
> > +
> >  bool nfsd_support_version(int vers)
> >  {
> >  	if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS)
> > @@ -474,6 +530,7 @@ void nfsd_destroy_serv(struct net *net)
> >  	spin_lock(&nfsd_notifier_lock);
> >  	nn->nfsd_serv = NULL;
> >  	spin_unlock(&nfsd_notifier_lock);
> > +	nfsd_update_nets();
> >  
> >  	/* check if the notifier still has clients */
> >  	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
> > @@ -614,6 +671,8 @@ int nfsd_create_serv(struct net *net)
> >  	nn->nfsd_serv = serv;
> >  	spin_unlock(&nfsd_notifier_lock);
> >  
> > +	nfsd_update_nets();
> > +
> >  	set_max_drc();
> >  	/* check if the notifier is already set */
> >  	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
> > @@ -720,6 +779,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
> >  			goto out;
> >  	}
> >  out:
> > +	nfsd_update_nets();
> >  	return err;
> >  }
> >  
> > @@ -759,6 +819,7 @@ nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const c
> >  	error = nfsd_set_nrthreads(n, nthreads, net);
> >  	if (error)
> >  		goto out_put;
> > +	nfsd_update_nets();
> >  	error = serv->sv_nrthreads;
> >  out_put:
> >  	if (serv->sv_nrthreads == 0)
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 77bbd23aa150..92b888e178e8 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -2054,6 +2054,25 @@ TRACE_EVENT(nfsd_ctl_maxconn,
> >  	)
> >  );
> >  
> > +TRACE_EVENT(nfsd_ctl_maxthreads,
> > +	TP_PROTO(
> > +		const struct net *net,
> > +		int maxthreads
> > +	),
> > +	TP_ARGS(net, maxthreads),
> > +	TP_STRUCT__entry(
> > +		__field(unsigned int, netns_ino)
> > +		__field(int, maxthreads)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->netns_ino = net->ns.inum;
> > +		__entry->maxthreads = maxthreads
> > +	),
> > +	TP_printk("maxthreads=%d",
> > +		__entry->maxthreads
> > +	)
> > +);
> > +
> >  TRACE_EVENT(nfsd_ctl_time,
> >  	TP_PROTO(
> >  		const struct net *net,
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
>
Jeff Layton July 16, 2024, 11 a.m. UTC | #3
On Tue, 2024-07-16 at 13:21 +1000, NeilBrown wrote:
> On Tue, 16 Jul 2024, Jeff Layton wrote:
> > On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
> > > A future patch will allow the number of threads in each nfsd pool to
> > > vary dynamically.
> > > The lower bound will be the number explicit requested via
> > > /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
> > > 
> > > The upper bound can be set in each net-namespace by writing
> > > /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
> > > there is no per-pool upper limit.
> > > 
> > > If no upper bound is set, then one is calculated.  A global upper limit
> > > is chosen based on amount of memory.  This limit only affects dynamic
> > > changes. Static configuration can always over-ride it.
> > > 
> > > We track how many threads are configured in each net namespace, with the
> > > max or the min.  We also track how many net namespaces have nfsd
> > > configured with only a min, not a max.
> > > 
> > > The difference between the calculated max and the total allocation is
> > > available to be shared among those namespaces which don't have a maximum
> > > configured.  Within a namespace, the available share is distributed
> > > equally across all pools.
> > > 
> > > In the common case there is one namespace and one pool.  A small number
> > > of threads are configured as a minimum and no maximum is set.  In this
> > > case the effective maximum will be directly based on total memory.
> > > Approximately 8 per gigabyte.
> > > 
> > 
> > 
> > Some of this may come across as bikeshedding, but I'd probably prefer
> > that this work a bit differently:
> > 
> > 1/ I don't think we should enable this universally -- at least not
> > initially. What I'd prefer to see is a new pool_mode for the dynamic
> > threadpools (maybe call it "dynamic"). That gives us a clear opt-in
> > mechanism. Later once we're convinced it's safe, we can make "dynamic"
> > the default instead of "global".
> > 
> > 2/ Rather than specifying a max_threads value separately, why not allow
> > the old threads/pool_threads interface to set the max and just have a
> > reasonable minimum setting (like the current default of 8). Since we're
> > growing the threadpool dynamically, I don't see why we need to have a
> > real configurable minimum.
> > 
> > 3/ the dynamic pool-mode should probably be layered on top of the
> > pernode pool mode. IOW, in a NUMA configuration, we should split the
> > threads across NUMA nodes.
> 
> Maybe we should start by discussing the goal.  What do we want
> configuration to look like when we finish?
> 
> I think we want it to be transparent.  Sysadmin does nothing, and it all
> works perfectly.  Or as close to that as we can get.
> 

That's a nice eventual goal, but what do we do if we make this change
and it's not behaving for them? We need some way for them to revert to
traditional behavior if the new mode isn't working well.

> So I think the "nproc" option to rpc.nfsd should eventually be ignored
> except for whether it is zero or non-zero.  If it is non-zero we start 1
> thread on each NUMA node and let that grow with limits imposed primarily
> by memory pressure.
> 
> We should probably change
> 
> #define SVC_POOL_DEFAULT	SVC_POOL_GLOBAL
> 
> to
> 
> #define SVC_POOL_DEFAULT	SVC_POOL_AUTO
> 
> about 10 years ago, but failing that, maybe change it tomorrow?
> 

At this point, I wouldn't change the defaults until we're ready to make
dynamic mode the default.

> I'm not sure how
>     /proc/fs/nfsd/{threads,pool_threads}
> should be handled. 
> 

Technically, I'm fine with _not_ handling them here. We do have the new
netlink interfaces that are better suited for this. We could make the
opt-in for dynamic mode contingent on using that somehow.

> Like you I don't think it is really useful to have
> a configured minimum but I don't want them to be imposed as a maximum
> because I want servers with the current default (of 8) to be able to
> start more new threads if necessary without needing a config change.
> Maybe that outcome can be delayed until rpc.nfsd is updated?
> 

That's a bit too aggressive for my tastes. I really do think we need to
allow people to opt-in for this at first. Once we've grown comfortable
with how it all works, we can consider changing the default then.

> I don't really like the idea of a dynamic pool mode.  I want the pool to
> *always* be dynamic.
> 
> 

I think that's a good eventual goal, but I think we need to proceed
with caution. Given that this is all based around heuristics, we'll
need a way for people to revert to more traditional behavior if it's
not working well for them. Making this into a pool-mode and allowing
people to opt-in initially seems like a simple way to do that.

I am fine with eventually discarding the pool-mode settings altogether
if we get dynamic mode working well enough. I'd just prefer a more
incremental approach to getting there.

> 
> > 
> > 
> > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > ---
> > >  fs/nfsd/netns.h  |  6 +++++
> > >  fs/nfsd/nfsctl.c | 45 +++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/nfsd.h   |  4 ++++
> > >  fs/nfsd/nfssvc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/nfsd/trace.h  | 19 +++++++++++++++
> > >  5 files changed, 135 insertions(+)
> > > 
> > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > > index 0d2ac15a5003..329484696a42 100644
> > > --- a/fs/nfsd/netns.h
> > > +++ b/fs/nfsd/netns.h
> > > @@ -133,6 +133,12 @@ struct nfsd_net {
> > >  	 */
> > >  	unsigned int max_connections;
> > >  
> > > +	/*
> > > +	 * Maximum number of threads to auto-adjust up to.  If 0 then a
> > > +	 * share of nfsd_max_threads will be used.
> > > +	 */
> > > +	unsigned int max_threads;
> > > +
> > >  	u32 clientid_base;
> > >  	u32 clientid_counter;
> > >  	u32 clverifier_counter;
> > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > > index d85b6d1fa31f..37e9936567e9 100644
> > > --- a/fs/nfsd/nfsctl.c
> > > +++ b/fs/nfsd/nfsctl.c
> > > @@ -48,6 +48,7 @@ enum {
> > >  	NFSD_Ports,
> > >  	NFSD_MaxBlkSize,
> > >  	NFSD_MaxConnections,
> > > +	NFSD_MaxThreads,
> > >  	NFSD_Filecache,
> > >  	NFSD_Leasetime,
> > >  	NFSD_Gracetime,
> > > @@ -68,6 +69,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size);
> > >  static ssize_t write_ports(struct file *file, char *buf, size_t size);
> > >  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
> > >  static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
> > > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size);
> > >  #ifdef CONFIG_NFSD_V4
> > >  static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
> > >  static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
> > > @@ -87,6 +89,7 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
> > >  	[NFSD_Ports] = write_ports,
> > >  	[NFSD_MaxBlkSize] = write_maxblksize,
> > >  	[NFSD_MaxConnections] = write_maxconn,
> > > +	[NFSD_MaxThreads] = write_maxthreads,
> > >  #ifdef CONFIG_NFSD_V4
> > >  	[NFSD_Leasetime] = write_leasetime,
> > >  	[NFSD_Gracetime] = write_gracetime,
> > > @@ -939,6 +942,47 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
> > >  	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
> > >  }
> > >  
> > > +/*
> > > + * write_maxthreads - Set or report the current max number threads
> > > + *
> > > + * Input:
> > > + *			buf:		ignored
> > > + *			size:		zero
> > > + * OR
> > > + *
> > > + * Input:
> > > + *			buf:		C string containing an unsigned
> > > + *					integer value representing the new
> > > + *					max number of threads
> > > + *			size:		non-zero length of C string in @buf
> > > + * Output:
> > > + *	On success:	passed-in buffer filled with '\n'-terminated C string
> > > + *			containing numeric value of max_threads setting
> > > + *			for this net namespace;
> > > + *			return code is the size in bytes of the string
> > > + *	On error:	return code is zero or a negative errno value
> > > + */
> > > +static ssize_t write_maxthreads(struct file *file, char *buf, size_t size)
> > > +{
> > > +	char *mesg = buf;
> > > +	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
> > > +	unsigned int maxthreads = nn->max_threads;
> > > +
> > > +	if (size > 0) {
> > > +		int rv = get_uint(&mesg, &maxthreads);
> > > +
> > > +		if (rv)
> > > +			return rv;
> > > +		trace_nfsd_ctl_maxthreads(netns(file), maxthreads);
> > > +		mutex_lock(&nfsd_mutex);
> > > +		nn->max_threads = maxthreads;
> > > +		nfsd_update_nets();
> > > +		mutex_unlock(&nfsd_mutex);
> > > +	}
> > > +
> > > +	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxthreads);
> > > +}
> > > +
> > >  #ifdef CONFIG_NFSD_V4
> > >  static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
> > >  				  time64_t *time, struct nfsd_net *nn)
> > > @@ -1372,6 +1416,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
> > >  		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
> > >  		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
> > >  		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
> > > +		[NFSD_MaxThreads] = {"max_threads", &transaction_ops, S_IWUSR|S_IRUGO},
> > >  		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
> > >  #ifdef CONFIG_NFSD_V4
> > >  		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
> > > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > > index e4c643255dc9..6874c2de670b 100644
> > > --- a/fs/nfsd/nfsd.h
> > > +++ b/fs/nfsd/nfsd.h
> > > @@ -156,6 +156,10 @@ int nfsd_create_serv(struct net *net);
> > >  void nfsd_destroy_serv(struct net *net);
> > >  
> > >  extern int nfsd_max_blksize;
> > > +void nfsd_update_nets(void);
> > > +extern unsigned int	nfsd_max_threads;
> > > +extern unsigned long	nfsd_net_used;
> > > +extern unsigned int	nfsd_net_cnt;
> > >  
> > >  static inline int nfsd_v4client(struct svc_rqst *rq)
> > >  {
> > > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > > index b005b2e2e6ad..75d78c17756f 100644
> > > --- a/fs/nfsd/nfssvc.c
> > > +++ b/fs/nfsd/nfssvc.c
> > > @@ -80,6 +80,21 @@ DEFINE_SPINLOCK(nfsd_drc_lock);
> > >  unsigned long	nfsd_drc_max_mem;
> > >  unsigned long	nfsd_drc_slotsize_sum;
> > >  
> > > +/*
> > > + * nfsd_max_threads is auto-configured based on available ram.
> > > + * Each network namespace can configure a minimum number of threads
> > > + * and optionally a maximum.
> > > + * nfsd_net_used is the number of the max or min from each net namespace.
> > > + * nfsd_new_cnt is the number of net namespaces with a configured minimum
> > > + *    but no configured maximum.
> > > + * When nfsd_max_threads exceeds nfsd_net_used, the different is divided
> > > + * by nfsd_net_cnt and this number gives the excess above the configured minimum
> > > + * for all net namespaces without a configured maximum.
> > > + */
> > > +unsigned int	nfsd_max_threads;
> > > +unsigned long	nfsd_net_used;
> > > +unsigned int	nfsd_net_cnt;
> > > +
> > >  #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
> > >  static const struct svc_version *nfsd_acl_version[] = {
> > >  # if defined(CONFIG_NFSD_V2_ACL)
> > > @@ -130,6 +145,47 @@ struct svc_program		nfsd_program = {
> > >  	.pg_rpcbind_set		= nfsd_rpcbind_set,
> > >  };
> > >  
> > > +void nfsd_update_nets(void)
> > > +{
> > > +	struct net *net;
> > > +
> > > +	if (nfsd_max_threads == 0) {
> > > +		nfsd_max_threads = (nr_free_buffer_pages() >> 7) /
> > > +			(NFSSVC_MAXBLKSIZE >> PAGE_SHIFT);
> > > +	}
> > > +	nfsd_net_used = 0;
> > > +	nfsd_net_cnt = 0;
> > > +	down_read(&net_rwsem);
> > > +	for_each_net(net) {
> > > +		struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +
> > > +		if (!nn->nfsd_serv)
> > > +			continue;
> > > +		if (nn->max_threads > 0) {
> > > +			nfsd_net_used += nn->max_threads;
> > > +		} else {
> > > +			nfsd_net_used += nn->nfsd_serv->sv_nrthreads;
> > > +			nfsd_net_cnt += 1;
> > > +		}
> > > +	}
> > > +	up_read(&net_rwsem);
> > > +}
> > > +
> > > +static inline int nfsd_max_pool_threads(struct svc_pool *p, struct nfsd_net *nn)
> > > +{
> > > +	int svthreads = nn->nfsd_serv->sv_nrthreads;
> > > +
> > > +	if (nn->max_threads > 0)
> > > +		return nn->max_threads;
> > > +	if (nfsd_net_cnt == 0 || svthreads == 0)
> > > +		return 0;
> > > +	if (nfsd_max_threads < nfsd_net_cnt)
> > > +		return p->sp_nrthreads;
> > > +	/* Share nfsd_max_threads among all net, then among pools in this net. */
> > > +	return p->sp_nrthreads +
> > > +		nfsd_max_threads / nfsd_net_cnt * p->sp_nrthreads / svthreads;
> > > +}
> > > +
> > >  bool nfsd_support_version(int vers)
> > >  {
> > >  	if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS)
> > > @@ -474,6 +530,7 @@ void nfsd_destroy_serv(struct net *net)
> > >  	spin_lock(&nfsd_notifier_lock);
> > >  	nn->nfsd_serv = NULL;
> > >  	spin_unlock(&nfsd_notifier_lock);
> > > +	nfsd_update_nets();
> > >  
> > >  	/* check if the notifier still has clients */
> > >  	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
> > > @@ -614,6 +671,8 @@ int nfsd_create_serv(struct net *net)
> > >  	nn->nfsd_serv = serv;
> > >  	spin_unlock(&nfsd_notifier_lock);
> > >  
> > > +	nfsd_update_nets();
> > > +
> > >  	set_max_drc();
> > >  	/* check if the notifier is already set */
> > >  	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
> > > @@ -720,6 +779,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
> > >  			goto out;
> > >  	}
> > >  out:
> > > +	nfsd_update_nets();
> > >  	return err;
> > >  }
> > >  
> > > @@ -759,6 +819,7 @@ nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const c
> > >  	error = nfsd_set_nrthreads(n, nthreads, net);
> > >  	if (error)
> > >  		goto out_put;
> > > +	nfsd_update_nets();
> > >  	error = serv->sv_nrthreads;
> > >  out_put:
> > >  	if (serv->sv_nrthreads == 0)
> > > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > > index 77bbd23aa150..92b888e178e8 100644
> > > --- a/fs/nfsd/trace.h
> > > +++ b/fs/nfsd/trace.h
> > > @@ -2054,6 +2054,25 @@ TRACE_EVENT(nfsd_ctl_maxconn,
> > >  	)
> > >  );
> > >  
> > > +TRACE_EVENT(nfsd_ctl_maxthreads,
> > > +	TP_PROTO(
> > > +		const struct net *net,
> > > +		int maxthreads
> > > +	),
> > > +	TP_ARGS(net, maxthreads),
> > > +	TP_STRUCT__entry(
> > > +		__field(unsigned int, netns_ino)
> > > +		__field(int, maxthreads)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->netns_ino = net->ns.inum;
> > > +		__entry->maxthreads = maxthreads
> > > +	),
> > > +	TP_printk("maxthreads=%d",
> > > +		__entry->maxthreads
> > > +	)
> > > +);
> > > +
> > >  TRACE_EVENT(nfsd_ctl_time,
> > >  	TP_PROTO(
> > >  		const struct net *net,
> > 
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> > 
>
Chuck Lever July 16, 2024, 1:31 p.m. UTC | #4
> On Jul 16, 2024, at 7:00 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Tue, 2024-07-16 at 13:21 +1000, NeilBrown wrote:
>> On Tue, 16 Jul 2024, Jeff Layton wrote:
>>> On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
>>>> A future patch will allow the number of threads in each nfsd pool to
>>>> vary dynamically.
>>>> The lower bound will be the number explicit requested via
>>>> /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
>>>> 
>>>> The upper bound can be set in each net-namespace by writing
>>>> /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
>>>> there is no per-pool upper limit.
>>>> 
>>>> If no upper bound is set, then one is calculated.  A global upper limit
>>>> is chosen based on amount of memory.  This limit only affects dynamic
>>>> changes. Static configuration can always over-ride it.
>>>> 
>>>> We track how many threads are configured in each net namespace, with the
>>>> max or the min.  We also track how many net namespaces have nfsd
>>>> configured with only a min, not a max.
>>>> 
>>>> The difference between the calculated max and the total allocation is
>>>> available to be shared among those namespaces which don't have a maximum
>>>> configured.  Within a namespace, the available share is distributed
>>>> equally across all pools.
>>>> 
>>>> In the common case there is one namespace and one pool.  A small number
>>>> of threads are configured as a minimum and no maximum is set.  In this
>>>> case the effective maximum will be directly based on total memory.
>>>> Approximately 8 per gigabyte.
>>>> 
>>> 
>>> 
>>> Some of this may come across as bikeshedding, but I'd probably prefer
>>> that this work a bit differently:
>>> 
>>> 1/ I don't think we should enable this universally -- at least not
>>> initially. What I'd prefer to see is a new pool_mode for the dynamic
>>> threadpools (maybe call it "dynamic"). That gives us a clear opt-in
>>> mechanism. Later once we're convinced it's safe, we can make "dynamic"
>>> the default instead of "global".
>>> 
>>> 2/ Rather than specifying a max_threads value separately, why not allow
>>> the old threads/pool_threads interface to set the max and just have a
>>> reasonable minimum setting (like the current default of 8). Since we're
>>> growing the threadpool dynamically, I don't see why we need to have a
>>> real configurable minimum.
>>> 
>>> 3/ the dynamic pool-mode should probably be layered on top of the
>>> pernode pool mode. IOW, in a NUMA configuration, we should split the
>>> threads across NUMA nodes.
>> 
>> Maybe we should start by discussing the goal.  What do we want
>> configuration to look like when we finish?
>> 
>> I think we want it to be transparent.  Sysadmin does nothing, and it all
>> works perfectly.  Or as close to that as we can get.
>> 
> 
> That's a nice eventual goal, but what do we do if we make this change
> and it's not behaving for them? We need some way for them to revert to
> traditional behavior if the new mode isn't working well.

As Steve pointed out (privately) there are likely to be cases
where the dynamic thread count adjustment creates too many
threads or somehow triggers a DoS. Admins want the ability to
disable new features that cause trouble, and it is impossible
for us to to say truthfully that we have predicted every
misbehavior.

So +1 for having a mechanism for getting back the traditional
behavior, at least until we have confidence it is not going
to have troubling side-effects.

Yes, in a perfect world, fully autonomous thread count
adjustment would be amazing. Let's aim for that, but take
baby steps to get there.

--
Chuck Lever
Tom Talpey July 16, 2024, 6:49 p.m. UTC | #5
On 7/16/2024 9:31 AM, Chuck Lever III wrote:
> 
> 
>> On Jul 16, 2024, at 7:00 AM, Jeff Layton <jlayton@kernel.org> wrote:
>>
>> On Tue, 2024-07-16 at 13:21 +1000, NeilBrown wrote:
>>> On Tue, 16 Jul 2024, Jeff Layton wrote:
>>>> On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
>>>>> A future patch will allow the number of threads in each nfsd pool to
>>>>> vary dynamically.
>>>>> The lower bound will be the number explicit requested via
>>>>> /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
>>>>>
>>>>> The upper bound can be set in each net-namespace by writing
>>>>> /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
>>>>> there is no per-pool upper limit.
>>>>>
>>>>> If no upper bound is set, then one is calculated.  A global upper limit
>>>>> is chosen based on amount of memory.  This limit only affects dynamic
>>>>> changes. Static configuration can always over-ride it.
>>>>>
>>>>> We track how many threads are configured in each net namespace, with the
>>>>> max or the min.  We also track how many net namespaces have nfsd
>>>>> configured with only a min, not a max.
>>>>>
>>>>> The difference between the calculated max and the total allocation is
>>>>> available to be shared among those namespaces which don't have a maximum
>>>>> configured.  Within a namespace, the available share is distributed
>>>>> equally across all pools.
>>>>>
>>>>> In the common case there is one namespace and one pool.  A small number
>>>>> of threads are configured as a minimum and no maximum is set.  In this
>>>>> case the effective maximum will be directly based on total memory.
>>>>> Approximately 8 per gigabyte.
>>>>>
>>>>
>>>>
>>>> Some of this may come across as bikeshedding, but I'd probably prefer
>>>> that this work a bit differently:
>>>>
>>>> 1/ I don't think we should enable this universally -- at least not
>>>> initially. What I'd prefer to see is a new pool_mode for the dynamic
>>>> threadpools (maybe call it "dynamic"). That gives us a clear opt-in
>>>> mechanism. Later once we're convinced it's safe, we can make "dynamic"
>>>> the default instead of "global".
>>>>
>>>> 2/ Rather than specifying a max_threads value separately, why not allow
>>>> the old threads/pool_threads interface to set the max and just have a
>>>> reasonable minimum setting (like the current default of 8). Since we're
>>>> growing the threadpool dynamically, I don't see why we need to have a
>>>> real configurable minimum.
>>>>
>>>> 3/ the dynamic pool-mode should probably be layered on top of the
>>>> pernode pool mode. IOW, in a NUMA configuration, we should split the
>>>> threads across NUMA nodes.
>>>
>>> Maybe we should start by discussing the goal.  What do we want
>>> configuration to look like when we finish?
>>>
>>> I think we want it to be transparent.  Sysadmin does nothing, and it all
>>> works perfectly.  Or as close to that as we can get.
>>>
>>
>> That's a nice eventual goal, but what do we do if we make this change
>> and it's not behaving for them? We need some way for them to revert to
>> traditional behavior if the new mode isn't working well.
> 
> As Steve pointed out (privately) there are likely to be cases
> where the dynamic thread count adjustment creates too many
> threads or somehow triggers a DoS. Admins want the ability to
> disable new features that cause trouble, and it is impossible
> for us to to say truthfully that we have predicted every
> misbehavior.
> 
> So +1 for having a mechanism for getting back the traditional
> behavior, at least until we have confidence it is not going
> to have troubling side-effects.

+1 on a configurable maximum as well, but I'll add a concern about
the NUMA node thing.

Not all CPU cores are created equal any more, there are "performance"
and "efficiency" (Atom) cores and there can be a big difference. Also
there are NUMA nodes with no CPUs at all, memory-only for example.
Then, CXL scrambles the topology again.

Let's not forget that these nfsd threads call into the filesystems,
which may desire very different NUMA affinities, for example the nfsd
protocol side may prefer to be near the network adapter, while the
filesystem side, the storage. And RDMA can bypass memory copy costs.

Thread count only addresses a fraction of these.

> Yes, in a perfect world, fully autonomous thread count
> adjustment would be amazing. Let's aim for that, but take
> baby steps to get there.

Amazing indeed, and just as unlikely to be perfect. Caution is good.

Tom.
Chuck Lever July 17, 2024, 3:24 p.m. UTC | #6
> On Jul 16, 2024, at 2:49 PM, Tom Talpey <tom@talpey.com> wrote:
> 
> On 7/16/2024 9:31 AM, Chuck Lever III wrote:
>>> On Jul 16, 2024, at 7:00 AM, Jeff Layton <jlayton@kernel.org> wrote:
>>> 
>>> On Tue, 2024-07-16 at 13:21 +1000, NeilBrown wrote:
>>>> On Tue, 16 Jul 2024, Jeff Layton wrote:
>>>>> On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
>>>>>> A future patch will allow the number of threads in each nfsd pool to
>>>>>> vary dynamically.
>>>>>> The lower bound will be the number explicit requested via
>>>>>> /proc/fs/nfsd/threads or /proc/fs/nfsd/pool_threads
>>>>>> 
>>>>>> The upper bound can be set in each net-namespace by writing
>>>>>> /proc/fs/nfsd/max_threads.  This upper bound applies across all pools,
>>>>>> there is no per-pool upper limit.
>>>>>> 
>>>>>> If no upper bound is set, then one is calculated.  A global upper limit
>>>>>> is chosen based on amount of memory.  This limit only affects dynamic
>>>>>> changes. Static configuration can always over-ride it.
>>>>>> 
>>>>>> We track how many threads are configured in each net namespace, with the
>>>>>> max or the min.  We also track how many net namespaces have nfsd
>>>>>> configured with only a min, not a max.
>>>>>> 
>>>>>> The difference between the calculated max and the total allocation is
>>>>>> available to be shared among those namespaces which don't have a maximum
>>>>>> configured.  Within a namespace, the available share is distributed
>>>>>> equally across all pools.
>>>>>> 
>>>>>> In the common case there is one namespace and one pool.  A small number
>>>>>> of threads are configured as a minimum and no maximum is set.  In this
>>>>>> case the effective maximum will be directly based on total memory.
>>>>>> Approximately 8 per gigabyte.
>>>>>> 
>>>>> 
>>>>> 
>>>>> Some of this may come across as bikeshedding, but I'd probably prefer
>>>>> that this work a bit differently:
>>>>> 
>>>>> 1/ I don't think we should enable this universally -- at least not
>>>>> initially. What I'd prefer to see is a new pool_mode for the dynamic
>>>>> threadpools (maybe call it "dynamic"). That gives us a clear opt-in
>>>>> mechanism. Later once we're convinced it's safe, we can make "dynamic"
>>>>> the default instead of "global".
>>>>> 
>>>>> 2/ Rather than specifying a max_threads value separately, why not allow
>>>>> the old threads/pool_threads interface to set the max and just have a
>>>>> reasonable minimum setting (like the current default of 8). Since we're
>>>>> growing the threadpool dynamically, I don't see why we need to have a
>>>>> real configurable minimum.
>>>>> 
>>>>> 3/ the dynamic pool-mode should probably be layered on top of the
>>>>> pernode pool mode. IOW, in a NUMA configuration, we should split the
>>>>> threads across NUMA nodes.
>>>> 
>>>> Maybe we should start by discussing the goal.  What do we want
>>>> configuration to look like when we finish?
>>>> 
>>>> I think we want it to be transparent.  Sysadmin does nothing, and it all
>>>> works perfectly.  Or as close to that as we can get.
>>>> 
>>> 
>>> That's a nice eventual goal, but what do we do if we make this change
>>> and it's not behaving for them? We need some way for them to revert to
>>> traditional behavior if the new mode isn't working well.
>> As Steve pointed out (privately) there are likely to be cases
>> where the dynamic thread count adjustment creates too many
>> threads or somehow triggers a DoS. Admins want the ability to
>> disable new features that cause trouble, and it is impossible
>> for us to to say truthfully that we have predicted every
>> misbehavior.
>> So +1 for having a mechanism for getting back the traditional
>> behavior, at least until we have confidence it is not going
>> to have troubling side-effects.
> 
> +1 on a configurable maximum as well, but I'll add a concern about
> the NUMA node thing.
> 
> Not all CPU cores are created equal any more, there are "performance"
> and "efficiency" (Atom) cores and there can be a big difference. Also
> there are NUMA nodes with no CPUs at all, memory-only for example.
> Then, CXL scrambles the topology again.

I think it wouldn't be difficult to make the svc_pool_map skip
creating svc thread pools on NUMA nodes with no CPUs. And perhaps
the min/max settings need to be per pool?

But the idea with dynamic thread pool sizing is that if a pool
(or node) is not getting NFS traffic, then its thread pool will
not grow.


> Let's not forget that these nfsd threads call into the filesystems,
> which may desire very different NUMA affinities, for example the nfsd
> protocol side may prefer to be near the network adapter, while the
> filesystem side, the storage. And RDMA can bypass memory copy costs.

Agreed, these issues still require administrator attention when
configuring a high performance system.


> Thread count only addresses a fraction of these.
> 
>> Yes, in a perfect world, fully autonomous thread count
>> adjustment would be amazing. Let's aim for that, but take
>> baby steps to get there.
> 
> Amazing indeed, and just as unlikely to be perfect. Caution is good.
> 
> Tom.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 0d2ac15a5003..329484696a42 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -133,6 +133,12 @@  struct nfsd_net {
 	 */
 	unsigned int max_connections;
 
+	/*
+	 * Maximum number of threads to auto-adjust up to.  If 0 then a
+	 * share of nfsd_max_threads will be used.
+	 */
+	unsigned int max_threads;
+
 	u32 clientid_base;
 	u32 clientid_counter;
 	u32 clverifier_counter;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d85b6d1fa31f..37e9936567e9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -48,6 +48,7 @@  enum {
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
 	NFSD_MaxConnections,
+	NFSD_MaxThreads,
 	NFSD_Filecache,
 	NFSD_Leasetime,
 	NFSD_Gracetime,
@@ -68,6 +69,7 @@  static ssize_t write_versions(struct file *file, char *buf, size_t size);
 static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
+static ssize_t write_maxthreads(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
@@ -87,6 +89,7 @@  static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Ports] = write_ports,
 	[NFSD_MaxBlkSize] = write_maxblksize,
 	[NFSD_MaxConnections] = write_maxconn,
+	[NFSD_MaxThreads] = write_maxthreads,
 #ifdef CONFIG_NFSD_V4
 	[NFSD_Leasetime] = write_leasetime,
 	[NFSD_Gracetime] = write_gracetime,
@@ -939,6 +942,47 @@  static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
 	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
 }
 
+/*
+ * write_maxthreads - Set or report the current max number threads
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the new
+ *					max number of threads
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of max_threads setting
+ *			for this net namespace;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_maxthreads(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+	unsigned int maxthreads = nn->max_threads;
+
+	if (size > 0) {
+		int rv = get_uint(&mesg, &maxthreads);
+
+		if (rv)
+			return rv;
+		trace_nfsd_ctl_maxthreads(netns(file), maxthreads);
+		mutex_lock(&nfsd_mutex);
+		nn->max_threads = maxthreads;
+		nfsd_update_nets();
+		mutex_unlock(&nfsd_mutex);
+	}
+
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxthreads);
+}
+
 #ifdef CONFIG_NFSD_V4
 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
 				  time64_t *time, struct nfsd_net *nn)
@@ -1372,6 +1416,7 @@  static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_MaxThreads] = {"max_threads", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e4c643255dc9..6874c2de670b 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -156,6 +156,10 @@  int nfsd_create_serv(struct net *net);
 void nfsd_destroy_serv(struct net *net);
 
 extern int nfsd_max_blksize;
+void nfsd_update_nets(void);
+extern unsigned int	nfsd_max_threads;
+extern unsigned long	nfsd_net_used;
+extern unsigned int	nfsd_net_cnt;
 
 static inline int nfsd_v4client(struct svc_rqst *rq)
 {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b005b2e2e6ad..75d78c17756f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -80,6 +80,21 @@  DEFINE_SPINLOCK(nfsd_drc_lock);
 unsigned long	nfsd_drc_max_mem;
 unsigned long	nfsd_drc_slotsize_sum;
 
+/*
+ * nfsd_max_threads is auto-configured based on available ram.
+ * Each network namespace can configure a minimum number of threads
+ * and optionally a maximum.
+ * nfsd_net_used is the number of the max or min from each net namespace.
+ * nfsd_new_cnt is the number of net namespaces with a configured minimum
+ *    but no configured maximum.
+ * When nfsd_max_threads exceeds nfsd_net_used, the different is divided
+ * by nfsd_net_cnt and this number gives the excess above the configured minimum
+ * for all net namespaces without a configured maximum.
+ */
+unsigned int	nfsd_max_threads;
+unsigned long	nfsd_net_used;
+unsigned int	nfsd_net_cnt;
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static const struct svc_version *nfsd_acl_version[] = {
 # if defined(CONFIG_NFSD_V2_ACL)
@@ -130,6 +145,47 @@  struct svc_program		nfsd_program = {
 	.pg_rpcbind_set		= nfsd_rpcbind_set,
 };
 
+void nfsd_update_nets(void)
+{
+	struct net *net;
+
+	if (nfsd_max_threads == 0) {
+		nfsd_max_threads = (nr_free_buffer_pages() >> 7) /
+			(NFSSVC_MAXBLKSIZE >> PAGE_SHIFT);
+	}
+	nfsd_net_used = 0;
+	nfsd_net_cnt = 0;
+	down_read(&net_rwsem);
+	for_each_net(net) {
+		struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+		if (!nn->nfsd_serv)
+			continue;
+		if (nn->max_threads > 0) {
+			nfsd_net_used += nn->max_threads;
+		} else {
+			nfsd_net_used += nn->nfsd_serv->sv_nrthreads;
+			nfsd_net_cnt += 1;
+		}
+	}
+	up_read(&net_rwsem);
+}
+
+static inline int nfsd_max_pool_threads(struct svc_pool *p, struct nfsd_net *nn)
+{
+	int svthreads = nn->nfsd_serv->sv_nrthreads;
+
+	if (nn->max_threads > 0)
+		return nn->max_threads;
+	if (nfsd_net_cnt == 0 || svthreads == 0)
+		return 0;
+	if (nfsd_max_threads < nfsd_net_cnt)
+		return p->sp_nrthreads;
+	/* Share nfsd_max_threads among all net, then among pools in this net. */
+	return p->sp_nrthreads +
+		nfsd_max_threads / nfsd_net_cnt * p->sp_nrthreads / svthreads;
+}
+
 bool nfsd_support_version(int vers)
 {
 	if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS)
@@ -474,6 +530,7 @@  void nfsd_destroy_serv(struct net *net)
 	spin_lock(&nfsd_notifier_lock);
 	nn->nfsd_serv = NULL;
 	spin_unlock(&nfsd_notifier_lock);
+	nfsd_update_nets();
 
 	/* check if the notifier still has clients */
 	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
@@ -614,6 +671,8 @@  int nfsd_create_serv(struct net *net)
 	nn->nfsd_serv = serv;
 	spin_unlock(&nfsd_notifier_lock);
 
+	nfsd_update_nets();
+
 	set_max_drc();
 	/* check if the notifier is already set */
 	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
@@ -720,6 +779,7 @@  int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 			goto out;
 	}
 out:
+	nfsd_update_nets();
 	return err;
 }
 
@@ -759,6 +819,7 @@  nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const c
 	error = nfsd_set_nrthreads(n, nthreads, net);
 	if (error)
 		goto out_put;
+	nfsd_update_nets();
 	error = serv->sv_nrthreads;
 out_put:
 	if (serv->sv_nrthreads == 0)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 77bbd23aa150..92b888e178e8 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -2054,6 +2054,25 @@  TRACE_EVENT(nfsd_ctl_maxconn,
 	)
 );
 
+TRACE_EVENT(nfsd_ctl_maxthreads,
+	TP_PROTO(
+		const struct net *net,
+		int maxthreads
+	),
+	TP_ARGS(net, maxthreads),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(int, maxthreads)
+	),
+	TP_fast_assign(
+		__entry->netns_ino = net->ns.inum;
+		__entry->maxthreads = maxthreads
+	),
+	TP_printk("maxthreads=%d",
+		__entry->maxthreads
+	)
+);
+
 TRACE_EVENT(nfsd_ctl_time,
 	TP_PROTO(
 		const struct net *net,