diff mbox series

[07/14] Change unshare_fs_struct() to never fail.

Message ID 20240715074657.18174-8-neilb@suse.de (mailing list archive)
State New
Headers show
Series support automatic changes to nfsd thread count | expand

Commit Message

NeilBrown July 15, 2024, 7:14 a.m. UTC
nfsd threads need to not share the init fs_struct as they need to
manipulate umask independently.  So they call unshare_fs_struct() and
are the only user of that function.

In the unlikely event that unshare_fs_struct() fails, the thread will
exit calling svc_exit_thread() BEFORE svc_thread_should_stop() reports
'true'.

This is a problem because svc_exit_thread() assumes that
svc_stop_threads() is running and consequently (in the nfsd case)
nfsd_mutex is held.  This ensures that the list_del_rcu() call in
svc_exit_thread() cannot race with any other manipulation of
->sp_all_threads.

While it would be possible to add some other exclusion, doing so would
introduce unnecessary complexity.  unshare_fs_struct() does not fail in
practice.  So the simplest solution is to make this explicit.  i.e.  use
__GFP_NOFAIL which is safe on such a small allocation - about 64 bytes.

Change unshare_fs_struct() to not return any error, and remove the error
handling from nfsd().

An alternate approach would be to create a variant of
kthread_create_on_node() which didn't set CLONE_FS.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/fs_struct.c            | 42 ++++++++++++++++++++-------------------
 fs/nfsd/nfssvc.c          |  9 +++------
 include/linux/fs_struct.h |  2 +-
 3 files changed, 26 insertions(+), 27 deletions(-)

Comments

Jeff Layton July 15, 2024, 2:39 p.m. UTC | #1
On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
> nfsd threads need to not share the init fs_struct as they need to
> manipulate umask independently.  So they call unshare_fs_struct() and
> are the only user of that function.
> 
> In the unlikely event that unshare_fs_struct() fails, the thread will
> exit calling svc_exit_thread() BEFORE svc_thread_should_stop() reports
> 'true'.
> 
> This is a problem because svc_exit_thread() assumes that
> svc_stop_threads() is running and consequently (in the nfsd case)
> nfsd_mutex is held.  This ensures that the list_del_rcu() call in
> svc_exit_thread() cannot race with any other manipulation of
> ->sp_all_threads.
> 
> While it would be possible to add some other exclusion, doing so would
> introduce unnecessary complexity.  unshare_fs_struct() does not fail in
> practice.  So the simplest solution is to make this explicit.  i.e.  use
> __GFP_NOFAIL which is safe on such a small allocation - about 64 bytes.
> 

I know some folks are trying hard to get rid of (or minimize the use
of) __GFP_NOFAIL. This might not be a long term solution.

> Change unshare_fs_struct() to not return any error, and remove the error
> handling from nfsd().
> 
> An alternate approach would be to create a variant of
> kthread_create_on_node() which didn't set CLONE_FS.
> 

This sounds like it might be the better approach. I guess you could
just add a set of CLONE_* flags to struct kthread_create_info and fix
up the callers to set that appropriately?

> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/fs_struct.c            | 42 ++++++++++++++++++++-------------------
>  fs/nfsd/nfssvc.c          |  9 +++------
>  include/linux/fs_struct.h |  2 +-
>  3 files changed, 26 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/fs_struct.c b/fs/fs_struct.c
> index 64c2d0814ed6..49fba862e408 100644
> --- a/fs/fs_struct.c
> +++ b/fs/fs_struct.c
> @@ -109,35 +109,39 @@ void exit_fs(struct task_struct *tsk)
>  	}
>  }
>  
> +static void init_fs_struct(struct fs_struct *fs, struct fs_struct *old)
> +{
> +	fs->users = 1;
> +	fs->in_exec = 0;
> +	spin_lock_init(&fs->lock);
> +	seqcount_spinlock_init(&fs->seq, &fs->lock);
> +	fs->umask = old->umask;
> +
> +	spin_lock(&old->lock);
> +	fs->root = old->root;
> +	path_get(&fs->root);
> +	fs->pwd = old->pwd;
> +	path_get(&fs->pwd);
> +	spin_unlock(&old->lock);
> +}
> +
>  struct fs_struct *copy_fs_struct(struct fs_struct *old)
>  {
>  	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
>  	/* We don't need to lock fs - think why ;-) */
> -	if (fs) {
> -		fs->users = 1;
> -		fs->in_exec = 0;
> -		spin_lock_init(&fs->lock);
> -		seqcount_spinlock_init(&fs->seq, &fs->lock);
> -		fs->umask = old->umask;
> -
> -		spin_lock(&old->lock);
> -		fs->root = old->root;
> -		path_get(&fs->root);
> -		fs->pwd = old->pwd;
> -		path_get(&fs->pwd);
> -		spin_unlock(&old->lock);
> -	}
> +	if (fs)
> +		init_fs_struct(fs, old);
>  	return fs;
>  }
>  
> -int unshare_fs_struct(void)
> +void unshare_fs_struct(void)
>  {
>  	struct fs_struct *fs = current->fs;
> -	struct fs_struct *new_fs = copy_fs_struct(fs);
> +	struct fs_struct *new_fs = kmem_cache_alloc(fs_cachep,
> +						    GFP_KERNEL| __GFP_NOFAIL);
>  	int kill;
>  
> -	if (!new_fs)
> -		return -ENOMEM;
> +	init_fs_struct(new_fs, fs);
>  
>  	task_lock(current);
>  	spin_lock(&fs->lock);
> @@ -148,8 +152,6 @@ int unshare_fs_struct(void)
>  
>  	if (kill)
>  		free_fs_struct(fs);
> -
> -	return 0;
>  }
>  EXPORT_SYMBOL_GPL(unshare_fs_struct);
>  
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 7377422a34df..f5de04a63c6f 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -873,11 +873,9 @@ nfsd(void *vrqstp)
>  
>  	/* At this point, the thread shares current->fs
>  	 * with the init process. We need to create files with the
> -	 * umask as defined by the client instead of init's umask. */
> -	if (unshare_fs_struct() < 0) {
> -		printk("Unable to start nfsd thread: out of memory\n");
> -		goto out;
> -	}
> +	 * umask as defined by the client instead of init's umask.
> +	 */
> +	unshare_fs_struct();
>  
>  	current->fs->umask = 0;
>  
> @@ -899,7 +897,6 @@ nfsd(void *vrqstp)
>  
>  	atomic_dec(&nfsd_th_cnt);
>  
> -out:
>  	/* Release the thread */
>  	svc_exit_thread(rqstp);
>  	return 0;
> diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
> index 783b48dedb72..8282e6c7ff29 100644
> --- a/include/linux/fs_struct.h
> +++ b/include/linux/fs_struct.h
> @@ -22,7 +22,7 @@ extern void set_fs_root(struct fs_struct *, const struct path *);
>  extern void set_fs_pwd(struct fs_struct *, const struct path *);
>  extern struct fs_struct *copy_fs_struct(struct fs_struct *);
>  extern void free_fs_struct(struct fs_struct *);
> -extern int unshare_fs_struct(void);
> +extern void unshare_fs_struct(void);
>  
>  static inline void get_fs_root(struct fs_struct *fs, struct path *root)
>  {
NeilBrown July 16, 2024, 1:48 a.m. UTC | #2
On Tue, 16 Jul 2024, Jeff Layton wrote:
> On Mon, 2024-07-15 at 17:14 +1000, NeilBrown wrote:
> > nfsd threads need to not share the init fs_struct as they need to
> > manipulate umask independently.  So they call unshare_fs_struct() and
> > are the only user of that function.
> > 
> > In the unlikely event that unshare_fs_struct() fails, the thread will
> > exit calling svc_exit_thread() BEFORE svc_thread_should_stop() reports
> > 'true'.
> > 
> > This is a problem because svc_exit_thread() assumes that
> > svc_stop_threads() is running and consequently (in the nfsd case)
> > nfsd_mutex is held.  This ensures that the list_del_rcu() call in
> > svc_exit_thread() cannot race with any other manipulation of
> > ->sp_all_threads.
> > 
> > While it would be possible to add some other exclusion, doing so would
> > introduce unnecessary complexity.  unshare_fs_struct() does not fail in
> > practice.  So the simplest solution is to make this explicit.  i.e.  use
> > __GFP_NOFAIL which is safe on such a small allocation - about 64 bytes.
> > 
> 
> I know some folks are trying hard to get rid of (or minimize the use
> of) __GFP_NOFAIL. This might not be a long term solution.

Other folk are trying to make NOFAIL a standard option.

See
  https://lore.kernel.org/all/22363d0a-71db-4ba7-b5e1-8bb515811d1c@moroto.mountain/
and surrounding.  In that email Dan suggests GFP_SMALL as a standard
option that is used for smallish allocations and never fails (and warns
in the allocation is bigger than X).

Also
  https://lwn.net/Articles/964793/

> 
> > Change unshare_fs_struct() to not return any error, and remove the error
> > handling from nfsd().
> > 
> > An alternate approach would be to create a variant of
> > kthread_create_on_node() which didn't set CLONE_FS.
> > 
> 
> This sounds like it might be the better approach. I guess you could
> just add a set of CLONE_* flags to struct kthread_create_info and fix
> up the callers to set that appropriately?

I tried that first.  I didn't like it.  Lots of effort for little gain,
where __GFP_NOFAIL fixed the same problem more cleanly.
For reference (in case I do need it eventually) below is a patch from my
'git stash' history.

NeilBrown


 fs/fs_struct.c             | 23 -----------------------
 fs/nfsd/nfssvc.c           | 14 +++++---------
 include/linux/fs_struct.h  |  1 -
 include/linux/kthread.h    |  8 ++++++++
 include/linux/sunrpc/svc.h |  1 +
 kernel/kthread.c           | 33 +++++++++++++++++++--------------
 net/sunrpc/svc.c           |  6 ++++--
 7 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..a94764084c8c 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -130,29 +130,6 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	return fs;
 }
 
-int unshare_fs_struct(void)
-{
-	struct fs_struct *fs = current->fs;
-	struct fs_struct *new_fs = copy_fs_struct(fs);
-	int kill;
-
-	if (!new_fs)
-		return -ENOMEM;
-
-	task_lock(current);
-	spin_lock(&fs->lock);
-	kill = !--fs->users;
-	current->fs = new_fs;
-	spin_unlock(&fs->lock);
-	task_unlock(current);
-
-	if (kill)
-		free_fs_struct(fs);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(unshare_fs_struct);
-
 int current_umask(void)
 {
 	return current->fs->umask;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c0d17b92b249..d37b9cbbc250 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -666,6 +666,7 @@ int nfsd_create_serv(struct net *net)
 	if (serv == NULL)
 		return -ENOMEM;
 
+	serv->sv_unshare_fs = true;
 	serv->sv_maxconn = nn->max_connections;
 	error = svc_bind(serv, net);
 	if (error < 0) {
@@ -915,14 +916,10 @@ nfsd(void *vrqstp)
 	struct net *net = perm_sock->xpt_net;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	/* At this point, the thread shares current->fs
-	 * with the init process. We need to create files with the
-	 * umask as defined by the client instead of init's umask. */
-	if (unshare_fs_struct() < 0) {
-		printk("Unable to start nfsd thread: out of memory\n");
-		goto out;
-	}
-
+	/* Thread was created with CLONE_FS disabled so we have
+	 * a private current->fs in which we can control umask
+	 * for file creation.
+	 */
 	current->fs->umask = 0;
 
 	atomic_inc(&nfsd_th_cnt);
@@ -943,7 +940,6 @@ nfsd(void *vrqstp)
 
 	atomic_dec(&nfsd_th_cnt);
 
-out:
 	/* Release the thread */
 	svc_exit_thread(rqstp);
 	return 0;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 783b48dedb72..a854bfa4708c 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -22,7 +22,6 @@ extern void set_fs_root(struct fs_struct *, const struct path *);
 extern void set_fs_pwd(struct fs_struct *, const struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void free_fs_struct(struct fs_struct *);
-extern int unshare_fs_struct(void);
 
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index b11f53c1ba2e..222779a40389 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -24,6 +24,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  * the stopped state.  This is just a helper for kthread_create_on_node();
  * see the documentation there for more details.
  */
+#define kthread_create_on_node(threadfn, data, node, namefmt, arg...) \
+	kthread_create_on_node_flags(threadfn, data, NUMA_NO_NODE, CLONE_FS, namefmt, ##arg)
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 
@@ -33,6 +35,12 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  unsigned int cpu,
 					  const char *namefmt);
 
+struct task_struct *kthread_create_on_node_flags(int (*threadfn)(void *data),
+						 void *data,
+						 int node,
+						 int flags,
+						 const char *namefmt, ...);
+
 void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk);
 bool set_kthread_struct(struct task_struct *p);
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 23617da0e565..405f8ec8a505 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -87,6 +87,7 @@ struct svc_serv {
 	unsigned int		sv_nrpools;	/* number of thread pools */
 	struct svc_pool *	sv_pools;	/* array of thread pools */
 	int			(*sv_threadfn)(void *data);
+	bool			sv_unshare_fs;	/* Does serv need umask? */
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct lwq		sv_cb_list;	/* queue for callback requests
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5e40830c1f2..e97cbab40034 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -42,6 +42,7 @@ struct kthread_create_info
 	int (*threadfn)(void *data);
 	void *data;
 	int node;
+	int clone_flags;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -409,7 +410,7 @@ static void create_kthread(struct kthread_create_info *create)
 #endif
 	/* We want our own signal handler (we take no signals by default). */
 	pid = kernel_thread(kthread, create, create->full_name,
-			    CLONE_FS | CLONE_FILES | SIGCHLD);
+			    create->clone_flags | CLONE_FILES | SIGCHLD);
 	if (pid < 0) {
 		/* Release the structure when caller killed by a fatal signal. */
 		struct completion *done = xchg(&create->done, NULL);
@@ -424,11 +425,12 @@ static void create_kthread(struct kthread_create_info *create)
 	}
 }
 
-static __printf(4, 0)
-struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
-						    void *data, int node,
-						    const char namefmt[],
-						    va_list args)
+static __printf(5, 0)
+struct task_struct *__kthread_create_on_node_flags(int (*threadfn)(void *data),
+						   void *data,
+						   int node, int clone_flags,
+						   const char namefmt[],
+						   va_list args)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
@@ -440,6 +442,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->threadfn = threadfn;
 	create->data = data;
 	create->node = node;
+	create->clone_flags = clone_flags;
 	create->done = &done;
 	create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
 	if (!create->full_name) {
@@ -500,21 +503,23 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
  */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data, int node,
-					   const char namefmt[],
-					   ...)
+struct task_struct *kthread_create_on_node_flags(int (*threadfn)(void *data),
+						 void *data, int node,
+						 int clone_flags,
+						 const char namefmt[],
+						 ...)
 {
 	struct task_struct *task;
 	va_list args;
 
 	va_start(args, namefmt);
-	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+	task = __kthread_create_on_node_flags(threadfn, data, node, clone_flags,
+					      namefmt, args);
 	va_end(args);
 
 	return task;
 }
-EXPORT_SYMBOL(kthread_create_on_node);
+EXPORT_SYMBOL(kthread_create_on_node_flags);
 
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
 {
@@ -870,8 +875,8 @@ __kthread_create_worker(int cpu, unsigned int flags,
 	if (cpu >= 0)
 		node = cpu_to_node(cpu);
 
-	task = __kthread_create_on_node(kthread_worker_fn, worker,
-						node, namefmt, args);
+	task = __kthread_create_on_node_flags(kthread_worker_fn, worker,
+					      node, CLONE_FS, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 2b4b1276d4e8..a3c94778b547 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -781,8 +781,10 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 		rqstp = svc_prepare_thread(serv, chosen_pool, node);
 		if (IS_ERR(rqstp))
 			return PTR_ERR(rqstp);
-		task = kthread_create_on_node(serv->sv_threadfn, rqstp,
-					      node, "%s", serv->sv_name);
+		task = kthread_create_on_node_flags(serv->sv_threadfn, rqstp,
+						    node,
+						    serv->sv_unshare_fs ? 0 : CLONE_FS,
+						    "%s", serv->sv_name);
 		if (IS_ERR(task)) {
 			svc_exit_thread(rqstp);
 			return PTR_ERR(task);
diff mbox series

Patch

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..49fba862e408 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -109,35 +109,39 @@  void exit_fs(struct task_struct *tsk)
 	}
 }
 
+static void init_fs_struct(struct fs_struct *fs, struct fs_struct *old)
+{
+	fs->users = 1;
+	fs->in_exec = 0;
+	spin_lock_init(&fs->lock);
+	seqcount_spinlock_init(&fs->seq, &fs->lock);
+	fs->umask = old->umask;
+
+	spin_lock(&old->lock);
+	fs->root = old->root;
+	path_get(&fs->root);
+	fs->pwd = old->pwd;
+	path_get(&fs->pwd);
+	spin_unlock(&old->lock);
+}
+
 struct fs_struct *copy_fs_struct(struct fs_struct *old)
 {
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		fs->users = 1;
-		fs->in_exec = 0;
-		spin_lock_init(&fs->lock);
-		seqcount_spinlock_init(&fs->seq, &fs->lock);
-		fs->umask = old->umask;
-
-		spin_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&fs->root);
-		fs->pwd = old->pwd;
-		path_get(&fs->pwd);
-		spin_unlock(&old->lock);
-	}
+	if (fs)
+		init_fs_struct(fs, old);
 	return fs;
 }
 
-int unshare_fs_struct(void)
+void unshare_fs_struct(void)
 {
 	struct fs_struct *fs = current->fs;
-	struct fs_struct *new_fs = copy_fs_struct(fs);
+	struct fs_struct *new_fs = kmem_cache_alloc(fs_cachep,
+						    GFP_KERNEL| __GFP_NOFAIL);
 	int kill;
 
-	if (!new_fs)
-		return -ENOMEM;
+	init_fs_struct(new_fs, fs);
 
 	task_lock(current);
 	spin_lock(&fs->lock);
@@ -148,8 +152,6 @@  int unshare_fs_struct(void)
 
 	if (kill)
 		free_fs_struct(fs);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7377422a34df..f5de04a63c6f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -873,11 +873,9 @@  nfsd(void *vrqstp)
 
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with the
-	 * umask as defined by the client instead of init's umask. */
-	if (unshare_fs_struct() < 0) {
-		printk("Unable to start nfsd thread: out of memory\n");
-		goto out;
-	}
+	 * umask as defined by the client instead of init's umask.
+	 */
+	unshare_fs_struct();
 
 	current->fs->umask = 0;
 
@@ -899,7 +897,6 @@  nfsd(void *vrqstp)
 
 	atomic_dec(&nfsd_th_cnt);
 
-out:
 	/* Release the thread */
 	svc_exit_thread(rqstp);
 	return 0;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 783b48dedb72..8282e6c7ff29 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -22,7 +22,7 @@  extern void set_fs_root(struct fs_struct *, const struct path *);
 extern void set_fs_pwd(struct fs_struct *, const struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void free_fs_struct(struct fs_struct *);
-extern int unshare_fs_struct(void);
+extern void unshare_fs_struct(void);
 
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {