Message ID | 20240729212217.30747-3-neilb@suse.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | nfsd: fix handling of error from unshare_fs_struct() | expand |
On Tue, 2024-07-30 at 07:19 +1000, NeilBrown wrote: > If an svc thread needs to perform some initialisation that might fail, > it has no good way to handle the failure. > > Before the thread can exit it must call svc_exit_thread(), but that > requires the service mutex to be held. The thread cannot simply take > the mutex as that could deadlock if there is a concurrent attempt to > shut down all threads (which is unlikely, but not impossible). > > nfsd currently call svc_exit_thread() unprotected in the unlikely event > that unshare_fs_struct() fails. > > We can clean this up by introducing svc_thread_init_status() by which an > svc thread can report whether initialisation has succeeded. If it has, > it continues normally into the action loop. If it has not, > svc_thread_init_status() immediately aborts the thread. > svc_start_kthread() waits for either of these to happen, and calls > svc_exit_thread() (under the mutex) if the thread aborted. > > Signed-off-by: NeilBrown <neilb@suse.de> > --- > fs/lockd/svc.c | 2 ++ > fs/nfs/callback.c | 2 ++ > fs/nfsd/nfssvc.c | 9 +++------ > include/linux/sunrpc/svc.h | 28 ++++++++++++++++++++++++++++ > net/sunrpc/svc.c | 11 +++++++++++ > 5 files changed, 46 insertions(+), 6 deletions(-) > > diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c > index 71713309967d..4ec22c2f2ea3 100644 > --- a/fs/lockd/svc.c > +++ b/fs/lockd/svc.c > @@ -124,6 +124,8 @@ lockd(void *vrqstp) > struct net *net = &init_net; > struct lockd_net *ln = net_generic(net, lockd_net_id); > > + svc_thread_init_status(rqstp, 0); > + > /* try_to_freeze() is called from svc_recv() */ > set_freezable(); > > diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c > index 8adfcd4c8c1a..6cf92498a5ac 100644 > --- a/fs/nfs/callback.c > +++ b/fs/nfs/callback.c > @@ -76,6 +76,8 @@ nfs4_callback_svc(void *vrqstp) > { > struct svc_rqst *rqstp = vrqstp; > > + svc_thread_init_status(rqstp, 0); > + > set_freezable(); > > while (!svc_thread_should_stop(rqstp)) > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > index fca340b3ee91..1cef09a3c78e 100644 > --- a/fs/nfsd/nfssvc.c > +++ b/fs/nfsd/nfssvc.c > @@ -875,11 +875,9 @@ nfsd(void *vrqstp) > > /* At this point, the thread shares current->fs > * with the init process. We need to create files with the > - * umask as defined by the client instead of init's umask. */ > - if (unshare_fs_struct() < 0) { > - printk("Unable to start nfsd thread: out of memory\n"); > - goto out; > - } > + * umask as defined by the client instead of init's umask. > + */ > + svc_thread_init_status(rqstp, unshare_fs_struct()); > > current->fs->umask = 0; > > @@ -901,7 +899,6 @@ nfsd(void *vrqstp) > > atomic_dec(&nfsd_th_cnt); > > -out: > /* Release the thread */ > svc_exit_thread(rqstp); > return 0; > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > index 99e9345d829e..437672bcaa22 100644 > --- a/include/linux/sunrpc/svc.h > +++ b/include/linux/sunrpc/svc.h > @@ -21,6 +21,7 @@ > #include <linux/wait.h> > #include <linux/mm.h> > #include <linux/pagevec.h> > +#include <linux/kthread.h> > > /* > * > @@ -232,6 +233,11 @@ struct svc_rqst { > struct net *rq_bc_net; /* pointer to backchannel's > * net namespace > */ > + > + int rq_err; /* Thread sets this to inidicate > + * initialisation success. > + */ > + > unsigned long bc_to_initval; > unsigned int bc_to_retries; > void ** rq_lease_breaker; /* The v4 client breaking a lease */ > @@ -305,6 +311,28 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) > return test_bit(RQ_VICTIM, &rqstp->rq_flags); > } > > +/** > + * svc_thread_init_status - report whether thread has initialised successfully > + * @rqstp: the thread in question > + * @err: errno code > + * > + * After performing any initialisation that could fail, and before starting > + * normal work, each sunrpc svc_thread must call svc_thread_init_status() > + * with an appropriate error, or zero. > + * > + * If zero is passed, the thread is ready and must continue until > + * svc_thread_should_stop() returns true. If a non-zero error is passed > + * the call will not return - the thread will exit. > + */ > +static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) > +{ > + /* store_release ensures svc_start_kthreads() sees the error */ > + smp_store_release(&rqstp->rq_err, err); > + wake_up_var(&rqstp->rq_err); > + if (err) > + kthread_exit(1); > +} > + > struct svc_deferred_req { > u32 prot; /* protocol (UDP or TCP) */ > struct svc_xprt *xprt; > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c > index ae31ffea7a14..1e80fa67d8b7 100644 > --- a/net/sunrpc/svc.c > +++ b/net/sunrpc/svc.c > @@ -706,6 +706,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) > if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) > goto out_enomem; > > + rqstp->rq_err = -EAGAIN; /* No error yet */ > + > serv->sv_nrthreads += 1; > pool->sp_nrthreads += 1; > > @@ -792,6 +794,7 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) > struct svc_pool *chosen_pool; > unsigned int state = serv->sv_nrthreads-1; > int node; > + int err; > > do { > nrservs--; > @@ -814,6 +817,14 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) > > svc_sock_update_bufs(serv); > wake_up_process(task); > + > + /* load_acquire ensures we get value stored in svc_thread_init_status() */ > + wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); > + err = rqstp->rq_err; > + if (err) { > + svc_exit_thread(rqstp); > + return err; > + } > } while (nrservs > 0); > > return 0; I hit a hang today on the client while running the nfs-interop test under kdevops. The client is stuck in mount syscall, while trying to set up the backchannel: [ 1693.653257] INFO: task mount.nfs:13243 blocked for more than 120 seconds. [ 1693.655827] Not tainted 6.11.0-rc7-gffcadb41b696 #166 [ 1693.657858] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1693.661442] task:mount.nfs state:D stack:0 pid:13243 tgid:13243 ppid:13242 flags:0x00004002 [ 1693.664648] Call Trace: [ 1693.665639] <TASK> [ 1693.666482] __schedule+0xc04/0x2750 [ 1693.668021] ? __pfx___schedule+0x10/0x10 [ 1693.669562] ? _raw_spin_lock_irqsave+0x98/0xf0 [ 1693.671213] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 1693.673109] ? try_to_wake_up+0x141/0x1210 [ 1693.674763] ? __pfx_try_to_wake_up+0x10/0x10 [ 1693.676612] ? _raw_spin_unlock_irqrestore+0x23/0x40 [ 1693.678429] schedule+0x73/0x2f0 [ 1693.679662] svc_set_num_threads+0xbc8/0xf80 [sunrpc] [ 1693.682007] ? __pfx_svc_set_num_threads+0x10/0x10 [sunrpc] [ 1693.684163] ? __pfx_var_wake_function+0x10/0x10 [ 1693.685849] ? __svc_create+0x6c0/0x980 [sunrpc] [ 1693.687777] nfs_callback_up+0x314/0xae0 [nfsv4] [ 1693.689630] nfs4_init_client+0x203/0x400 [nfsv4] [ 1693.691468] ? __pfx_nfs4_init_client+0x10/0x10 [nfsv4] [ 1693.693502] ? _raw_spin_lock_irqsave+0x98/0xf0 [ 1693.695141] nfs4_set_client+0x2f4/0x520 [nfsv4] [ 1693.696967] ? __pfx_nfs4_set_client+0x10/0x10 [nfsv4] [ 1693.699230] nfs4_create_server+0x5f2/0xef0 [nfsv4] [ 1693.701357] ? _raw_spin_lock+0x85/0xe0 [ 1693.702758] ? __pfx__raw_spin_lock+0x10/0x10 [ 1693.704344] ? nfs_get_tree+0x61f/0x16a0 [nfs] [ 1693.706160] ? __pfx_nfs4_create_server+0x10/0x10 [nfsv4] [ 1693.707376] ? __module_get+0x26/0xc0 [ 1693.708061] nfs4_try_get_tree+0xcd/0x250 [nfsv4] [ 1693.708893] vfs_get_tree+0x83/0x2d0 [ 1693.709534] path_mount+0x88d/0x19a0 [ 1693.710100] ? __pfx_path_mount+0x10/0x10 [ 1693.710718] ? user_path_at+0xa4/0xe0 [ 1693.711303] ? kmem_cache_free+0x143/0x3e0 [ 1693.711936] __x64_sys_mount+0x1fb/0x270 [ 1693.712606] ? __pfx___x64_sys_mount+0x10/0x10 [ 1693.713315] do_syscall_64+0x4b/0x110 [ 1693.713889] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 1693.714660] RIP: 0033:0x7f05050418fe [ 1693.715233] RSP: 002b:00007fff4e0f5728 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [ 1693.716417] RAX: ffffffffffffffda RBX: 00007fff4e0f5900 RCX: 00007f05050418fe [ 1693.717492] RDX: 00005559cea3ea70 RSI: 00005559cea3fe60 RDI: 00005559cea40140 [ 1693.718558] RBP: 00007fff4e0f5760 R08: 00005559cea41e60 R09: 00007f050510cb20 [ 1693.719620] R10: 0000000000000000 R11: 0000000000000246 R12: 00005559cea41e60 [ 1693.720735] R13: 00007fff4e0f5900 R14: 00005559cea41df0 R15: 0000000000000004 Looking at faddr2line: $ ./scripts/faddr2line --list net/sunrpc/sunrpc.ko svc_set_num_threads+0xbc8/0xf80 svc_set_num_threads+0xbc8/0xf80: svc_start_kthreads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:822 (discriminator 17) 817 818 svc_sock_update_bufs(serv); 819 wake_up_process(task); 820 821 /* load_acquire ensures we get value stored in svc_thread_init_status() */ >822< wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); 823 err = rqstp->rq_err; 824 if (err) { 825 svc_exit_thread(rqstp); 826 return err; 827 } (inlined by) svc_set_num_threads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:877 (discriminator 17) 872 nrservs -= serv->sv_nrthreads; 873 else 874 nrservs -= pool->sp_nrthreads; 875 876 if (nrservs > 0) >877< return svc_start_kthreads(serv, pool, nrservs); 878 if (nrservs < 0) 879 return svc_stop_kthreads(serv, pool, nrservs); 880 return 0; 881 } 882 EXPORT_SYMBOL_GPL(svc_set_num_threads); It looks like the callback thread started up properly and is past the svc_thread_init_status call. $ sudo cat /proc/13246/stack [<0>] svc_recv+0xcef/0x2020 [sunrpc] [<0>] nfs4_callback_svc+0xb0/0x140 [nfsv4] [<0>] kthread+0x29b/0x360 [<0>] ret_from_fork+0x30/0x70 [<0>] ret_from_fork_asm+0x1a/0x30 ...so the status should have been updated properly. Barrier problem?
> On Sep 11, 2024, at 2:33 PM, Jeff Layton <jlayton@kernel.org> wrote: > > On Tue, 2024-07-30 at 07:19 +1000, NeilBrown wrote: >> If an svc thread needs to perform some initialisation that might fail, >> it has no good way to handle the failure. >> >> Before the thread can exit it must call svc_exit_thread(), but that >> requires the service mutex to be held. The thread cannot simply take >> the mutex as that could deadlock if there is a concurrent attempt to >> shut down all threads (which is unlikely, but not impossible). >> >> nfsd currently call svc_exit_thread() unprotected in the unlikely event >> that unshare_fs_struct() fails. >> >> We can clean this up by introducing svc_thread_init_status() by which an >> svc thread can report whether initialisation has succeeded. If it has, >> it continues normally into the action loop. If it has not, >> svc_thread_init_status() immediately aborts the thread. >> svc_start_kthread() waits for either of these to happen, and calls >> svc_exit_thread() (under the mutex) if the thread aborted. >> >> Signed-off-by: NeilBrown <neilb@suse.de> >> --- >> fs/lockd/svc.c | 2 ++ >> fs/nfs/callback.c | 2 ++ >> fs/nfsd/nfssvc.c | 9 +++------ >> include/linux/sunrpc/svc.h | 28 ++++++++++++++++++++++++++++ >> net/sunrpc/svc.c | 11 +++++++++++ >> 5 files changed, 46 insertions(+), 6 deletions(-) >> >> diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c >> index 71713309967d..4ec22c2f2ea3 100644 >> --- a/fs/lockd/svc.c >> +++ b/fs/lockd/svc.c >> @@ -124,6 +124,8 @@ lockd(void *vrqstp) >> struct net *net = &init_net; >> struct lockd_net *ln = net_generic(net, lockd_net_id); >> >> + svc_thread_init_status(rqstp, 0); >> + >> /* try_to_freeze() is called from svc_recv() */ >> set_freezable(); >> >> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c >> index 8adfcd4c8c1a..6cf92498a5ac 100644 >> --- a/fs/nfs/callback.c >> +++ b/fs/nfs/callback.c >> @@ -76,6 +76,8 @@ nfs4_callback_svc(void *vrqstp) >> { >> struct svc_rqst *rqstp = vrqstp; >> >> + svc_thread_init_status(rqstp, 0); >> + >> set_freezable(); >> >> while (!svc_thread_should_stop(rqstp)) >> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c >> index fca340b3ee91..1cef09a3c78e 100644 >> --- a/fs/nfsd/nfssvc.c >> +++ b/fs/nfsd/nfssvc.c >> @@ -875,11 +875,9 @@ nfsd(void *vrqstp) >> >> /* At this point, the thread shares current->fs >> * with the init process. We need to create files with the >> - * umask as defined by the client instead of init's umask. */ >> - if (unshare_fs_struct() < 0) { >> - printk("Unable to start nfsd thread: out of memory\n"); >> - goto out; >> - } >> + * umask as defined by the client instead of init's umask. >> + */ >> + svc_thread_init_status(rqstp, unshare_fs_struct()); >> >> current->fs->umask = 0; >> >> @@ -901,7 +899,6 @@ nfsd(void *vrqstp) >> >> atomic_dec(&nfsd_th_cnt); >> >> -out: >> /* Release the thread */ >> svc_exit_thread(rqstp); >> return 0; >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h >> index 99e9345d829e..437672bcaa22 100644 >> --- a/include/linux/sunrpc/svc.h >> +++ b/include/linux/sunrpc/svc.h >> @@ -21,6 +21,7 @@ >> #include <linux/wait.h> >> #include <linux/mm.h> >> #include <linux/pagevec.h> >> +#include <linux/kthread.h> >> >> /* >> * >> @@ -232,6 +233,11 @@ struct svc_rqst { >> struct net *rq_bc_net; /* pointer to backchannel's >> * net namespace >> */ >> + >> + int rq_err; /* Thread sets this to inidicate >> + * initialisation success. >> + */ >> + >> unsigned long bc_to_initval; >> unsigned int bc_to_retries; >> void ** rq_lease_breaker; /* The v4 client breaking a lease */ >> @@ -305,6 +311,28 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) >> return test_bit(RQ_VICTIM, &rqstp->rq_flags); >> } >> >> +/** >> + * svc_thread_init_status - report whether thread has initialised successfully >> + * @rqstp: the thread in question >> + * @err: errno code >> + * >> + * After performing any initialisation that could fail, and before starting >> + * normal work, each sunrpc svc_thread must call svc_thread_init_status() >> + * with an appropriate error, or zero. >> + * >> + * If zero is passed, the thread is ready and must continue until >> + * svc_thread_should_stop() returns true. If a non-zero error is passed >> + * the call will not return - the thread will exit. >> + */ >> +static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) >> +{ >> + /* store_release ensures svc_start_kthreads() sees the error */ >> + smp_store_release(&rqstp->rq_err, err); >> + wake_up_var(&rqstp->rq_err); >> + if (err) >> + kthread_exit(1); >> +} >> + >> struct svc_deferred_req { >> u32 prot; /* protocol (UDP or TCP) */ >> struct svc_xprt *xprt; >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c >> index ae31ffea7a14..1e80fa67d8b7 100644 >> --- a/net/sunrpc/svc.c >> +++ b/net/sunrpc/svc.c >> @@ -706,6 +706,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) >> if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) >> goto out_enomem; >> >> + rqstp->rq_err = -EAGAIN; /* No error yet */ >> + >> serv->sv_nrthreads += 1; >> pool->sp_nrthreads += 1; >> >> @@ -792,6 +794,7 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) >> struct svc_pool *chosen_pool; >> unsigned int state = serv->sv_nrthreads-1; >> int node; >> + int err; >> >> do { >> nrservs--; >> @@ -814,6 +817,14 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) >> >> svc_sock_update_bufs(serv); >> wake_up_process(task); >> + >> + /* load_acquire ensures we get value stored in svc_thread_init_status() */ >> + wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); >> + err = rqstp->rq_err; >> + if (err) { >> + svc_exit_thread(rqstp); >> + return err; >> + } >> } while (nrservs > 0); >> >> return 0; > > > I hit a hang today on the client while running the nfs-interop test > under kdevops. The client is stuck in mount syscall, while trying to > set up the backchannel: > > [ 1693.653257] INFO: task mount.nfs:13243 blocked for more than 120 seconds. > [ 1693.655827] Not tainted 6.11.0-rc7-gffcadb41b696 #166 > [ 1693.657858] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > [ 1693.661442] task:mount.nfs state:D stack:0 pid:13243 tgid:13243 ppid:13242 flags:0x00004002 > [ 1693.664648] Call Trace: > [ 1693.665639] <TASK> > [ 1693.666482] __schedule+0xc04/0x2750 > [ 1693.668021] ? __pfx___schedule+0x10/0x10 > [ 1693.669562] ? _raw_spin_lock_irqsave+0x98/0xf0 > [ 1693.671213] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 > [ 1693.673109] ? try_to_wake_up+0x141/0x1210 > [ 1693.674763] ? __pfx_try_to_wake_up+0x10/0x10 > [ 1693.676612] ? _raw_spin_unlock_irqrestore+0x23/0x40 > [ 1693.678429] schedule+0x73/0x2f0 > [ 1693.679662] svc_set_num_threads+0xbc8/0xf80 [sunrpc] > [ 1693.682007] ? __pfx_svc_set_num_threads+0x10/0x10 [sunrpc] > [ 1693.684163] ? __pfx_var_wake_function+0x10/0x10 > [ 1693.685849] ? __svc_create+0x6c0/0x980 [sunrpc] > [ 1693.687777] nfs_callback_up+0x314/0xae0 [nfsv4] > [ 1693.689630] nfs4_init_client+0x203/0x400 [nfsv4] > [ 1693.691468] ? __pfx_nfs4_init_client+0x10/0x10 [nfsv4] > [ 1693.693502] ? _raw_spin_lock_irqsave+0x98/0xf0 > [ 1693.695141] nfs4_set_client+0x2f4/0x520 [nfsv4] > [ 1693.696967] ? __pfx_nfs4_set_client+0x10/0x10 [nfsv4] > [ 1693.699230] nfs4_create_server+0x5f2/0xef0 [nfsv4] > [ 1693.701357] ? _raw_spin_lock+0x85/0xe0 > [ 1693.702758] ? __pfx__raw_spin_lock+0x10/0x10 > [ 1693.704344] ? nfs_get_tree+0x61f/0x16a0 [nfs] > [ 1693.706160] ? __pfx_nfs4_create_server+0x10/0x10 [nfsv4] > [ 1693.707376] ? __module_get+0x26/0xc0 > [ 1693.708061] nfs4_try_get_tree+0xcd/0x250 [nfsv4] > [ 1693.708893] vfs_get_tree+0x83/0x2d0 > [ 1693.709534] path_mount+0x88d/0x19a0 > [ 1693.710100] ? __pfx_path_mount+0x10/0x10 > [ 1693.710718] ? user_path_at+0xa4/0xe0 > [ 1693.711303] ? kmem_cache_free+0x143/0x3e0 > [ 1693.711936] __x64_sys_mount+0x1fb/0x270 > [ 1693.712606] ? __pfx___x64_sys_mount+0x10/0x10 > [ 1693.713315] do_syscall_64+0x4b/0x110 > [ 1693.713889] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 1693.714660] RIP: 0033:0x7f05050418fe > [ 1693.715233] RSP: 002b:00007fff4e0f5728 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 > [ 1693.716417] RAX: ffffffffffffffda RBX: 00007fff4e0f5900 RCX: 00007f05050418fe > [ 1693.717492] RDX: 00005559cea3ea70 RSI: 00005559cea3fe60 RDI: 00005559cea40140 > [ 1693.718558] RBP: 00007fff4e0f5760 R08: 00005559cea41e60 R09: 00007f050510cb20 > [ 1693.719620] R10: 0000000000000000 R11: 0000000000000246 R12: 00005559cea41e60 > [ 1693.720735] R13: 00007fff4e0f5900 R14: 00005559cea41df0 R15: 0000000000000004 > > > Looking at faddr2line: > > $ ./scripts/faddr2line --list net/sunrpc/sunrpc.ko svc_set_num_threads+0xbc8/0xf80 > svc_set_num_threads+0xbc8/0xf80: > > svc_start_kthreads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:822 (discriminator 17) > 817 > 818 svc_sock_update_bufs(serv); > 819 wake_up_process(task); > 820 > 821 /* load_acquire ensures we get value stored in svc_thread_init_status() */ >> 822< wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); > 823 err = rqstp->rq_err; > 824 if (err) { > 825 svc_exit_thread(rqstp); > 826 return err; > 827 } > > (inlined by) svc_set_num_threads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:877 (discriminator 17) > 872 nrservs -= serv->sv_nrthreads; > 873 else > 874 nrservs -= pool->sp_nrthreads; > 875 > 876 if (nrservs > 0) >> 877< return svc_start_kthreads(serv, pool, nrservs); > 878 if (nrservs < 0) > 879 return svc_stop_kthreads(serv, pool, nrservs); > 880 return 0; > 881 } > 882 EXPORT_SYMBOL_GPL(svc_set_num_threads); > > It looks like the callback thread started up properly and is past the svc_thread_init_status call. > > $ sudo cat /proc/13246/stack > [<0>] svc_recv+0xcef/0x2020 [sunrpc] > [<0>] nfs4_callback_svc+0xb0/0x140 [nfsv4] > [<0>] kthread+0x29b/0x360 > [<0>] ret_from_fork+0x30/0x70 > [<0>] ret_from_fork_asm+0x1a/0x30 > > ...so the status should have been updated properly. Barrier problem? Speaking as NFSD release manager: I don't intend to send the NFSD v6.12 PR until week of September 23rd. I will either drop these patches before sending my PR, or I can squash in a fix if one is found. -- Chuck Lever
On Sat, 14 Sep 2024, Chuck Lever III wrote: > > > > On Sep 11, 2024, at 2:33 PM, Jeff Layton <jlayton@kernel.org> wrote: > > > > On Tue, 2024-07-30 at 07:19 +1000, NeilBrown wrote: > >> If an svc thread needs to perform some initialisation that might fail, > >> it has no good way to handle the failure. > >> > >> Before the thread can exit it must call svc_exit_thread(), but that > >> requires the service mutex to be held. The thread cannot simply take > >> the mutex as that could deadlock if there is a concurrent attempt to > >> shut down all threads (which is unlikely, but not impossible). > >> > >> nfsd currently call svc_exit_thread() unprotected in the unlikely event > >> that unshare_fs_struct() fails. > >> > >> We can clean this up by introducing svc_thread_init_status() by which an > >> svc thread can report whether initialisation has succeeded. If it has, > >> it continues normally into the action loop. If it has not, > >> svc_thread_init_status() immediately aborts the thread. > >> svc_start_kthread() waits for either of these to happen, and calls > >> svc_exit_thread() (under the mutex) if the thread aborted. > >> > >> Signed-off-by: NeilBrown <neilb@suse.de> > >> --- > >> fs/lockd/svc.c | 2 ++ > >> fs/nfs/callback.c | 2 ++ > >> fs/nfsd/nfssvc.c | 9 +++------ > >> include/linux/sunrpc/svc.h | 28 ++++++++++++++++++++++++++++ > >> net/sunrpc/svc.c | 11 +++++++++++ > >> 5 files changed, 46 insertions(+), 6 deletions(-) > >> > >> diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c > >> index 71713309967d..4ec22c2f2ea3 100644 > >> --- a/fs/lockd/svc.c > >> +++ b/fs/lockd/svc.c > >> @@ -124,6 +124,8 @@ lockd(void *vrqstp) > >> struct net *net = &init_net; > >> struct lockd_net *ln = net_generic(net, lockd_net_id); > >> > >> + svc_thread_init_status(rqstp, 0); > >> + > >> /* try_to_freeze() is called from svc_recv() */ > >> set_freezable(); > >> > >> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c > >> index 8adfcd4c8c1a..6cf92498a5ac 100644 > >> --- a/fs/nfs/callback.c > >> +++ b/fs/nfs/callback.c > >> @@ -76,6 +76,8 @@ nfs4_callback_svc(void *vrqstp) > >> { > >> struct svc_rqst *rqstp = vrqstp; > >> > >> + svc_thread_init_status(rqstp, 0); > >> + > >> set_freezable(); > >> > >> while (!svc_thread_should_stop(rqstp)) > >> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > >> index fca340b3ee91..1cef09a3c78e 100644 > >> --- a/fs/nfsd/nfssvc.c > >> +++ b/fs/nfsd/nfssvc.c > >> @@ -875,11 +875,9 @@ nfsd(void *vrqstp) > >> > >> /* At this point, the thread shares current->fs > >> * with the init process. We need to create files with the > >> - * umask as defined by the client instead of init's umask. */ > >> - if (unshare_fs_struct() < 0) { > >> - printk("Unable to start nfsd thread: out of memory\n"); > >> - goto out; > >> - } > >> + * umask as defined by the client instead of init's umask. > >> + */ > >> + svc_thread_init_status(rqstp, unshare_fs_struct()); > >> > >> current->fs->umask = 0; > >> > >> @@ -901,7 +899,6 @@ nfsd(void *vrqstp) > >> > >> atomic_dec(&nfsd_th_cnt); > >> > >> -out: > >> /* Release the thread */ > >> svc_exit_thread(rqstp); > >> return 0; > >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > >> index 99e9345d829e..437672bcaa22 100644 > >> --- a/include/linux/sunrpc/svc.h > >> +++ b/include/linux/sunrpc/svc.h > >> @@ -21,6 +21,7 @@ > >> #include <linux/wait.h> > >> #include <linux/mm.h> > >> #include <linux/pagevec.h> > >> +#include <linux/kthread.h> > >> > >> /* > >> * > >> @@ -232,6 +233,11 @@ struct svc_rqst { > >> struct net *rq_bc_net; /* pointer to backchannel's > >> * net namespace > >> */ > >> + > >> + int rq_err; /* Thread sets this to inidicate > >> + * initialisation success. > >> + */ > >> + > >> unsigned long bc_to_initval; > >> unsigned int bc_to_retries; > >> void ** rq_lease_breaker; /* The v4 client breaking a lease */ > >> @@ -305,6 +311,28 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) > >> return test_bit(RQ_VICTIM, &rqstp->rq_flags); > >> } > >> > >> +/** > >> + * svc_thread_init_status - report whether thread has initialised successfully > >> + * @rqstp: the thread in question > >> + * @err: errno code > >> + * > >> + * After performing any initialisation that could fail, and before starting > >> + * normal work, each sunrpc svc_thread must call svc_thread_init_status() > >> + * with an appropriate error, or zero. > >> + * > >> + * If zero is passed, the thread is ready and must continue until > >> + * svc_thread_should_stop() returns true. If a non-zero error is passed > >> + * the call will not return - the thread will exit. > >> + */ > >> +static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) > >> +{ > >> + /* store_release ensures svc_start_kthreads() sees the error */ > >> + smp_store_release(&rqstp->rq_err, err); > >> + wake_up_var(&rqstp->rq_err); > >> + if (err) > >> + kthread_exit(1); > >> +} > >> + > >> struct svc_deferred_req { > >> u32 prot; /* protocol (UDP or TCP) */ > >> struct svc_xprt *xprt; > >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c > >> index ae31ffea7a14..1e80fa67d8b7 100644 > >> --- a/net/sunrpc/svc.c > >> +++ b/net/sunrpc/svc.c > >> @@ -706,6 +706,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) > >> if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) > >> goto out_enomem; > >> > >> + rqstp->rq_err = -EAGAIN; /* No error yet */ > >> + > >> serv->sv_nrthreads += 1; > >> pool->sp_nrthreads += 1; > >> > >> @@ -792,6 +794,7 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) > >> struct svc_pool *chosen_pool; > >> unsigned int state = serv->sv_nrthreads-1; > >> int node; > >> + int err; > >> > >> do { > >> nrservs--; > >> @@ -814,6 +817,14 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) > >> > >> svc_sock_update_bufs(serv); > >> wake_up_process(task); > >> + > >> + /* load_acquire ensures we get value stored in svc_thread_init_status() */ > >> + wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); > >> + err = rqstp->rq_err; > >> + if (err) { > >> + svc_exit_thread(rqstp); > >> + return err; > >> + } > >> } while (nrservs > 0); > >> > >> return 0; > > > > > > I hit a hang today on the client while running the nfs-interop test > > under kdevops. The client is stuck in mount syscall, while trying to > > set up the backchannel: > > > > [ 1693.653257] INFO: task mount.nfs:13243 blocked for more than 120 seconds. > > [ 1693.655827] Not tainted 6.11.0-rc7-gffcadb41b696 #166 > > [ 1693.657858] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > > [ 1693.661442] task:mount.nfs state:D stack:0 pid:13243 tgid:13243 ppid:13242 flags:0x00004002 > > [ 1693.664648] Call Trace: > > [ 1693.665639] <TASK> > > [ 1693.666482] __schedule+0xc04/0x2750 > > [ 1693.668021] ? __pfx___schedule+0x10/0x10 > > [ 1693.669562] ? _raw_spin_lock_irqsave+0x98/0xf0 > > [ 1693.671213] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 > > [ 1693.673109] ? try_to_wake_up+0x141/0x1210 > > [ 1693.674763] ? __pfx_try_to_wake_up+0x10/0x10 > > [ 1693.676612] ? _raw_spin_unlock_irqrestore+0x23/0x40 > > [ 1693.678429] schedule+0x73/0x2f0 > > [ 1693.679662] svc_set_num_threads+0xbc8/0xf80 [sunrpc] > > [ 1693.682007] ? __pfx_svc_set_num_threads+0x10/0x10 [sunrpc] > > [ 1693.684163] ? __pfx_var_wake_function+0x10/0x10 > > [ 1693.685849] ? __svc_create+0x6c0/0x980 [sunrpc] > > [ 1693.687777] nfs_callback_up+0x314/0xae0 [nfsv4] > > [ 1693.689630] nfs4_init_client+0x203/0x400 [nfsv4] > > [ 1693.691468] ? __pfx_nfs4_init_client+0x10/0x10 [nfsv4] > > [ 1693.693502] ? _raw_spin_lock_irqsave+0x98/0xf0 > > [ 1693.695141] nfs4_set_client+0x2f4/0x520 [nfsv4] > > [ 1693.696967] ? __pfx_nfs4_set_client+0x10/0x10 [nfsv4] > > [ 1693.699230] nfs4_create_server+0x5f2/0xef0 [nfsv4] > > [ 1693.701357] ? _raw_spin_lock+0x85/0xe0 > > [ 1693.702758] ? __pfx__raw_spin_lock+0x10/0x10 > > [ 1693.704344] ? nfs_get_tree+0x61f/0x16a0 [nfs] > > [ 1693.706160] ? __pfx_nfs4_create_server+0x10/0x10 [nfsv4] > > [ 1693.707376] ? __module_get+0x26/0xc0 > > [ 1693.708061] nfs4_try_get_tree+0xcd/0x250 [nfsv4] > > [ 1693.708893] vfs_get_tree+0x83/0x2d0 > > [ 1693.709534] path_mount+0x88d/0x19a0 > > [ 1693.710100] ? __pfx_path_mount+0x10/0x10 > > [ 1693.710718] ? user_path_at+0xa4/0xe0 > > [ 1693.711303] ? kmem_cache_free+0x143/0x3e0 > > [ 1693.711936] __x64_sys_mount+0x1fb/0x270 > > [ 1693.712606] ? __pfx___x64_sys_mount+0x10/0x10 > > [ 1693.713315] do_syscall_64+0x4b/0x110 > > [ 1693.713889] entry_SYSCALL_64_after_hwframe+0x76/0x7e > > [ 1693.714660] RIP: 0033:0x7f05050418fe > > [ 1693.715233] RSP: 002b:00007fff4e0f5728 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 > > [ 1693.716417] RAX: ffffffffffffffda RBX: 00007fff4e0f5900 RCX: 00007f05050418fe > > [ 1693.717492] RDX: 00005559cea3ea70 RSI: 00005559cea3fe60 RDI: 00005559cea40140 > > [ 1693.718558] RBP: 00007fff4e0f5760 R08: 00005559cea41e60 R09: 00007f050510cb20 > > [ 1693.719620] R10: 0000000000000000 R11: 0000000000000246 R12: 00005559cea41e60 > > [ 1693.720735] R13: 00007fff4e0f5900 R14: 00005559cea41df0 R15: 0000000000000004 > > > > > > Looking at faddr2line: > > > > $ ./scripts/faddr2line --list net/sunrpc/sunrpc.ko svc_set_num_threads+0xbc8/0xf80 > > svc_set_num_threads+0xbc8/0xf80: > > > > svc_start_kthreads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:822 (discriminator 17) > > 817 > > 818 svc_sock_update_bufs(serv); > > 819 wake_up_process(task); > > 820 > > 821 /* load_acquire ensures we get value stored in svc_thread_init_status() */ > >> 822< wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); > > 823 err = rqstp->rq_err; > > 824 if (err) { > > 825 svc_exit_thread(rqstp); > > 826 return err; > > 827 } > > > > (inlined by) svc_set_num_threads at /home/jlayton/git/kdevops/linux/net/sunrpc/svc.c:877 (discriminator 17) > > 872 nrservs -= serv->sv_nrthreads; > > 873 else > > 874 nrservs -= pool->sp_nrthreads; > > 875 > > 876 if (nrservs > 0) > >> 877< return svc_start_kthreads(serv, pool, nrservs); > > 878 if (nrservs < 0) > > 879 return svc_stop_kthreads(serv, pool, nrservs); > > 880 return 0; > > 881 } > > 882 EXPORT_SYMBOL_GPL(svc_set_num_threads); > > > > It looks like the callback thread started up properly and is past the svc_thread_init_status call. > > > > $ sudo cat /proc/13246/stack > > [<0>] svc_recv+0xcef/0x2020 [sunrpc] > > [<0>] nfs4_callback_svc+0xb0/0x140 [nfsv4] > > [<0>] kthread+0x29b/0x360 > > [<0>] ret_from_fork+0x30/0x70 > > [<0>] ret_from_fork_asm+0x1a/0x30 > > > > ...so the status should have been updated properly. Barrier problem? > > Speaking as NFSD release manager: I don't intend to send the > NFSD v6.12 PR until week of September 23rd. > > I will either drop these patches before sending my PR, or I > can squash in a fix if one is found. When I wrote that patch I thought I understood the barriers needed for wake/wait but I was wrong. Quite wrong. I do now (I believe) and I've been trying to get improved interfaces upstream so we don't need explicit memory barrier, but no success yet. I've posted a patch which should fix this problem. Thanks, BeilBrown
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 71713309967d..4ec22c2f2ea3 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -124,6 +124,8 @@ lockd(void *vrqstp) struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); + svc_thread_init_status(rqstp, 0); + /* try_to_freeze() is called from svc_recv() */ set_freezable(); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 8adfcd4c8c1a..6cf92498a5ac 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -76,6 +76,8 @@ nfs4_callback_svc(void *vrqstp) { struct svc_rqst *rqstp = vrqstp; + svc_thread_init_status(rqstp, 0); + set_freezable(); while (!svc_thread_should_stop(rqstp)) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fca340b3ee91..1cef09a3c78e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -875,11 +875,9 @@ nfsd(void *vrqstp) /* At this point, the thread shares current->fs * with the init process. We need to create files with the - * umask as defined by the client instead of init's umask. */ - if (unshare_fs_struct() < 0) { - printk("Unable to start nfsd thread: out of memory\n"); - goto out; - } + * umask as defined by the client instead of init's umask. + */ + svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; @@ -901,7 +899,6 @@ nfsd(void *vrqstp) atomic_dec(&nfsd_th_cnt); -out: /* Release the thread */ svc_exit_thread(rqstp); return 0; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 99e9345d829e..437672bcaa22 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -21,6 +21,7 @@ #include <linux/wait.h> #include <linux/mm.h> #include <linux/pagevec.h> +#include <linux/kthread.h> /* * @@ -232,6 +233,11 @@ struct svc_rqst { struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ + + int rq_err; /* Thread sets this to inidicate + * initialisation success. + */ + unsigned long bc_to_initval; unsigned int bc_to_retries; void ** rq_lease_breaker; /* The v4 client breaking a lease */ @@ -305,6 +311,28 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) return test_bit(RQ_VICTIM, &rqstp->rq_flags); } +/** + * svc_thread_init_status - report whether thread has initialised successfully + * @rqstp: the thread in question + * @err: errno code + * + * After performing any initialisation that could fail, and before starting + * normal work, each sunrpc svc_thread must call svc_thread_init_status() + * with an appropriate error, or zero. + * + * If zero is passed, the thread is ready and must continue until + * svc_thread_should_stop() returns true. If a non-zero error is passed + * the call will not return - the thread will exit. + */ +static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) +{ + /* store_release ensures svc_start_kthreads() sees the error */ + smp_store_release(&rqstp->rq_err, err); + wake_up_var(&rqstp->rq_err); + if (err) + kthread_exit(1); +} + struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ struct svc_xprt *xprt; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index ae31ffea7a14..1e80fa67d8b7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -706,6 +706,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) goto out_enomem; + rqstp->rq_err = -EAGAIN; /* No error yet */ + serv->sv_nrthreads += 1; pool->sp_nrthreads += 1; @@ -792,6 +794,7 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) struct svc_pool *chosen_pool; unsigned int state = serv->sv_nrthreads-1; int node; + int err; do { nrservs--; @@ -814,6 +817,14 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) svc_sock_update_bufs(serv); wake_up_process(task); + + /* load_acquire ensures we get value stored in svc_thread_init_status() */ + wait_var_event(&rqstp->rq_err, smp_load_acquire(&rqstp->rq_err) != -EAGAIN); + err = rqstp->rq_err; + if (err) { + svc_exit_thread(rqstp); + return err; + } } while (nrservs > 0); return 0;
If an svc thread needs to perform some initialisation that might fail, it has no good way to handle the failure. Before the thread can exit it must call svc_exit_thread(), but that requires the service mutex to be held. The thread cannot simply take the mutex as that could deadlock if there is a concurrent attempt to shut down all threads (which is unlikely, but not impossible). nfsd currently call svc_exit_thread() unprotected in the unlikely event that unshare_fs_struct() fails. We can clean this up by introducing svc_thread_init_status() by which an svc thread can report whether initialisation has succeeded. If it has, it continues normally into the action loop. If it has not, svc_thread_init_status() immediately aborts the thread. svc_start_kthread() waits for either of these to happen, and calls svc_exit_thread() (under the mutex) if the thread aborted. Signed-off-by: NeilBrown <neilb@suse.de> --- fs/lockd/svc.c | 2 ++ fs/nfs/callback.c | 2 ++ fs/nfsd/nfssvc.c | 9 +++------ include/linux/sunrpc/svc.h | 28 ++++++++++++++++++++++++++++ net/sunrpc/svc.c | 11 +++++++++++ 5 files changed, 46 insertions(+), 6 deletions(-)