diff mbox series

[3/4] nfsd: replace rp_mutex to avoid deadlock in move_to_close_lru()

Message ID 20240304044304.3657-4-neilb@suse.de (mailing list archive)
State New, archived
Headers show
Series nfsd: fix dadlock in move_to_close_lru() | expand

Commit Message

NeilBrown March 4, 2024, 4:40 a.m. UTC
move_to_close_lru() waits for sc_count to become zero while holding
rp_mutex.  This can deadlock if another thread holds a reference and is
waiting for rp_mutex.

By the time we get to move_to_close_lru() the openowner is unhashed and
cannot be found any more.  So code waiting for the mutex can safely
retry the lookup if move_to_close_lru() has started.

So change rp_mutex to an atomic_t with three states:

 RP_UNLOCK   - state is still hashed, not locked for reply
 RP_LOCKED   - state is still hashed, is locked for reply
 RP_UNHASHED - state is not hashed, no code can get a lock.

Use wait_var_event() to wait for either a lock, or for the owner to be
unhashed.  In the latter case, retry the lookup.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
 fs/nfsd/state.h     |  2 +-
 2 files changed, 32 insertions(+), 8 deletions(-)

Comments

Jeff Layton March 4, 2024, 12:50 p.m. UTC | #1
On Mon, 2024-03-04 at 15:40 +1100, NeilBrown wrote:
> move_to_close_lru() waits for sc_count to become zero while holding
> rp_mutex.  This can deadlock if another thread holds a reference and is
> waiting for rp_mutex.
> 
> By the time we get to move_to_close_lru() the openowner is unhashed and
> cannot be found any more.  So code waiting for the mutex can safely
> retry the lookup if move_to_close_lru() has started.
> 
> So change rp_mutex to an atomic_t with three states:
> 
>  RP_UNLOCK   - state is still hashed, not locked for reply
>  RP_LOCKED   - state is still hashed, is locked for reply
>  RP_UNHASHED - state is not hashed, no code can get a lock.
> 
> Use wait_var_event() to wait for either a lock, or for the owner to be
> unhashed.  In the latter case, retry the lookup.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
>  fs/nfsd/state.h     |  2 +-
>  2 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 690d0e697320..47e879d5d68a 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
>  	atomic_set(&nn->nfsd_courtesy_clients, 0);
>  }
>  
> +enum rp_lock {
> +	RP_UNLOCKED,
> +	RP_LOCKED,
> +	RP_UNHASHED,
> +};
> +
>  static void init_nfs4_replay(struct nfs4_replay *rp)
>  {
>  	rp->rp_status = nfserr_serverfault;
>  	rp->rp_buflen = 0;
>  	rp->rp_buf = rp->rp_ibuf;
> -	mutex_init(&rp->rp_mutex);
> +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
>  }
>  
> -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> -		struct nfs4_stateowner *so)
> +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> +				      struct nfs4_stateowner *so)
>  {
>  	if (!nfsd4_has_session(cstate)) {
> -		mutex_lock(&so->so_replay.rp_mutex);
> +		wait_var_event(&so->so_replay.rp_locked,
> +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> +			return -EAGAIN;
>  		cstate->replay_owner = nfs4_get_stateowner(so);
>  	}
> +	return 0;
>  }
>  
>  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
>  
>  	if (so != NULL) {
>  		cstate->replay_owner = NULL;
> -		mutex_unlock(&so->so_replay.rp_mutex);
> +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> +		wake_up_var(&so->so_replay.rp_locked);
>  		nfs4_put_stateowner(so);
>  	}
>  }
> @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
>  	 * there should be no danger of the refcount going back up again at
>  	 * this point.
> +	 * Some threads with a reference might be waiting for rp_locked,
> +	 * so tell them to stop waiting.
>  	 */
> +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
>  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
>  
>  	release_all_access(s);
> @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
>  	clp = cstate->clp;
>  
>  	strhashval = ownerstr_hashval(&open->op_owner);
> +retry:
>  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
>  	open->op_openowner = oo;
>  	if (!oo)
>  		return nfserr_jukebox;
> -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> +		nfs4_put_stateowner(&oo->oo_owner);
> +		goto retry;
> +	}
>  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
>  	if (status)
>  		return status;
> @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
>  	trace_nfsd_preprocess(seqid, stateid);
>  
>  	*stpp = NULL;
> +retry:
>  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
>  	if (status)
>  		return status;
>  	stp = openlockstateid(s);
> -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> +		nfs4_put_stateowner(stp->st_stateowner);
> +		goto retry;
> +	}
>  
>  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
>  	if (!status)
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 41bdc913fa71..6a3becd86112 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -446,7 +446,7 @@ struct nfs4_replay {
>  	unsigned int		rp_buflen;
>  	char			*rp_buf;
>  	struct knfsd_fh		rp_openfh;
> -	struct mutex		rp_mutex;
> +	atomic_t		rp_locked;
>  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
>  };
>  

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Chuck Lever March 4, 2024, 2:09 p.m. UTC | #2
On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> move_to_close_lru() waits for sc_count to become zero while holding
> rp_mutex.  This can deadlock if another thread holds a reference and is
> waiting for rp_mutex.
> 
> By the time we get to move_to_close_lru() the openowner is unhashed and
> cannot be found any more.  So code waiting for the mutex can safely
> retry the lookup if move_to_close_lru() has started.
> 
> So change rp_mutex to an atomic_t with three states:
> 
>  RP_UNLOCK   - state is still hashed, not locked for reply
>  RP_LOCKED   - state is still hashed, is locked for reply
>  RP_UNHASHED - state is not hashed, no code can get a lock.
> 
> Use wait_var_event() to wait for either a lock, or for the owner to be
> unhashed.  In the latter case, retry the lookup.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
>  fs/nfsd/state.h     |  2 +-
>  2 files changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 690d0e697320..47e879d5d68a 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
>  	atomic_set(&nn->nfsd_courtesy_clients, 0);
>  }
>  
> +enum rp_lock {
> +	RP_UNLOCKED,
> +	RP_LOCKED,
> +	RP_UNHASHED,
> +};
> +
>  static void init_nfs4_replay(struct nfs4_replay *rp)
>  {
>  	rp->rp_status = nfserr_serverfault;
>  	rp->rp_buflen = 0;
>  	rp->rp_buf = rp->rp_ibuf;
> -	mutex_init(&rp->rp_mutex);
> +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
>  }
>  
> -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> -		struct nfs4_stateowner *so)
> +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> +				      struct nfs4_stateowner *so)
>  {
>  	if (!nfsd4_has_session(cstate)) {
> -		mutex_lock(&so->so_replay.rp_mutex);
> +		wait_var_event(&so->so_replay.rp_locked,
> +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);

What reliably prevents this from being a "wait forever" ?


> +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> +			return -EAGAIN;
>  		cstate->replay_owner = nfs4_get_stateowner(so);
>  	}
> +	return 0;
>  }
>  
>  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
>  
>  	if (so != NULL) {
>  		cstate->replay_owner = NULL;
> -		mutex_unlock(&so->so_replay.rp_mutex);
> +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> +		wake_up_var(&so->so_replay.rp_locked);
>  		nfs4_put_stateowner(so);
>  	}
>  }
> @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
>  	 * there should be no danger of the refcount going back up again at
>  	 * this point.
> +	 * Some threads with a reference might be waiting for rp_locked,
> +	 * so tell them to stop waiting.
>  	 */
> +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
>  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
>  
>  	release_all_access(s);
> @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
>  	clp = cstate->clp;
>  
>  	strhashval = ownerstr_hashval(&open->op_owner);
> +retry:
>  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
>  	open->op_openowner = oo;
>  	if (!oo)
>  		return nfserr_jukebox;
> -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> +		nfs4_put_stateowner(&oo->oo_owner);
> +		goto retry;
> +	}
>  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
>  	if (status)
>  		return status;
> @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
>  	trace_nfsd_preprocess(seqid, stateid);
>  
>  	*stpp = NULL;
> +retry:
>  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
>  	if (status)
>  		return status;
>  	stp = openlockstateid(s);
> -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> +		nfs4_put_stateowner(stp->st_stateowner);
> +		goto retry;
> +	}
>  
>  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
>  	if (!status)

My tool chain reports that this hunk won't apply to nfsd-next.

In my copy of fs/nfsd/nfs4state.c, nfs4_preprocess_seqid_op() starts
at line 7180, so something is whack-a-doodle.


> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 41bdc913fa71..6a3becd86112 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -446,7 +446,7 @@ struct nfs4_replay {
>  	unsigned int		rp_buflen;
>  	char			*rp_buf;
>  	struct knfsd_fh		rp_openfh;
> -	struct mutex		rp_mutex;
> +	atomic_t		rp_locked;
>  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
>  };
>  
> -- 
> 2.43.0
>
NeilBrown March 4, 2024, 9:45 p.m. UTC | #3
On Tue, 05 Mar 2024, Chuck Lever wrote:
> On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > move_to_close_lru() waits for sc_count to become zero while holding
> > rp_mutex.  This can deadlock if another thread holds a reference and is
> > waiting for rp_mutex.
> > 
> > By the time we get to move_to_close_lru() the openowner is unhashed and
> > cannot be found any more.  So code waiting for the mutex can safely
> > retry the lookup if move_to_close_lru() has started.
> > 
> > So change rp_mutex to an atomic_t with three states:
> > 
> >  RP_UNLOCK   - state is still hashed, not locked for reply
> >  RP_LOCKED   - state is still hashed, is locked for reply
> >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > 
> > Use wait_var_event() to wait for either a lock, or for the owner to be
> > unhashed.  In the latter case, retry the lookup.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> >  fs/nfsd/state.h     |  2 +-
> >  2 files changed, 32 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 690d0e697320..47e879d5d68a 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> >  }
> >  
> > +enum rp_lock {
> > +	RP_UNLOCKED,
> > +	RP_LOCKED,
> > +	RP_UNHASHED,
> > +};
> > +
> >  static void init_nfs4_replay(struct nfs4_replay *rp)
> >  {
> >  	rp->rp_status = nfserr_serverfault;
> >  	rp->rp_buflen = 0;
> >  	rp->rp_buf = rp->rp_ibuf;
> > -	mutex_init(&rp->rp_mutex);
> > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> >  }
> >  
> > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > -		struct nfs4_stateowner *so)
> > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > +				      struct nfs4_stateowner *so)
> >  {
> >  	if (!nfsd4_has_session(cstate)) {
> > -		mutex_lock(&so->so_replay.rp_mutex);
> > +		wait_var_event(&so->so_replay.rp_locked,
> > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> 
> What reliably prevents this from being a "wait forever" ?

That same thing that reliably prevented the original mutex_lock from
waiting forever.
It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
we previously called mutex_unlock.  But it *also* aborts the wait if
rp_locked is set to RP_UNHASHED - so it is now more reliable.

Does that answer the question?

> 
> 
> > +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> > +			return -EAGAIN;
> >  		cstate->replay_owner = nfs4_get_stateowner(so);
> >  	}
> > +	return 0;
> >  }
> >  
> >  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> >  
> >  	if (so != NULL) {
> >  		cstate->replay_owner = NULL;
> > -		mutex_unlock(&so->so_replay.rp_mutex);
> > +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> > +		wake_up_var(&so->so_replay.rp_locked);
> >  		nfs4_put_stateowner(so);
> >  	}
> >  }
> > @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> >  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
> >  	 * there should be no danger of the refcount going back up again at
> >  	 * this point.
> > +	 * Some threads with a reference might be waiting for rp_locked,
> > +	 * so tell them to stop waiting.
> >  	 */
> > +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> > +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
> >  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> >  
> >  	release_all_access(s);
> > @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> >  	clp = cstate->clp;
> >  
> >  	strhashval = ownerstr_hashval(&open->op_owner);
> > +retry:
> >  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
> >  	open->op_openowner = oo;
> >  	if (!oo)
> >  		return nfserr_jukebox;
> > -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> > +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> > +		nfs4_put_stateowner(&oo->oo_owner);
> > +		goto retry;
> > +	}
> >  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
> >  	if (status)
> >  		return status;
> > @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
> >  	trace_nfsd_preprocess(seqid, stateid);
> >  
> >  	*stpp = NULL;
> > +retry:
> >  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
> >  	if (status)
> >  		return status;
> >  	stp = openlockstateid(s);
> > -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> > +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> > +		nfs4_put_stateowner(stp->st_stateowner);
> > +		goto retry;
> > +	}
> >  
> >  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> >  	if (!status)
> 
> My tool chain reports that this hunk won't apply to nfsd-next.

You need better tools :-)

> 
> In my copy of fs/nfsd/nfs4state.c, nfs4_preprocess_seqid_op() starts
> at line 7180, so something is whack-a-doodle.

I have been working against master because the old version of the fix
was in nfsd-next.  I should have rebased when you removed it from
nfsd-next.  I have now and will repost once you confirm you are
comfortable with the answer about waiting above.  Would adding a comment
there help?

Thanks,
NeilBrown


> 
> 
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index 41bdc913fa71..6a3becd86112 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -446,7 +446,7 @@ struct nfs4_replay {
> >  	unsigned int		rp_buflen;
> >  	char			*rp_buf;
> >  	struct knfsd_fh		rp_openfh;
> > -	struct mutex		rp_mutex;
> > +	atomic_t		rp_locked;
> >  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
> >  };
> >  
> > -- 
> > 2.43.0
> > 
> 
> -- 
> Chuck Lever
>
Chuck Lever March 4, 2024, 10:03 p.m. UTC | #4
On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> On Tue, 05 Mar 2024, Chuck Lever wrote:
> > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > move_to_close_lru() waits for sc_count to become zero while holding
> > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > waiting for rp_mutex.
> > > 
> > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > cannot be found any more.  So code waiting for the mutex can safely
> > > retry the lookup if move_to_close_lru() has started.
> > > 
> > > So change rp_mutex to an atomic_t with three states:
> > > 
> > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > >  RP_LOCKED   - state is still hashed, is locked for reply
> > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > 
> > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > unhashed.  In the latter case, retry the lookup.
> > > 
> > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > ---
> > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > >  fs/nfsd/state.h     |  2 +-
> > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 690d0e697320..47e879d5d68a 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > >  }
> > >  
> > > +enum rp_lock {
> > > +	RP_UNLOCKED,
> > > +	RP_LOCKED,
> > > +	RP_UNHASHED,
> > > +};
> > > +
> > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > >  {
> > >  	rp->rp_status = nfserr_serverfault;
> > >  	rp->rp_buflen = 0;
> > >  	rp->rp_buf = rp->rp_ibuf;
> > > -	mutex_init(&rp->rp_mutex);
> > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > >  }
> > >  
> > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > -		struct nfs4_stateowner *so)
> > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > +				      struct nfs4_stateowner *so)
> > >  {
> > >  	if (!nfsd4_has_session(cstate)) {
> > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > +		wait_var_event(&so->so_replay.rp_locked,
> > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > 
> > What reliably prevents this from being a "wait forever" ?
> 
> That same thing that reliably prevented the original mutex_lock from
> waiting forever.
> It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> we previously called mutex_unlock.  But it *also* aborts the wait if
> rp_locked is set to RP_UNHASHED - so it is now more reliable.
> 
> Does that answer the question?

Hm. I guess then we are no worse off with wait_var_event().

I'm not as familiar with this logic as perhaps I should be. How long
does it take for the wake-up to occur, typically?


> > > +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> > > +			return -EAGAIN;
> > >  		cstate->replay_owner = nfs4_get_stateowner(so);
> > >  	}
> > > +	return 0;
> > >  }
> > >  
> > >  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > > @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > >  
> > >  	if (so != NULL) {
> > >  		cstate->replay_owner = NULL;
> > > -		mutex_unlock(&so->so_replay.rp_mutex);
> > > +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> > > +		wake_up_var(&so->so_replay.rp_locked);
> > >  		nfs4_put_stateowner(so);
> > >  	}
> > >  }
> > > @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> > >  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
> > >  	 * there should be no danger of the refcount going back up again at
> > >  	 * this point.
> > > +	 * Some threads with a reference might be waiting for rp_locked,
> > > +	 * so tell them to stop waiting.
> > >  	 */
> > > +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> > > +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
> > >  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> > >  
> > >  	release_all_access(s);
> > > @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> > >  	clp = cstate->clp;
> > >  
> > >  	strhashval = ownerstr_hashval(&open->op_owner);
> > > +retry:
> > >  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
> > >  	open->op_openowner = oo;
> > >  	if (!oo)
> > >  		return nfserr_jukebox;
> > > -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> > > +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> > > +		nfs4_put_stateowner(&oo->oo_owner);
> > > +		goto retry;
> > > +	}
> > >  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
> > >  	if (status)
> > >  		return status;
> > > @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
> > >  	trace_nfsd_preprocess(seqid, stateid);
> > >  
> > >  	*stpp = NULL;
> > > +retry:
> > >  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
> > >  	if (status)
> > >  		return status;
> > >  	stp = openlockstateid(s);
> > > -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> > > +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> > > +		nfs4_put_stateowner(stp->st_stateowner);
> > > +		goto retry;
> > > +	}
> > >  
> > >  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> > >  	if (!status)
> > 
> > My tool chain reports that this hunk won't apply to nfsd-next.
> 
> You need better tools :-)

<shrug> Essentially it is git, importing an mbox. That kind of thing
is generally the weakest aspect of all these tools.  Do you know of
something more robust?


> > In my copy of fs/nfsd/nfs4state.c, nfs4_preprocess_seqid_op() starts
> > at line 7180, so something is whack-a-doodle.
> 
> I have been working against master because the old version of the fix
> was in nfsd-next.  I should have rebased when you removed it from
> nfsd-next.  I have now and will repost once you confirm you are
> comfortable with the answer about waiting above.  Would adding a comment
> there help?

The mechanism is clear from the code, so a new comment, if you add
one, should spell out what condition(s) mark rp_locked as UNLOCKED.

But I might be missing something that should be obvious, in which
case no comment is necessary.


> Thanks,
> NeilBrown
> 
> 
> > 
> > 
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index 41bdc913fa71..6a3becd86112 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -446,7 +446,7 @@ struct nfs4_replay {
> > >  	unsigned int		rp_buflen;
> > >  	char			*rp_buf;
> > >  	struct knfsd_fh		rp_openfh;
> > > -	struct mutex		rp_mutex;
> > > +	atomic_t		rp_locked;
> > >  	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
> > >  };
> > >  
> > > -- 
> > > 2.43.0
> > > 
> > 
> > -- 
> > Chuck Lever
> > 
>
NeilBrown March 4, 2024, 10:36 p.m. UTC | #5
On Tue, 05 Mar 2024, Chuck Lever wrote:
> On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > waiting for rp_mutex.
> > > > 
> > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > retry the lookup if move_to_close_lru() has started.
> > > > 
> > > > So change rp_mutex to an atomic_t with three states:
> > > > 
> > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > 
> > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > unhashed.  In the latter case, retry the lookup.
> > > > 
> > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > ---
> > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > >  fs/nfsd/state.h     |  2 +-
> > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index 690d0e697320..47e879d5d68a 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > >  }
> > > >  
> > > > +enum rp_lock {
> > > > +	RP_UNLOCKED,
> > > > +	RP_LOCKED,
> > > > +	RP_UNHASHED,
> > > > +};
> > > > +
> > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > >  {
> > > >  	rp->rp_status = nfserr_serverfault;
> > > >  	rp->rp_buflen = 0;
> > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > -	mutex_init(&rp->rp_mutex);
> > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > >  }
> > > >  
> > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > -		struct nfs4_stateowner *so)
> > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > +				      struct nfs4_stateowner *so)
> > > >  {
> > > >  	if (!nfsd4_has_session(cstate)) {
> > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > 
> > > What reliably prevents this from being a "wait forever" ?
> > 
> > That same thing that reliably prevented the original mutex_lock from
> > waiting forever.
> > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > we previously called mutex_unlock.  But it *also* aborts the wait if
> > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > 
> > Does that answer the question?
> 
> Hm. I guess then we are no worse off with wait_var_event().
> 
> I'm not as familiar with this logic as perhaps I should be. How long
> does it take for the wake-up to occur, typically?

wait_var_event() is paired with wake_up_var().
The wake up happens when wake_up_var() is called, which in this code is
always immediately after atomic_set() updates the variable.

> 
> 
> > > > +		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
> > > > +			return -EAGAIN;
> > > >  		cstate->replay_owner = nfs4_get_stateowner(so);
> > > >  	}
> > > > +	return 0;
> > > >  }
> > > >  
> > > >  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > > > @@ -4453,7 +4464,8 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
> > > >  
> > > >  	if (so != NULL) {
> > > >  		cstate->replay_owner = NULL;
> > > > -		mutex_unlock(&so->so_replay.rp_mutex);
> > > > +		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
> > > > +		wake_up_var(&so->so_replay.rp_locked);
> > > >  		nfs4_put_stateowner(so);
> > > >  	}
> > > >  }
> > > > @@ -4691,7 +4703,11 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> > > >  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
> > > >  	 * there should be no danger of the refcount going back up again at
> > > >  	 * this point.
> > > > +	 * Some threads with a reference might be waiting for rp_locked,
> > > > +	 * so tell them to stop waiting.
> > > >  	 */
> > > > +	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
> > > > +	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
> > > >  	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> > > >  
> > > >  	release_all_access(s);
> > > > @@ -5064,11 +5080,15 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
> > > >  	clp = cstate->clp;
> > > >  
> > > >  	strhashval = ownerstr_hashval(&open->op_owner);
> > > > +retry:
> > > >  	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
> > > >  	open->op_openowner = oo;
> > > >  	if (!oo)
> > > >  		return nfserr_jukebox;
> > > > -	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
> > > > +	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
> > > > +		nfs4_put_stateowner(&oo->oo_owner);
> > > > +		goto retry;
> > > > +	}
> > > >  	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
> > > >  	if (status)
> > > >  		return status;
> > > > @@ -6828,11 +6848,15 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
> > > >  	trace_nfsd_preprocess(seqid, stateid);
> > > >  
> > > >  	*stpp = NULL;
> > > > +retry:
> > > >  	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
> > > >  	if (status)
> > > >  		return status;
> > > >  	stp = openlockstateid(s);
> > > > -	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
> > > > +	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
> > > > +		nfs4_put_stateowner(stp->st_stateowner);
> > > > +		goto retry;
> > > > +	}
> > > >  
> > > >  	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
> > > >  	if (!status)
> > > 
> > > My tool chain reports that this hunk won't apply to nfsd-next.
> > 
> > You need better tools :-)
> 
> <shrug> Essentially it is git, importing an mbox. That kind of thing
> is generally the weakest aspect of all these tools.  Do you know of
> something more robust?

My tool of choice is "wiggle" - which I wrote.
I just put those 4 patches into an mbox and use "git am < mbox" to apply
to nfsd-next.  It hit a problem at this patch - 3/4.
So I ran
   wiggle -mrp .git/rebase-apply/patch

which worked without complaint.  (you might want --no-backup too).
But you probably want to know that the conflict was that "git am" found
and "wiggle" ignored.  So:

  git diff | diff -u -I '^@@' -I '^index' .git/rebase-apply/patch - 

This shows 

 +retry:
- 	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+ 	status = nfsd4_lookup_stateid(cstate, stateid,
+ 				      typemask, statusmask, &s, nn);


once you have had a bit of practice reading diffs of diffs you can see
that the patch inserts "retry:" before a line which has changed
between the original and new bases.  The difference clearly isn't
important to this patch.

So "git add -u; git am --continue" will accept the wiggle and continue
with the "git am".
Patch 4/4 has one conflict:
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 +      bool need_move_to_close_list;
  
-       dprintk("NFSD: nfsd4_close on file %pd\n", 
+       dprintk("NFSD: nfsd4_close on file %pd\n",
                        cstate->current_fh.fh_dentry);

again - not interesting (white space at end of line changed).

But maybe you would rather avoid that and have submitted base their
patches properly to start with :-)

> 
> 
> > > In my copy of fs/nfsd/nfs4state.c, nfs4_preprocess_seqid_op() starts
> > > at line 7180, so something is whack-a-doodle.
> > 
> > I have been working against master because the old version of the fix
> > was in nfsd-next.  I should have rebased when you removed it from
> > nfsd-next.  I have now and will repost once you confirm you are
> > comfortable with the answer about waiting above.  Would adding a comment
> > there help?
> 
> The mechanism is clear from the code, so a new comment, if you add
> one, should spell out what condition(s) mark rp_locked as UNLOCKED.
> 
> But I might be missing something that should be obvious, in which
> case no comment is necessary.
> 

I've added a comment emphasising that it is much like a mutex, and
pointing to the matching unlocks.

Thanks,
NeilBrown
Chuck Lever March 4, 2024, 10:54 p.m. UTC | #6
On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> On Tue, 05 Mar 2024, Chuck Lever wrote:
> > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > waiting for rp_mutex.
> > > > > 
> > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > retry the lookup if move_to_close_lru() has started.
> > > > > 
> > > > > So change rp_mutex to an atomic_t with three states:
> > > > > 
> > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > 
> > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > unhashed.  In the latter case, retry the lookup.
> > > > > 
> > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > ---
> > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > >  fs/nfsd/state.h     |  2 +-
> > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > --- a/fs/nfsd/nfs4state.c
> > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > >  }
> > > > >  
> > > > > +enum rp_lock {
> > > > > +	RP_UNLOCKED,
> > > > > +	RP_LOCKED,
> > > > > +	RP_UNHASHED,
> > > > > +};
> > > > > +
> > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > >  {
> > > > >  	rp->rp_status = nfserr_serverfault;
> > > > >  	rp->rp_buflen = 0;
> > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > -	mutex_init(&rp->rp_mutex);
> > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > >  }
> > > > >  
> > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > -		struct nfs4_stateowner *so)
> > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > +				      struct nfs4_stateowner *so)
> > > > >  {
> > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > 
> > > > What reliably prevents this from being a "wait forever" ?
> > > 
> > > That same thing that reliably prevented the original mutex_lock from
> > > waiting forever.
> > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > 
> > > Does that answer the question?
> > 
> > Hm. I guess then we are no worse off with wait_var_event().
> > 
> > I'm not as familiar with this logic as perhaps I should be. How long
> > does it take for the wake-up to occur, typically?
> 
> wait_var_event() is paired with wake_up_var().
> The wake up happens when wake_up_var() is called, which in this code is
> always immediately after atomic_set() updates the variable.

I'm trying to ascertain the actual wall-clock time that the nfsd thread
is sleeping, at most. Is this going to be a possible DoS vector? Can
it impact the ability for the server to shut down without hanging?
NeilBrown March 4, 2024, 11:04 p.m. UTC | #7
On Tue, 05 Mar 2024, Chuck Lever wrote:
> On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > > waiting for rp_mutex.
> > > > > > 
> > > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > > retry the lookup if move_to_close_lru() has started.
> > > > > > 
> > > > > > So change rp_mutex to an atomic_t with three states:
> > > > > > 
> > > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > > 
> > > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > > unhashed.  In the latter case, retry the lookup.
> > > > > > 
> > > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > > ---
> > > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > > >  fs/nfsd/state.h     |  2 +-
> > > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > > >  }
> > > > > >  
> > > > > > +enum rp_lock {
> > > > > > +	RP_UNLOCKED,
> > > > > > +	RP_LOCKED,
> > > > > > +	RP_UNHASHED,
> > > > > > +};
> > > > > > +
> > > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > > >  {
> > > > > >  	rp->rp_status = nfserr_serverfault;
> > > > > >  	rp->rp_buflen = 0;
> > > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > > -	mutex_init(&rp->rp_mutex);
> > > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > > >  }
> > > > > >  
> > > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > -		struct nfs4_stateowner *so)
> > > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > +				      struct nfs4_stateowner *so)
> > > > > >  {
> > > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > > 
> > > > > What reliably prevents this from being a "wait forever" ?
> > > > 
> > > > That same thing that reliably prevented the original mutex_lock from
> > > > waiting forever.
> > > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > > 
> > > > Does that answer the question?
> > > 
> > > Hm. I guess then we are no worse off with wait_var_event().
> > > 
> > > I'm not as familiar with this logic as perhaps I should be. How long
> > > does it take for the wake-up to occur, typically?
> > 
> > wait_var_event() is paired with wake_up_var().
> > The wake up happens when wake_up_var() is called, which in this code is
> > always immediately after atomic_set() updates the variable.
> 
> I'm trying to ascertain the actual wall-clock time that the nfsd thread
> is sleeping, at most. Is this going to be a possible DoS vector? Can
> it impact the ability for the server to shut down without hanging?

Certainly the wall-clock sleep time is no more than it was before this
patch.
At most it is the time it takes for some other request running in some
other nfsd thread to complete.

Well.... technically if there were a large number of concurrent requests
from a client that all required claiming this lock, one of the threads
might be blocked while all the others threads get a turn.  There is no
fairness guarantee with this style of waiting.  So one thread might be
blocked indefinitely while other threads take turns taking the lock.
It's not really a DoS vector as the client is only really denying
service to itself by keeping the server excessively busy.  Other
clients will still get a reasonable turn.

NeilBrown

> 
> 
> -- 
> Chuck Lever
>
Jeff Layton March 4, 2024, 11:11 p.m. UTC | #8
On Mon, 2024-03-04 at 17:54 -0500, Chuck Lever wrote:
> On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > > waiting for rp_mutex.
> > > > > > 
> > > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > > retry the lookup if move_to_close_lru() has started.
> > > > > > 
> > > > > > So change rp_mutex to an atomic_t with three states:
> > > > > > 
> > > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > > 
> > > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > > unhashed.  In the latter case, retry the lookup.
> > > > > > 
> > > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > > ---
> > > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > > >  fs/nfsd/state.h     |  2 +-
> > > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > > >  }
> > > > > >  
> > > > > > +enum rp_lock {
> > > > > > +	RP_UNLOCKED,
> > > > > > +	RP_LOCKED,
> > > > > > +	RP_UNHASHED,
> > > > > > +};
> > > > > > +
> > > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > > >  {
> > > > > >  	rp->rp_status = nfserr_serverfault;
> > > > > >  	rp->rp_buflen = 0;
> > > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > > -	mutex_init(&rp->rp_mutex);
> > > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > > >  }
> > > > > >  
> > > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > -		struct nfs4_stateowner *so)
> > > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > +				      struct nfs4_stateowner *so)
> > > > > >  {
> > > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > > 
> > > > > What reliably prevents this from being a "wait forever" ?
> > > > 
> > > > That same thing that reliably prevented the original mutex_lock from
> > > > waiting forever.
> > > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > > 
> > > > Does that answer the question?
> > > 
> > > Hm. I guess then we are no worse off with wait_var_event().
> > > 
> > > I'm not as familiar with this logic as perhaps I should be. How long
> > > does it take for the wake-up to occur, typically?
> > 
> > wait_var_event() is paired with wake_up_var().
> > The wake up happens when wake_up_var() is called, which in this code is
> > always immediately after atomic_set() updates the variable.
> 
> I'm trying to ascertain the actual wall-clock time that the nfsd thread
> is sleeping, at most. Is this going to be a possible DoS vector? Can
> it impact the ability for the server to shut down without hanging?
> 
> 

Prior to this patch, there was a mutex in play here and we just released
it to wake up the waiters. This is more or less doing the same thing, it
just indicates the resulting state better.

I doubt this will materially change how long the tasks are waiting.
Chuck Lever March 4, 2024, 11:30 p.m. UTC | #9
On Mon, Mar 04, 2024 at 06:11:24PM -0500, Jeff Layton wrote:
> On Mon, 2024-03-04 at 17:54 -0500, Chuck Lever wrote:
> > On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > > > waiting for rp_mutex.
> > > > > > > 
> > > > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > > > retry the lookup if move_to_close_lru() has started.
> > > > > > > 
> > > > > > > So change rp_mutex to an atomic_t with three states:
> > > > > > > 
> > > > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > > > 
> > > > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > > > unhashed.  In the latter case, retry the lookup.
> > > > > > > 
> > > > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > > > ---
> > > > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > > > >  fs/nfsd/state.h     |  2 +-
> > > > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > > > >  }
> > > > > > >  
> > > > > > > +enum rp_lock {
> > > > > > > +	RP_UNLOCKED,
> > > > > > > +	RP_LOCKED,
> > > > > > > +	RP_UNHASHED,
> > > > > > > +};
> > > > > > > +
> > > > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > > > >  {
> > > > > > >  	rp->rp_status = nfserr_serverfault;
> > > > > > >  	rp->rp_buflen = 0;
> > > > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > > > -	mutex_init(&rp->rp_mutex);
> > > > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > > > >  }
> > > > > > >  
> > > > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > -		struct nfs4_stateowner *so)
> > > > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > +				      struct nfs4_stateowner *so)
> > > > > > >  {
> > > > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > > > 
> > > > > > What reliably prevents this from being a "wait forever" ?
> > > > > 
> > > > > That same thing that reliably prevented the original mutex_lock from
> > > > > waiting forever.

Note that this patch fixes a deadlock here. So clearly, there /were/
situations where "waiting forever" was possible with the mutex version
of this code.


> > > > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > > > 
> > > > > Does that answer the question?
> > > > 
> > > > Hm. I guess then we are no worse off with wait_var_event().
> > > > 
> > > > I'm not as familiar with this logic as perhaps I should be. How long
> > > > does it take for the wake-up to occur, typically?
> > > 
> > > wait_var_event() is paired with wake_up_var().
> > > The wake up happens when wake_up_var() is called, which in this code is
> > > always immediately after atomic_set() updates the variable.
> > 
> > I'm trying to ascertain the actual wall-clock time that the nfsd thread
> > is sleeping, at most. Is this going to be a possible DoS vector? Can
> > it impact the ability for the server to shut down without hanging?
> 
> Prior to this patch, there was a mutex in play here and we just released
> it to wake up the waiters. This is more or less doing the same thing, it
> just indicates the resulting state better.

Well, it adds a third state so that a recovery action can be taken
on wake-up in some cases. That avoids a deadlock, so this does count
as a bug fix.


> I doubt this will materially change how long the tasks are waiting.

It might not be a longer wait, but it still seems difficult to prove
that the wait_var_event() will /always/ be awoken somehow.

Applying for now.
NeilBrown March 5, 2024, 12:05 a.m. UTC | #10
On Tue, 05 Mar 2024, Chuck Lever wrote:
> On Mon, Mar 04, 2024 at 06:11:24PM -0500, Jeff Layton wrote:
> > On Mon, 2024-03-04 at 17:54 -0500, Chuck Lever wrote:
> > > On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > > > > waiting for rp_mutex.
> > > > > > > > 
> > > > > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > > > > retry the lookup if move_to_close_lru() has started.
> > > > > > > > 
> > > > > > > > So change rp_mutex to an atomic_t with three states:
> > > > > > > > 
> > > > > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > > > > 
> > > > > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > > > > unhashed.  In the latter case, retry the lookup.
> > > > > > > > 
> > > > > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > > > > ---
> > > > > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > > > > >  fs/nfsd/state.h     |  2 +-
> > > > > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +enum rp_lock {
> > > > > > > > +	RP_UNLOCKED,
> > > > > > > > +	RP_LOCKED,
> > > > > > > > +	RP_UNHASHED,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > > > > >  {
> > > > > > > >  	rp->rp_status = nfserr_serverfault;
> > > > > > > >  	rp->rp_buflen = 0;
> > > > > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > > > > -	mutex_init(&rp->rp_mutex);
> > > > > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > > -		struct nfs4_stateowner *so)
> > > > > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > > +				      struct nfs4_stateowner *so)
> > > > > > > >  {
> > > > > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > > > > 
> > > > > > > What reliably prevents this from being a "wait forever" ?
> > > > > > 
> > > > > > That same thing that reliably prevented the original mutex_lock from
> > > > > > waiting forever.
> 
> Note that this patch fixes a deadlock here. So clearly, there /were/
> situations where "waiting forever" was possible with the mutex version
> of this code.
> 
> 
> > > > > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > > > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > > > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > > > > 
> > > > > > Does that answer the question?
> > > > > 
> > > > > Hm. I guess then we are no worse off with wait_var_event().
> > > > > 
> > > > > I'm not as familiar with this logic as perhaps I should be. How long
> > > > > does it take for the wake-up to occur, typically?
> > > > 
> > > > wait_var_event() is paired with wake_up_var().
> > > > The wake up happens when wake_up_var() is called, which in this code is
> > > > always immediately after atomic_set() updates the variable.
> > > 
> > > I'm trying to ascertain the actual wall-clock time that the nfsd thread
> > > is sleeping, at most. Is this going to be a possible DoS vector? Can
> > > it impact the ability for the server to shut down without hanging?
> > 
> > Prior to this patch, there was a mutex in play here and we just released
> > it to wake up the waiters. This is more or less doing the same thing, it
> > just indicates the resulting state better.
> 
> Well, it adds a third state so that a recovery action can be taken
> on wake-up in some cases. That avoids a deadlock, so this does count
> as a bug fix.
> 
> 
> > I doubt this will materially change how long the tasks are waiting.
> 
> It might not be a longer wait, but it still seems difficult to prove
> that the wait_var_event() will /always/ be awoken somehow.

Proving locks are free from possible deadlock is notoriously difficult. 
That is why we have lockdep.
Maybe the first step in this series should have been to add some lockdep
annotations to sc_count so that it could report the possible deadlock.
That would look like a readlock when sc_count is incremented,
readunlock when it is decremented, and a writelock when we wait in
move_to_close_lru.  Or something like that.  If we could demonstrate
that easily triggers a lockdep warning, then demonstrate the warning is
gone after the patch series, that would be a good thing.

I might try that, but not today.

> 
> Applying for now.
> 

Thanks,
NeilBrown

> 
> -- 
> Chuck Lever
>
Jeff Layton March 5, 2024, 12:11 a.m. UTC | #11
On Mon, 2024-03-04 at 18:30 -0500, Chuck Lever wrote:
> On Mon, Mar 04, 2024 at 06:11:24PM -0500, Jeff Layton wrote:
> > On Mon, 2024-03-04 at 17:54 -0500, Chuck Lever wrote:
> > > On Tue, Mar 05, 2024 at 09:36:29AM +1100, NeilBrown wrote:
> > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > On Tue, Mar 05, 2024 at 08:45:45AM +1100, NeilBrown wrote:
> > > > > > On Tue, 05 Mar 2024, Chuck Lever wrote:
> > > > > > > On Mon, Mar 04, 2024 at 03:40:21PM +1100, NeilBrown wrote:
> > > > > > > > move_to_close_lru() waits for sc_count to become zero while holding
> > > > > > > > rp_mutex.  This can deadlock if another thread holds a reference and is
> > > > > > > > waiting for rp_mutex.
> > > > > > > > 
> > > > > > > > By the time we get to move_to_close_lru() the openowner is unhashed and
> > > > > > > > cannot be found any more.  So code waiting for the mutex can safely
> > > > > > > > retry the lookup if move_to_close_lru() has started.
> > > > > > > > 
> > > > > > > > So change rp_mutex to an atomic_t with three states:
> > > > > > > > 
> > > > > > > >  RP_UNLOCK   - state is still hashed, not locked for reply
> > > > > > > >  RP_LOCKED   - state is still hashed, is locked for reply
> > > > > > > >  RP_UNHASHED - state is not hashed, no code can get a lock.
> > > > > > > > 
> > > > > > > > Use wait_var_event() to wait for either a lock, or for the owner to be
> > > > > > > > unhashed.  In the latter case, retry the lookup.
> > > > > > > > 
> > > > > > > > Signed-off-by: NeilBrown <neilb@suse.de>
> > > > > > > > ---
> > > > > > > >  fs/nfsd/nfs4state.c | 38 +++++++++++++++++++++++++++++++-------
> > > > > > > >  fs/nfsd/state.h     |  2 +-
> > > > > > > >  2 files changed, 32 insertions(+), 8 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > > > index 690d0e697320..47e879d5d68a 100644
> > > > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > > > @@ -4430,21 +4430,32 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
> > > > > > > >  	atomic_set(&nn->nfsd_courtesy_clients, 0);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +enum rp_lock {
> > > > > > > > +	RP_UNLOCKED,
> > > > > > > > +	RP_LOCKED,
> > > > > > > > +	RP_UNHASHED,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >  static void init_nfs4_replay(struct nfs4_replay *rp)
> > > > > > > >  {
> > > > > > > >  	rp->rp_status = nfserr_serverfault;
> > > > > > > >  	rp->rp_buflen = 0;
> > > > > > > >  	rp->rp_buf = rp->rp_ibuf;
> > > > > > > > -	mutex_init(&rp->rp_mutex);
> > > > > > > > +	atomic_set(&rp->rp_locked, RP_UNLOCKED);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > > -		struct nfs4_stateowner *so)
> > > > > > > > +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
> > > > > > > > +				      struct nfs4_stateowner *so)
> > > > > > > >  {
> > > > > > > >  	if (!nfsd4_has_session(cstate)) {
> > > > > > > > -		mutex_lock(&so->so_replay.rp_mutex);
> > > > > > > > +		wait_var_event(&so->so_replay.rp_locked,
> > > > > > > > +			       atomic_cmpxchg(&so->so_replay.rp_locked,
> > > > > > > > +					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
> > > > > > > 
> > > > > > > What reliably prevents this from being a "wait forever" ?
> > > > > > 
> > > > > > That same thing that reliably prevented the original mutex_lock from
> > > > > > waiting forever.
> 
> Note that this patch fixes a deadlock here. So clearly, there /were/
> situations where "waiting forever" was possible with the mutex version
> of this code.
> 
> 
> > > > > > It waits until rp_locked is set to RP_UNLOCKED, which is precisely when
> > > > > > we previously called mutex_unlock.  But it *also* aborts the wait if
> > > > > > rp_locked is set to RP_UNHASHED - so it is now more reliable.
> > > > > > 
> > > > > > Does that answer the question?
> > > > > 
> > > > > Hm. I guess then we are no worse off with wait_var_event().
> > > > > 
> > > > > I'm not as familiar with this logic as perhaps I should be. How long
> > > > > does it take for the wake-up to occur, typically?
> > > > 
> > > > wait_var_event() is paired with wake_up_var().
> > > > The wake up happens when wake_up_var() is called, which in this code is
> > > > always immediately after atomic_set() updates the variable.
> > > 
> > > I'm trying to ascertain the actual wall-clock time that the nfsd thread
> > > is sleeping, at most. Is this going to be a possible DoS vector? Can
> > > it impact the ability for the server to shut down without hanging?
> > 
> > Prior to this patch, there was a mutex in play here and we just released
> > it to wake up the waiters. This is more or less doing the same thing, it
> > just indicates the resulting state better.
> 
> Well, it adds a third state so that a recovery action can be taken
> on wake-up in some cases. That avoids a deadlock, so this does count
> as a bug fix.
> 
> 
> > I doubt this will materially change how long the tasks are waiting.
> 
> It might not be a longer wait, but it still seems difficult to prove
> that the wait_var_event() will /always/ be awoken somehow.
> 

I'm not sure what other guarantees we can reasonably offer. The v4.0
replay handling requires an exclusionary lock of some sort, so you can
always get stuck if the wakeup never comes.

We typically hold this mutex today over seqid morphing operations in
v4.0, so if those get stuck doing something (a vfs_open or something?)
you'll get stuck here too.
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 690d0e697320..47e879d5d68a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4430,21 +4430,32 @@  nfsd4_init_leases_net(struct nfsd_net *nn)
 	atomic_set(&nn->nfsd_courtesy_clients, 0);
 }
 
+enum rp_lock {
+	RP_UNLOCKED,
+	RP_LOCKED,
+	RP_UNHASHED,
+};
+
 static void init_nfs4_replay(struct nfs4_replay *rp)
 {
 	rp->rp_status = nfserr_serverfault;
 	rp->rp_buflen = 0;
 	rp->rp_buf = rp->rp_ibuf;
-	mutex_init(&rp->rp_mutex);
+	atomic_set(&rp->rp_locked, RP_UNLOCKED);
 }
 
-static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
-		struct nfs4_stateowner *so)
+static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+				      struct nfs4_stateowner *so)
 {
 	if (!nfsd4_has_session(cstate)) {
-		mutex_lock(&so->so_replay.rp_mutex);
+		wait_var_event(&so->so_replay.rp_locked,
+			       atomic_cmpxchg(&so->so_replay.rp_locked,
+					      RP_UNLOCKED, RP_LOCKED) != RP_LOCKED);
+		if (atomic_read(&so->so_replay.rp_locked) == RP_UNHASHED)
+			return -EAGAIN;
 		cstate->replay_owner = nfs4_get_stateowner(so);
 	}
+	return 0;
 }
 
 void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
@@ -4453,7 +4464,8 @@  void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
 
 	if (so != NULL) {
 		cstate->replay_owner = NULL;
-		mutex_unlock(&so->so_replay.rp_mutex);
+		atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
+		wake_up_var(&so->so_replay.rp_locked);
 		nfs4_put_stateowner(so);
 	}
 }
@@ -4691,7 +4703,11 @@  move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 	 * Wait for the refcount to drop to 2. Since it has been unhashed,
 	 * there should be no danger of the refcount going back up again at
 	 * this point.
+	 * Some threads with a reference might be waiting for rp_locked,
+	 * so tell them to stop waiting.
 	 */
+	atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
+	wake_up_var(&oo->oo_owner.so_replay.rp_locked);
 	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
 
 	release_all_access(s);
@@ -5064,11 +5080,15 @@  nfsd4_process_open1(struct nfsd4_compound_state *cstate,
 	clp = cstate->clp;
 
 	strhashval = ownerstr_hashval(&open->op_owner);
+retry:
 	oo = find_or_alloc_open_stateowner(strhashval, open, cstate);
 	open->op_openowner = oo;
 	if (!oo)
 		return nfserr_jukebox;
-	nfsd4_cstate_assign_replay(cstate, &oo->oo_owner);
+	if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) {
+		nfs4_put_stateowner(&oo->oo_owner);
+		goto retry;
+	}
 	status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
 	if (status)
 		return status;
@@ -6828,11 +6848,15 @@  nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	trace_nfsd_preprocess(seqid, stateid);
 
 	*stpp = NULL;
+retry:
 	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
 	if (status)
 		return status;
 	stp = openlockstateid(s);
-	nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
+	if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) {
+		nfs4_put_stateowner(stp->st_stateowner);
+		goto retry;
+	}
 
 	status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
 	if (!status)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 41bdc913fa71..6a3becd86112 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -446,7 +446,7 @@  struct nfs4_replay {
 	unsigned int		rp_buflen;
 	char			*rp_buf;
 	struct knfsd_fh		rp_openfh;
-	struct mutex		rp_mutex;
+	atomic_t		rp_locked;
 	char			rp_ibuf[NFSD4_REPLAY_ISIZE];
 };