diff mbox series

nfsd4: fix a deadlock on state owner replay mutex

Message ID 720b91b1204b4c73be1b6ec2ff44dbab@jd.com (mailing list archive)
State New, archived
Headers show
Series nfsd4: fix a deadlock on state owner replay mutex | expand

Commit Message

黄乐 June 27, 2019, 6:30 p.m. UTC
from: Huang Le <huangle1@jd.com>

In move_to_close_lru(), which only be called on path of nfsd4 CLOSE op,
the code could wait for its stid ref count drop to 2 while holding its
state owner replay mutex.  However, the other stid ref holder (normally
a parallel CLOSE op) that move_to_close_lru() is waiting for might be
accquiring the same replay mutex.

This patch fix the issue by clearing the replay owner before waiting, and
assign it back after then.

Signed-off-by: Huang Le <huangle1@jd.com>
---

I guess we should cc this patch to stable tree, since a malicious client
could craft parallel CLOSE ops to put all nfsd tasks in D state shortly.

Comments

J. Bruce Fields July 10, 2019, 12:03 a.m. UTC | #1
On Thu, Jun 27, 2019 at 06:30:27PM +0000, 黄乐 wrote:
> from: Huang Le <huangle1@jd.com>
> 
> In move_to_close_lru(), which only be called on path of nfsd4 CLOSE op,
> the code could wait for its stid ref count drop to 2 while holding its
> state owner replay mutex.  However, the other stid ref holder (normally
> a parallel CLOSE op) that move_to_close_lru() is waiting for might be
> accquiring the same replay mutex.
> 
> This patch fix the issue by clearing the replay owner before waiting, and
> assign it back after then.

I don't understand why that's safe.  Maybe it is, but I don't understand
yet.  If we take the mutex, bump the seqid, drop the mutex, someone else
comes in and bumps the seqid again, then we reacquire the mutex... what
happens?

--b.

> 
> Signed-off-by: Huang Le <huangle1@jd.com>
> ---
> 
> I guess we should cc this patch to stable tree, since a malicious client
> could craft parallel CLOSE ops to put all nfsd tasks in D state shortly.
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 618e660..5f6a48f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3829,12 +3829,12 @@ static void nfs4_free_openowner(struct nfs4_stateowner *so)
>   * them before returning however.
>   */
>  static void
> -move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
> +move_to_close_lru(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *s,
> +		struct net *net)
>  {
>  	struct nfs4_ol_stateid *last;
>  	struct nfs4_openowner *oo = openowner(s->st_stateowner);
> -	struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
> -						nfsd_net_id);
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
>  
> @@ -3846,8 +3846,19 @@ static void nfs4_free_openowner(struct nfs4_stateowner *so)
>  	 * Wait for the refcount to drop to 2. Since it has been unhashed,
>  	 * there should be no danger of the refcount going back up again at
>  	 * this point.
> +	 *
> +	 * Before waiting, we clear cstate->replay_owner to release its
> +	 * so_replay.rp_mutex, since other reference holder might be accquiring
> +	 * the same mutex before they could drop the references.  The replay_owner
> +	 * can be assigned back safely after they done their jobs.
>  	 */
> -	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> +	if (refcount_read(&s->st_stid.sc_count) != 2) {
> +		struct nfs4_stateowner *so = cstate->replay_owner;
> +
> +		nfsd4_cstate_clear_replay(cstate);
> +		wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
> +		nfsd4_cstate_assign_replay(cstate, so);
> +	}
>  
>  	release_all_access(s);
>  	if (s->st_stid.sc_file) {
> @@ -5531,7 +5542,8 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static void nfsd4_close_open_stateid(struct nfsd4_compound_state *cstate,
> +		struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -5549,7 +5561,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
>  		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +			move_to_close_lru(cstate, s, clp->net);
>  	}
>  }
>  
> @@ -5587,7 +5599,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	nfsd4_close_open_stateid(cstate, stp);
>  	mutex_unlock(&stp->st_mutex);
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
黄乐 July 10, 2019, 6:43 p.m. UTC | #2
It's safe because when we reach move_to_close_lru(), we know we are the
*logical* last call among all racing calls, it is actually the reason we
do the waiting here at fist place.  Now just because of racing, we gain
the rp_mutex first and have a *lower* seqid value, so it is what we need
exactly to let the seqid bump with other racing calls before we continue,
that ensures possible future CLOSE call could be replayed correctly.


On Wed, Jul 10, 2019, bfields@fieldses.org wrote:
> I don't understand why that's safe.  Maybe it is, but I don't understand
> yet.  If we take the mutex, bump the seqid, drop the mutex, someone else
> comes in and bumps the seqid again, then we reacquire the mutex... what
> happens?
> 
> --b.
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 618e660..5f6a48f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3829,12 +3829,12 @@  static void nfs4_free_openowner(struct nfs4_stateowner *so)
  * them before returning however.
  */
 static void
-move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
+move_to_close_lru(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *s,
+		struct net *net)
 {
 	struct nfs4_ol_stateid *last;
 	struct nfs4_openowner *oo = openowner(s->st_stateowner);
-	struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
-						nfsd_net_id);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
 
@@ -3846,8 +3846,19 @@  static void nfs4_free_openowner(struct nfs4_stateowner *so)
 	 * Wait for the refcount to drop to 2. Since it has been unhashed,
 	 * there should be no danger of the refcount going back up again at
 	 * this point.
+	 *
+	 * Before waiting, we clear cstate->replay_owner to release its
+	 * so_replay.rp_mutex, since other reference holder might be accquiring
+	 * the same mutex before they could drop the references.  The replay_owner
+	 * can be assigned back safely after they done their jobs.
 	 */
-	wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
+	if (refcount_read(&s->st_stid.sc_count) != 2) {
+		struct nfs4_stateowner *so = cstate->replay_owner;
+
+		nfsd4_cstate_clear_replay(cstate);
+		wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
+		nfsd4_cstate_assign_replay(cstate, so);
+	}
 
 	release_all_access(s);
 	if (s->st_stid.sc_file) {
@@ -5531,7 +5542,8 @@  static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static void nfsd4_close_open_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -5549,7 +5561,7 @@  static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
 		if (unhashed)
-			move_to_close_lru(s, clp->net);
+			move_to_close_lru(cstate, s, clp->net);
 	}
 }
 
@@ -5587,7 +5599,7 @@  static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	nfsd4_close_open_stateid(cstate, stp);
 	mutex_unlock(&stp->st_mutex);
 
 	/* v4.1+ suggests that we send a special stateid in here, since the