diff mbox

nfsd: serialize state seqid morphing operations

Message ID 1442490428-29487-1-git-send-email-jeff.layton@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Sept. 17, 2015, 11:47 a.m. UTC
Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
running in parallel. The server would receive the OPEN_DOWNGRADE first
and check its seqid, but then an OPEN would race in and bump it. The
OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
it should have been rejected since the seqid changed.

The only recourse we have here I think is to serialize operations that
bump the seqid in a stateid, particularly when we're given a seqid in
the call. To address this, we add a new rw_semaphore to the
nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
after looking up the stateid to ensure that nothing else is going to
bump it while we're operating on it.

In the case of OPEN, we do a down_read, as the call doesn't contain a
seqid. Those can run in parallel -- we just need to serialize them when
there is a concurrent OPEN_DOWNGRADE or CLOSE.

LOCK and LOCKU however always take the write lock as there is no
opportunity for parallelizing those.

Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
 fs/nfsd/state.h     | 19 ++++++++++---------
 2 files changed, 38 insertions(+), 14 deletions(-)

Comments

J. Bruce Fields Sept. 29, 2015, 9:11 p.m. UTC | #1
On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> running in parallel. The server would receive the OPEN_DOWNGRADE first
> and check its seqid, but then an OPEN would race in and bump it. The
> OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> it should have been rejected since the seqid changed.
> 
> The only recourse we have here I think is to serialize operations that
> bump the seqid in a stateid, particularly when we're given a seqid in
> the call. To address this, we add a new rw_semaphore to the
> nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> after looking up the stateid to ensure that nothing else is going to
> bump it while we're operating on it.
> 
> In the case of OPEN, we do a down_read, as the call doesn't contain a
> seqid. Those can run in parallel -- we just need to serialize them when
> there is a concurrent OPEN_DOWNGRADE or CLOSE.
> 
> LOCK and LOCKU however always take the write lock as there is no
> opportunity for parallelizing those.
> 
> Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> ---
>  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
>  fs/nfsd/state.h     | 19 ++++++++++---------
>  2 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 0f1d5691b795..1b39edf10b67 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
>  	stp->st_access_bmap = 0;
>  	stp->st_deny_bmap = 0;
>  	stp->st_openstp = NULL;
> +	init_rwsem(&stp->st_rwsem);
>  	spin_lock(&oo->oo_owner.so_client->cl_lock);
>  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
>  	spin_lock(&fp->fi_lock);
> @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	 */
>  	if (stp) {
>  		/* Stateid was found, this is an OPEN upgrade */
> +		down_read(&stp->st_rwsem);
>  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> -		if (status)
> +		if (status) {
> +			up_read(&stp->st_rwsem);
>  			goto out;
> +		}
>  	} else {
>  		stp = open->op_stp;
>  		open->op_stp = NULL;
>  		init_open_stateid(stp, fp, open);
> +		down_read(&stp->st_rwsem);
>  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
>  		if (status) {
> +			up_read(&stp->st_rwsem);
>  			release_open_stateid(stp);
>  			goto out;
>  		}
> @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	}
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_read(&stp->st_rwsem);
>  
>  	if (nfsd4_has_session(&resp->cstate)) {
>  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {

The patch looks good, but:

Does it matter that we don't have an exclusive lock over that
update_stateid?

I think there's at least one small bug there:

	static inline void update_stateid(stateid_t *stateid)
	{       
	        stateid->si_generation++;
	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
	        if (stateid->si_generation == 0)
	                stateid->si_generation = 1;
	}

The si_generation increment isn't atomic, and even if it were the wraparound
handling definitely wouldn't.  That's a pretty small race.

Does it also matter that this si_generation update isn't atomic with respect
to the actual open and upgrade of the share bits?

--b.

> @@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
>  		 * revoked delegations are kept only for free_stateid.
>  		 */
>  		return nfserr_bad_stateid;
> +	down_write(&stp->st_rwsem);
>  	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
> -	if (status)
> -		return status;
> -	return nfs4_check_fh(current_fh, &stp->st_stid);
> +	if (status == nfs_ok)
> +		status = nfs4_check_fh(current_fh, &stp->st_stid);
> +	if (status != nfs_ok)
> +		up_write(&stp->st_rwsem);
> +	return status;
>  }
>  
>  /* 
> @@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
>  		return status;
>  	oo = openowner(stp->st_stateowner);
>  	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
> +		up_write(&stp->st_rwsem);
>  		nfs4_put_stid(&stp->st_stid);
>  		return nfserr_bad_stateid;
>  	}
> @@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		goto out;
>  	oo = openowner(stp->st_stateowner);
>  	status = nfserr_bad_stateid;
> -	if (oo->oo_flags & NFS4_OO_CONFIRMED)
> +	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
> +		up_write(&stp->st_rwsem);
>  		goto put_stateid;
> +	}
>  	oo->oo_flags |= NFS4_OO_CONFIRMED;
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_write(&stp->st_rwsem);
>  	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
>  		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
>  
> @@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
>  	status = nfs_ok;
>  put_stateid:
> +	up_write(&stp->st_rwsem);
>  	nfs4_put_stid(&stp->st_stid);
>  out:
>  	nfsd4_bump_seqid(cstate, status);
> @@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		goto out; 
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_write(&stp->st_rwsem);
>  
>  	nfsd4_close_open_stateid(stp);
>  
> @@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
>  	stp->st_access_bmap = 0;
>  	stp->st_deny_bmap = open_stp->st_deny_bmap;
>  	stp->st_openstp = open_stp;
> +	init_rwsem(&stp->st_rwsem);
>  	list_add(&stp->st_locks, &open_stp->st_locks);
>  	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
>  	spin_lock(&fp->fi_lock);
> @@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  					&open_stp, nn);
>  		if (status)
>  			goto out;
> +		up_write(&open_stp->st_rwsem);
>  		open_sop = openowner(open_stp->st_stateowner);
>  		status = nfserr_bad_stateid;
>  		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
> @@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  			goto out;
>  		status = lookup_or_create_lock_state(cstate, open_stp, lock,
>  							&lock_stp, &new);
> +		if (status == nfs_ok)
> +			down_write(&lock_stp->st_rwsem);
>  	} else {
>  		status = nfs4_preprocess_seqid_op(cstate,
>  				       lock->lk_old_lock_seqid,
> @@ -5540,6 +5560,8 @@ out:
>  		    seqid_mutating_err(ntohl(status)))
>  			lock_sop->lo_owner.so_seqid++;
>  
> +		up_write(&lock_stp->st_rwsem);
> +
>  		/*
>  		 * If this is a new, never-before-used stateid, and we are
>  		 * returning an error, then just go ahead and release it.
> @@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  fput:
>  	fput(filp);
>  put_stateid:
> +	up_write(&stp->st_rwsem);
>  	nfs4_put_stid(&stp->st_stid);
>  out:
>  	nfsd4_bump_seqid(cstate, status);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 583ffc13cae2..31bde12feefe 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -534,15 +534,16 @@ struct nfs4_file {
>   * Better suggestions welcome.
>   */
>  struct nfs4_ol_stateid {
> -	struct nfs4_stid    st_stid; /* must be first field */
> -	struct list_head              st_perfile;
> -	struct list_head              st_perstateowner;
> -	struct list_head              st_locks;
> -	struct nfs4_stateowner      * st_stateowner;
> -	struct nfs4_clnt_odstate    * st_clnt_odstate;
> -	unsigned char                 st_access_bmap;
> -	unsigned char                 st_deny_bmap;
> -	struct nfs4_ol_stateid         * st_openstp;
> +	struct nfs4_stid		st_stid;
> +	struct list_head		st_perfile;
> +	struct list_head		st_perstateowner;
> +	struct list_head		st_locks;
> +	struct nfs4_stateowner		*st_stateowner;
> +	struct nfs4_clnt_odstate	*st_clnt_odstate;
> +	unsigned char			st_access_bmap;
> +	unsigned char			st_deny_bmap;
> +	struct nfs4_ol_stateid		*st_openstp;
> +	struct rw_semaphore		st_rwsem;
>  };
>  
>  static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
> -- 
> 2.4.3
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Sept. 29, 2015, 9:26 p.m. UTC | #2
On Tue, 29 Sep 2015 17:11:55 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> > Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> > running in parallel. The server would receive the OPEN_DOWNGRADE first
> > and check its seqid, but then an OPEN would race in and bump it. The
> > OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> > was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> > it should have been rejected since the seqid changed.
> > 
> > The only recourse we have here I think is to serialize operations that
> > bump the seqid in a stateid, particularly when we're given a seqid in
> > the call. To address this, we add a new rw_semaphore to the
> > nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> > after looking up the stateid to ensure that nothing else is going to
> > bump it while we're operating on it.
> > 
> > In the case of OPEN, we do a down_read, as the call doesn't contain a
> > seqid. Those can run in parallel -- we just need to serialize them when
> > there is a concurrent OPEN_DOWNGRADE or CLOSE.
> > 
> > LOCK and LOCKU however always take the write lock as there is no
> > opportunity for parallelizing those.
> > 
> > Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > ---
> >  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
> >  fs/nfsd/state.h     | 19 ++++++++++---------
> >  2 files changed, 38 insertions(+), 14 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 0f1d5691b795..1b39edf10b67 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
> >  	stp->st_access_bmap = 0;
> >  	stp->st_deny_bmap = 0;
> >  	stp->st_openstp = NULL;
> > +	init_rwsem(&stp->st_rwsem);
> >  	spin_lock(&oo->oo_owner.so_client->cl_lock);
> >  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
> >  	spin_lock(&fp->fi_lock);
> > @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> >  	 */
> >  	if (stp) {
> >  		/* Stateid was found, this is an OPEN upgrade */
> > +		down_read(&stp->st_rwsem);
> >  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> > -		if (status)
> > +		if (status) {
> > +			up_read(&stp->st_rwsem);
> >  			goto out;
> > +		}
> >  	} else {
> >  		stp = open->op_stp;
> >  		open->op_stp = NULL;
> >  		init_open_stateid(stp, fp, open);
> > +		down_read(&stp->st_rwsem);
> >  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
> >  		if (status) {
> > +			up_read(&stp->st_rwsem);
> >  			release_open_stateid(stp);
> >  			goto out;
> >  		}
> > @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> >  	}
> >  	update_stateid(&stp->st_stid.sc_stateid);
> >  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > +	up_read(&stp->st_rwsem);
> >  
> >  	if (nfsd4_has_session(&resp->cstate)) {
> >  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> 
> The patch looks good, but:
> 
> Does it matter that we don't have an exclusive lock over that
> update_stateid?
> 
> I think there's at least one small bug there:
> 
> 	static inline void update_stateid(stateid_t *stateid)
> 	{       
> 	        stateid->si_generation++;
> 	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
> 	        if (stateid->si_generation == 0)
> 	                stateid->si_generation = 1;
> 	}
> 
> The si_generation increment isn't atomic, and even if it were the wraparound
> handling definitely wouldn't.  That's a pretty small race.
> 

Yeah, I eyeballed that some time ago and convinced myself it was ok,
but I think you're right that there is a potential race there. That
counter sort of seems like something that ought to use atomics in some
fashion. The wraparound is tricky, but could be done locklessly with
cmpxchg, I think...

> Does it also matter that this si_generation update isn't atomic with respect
> to the actual open and upgrade of the share bits?
> 

I don't think so. If you race with a concurrent OPEN upgrade, the
server will either have what you requested or a superset of that. It's
only OPEN_DOWNGRADE and CLOSE that need full serialization.

> --b.
> 
> > @@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
> >  		 * revoked delegations are kept only for free_stateid.
> >  		 */
> >  		return nfserr_bad_stateid;
> > +	down_write(&stp->st_rwsem);
> >  	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
> > -	if (status)
> > -		return status;
> > -	return nfs4_check_fh(current_fh, &stp->st_stid);
> > +	if (status == nfs_ok)
> > +		status = nfs4_check_fh(current_fh, &stp->st_stid);
> > +	if (status != nfs_ok)
> > +		up_write(&stp->st_rwsem);
> > +	return status;
> >  }
> >  
> >  /* 
> > @@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
> >  		return status;
> >  	oo = openowner(stp->st_stateowner);
> >  	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
> > +		up_write(&stp->st_rwsem);
> >  		nfs4_put_stid(&stp->st_stid);
> >  		return nfserr_bad_stateid;
> >  	}
> > @@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  		goto out;
> >  	oo = openowner(stp->st_stateowner);
> >  	status = nfserr_bad_stateid;
> > -	if (oo->oo_flags & NFS4_OO_CONFIRMED)
> > +	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
> > +		up_write(&stp->st_rwsem);
> >  		goto put_stateid;
> > +	}
> >  	oo->oo_flags |= NFS4_OO_CONFIRMED;
> >  	update_stateid(&stp->st_stid.sc_stateid);
> >  	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > +	up_write(&stp->st_rwsem);
> >  	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
> >  		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
> >  
> > @@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >  	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> >  	status = nfs_ok;
> >  put_stateid:
> > +	up_write(&stp->st_rwsem);
> >  	nfs4_put_stid(&stp->st_stid);
> >  out:
> >  	nfsd4_bump_seqid(cstate, status);
> > @@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  		goto out; 
> >  	update_stateid(&stp->st_stid.sc_stateid);
> >  	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > +	up_write(&stp->st_rwsem);
> >  
> >  	nfsd4_close_open_stateid(stp);
> >  
> > @@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
> >  	stp->st_access_bmap = 0;
> >  	stp->st_deny_bmap = open_stp->st_deny_bmap;
> >  	stp->st_openstp = open_stp;
> > +	init_rwsem(&stp->st_rwsem);
> >  	list_add(&stp->st_locks, &open_stp->st_locks);
> >  	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
> >  	spin_lock(&fp->fi_lock);
> > @@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  					&open_stp, nn);
> >  		if (status)
> >  			goto out;
> > +		up_write(&open_stp->st_rwsem);
> >  		open_sop = openowner(open_stp->st_stateowner);
> >  		status = nfserr_bad_stateid;
> >  		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
> > @@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  			goto out;
> >  		status = lookup_or_create_lock_state(cstate, open_stp, lock,
> >  							&lock_stp, &new);
> > +		if (status == nfs_ok)
> > +			down_write(&lock_stp->st_rwsem);
> >  	} else {
> >  		status = nfs4_preprocess_seqid_op(cstate,
> >  				       lock->lk_old_lock_seqid,
> > @@ -5540,6 +5560,8 @@ out:
> >  		    seqid_mutating_err(ntohl(status)))
> >  			lock_sop->lo_owner.so_seqid++;
> >  
> > +		up_write(&lock_stp->st_rwsem);
> > +
> >  		/*
> >  		 * If this is a new, never-before-used stateid, and we are
> >  		 * returning an error, then just go ahead and release it.
> > @@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  fput:
> >  	fput(filp);
> >  put_stateid:
> > +	up_write(&stp->st_rwsem);
> >  	nfs4_put_stid(&stp->st_stid);
> >  out:
> >  	nfsd4_bump_seqid(cstate, status);
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index 583ffc13cae2..31bde12feefe 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -534,15 +534,16 @@ struct nfs4_file {
> >   * Better suggestions welcome.
> >   */
> >  struct nfs4_ol_stateid {
> > -	struct nfs4_stid    st_stid; /* must be first field */
> > -	struct list_head              st_perfile;
> > -	struct list_head              st_perstateowner;
> > -	struct list_head              st_locks;
> > -	struct nfs4_stateowner      * st_stateowner;
> > -	struct nfs4_clnt_odstate    * st_clnt_odstate;
> > -	unsigned char                 st_access_bmap;
> > -	unsigned char                 st_deny_bmap;
> > -	struct nfs4_ol_stateid         * st_openstp;
> > +	struct nfs4_stid		st_stid;
> > +	struct list_head		st_perfile;
> > +	struct list_head		st_perstateowner;
> > +	struct list_head		st_locks;
> > +	struct nfs4_stateowner		*st_stateowner;
> > +	struct nfs4_clnt_odstate	*st_clnt_odstate;
> > +	unsigned char			st_access_bmap;
> > +	unsigned char			st_deny_bmap;
> > +	struct nfs4_ol_stateid		*st_openstp;
> > +	struct rw_semaphore		st_rwsem;
> >  };
> >  
> >  static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
> > -- 
> > 2.4.3
J. Bruce Fields Sept. 29, 2015, 11:14 p.m. UTC | #3
On Tue, Sep 29, 2015 at 05:26:38PM -0400, Jeff Layton wrote:
> On Tue, 29 Sep 2015 17:11:55 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> > > Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> > > running in parallel. The server would receive the OPEN_DOWNGRADE first
> > > and check its seqid, but then an OPEN would race in and bump it. The
> > > OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> > > was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> > > it should have been rejected since the seqid changed.
> > > 
> > > The only recourse we have here I think is to serialize operations that
> > > bump the seqid in a stateid, particularly when we're given a seqid in
> > > the call. To address this, we add a new rw_semaphore to the
> > > nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> > > after looking up the stateid to ensure that nothing else is going to
> > > bump it while we're operating on it.
> > > 
> > > In the case of OPEN, we do a down_read, as the call doesn't contain a
> > > seqid. Those can run in parallel -- we just need to serialize them when
> > > there is a concurrent OPEN_DOWNGRADE or CLOSE.
> > > 
> > > LOCK and LOCKU however always take the write lock as there is no
> > > opportunity for parallelizing those.
> > > 
> > > Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > ---
> > >  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
> > >  fs/nfsd/state.h     | 19 ++++++++++---------
> > >  2 files changed, 38 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index 0f1d5691b795..1b39edf10b67 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
> > >  	stp->st_access_bmap = 0;
> > >  	stp->st_deny_bmap = 0;
> > >  	stp->st_openstp = NULL;
> > > +	init_rwsem(&stp->st_rwsem);
> > >  	spin_lock(&oo->oo_owner.so_client->cl_lock);
> > >  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
> > >  	spin_lock(&fp->fi_lock);
> > > @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > >  	 */
> > >  	if (stp) {
> > >  		/* Stateid was found, this is an OPEN upgrade */
> > > +		down_read(&stp->st_rwsem);
> > >  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> > > -		if (status)
> > > +		if (status) {
> > > +			up_read(&stp->st_rwsem);
> > >  			goto out;
> > > +		}
> > >  	} else {
> > >  		stp = open->op_stp;
> > >  		open->op_stp = NULL;
> > >  		init_open_stateid(stp, fp, open);
> > > +		down_read(&stp->st_rwsem);
> > >  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
> > >  		if (status) {
> > > +			up_read(&stp->st_rwsem);
> > >  			release_open_stateid(stp);
> > >  			goto out;
> > >  		}
> > > @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > >  	}
> > >  	update_stateid(&stp->st_stid.sc_stateid);
> > >  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > > +	up_read(&stp->st_rwsem);
> > >  
> > >  	if (nfsd4_has_session(&resp->cstate)) {
> > >  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> > 
> > The patch looks good, but:
> > 
> > Does it matter that we don't have an exclusive lock over that
> > update_stateid?
> > 
> > I think there's at least one small bug there:
> > 
> > 	static inline void update_stateid(stateid_t *stateid)
> > 	{       
> > 	        stateid->si_generation++;
> > 	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
> > 	        if (stateid->si_generation == 0)
> > 	                stateid->si_generation = 1;
> > 	}
> > 
> > The si_generation increment isn't atomic, and even if it were the wraparound
> > handling definitely wouldn't.  That's a pretty small race.
> > 
> 
> Yeah, I eyeballed that some time ago and convinced myself it was ok,
> but I think you're right that there is a potential race there. That
> counter sort of seems like something that ought to use atomics in some
> fashion. The wraparound is tricky, but could be done locklessly with
> cmpxchg, I think...
> > Does it also matter that this si_generation update isn't atomic with respect
> > to the actual open and upgrade of the share bits?
> > 
> 
> I don't think so. If you race with a concurrent OPEN upgrade, the
> server will either have what you requested or a superset of that. It's
> only OPEN_DOWNGRADE and CLOSE that need full serialization.

IO also takes stateids, and I think it'd be interesting to think about
scenarios that involve concurrent opens and reads or writes.  For
example:

	- process_open2 upgrades R to RW

				- Write processed with si_generation=1

	- process_open2 bumps si_generation

The write succeeds, but it was performed with a stateid that represented
a read-only open.  I believe that write should have gotten either
OPENMODE (if it happened before the open) or OLD_STATEID (if it happened
after).

I don't know if that's actually a problem in practice.

Hm, I also wonder about the window between the si_generation bump and
memcpy.  Multiple concurrent opens could end up returning the same
stateid:

	- si_generation++
				- si_generation++

	- memcpy new stateid
				- memcpy new stateid

Again, I'm not sure if that will actually cause application-visible
problems.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Sept. 30, 2015, 10:53 a.m. UTC | #4
On Tue, 29 Sep 2015 19:14:27 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Tue, Sep 29, 2015 at 05:26:38PM -0400, Jeff Layton wrote:
> > On Tue, 29 Sep 2015 17:11:55 -0400
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> > > > Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> > > > running in parallel. The server would receive the OPEN_DOWNGRADE first
> > > > and check its seqid, but then an OPEN would race in and bump it. The
> > > > OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> > > > was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> > > > it should have been rejected since the seqid changed.
> > > > 
> > > > The only recourse we have here I think is to serialize operations that
> > > > bump the seqid in a stateid, particularly when we're given a seqid in
> > > > the call. To address this, we add a new rw_semaphore to the
> > > > nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> > > > after looking up the stateid to ensure that nothing else is going to
> > > > bump it while we're operating on it.
> > > > 
> > > > In the case of OPEN, we do a down_read, as the call doesn't contain a
> > > > seqid. Those can run in parallel -- we just need to serialize them when
> > > > there is a concurrent OPEN_DOWNGRADE or CLOSE.
> > > > 
> > > > LOCK and LOCKU however always take the write lock as there is no
> > > > opportunity for parallelizing those.
> > > > 
> > > > Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > > ---
> > > >  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
> > > >  fs/nfsd/state.h     | 19 ++++++++++---------
> > > >  2 files changed, 38 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index 0f1d5691b795..1b39edf10b67 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
> > > >  	stp->st_access_bmap = 0;
> > > >  	stp->st_deny_bmap = 0;
> > > >  	stp->st_openstp = NULL;
> > > > +	init_rwsem(&stp->st_rwsem);
> > > >  	spin_lock(&oo->oo_owner.so_client->cl_lock);
> > > >  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
> > > >  	spin_lock(&fp->fi_lock);
> > > > @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > >  	 */
> > > >  	if (stp) {
> > > >  		/* Stateid was found, this is an OPEN upgrade */
> > > > +		down_read(&stp->st_rwsem);
> > > >  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> > > > -		if (status)
> > > > +		if (status) {
> > > > +			up_read(&stp->st_rwsem);
> > > >  			goto out;
> > > > +		}
> > > >  	} else {
> > > >  		stp = open->op_stp;
> > > >  		open->op_stp = NULL;
> > > >  		init_open_stateid(stp, fp, open);
> > > > +		down_read(&stp->st_rwsem);
> > > >  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
> > > >  		if (status) {
> > > > +			up_read(&stp->st_rwsem);
> > > >  			release_open_stateid(stp);
> > > >  			goto out;
> > > >  		}
> > > > @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > >  	}
> > > >  	update_stateid(&stp->st_stid.sc_stateid);
> > > >  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > > > +	up_read(&stp->st_rwsem);
> > > >  
> > > >  	if (nfsd4_has_session(&resp->cstate)) {
> > > >  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> > > 
> > > The patch looks good, but:
> > > 
> > > Does it matter that we don't have an exclusive lock over that
> > > update_stateid?
> > > 
> > > I think there's at least one small bug there:
> > > 
> > > 	static inline void update_stateid(stateid_t *stateid)
> > > 	{       
> > > 	        stateid->si_generation++;
> > > 	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
> > > 	        if (stateid->si_generation == 0)
> > > 	                stateid->si_generation = 1;
> > > 	}
> > > 
> > > The si_generation increment isn't atomic, and even if it were the wraparound
> > > handling definitely wouldn't.  That's a pretty small race.
> > > 
> > 
> > Yeah, I eyeballed that some time ago and convinced myself it was ok,
> > but I think you're right that there is a potential race there. That
> > counter sort of seems like something that ought to use atomics in some
> > fashion. The wraparound is tricky, but could be done locklessly with
> > cmpxchg, I think...
> > > Does it also matter that this si_generation update isn't atomic with respect
> > > to the actual open and upgrade of the share bits?
> > > 
> > 
> > I don't think so. If you race with a concurrent OPEN upgrade, the
> > server will either have what you requested or a superset of that. It's
> > only OPEN_DOWNGRADE and CLOSE that need full serialization.
> 
> IO also takes stateids, and I think it'd be interesting to think about
> scenarios that involve concurrent opens and reads or writes.  For
> example:
> 
> 	- process_open2 upgrades R to RW
> 
> 				- Write processed with si_generation=1
> 
> 	- process_open2 bumps si_generation
> 
> The write succeeds, but it was performed with a stateid that represented
> a read-only open.  I believe that write should have gotten either
> OPENMODE (if it happened before the open) or OLD_STATEID (if it happened
> after).
> 
> I don't know if that's actually a problem in practice.
> 

That's a different issue altogether. We're not serializing anything but
operations that morph the seqid in this patch, and WRITE doesn't do
that.

If we need to hold the seqid static over the life of operations that
happened to provide that seqid then that's a much larger effort, and
probably not suitable for a rw semaphore. You'd need to allow for a 3rd
class of "locker" that is able to operate in parallel with one another
but that prevents any changes to the seqid.

> Hm, I also wonder about the window between the si_generation bump and
> memcpy.  Multiple concurrent opens could end up returning the same
> stateid:
> 
> 	- si_generation++
> 				- si_generation++
> 
> 	- memcpy new stateid
> 				- memcpy new stateid
> 
> Again, I'm not sure if that will actually cause application-visible
> problems.
> 

That, could be a problem.

One thing to notice is that update_stateid is always followed by the
memcpy of that stateid. Perhaps we should roll the two together --
update the stateid and do the memcpy under a spinlock or something.
That also would also make it trivial to fix the wraparound problem too.

Do you think we could get away with a global spinlock for that, or do
we need to consider adding one to the nfs4_stid?
J. Bruce Fields Sept. 30, 2015, 2:30 p.m. UTC | #5
On Wed, Sep 30, 2015 at 06:53:38AM -0400, Jeff Layton wrote:
> On Tue, 29 Sep 2015 19:14:27 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Tue, Sep 29, 2015 at 05:26:38PM -0400, Jeff Layton wrote:
> > > On Tue, 29 Sep 2015 17:11:55 -0400
> > > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > > 
> > > > On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> > > > > Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> > > > > running in parallel. The server would receive the OPEN_DOWNGRADE first
> > > > > and check its seqid, but then an OPEN would race in and bump it. The
> > > > > OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> > > > > was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> > > > > it should have been rejected since the seqid changed.
> > > > > 
> > > > > The only recourse we have here I think is to serialize operations that
> > > > > bump the seqid in a stateid, particularly when we're given a seqid in
> > > > > the call. To address this, we add a new rw_semaphore to the
> > > > > nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> > > > > after looking up the stateid to ensure that nothing else is going to
> > > > > bump it while we're operating on it.
> > > > > 
> > > > > In the case of OPEN, we do a down_read, as the call doesn't contain a
> > > > > seqid. Those can run in parallel -- we just need to serialize them when
> > > > > there is a concurrent OPEN_DOWNGRADE or CLOSE.
> > > > > 
> > > > > LOCK and LOCKU however always take the write lock as there is no
> > > > > opportunity for parallelizing those.
> > > > > 
> > > > > Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> > > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > > > ---
> > > > >  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
> > > > >  fs/nfsd/state.h     | 19 ++++++++++---------
> > > > >  2 files changed, 38 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > index 0f1d5691b795..1b39edf10b67 100644
> > > > > --- a/fs/nfsd/nfs4state.c
> > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
> > > > >  	stp->st_access_bmap = 0;
> > > > >  	stp->st_deny_bmap = 0;
> > > > >  	stp->st_openstp = NULL;
> > > > > +	init_rwsem(&stp->st_rwsem);
> > > > >  	spin_lock(&oo->oo_owner.so_client->cl_lock);
> > > > >  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
> > > > >  	spin_lock(&fp->fi_lock);
> > > > > @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > > >  	 */
> > > > >  	if (stp) {
> > > > >  		/* Stateid was found, this is an OPEN upgrade */
> > > > > +		down_read(&stp->st_rwsem);
> > > > >  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> > > > > -		if (status)
> > > > > +		if (status) {
> > > > > +			up_read(&stp->st_rwsem);
> > > > >  			goto out;
> > > > > +		}
> > > > >  	} else {
> > > > >  		stp = open->op_stp;
> > > > >  		open->op_stp = NULL;
> > > > >  		init_open_stateid(stp, fp, open);
> > > > > +		down_read(&stp->st_rwsem);
> > > > >  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
> > > > >  		if (status) {
> > > > > +			up_read(&stp->st_rwsem);
> > > > >  			release_open_stateid(stp);
> > > > >  			goto out;
> > > > >  		}
> > > > > @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > > >  	}
> > > > >  	update_stateid(&stp->st_stid.sc_stateid);
> > > > >  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > > > > +	up_read(&stp->st_rwsem);
> > > > >  
> > > > >  	if (nfsd4_has_session(&resp->cstate)) {
> > > > >  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> > > > 
> > > > The patch looks good, but:
> > > > 
> > > > Does it matter that we don't have an exclusive lock over that
> > > > update_stateid?
> > > > 
> > > > I think there's at least one small bug there:
> > > > 
> > > > 	static inline void update_stateid(stateid_t *stateid)
> > > > 	{       
> > > > 	        stateid->si_generation++;
> > > > 	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
> > > > 	        if (stateid->si_generation == 0)
> > > > 	                stateid->si_generation = 1;
> > > > 	}
> > > > 
> > > > The si_generation increment isn't atomic, and even if it were the wraparound
> > > > handling definitely wouldn't.  That's a pretty small race.
> > > > 
> > > 
> > > Yeah, I eyeballed that some time ago and convinced myself it was ok,
> > > but I think you're right that there is a potential race there. That
> > > counter sort of seems like something that ought to use atomics in some
> > > fashion. The wraparound is tricky, but could be done locklessly with
> > > cmpxchg, I think...
> > > > Does it also matter that this si_generation update isn't atomic with respect
> > > > to the actual open and upgrade of the share bits?
> > > > 
> > > 
> > > I don't think so. If you race with a concurrent OPEN upgrade, the
> > > server will either have what you requested or a superset of that. It's
> > > only OPEN_DOWNGRADE and CLOSE that need full serialization.
> > 
> > IO also takes stateids, and I think it'd be interesting to think about
> > scenarios that involve concurrent opens and reads or writes.  For
> > example:
> > 
> > 	- process_open2 upgrades R to RW
> > 
> > 				- Write processed with si_generation=1
> > 
> > 	- process_open2 bumps si_generation
> > 
> > The write succeeds, but it was performed with a stateid that represented
> > a read-only open.  I believe that write should have gotten either
> > OPENMODE (if it happened before the open) or OLD_STATEID (if it happened
> > after).
> > 
> > I don't know if that's actually a problem in practice.
> > 
> 
> That's a different issue altogether. We're not serializing anything but
> operations that morph the seqid in this patch, and WRITE doesn't do
> that.

Sure.  And this patch is an improvement on its own and should probably
be applied as is.  I just couldn't help wondering about this other stuff
while I was looking at this part of the code.

> If we need to hold the seqid static over the life of operations that
> happened to provide that seqid then that's a much larger effort, and
> probably not suitable for a rw semaphore. You'd need to allow for a 3rd
> class of "locker" that is able to operate in parallel with one another
> but that prevents any changes to the seqid.
> 
> > Hm, I also wonder about the window between the si_generation bump and
> > memcpy.  Multiple concurrent opens could end up returning the same
> > stateid:
> > 
> > 	- si_generation++
> > 				- si_generation++
> > 
> > 	- memcpy new stateid
> > 				- memcpy new stateid
> > 
> > Again, I'm not sure if that will actually cause application-visible
> > problems.
> > 
> 
> That, could be a problem.
> 
> One thing to notice is that update_stateid is always followed by the
> memcpy of that stateid. Perhaps we should roll the two together --
> update the stateid and do the memcpy under a spinlock or something.
> That also would also make it trivial to fix the wraparound problem too.

Sounds reasonable.

> Do you think we could get away with a global spinlock for that, or do
> we need to consider adding one to the nfs4_stid?

It'd be easy enough to do, wouldn't it?

I mean, really making open code scale sounds like a project for another
day, but glomming locks together also seems easier than splitting them
apart, so I'd rather start with the finer locking and then fix it later
if somebody decides it's not optimal.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Sept. 30, 2015, 2:35 p.m. UTC | #6
On Wed, 30 Sep 2015 10:30:49 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Wed, Sep 30, 2015 at 06:53:38AM -0400, Jeff Layton wrote:
> > On Tue, 29 Sep 2015 19:14:27 -0400
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Tue, Sep 29, 2015 at 05:26:38PM -0400, Jeff Layton wrote:
> > > > On Tue, 29 Sep 2015 17:11:55 -0400
> > > > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > > > 
> > > > > On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> > > > > > Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> > > > > > running in parallel. The server would receive the OPEN_DOWNGRADE first
> > > > > > and check its seqid, but then an OPEN would race in and bump it. The
> > > > > > OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> > > > > > was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> > > > > > it should have been rejected since the seqid changed.
> > > > > > 
> > > > > > The only recourse we have here I think is to serialize operations that
> > > > > > bump the seqid in a stateid, particularly when we're given a seqid in
> > > > > > the call. To address this, we add a new rw_semaphore to the
> > > > > > nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> > > > > > after looking up the stateid to ensure that nothing else is going to
> > > > > > bump it while we're operating on it.
> > > > > > 
> > > > > > In the case of OPEN, we do a down_read, as the call doesn't contain a
> > > > > > seqid. Those can run in parallel -- we just need to serialize them when
> > > > > > there is a concurrent OPEN_DOWNGRADE or CLOSE.
> > > > > > 
> > > > > > LOCK and LOCKU however always take the write lock as there is no
> > > > > > opportunity for parallelizing those.
> > > > > > 
> > > > > > Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> > > > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > > > > ---
> > > > > >  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
> > > > > >  fs/nfsd/state.h     | 19 ++++++++++---------
> > > > > >  2 files changed, 38 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > > index 0f1d5691b795..1b39edf10b67 100644
> > > > > > --- a/fs/nfsd/nfs4state.c
> > > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > > @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
> > > > > >  	stp->st_access_bmap = 0;
> > > > > >  	stp->st_deny_bmap = 0;
> > > > > >  	stp->st_openstp = NULL;
> > > > > > +	init_rwsem(&stp->st_rwsem);
> > > > > >  	spin_lock(&oo->oo_owner.so_client->cl_lock);
> > > > > >  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
> > > > > >  	spin_lock(&fp->fi_lock);
> > > > > > @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > > > >  	 */
> > > > > >  	if (stp) {
> > > > > >  		/* Stateid was found, this is an OPEN upgrade */
> > > > > > +		down_read(&stp->st_rwsem);
> > > > > >  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> > > > > > -		if (status)
> > > > > > +		if (status) {
> > > > > > +			up_read(&stp->st_rwsem);
> > > > > >  			goto out;
> > > > > > +		}
> > > > > >  	} else {
> > > > > >  		stp = open->op_stp;
> > > > > >  		open->op_stp = NULL;
> > > > > >  		init_open_stateid(stp, fp, open);
> > > > > > +		down_read(&stp->st_rwsem);
> > > > > >  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
> > > > > >  		if (status) {
> > > > > > +			up_read(&stp->st_rwsem);
> > > > > >  			release_open_stateid(stp);
> > > > > >  			goto out;
> > > > > >  		}
> > > > > > @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
> > > > > >  	}
> > > > > >  	update_stateid(&stp->st_stid.sc_stateid);
> > > > > >  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> > > > > > +	up_read(&stp->st_rwsem);
> > > > > >  
> > > > > >  	if (nfsd4_has_session(&resp->cstate)) {
> > > > > >  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> > > > > 
> > > > > The patch looks good, but:
> > > > > 
> > > > > Does it matter that we don't have an exclusive lock over that
> > > > > update_stateid?
> > > > > 
> > > > > I think there's at least one small bug there:
> > > > > 
> > > > > 	static inline void update_stateid(stateid_t *stateid)
> > > > > 	{       
> > > > > 	        stateid->si_generation++;
> > > > > 	        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
> > > > > 	        if (stateid->si_generation == 0)
> > > > > 	                stateid->si_generation = 1;
> > > > > 	}
> > > > > 
> > > > > The si_generation increment isn't atomic, and even if it were the wraparound
> > > > > handling definitely wouldn't.  That's a pretty small race.
> > > > > 
> > > > 
> > > > Yeah, I eyeballed that some time ago and convinced myself it was ok,
> > > > but I think you're right that there is a potential race there. That
> > > > counter sort of seems like something that ought to use atomics in some
> > > > fashion. The wraparound is tricky, but could be done locklessly with
> > > > cmpxchg, I think...
> > > > > Does it also matter that this si_generation update isn't atomic with respect
> > > > > to the actual open and upgrade of the share bits?
> > > > > 
> > > > 
> > > > I don't think so. If you race with a concurrent OPEN upgrade, the
> > > > server will either have what you requested or a superset of that. It's
> > > > only OPEN_DOWNGRADE and CLOSE that need full serialization.
> > > 
> > > IO also takes stateids, and I think it'd be interesting to think about
> > > scenarios that involve concurrent opens and reads or writes.  For
> > > example:
> > > 
> > > 	- process_open2 upgrades R to RW
> > > 
> > > 				- Write processed with si_generation=1
> > > 
> > > 	- process_open2 bumps si_generation
> > > 
> > > The write succeeds, but it was performed with a stateid that represented
> > > a read-only open.  I believe that write should have gotten either
> > > OPENMODE (if it happened before the open) or OLD_STATEID (if it happened
> > > after).
> > > 
> > > I don't know if that's actually a problem in practice.
> > > 
> > 
> > That's a different issue altogether. We're not serializing anything but
> > operations that morph the seqid in this patch, and WRITE doesn't do
> > that.
> 
> Sure.  And this patch is an improvement on its own and should probably
> be applied as is.  I just couldn't help wondering about this other stuff
> while I was looking at this part of the code.
> 
> > If we need to hold the seqid static over the life of operations that
> > happened to provide that seqid then that's a much larger effort, and
> > probably not suitable for a rw semaphore. You'd need to allow for a 3rd
> > class of "locker" that is able to operate in parallel with one another
> > but that prevents any changes to the seqid.
> > 
> > > Hm, I also wonder about the window between the si_generation bump and
> > > memcpy.  Multiple concurrent opens could end up returning the same
> > > stateid:
> > > 
> > > 	- si_generation++
> > > 				- si_generation++
> > > 
> > > 	- memcpy new stateid
> > > 				- memcpy new stateid
> > > 
> > > Again, I'm not sure if that will actually cause application-visible
> > > problems.
> > > 
> > 
> > That, could be a problem.
> > 
> > One thing to notice is that update_stateid is always followed by the
> > memcpy of that stateid. Perhaps we should roll the two together --
> > update the stateid and do the memcpy under a spinlock or something.
> > That also would also make it trivial to fix the wraparound problem too.
> 
> Sounds reasonable.
> 
> > Do you think we could get away with a global spinlock for that, or do
> > we need to consider adding one to the nfs4_stid?
> 
> It'd be easy enough to do, wouldn't it?
> 
> I mean, really making open code scale sounds like a project for another
> day, but glomming locks together also seems easier than splitting them
> apart, so I'd rather start with the finer locking and then fix it later
> if somebody decides it's not optimal.
> 

Ok, sounds good. I'll take a look at it when I get time, unless you get
there first... ;)
J. Bruce Fields Oct. 1, 2015, 5:51 p.m. UTC | #7
Thanks for figuring this one out; applying for 4.3 and stable.--b.

On Thu, Sep 17, 2015 at 07:47:08AM -0400, Jeff Layton wrote:
> Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
> running in parallel. The server would receive the OPEN_DOWNGRADE first
> and check its seqid, but then an OPEN would race in and bump it. The
> OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
> was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
> it should have been rejected since the seqid changed.
> 
> The only recourse we have here I think is to serialize operations that
> bump the seqid in a stateid, particularly when we're given a seqid in
> the call. To address this, we add a new rw_semaphore to the
> nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
> after looking up the stateid to ensure that nothing else is going to
> bump it while we're operating on it.
> 
> In the case of OPEN, we do a down_read, as the call doesn't contain a
> seqid. Those can run in parallel -- we just need to serialize them when
> there is a concurrent OPEN_DOWNGRADE or CLOSE.
> 
> LOCK and LOCKU however always take the write lock as there is no
> opportunity for parallelizing those.
> 
> Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> ---
>  fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
>  fs/nfsd/state.h     | 19 ++++++++++---------
>  2 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 0f1d5691b795..1b39edf10b67 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
>  	stp->st_access_bmap = 0;
>  	stp->st_deny_bmap = 0;
>  	stp->st_openstp = NULL;
> +	init_rwsem(&stp->st_rwsem);
>  	spin_lock(&oo->oo_owner.so_client->cl_lock);
>  	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
>  	spin_lock(&fp->fi_lock);
> @@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	 */
>  	if (stp) {
>  		/* Stateid was found, this is an OPEN upgrade */
> +		down_read(&stp->st_rwsem);
>  		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
> -		if (status)
> +		if (status) {
> +			up_read(&stp->st_rwsem);
>  			goto out;
> +		}
>  	} else {
>  		stp = open->op_stp;
>  		open->op_stp = NULL;
>  		init_open_stateid(stp, fp, open);
> +		down_read(&stp->st_rwsem);
>  		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
>  		if (status) {
> +			up_read(&stp->st_rwsem);
>  			release_open_stateid(stp);
>  			goto out;
>  		}
> @@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>  	}
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_read(&stp->st_rwsem);
>  
>  	if (nfsd4_has_session(&resp->cstate)) {
>  		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
> @@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
>  		 * revoked delegations are kept only for free_stateid.
>  		 */
>  		return nfserr_bad_stateid;
> +	down_write(&stp->st_rwsem);
>  	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
> -	if (status)
> -		return status;
> -	return nfs4_check_fh(current_fh, &stp->st_stid);
> +	if (status == nfs_ok)
> +		status = nfs4_check_fh(current_fh, &stp->st_stid);
> +	if (status != nfs_ok)
> +		up_write(&stp->st_rwsem);
> +	return status;
>  }
>  
>  /* 
> @@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
>  		return status;
>  	oo = openowner(stp->st_stateowner);
>  	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
> +		up_write(&stp->st_rwsem);
>  		nfs4_put_stid(&stp->st_stid);
>  		return nfserr_bad_stateid;
>  	}
> @@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		goto out;
>  	oo = openowner(stp->st_stateowner);
>  	status = nfserr_bad_stateid;
> -	if (oo->oo_flags & NFS4_OO_CONFIRMED)
> +	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
> +		up_write(&stp->st_rwsem);
>  		goto put_stateid;
> +	}
>  	oo->oo_flags |= NFS4_OO_CONFIRMED;
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_write(&stp->st_rwsem);
>  	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
>  		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
>  
> @@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
>  	status = nfs_ok;
>  put_stateid:
> +	up_write(&stp->st_rwsem);
>  	nfs4_put_stid(&stp->st_stid);
>  out:
>  	nfsd4_bump_seqid(cstate, status);
> @@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		goto out; 
>  	update_stateid(&stp->st_stid.sc_stateid);
>  	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
> +	up_write(&stp->st_rwsem);
>  
>  	nfsd4_close_open_stateid(stp);
>  
> @@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
>  	stp->st_access_bmap = 0;
>  	stp->st_deny_bmap = open_stp->st_deny_bmap;
>  	stp->st_openstp = open_stp;
> +	init_rwsem(&stp->st_rwsem);
>  	list_add(&stp->st_locks, &open_stp->st_locks);
>  	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
>  	spin_lock(&fp->fi_lock);
> @@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  					&open_stp, nn);
>  		if (status)
>  			goto out;
> +		up_write(&open_stp->st_rwsem);
>  		open_sop = openowner(open_stp->st_stateowner);
>  		status = nfserr_bad_stateid;
>  		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
> @@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  			goto out;
>  		status = lookup_or_create_lock_state(cstate, open_stp, lock,
>  							&lock_stp, &new);
> +		if (status == nfs_ok)
> +			down_write(&lock_stp->st_rwsem);
>  	} else {
>  		status = nfs4_preprocess_seqid_op(cstate,
>  				       lock->lk_old_lock_seqid,
> @@ -5540,6 +5560,8 @@ out:
>  		    seqid_mutating_err(ntohl(status)))
>  			lock_sop->lo_owner.so_seqid++;
>  
> +		up_write(&lock_stp->st_rwsem);
> +
>  		/*
>  		 * If this is a new, never-before-used stateid, and we are
>  		 * returning an error, then just go ahead and release it.
> @@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  fput:
>  	fput(filp);
>  put_stateid:
> +	up_write(&stp->st_rwsem);
>  	nfs4_put_stid(&stp->st_stid);
>  out:
>  	nfsd4_bump_seqid(cstate, status);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 583ffc13cae2..31bde12feefe 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -534,15 +534,16 @@ struct nfs4_file {
>   * Better suggestions welcome.
>   */
>  struct nfs4_ol_stateid {
> -	struct nfs4_stid    st_stid; /* must be first field */
> -	struct list_head              st_perfile;
> -	struct list_head              st_perstateowner;
> -	struct list_head              st_locks;
> -	struct nfs4_stateowner      * st_stateowner;
> -	struct nfs4_clnt_odstate    * st_clnt_odstate;
> -	unsigned char                 st_access_bmap;
> -	unsigned char                 st_deny_bmap;
> -	struct nfs4_ol_stateid         * st_openstp;
> +	struct nfs4_stid		st_stid;
> +	struct list_head		st_perfile;
> +	struct list_head		st_perstateowner;
> +	struct list_head		st_locks;
> +	struct nfs4_stateowner		*st_stateowner;
> +	struct nfs4_clnt_odstate	*st_clnt_odstate;
> +	unsigned char			st_access_bmap;
> +	unsigned char			st_deny_bmap;
> +	struct nfs4_ol_stateid		*st_openstp;
> +	struct rw_semaphore		st_rwsem;
>  };
>  
>  static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
> -- 
> 2.4.3
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d5691b795..1b39edf10b67 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3360,6 +3360,7 @@  static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
 	stp->st_openstp = NULL;
+	init_rwsem(&stp->st_rwsem);
 	spin_lock(&oo->oo_owner.so_client->cl_lock);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -4187,15 +4188,20 @@  nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 */
 	if (stp) {
 		/* Stateid was found, this is an OPEN upgrade */
+		down_read(&stp->st_rwsem);
 		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
-		if (status)
+		if (status) {
+			up_read(&stp->st_rwsem);
 			goto out;
+		}
 	} else {
 		stp = open->op_stp;
 		open->op_stp = NULL;
 		init_open_stateid(stp, fp, open);
+		down_read(&stp->st_rwsem);
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
 		if (status) {
+			up_read(&stp->st_rwsem);
 			release_open_stateid(stp);
 			goto out;
 		}
@@ -4207,6 +4213,7 @@  nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	}
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_read(&stp->st_rwsem);
 
 	if (nfsd4_has_session(&resp->cstate)) {
 		if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4819,10 +4826,13 @@  static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 		 * revoked delegations are kept only for free_stateid.
 		 */
 		return nfserr_bad_stateid;
+	down_write(&stp->st_rwsem);
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
-	if (status)
-		return status;
-	return nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status == nfs_ok)
+		status = nfs4_check_fh(current_fh, &stp->st_stid);
+	if (status != nfs_ok)
+		up_write(&stp->st_rwsem);
+	return status;
 }
 
 /* 
@@ -4869,6 +4879,7 @@  static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 		return status;
 	oo = openowner(stp->st_stateowner);
 	if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+		up_write(&stp->st_rwsem);
 		nfs4_put_stid(&stp->st_stid);
 		return nfserr_bad_stateid;
 	}
@@ -4899,11 +4910,14 @@  nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	oo = openowner(stp->st_stateowner);
 	status = nfserr_bad_stateid;
-	if (oo->oo_flags & NFS4_OO_CONFIRMED)
+	if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+		up_write(&stp->st_rwsem);
 		goto put_stateid;
+	}
 	oo->oo_flags |= NFS4_OO_CONFIRMED;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 	dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
 		__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
 
@@ -4982,6 +4996,7 @@  nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
@@ -5035,6 +5050,7 @@  nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out; 
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+	up_write(&stp->st_rwsem);
 
 	nfsd4_close_open_stateid(stp);
 
@@ -5260,6 +5276,7 @@  init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
+	init_rwsem(&stp->st_rwsem);
 	list_add(&stp->st_locks, &open_stp->st_locks);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	spin_lock(&fp->fi_lock);
@@ -5428,6 +5445,7 @@  nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&open_stp, nn);
 		if (status)
 			goto out;
+		up_write(&open_stp->st_rwsem);
 		open_sop = openowner(open_stp->st_stateowner);
 		status = nfserr_bad_stateid;
 		if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5453,8 @@  nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		status = lookup_or_create_lock_state(cstate, open_stp, lock,
 							&lock_stp, &new);
+		if (status == nfs_ok)
+			down_write(&lock_stp->st_rwsem);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid,
@@ -5540,6 +5560,8 @@  out:
 		    seqid_mutating_err(ntohl(status)))
 			lock_sop->lo_owner.so_seqid++;
 
+		up_write(&lock_stp->st_rwsem);
+
 		/*
 		 * If this is a new, never-before-used stateid, and we are
 		 * returning an error, then just go ahead and release it.
@@ -5709,6 +5731,7 @@  nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 fput:
 	fput(filp);
 put_stateid:
+	up_write(&stp->st_rwsem);
 	nfs4_put_stid(&stp->st_stid);
 out:
 	nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc13cae2..31bde12feefe 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -534,15 +534,16 @@  struct nfs4_file {
  * Better suggestions welcome.
  */
 struct nfs4_ol_stateid {
-	struct nfs4_stid    st_stid; /* must be first field */
-	struct list_head              st_perfile;
-	struct list_head              st_perstateowner;
-	struct list_head              st_locks;
-	struct nfs4_stateowner      * st_stateowner;
-	struct nfs4_clnt_odstate    * st_clnt_odstate;
-	unsigned char                 st_access_bmap;
-	unsigned char                 st_deny_bmap;
-	struct nfs4_ol_stateid         * st_openstp;
+	struct nfs4_stid		st_stid;
+	struct list_head		st_perfile;
+	struct list_head		st_perstateowner;
+	struct list_head		st_locks;
+	struct nfs4_stateowner		*st_stateowner;
+	struct nfs4_clnt_odstate	*st_clnt_odstate;
+	unsigned char			st_access_bmap;
+	unsigned char			st_deny_bmap;
+	struct nfs4_ol_stateid		*st_openstp;
+	struct rw_semaphore		st_rwsem;
 };
 
 static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)