diff mbox

nfsd: Close race between nfsd4_release_lockowner and nfsd4_lock

Message ID 20160713202810.19268.64046.stgit@klimt.1015granger.net (mailing list archive)
State New, archived
Headers show

Commit Message

Chuck Lever July 13, 2016, 8:40 p.m. UTC
nfsd4_release_lockowner finds a lock owner that has no lock state,
and drops cl_lock. Then release_lockowner picks up cl_lock and
unhashes the lock owner.

During the window where cl_lock is dropped, I don't see anything
preventing a concurrent nfsd4_lock from finding that same lock owner
and adding lock state to it.

Move release_lockowner() into nfsd4_release_lockowner and hang onto
the cl_lock until after the lock owner's state cannot be found
again.

Fixes: 2c41beb0e5cf ("nfsd: reduce cl_lock thrashing in ... ")
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
Hi Bruce-

Noticed this recently. I've been running with this patch on my test
NFS server for over a week. Haven't noticed any issues, but I wonder
if my clients or tests actually exercise this code in parallel.

The reason I was looking at this area is that one of our internal
testers encountered a related problem with NFSv4.1. LOCK and
FREE_STATEID are racing: LOCK returns an existing lock stateid, then
FREE_STATEID frees that stateid (NFS4_OK). FREE_STATEID should
return NFS4ERR_LOCKS_HELD in this case?

I have not been able to reproduce this, but our tester is able to
hit it fairly reliably with Oracle's v4.1-based kernel running on
his server. Recent upstream kernels make the issue rare, but it is
still encountered on occasion.


 fs/nfsd/nfs4state.c |   40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

J. Bruce Fields July 14, 2016, 7:55 p.m. UTC | #1
On Wed, Jul 13, 2016 at 04:40:14PM -0400, Chuck Lever wrote:
> nfsd4_release_lockowner finds a lock owner that has no lock state,
> and drops cl_lock. Then release_lockowner picks up cl_lock and
> unhashes the lock owner.
> 
> During the window where cl_lock is dropped, I don't see anything
> preventing a concurrent nfsd4_lock from finding that same lock owner
> and adding lock state to it.
> 
> Move release_lockowner() into nfsd4_release_lockowner and hang onto
> the cl_lock until after the lock owner's state cannot be found
> again.
> 
> Fixes: 2c41beb0e5cf ("nfsd: reduce cl_lock thrashing in ... ")
> Reviewed-by: Jeff Layton <jlayton@redhat.com>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

Makes sense.  Applying with just one line added to the changelog, in
case it's useful to someone: "Found by inspection, we don't currently
have a reproducer."

> ---
> Hi Bruce-
> 
> Noticed this recently. I've been running with this patch on my test
> NFS server for over a week. Haven't noticed any issues, but I wonder
> if my clients or tests actually exercise this code in parallel.
> 
> The reason I was looking at this area is that one of our internal
> testers encountered a related problem with NFSv4.1. LOCK and
> FREE_STATEID are racing: LOCK returns an existing lock stateid, then
> FREE_STATEID frees that stateid (NFS4_OK). FREE_STATEID should
> return NFS4ERR_LOCKS_HELD in this case?

Looks like free_stateid is deciding whether to return LOCKS_HELD by
inspecting the list of vfs locks, but nfsd4_lock creates the lock
stateid before the vfs lock, so maybe there's a race like:

	create lock stateid
				free_stateid
	vfs_lock_file

but I haven't checked that in detail.

I haven't thought about the protocol side--does hitting this race
require an incorrect client?

--b.

> 
> I have not been able to reproduce this, but our tester is able to
> hit it fairly reliably with Oracle's v4.1-based kernel running on
> his server. Recent upstream kernels make the issue rare, but it is
> still encountered on occasion.
> 
> 
>  fs/nfsd/nfs4state.c |   40 +++++++++++++++++-----------------------
>  1 file changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index f5f82e1..31c993f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist)
>  	}
>  }
>  
> -static void release_lockowner(struct nfs4_lockowner *lo)
> -{
> -	struct nfs4_client *clp = lo->lo_owner.so_client;
> -	struct nfs4_ol_stateid *stp;
> -	struct list_head reaplist;
> -
> -	INIT_LIST_HEAD(&reaplist);
> -
> -	spin_lock(&clp->cl_lock);
> -	unhash_lockowner_locked(lo);
> -	while (!list_empty(&lo->lo_owner.so_stateids)) {
> -		stp = list_first_entry(&lo->lo_owner.so_stateids,
> -				struct nfs4_ol_stateid, st_perstateowner);
> -		WARN_ON(!unhash_lock_stateid(stp));
> -		put_ol_stateid_locked(stp, &reaplist);
> -	}
> -	spin_unlock(&clp->cl_lock);
> -	free_ol_stateid_reaplist(&reaplist);
> -	nfs4_put_stateowner(&lo->lo_owner);
> -}
> -
>  static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
>  				       struct list_head *reaplist)
>  {
> @@ -5938,6 +5917,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
>  	__be32 status;
>  	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
>  	struct nfs4_client *clp;
> +	LIST_HEAD (reaplist);
>  
>  	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
>  		clid->cl_boot, clid->cl_id);
> @@ -5968,9 +5948,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
>  		nfs4_get_stateowner(sop);
>  		break;
>  	}
> +	if (!lo) {
> +		spin_unlock(&clp->cl_lock);
> +		return status;
> +	}
> +
> +	unhash_lockowner_locked(lo);
> +	while (!list_empty(&lo->lo_owner.so_stateids)) {
> +		stp = list_first_entry(&lo->lo_owner.so_stateids,
> +				       struct nfs4_ol_stateid,
> +				       st_perstateowner);
> +		WARN_ON(!unhash_lock_stateid(stp));
> +		put_ol_stateid_locked(stp, &reaplist);
> +	}
>  	spin_unlock(&clp->cl_lock);
> -	if (lo)
> -		release_lockowner(lo);
> +	free_ol_stateid_reaplist(&reaplist);
> +	nfs4_put_stateowner(&lo->lo_owner);
> +
>  	return status;
>  }
>  
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever July 14, 2016, 8:25 p.m. UTC | #2
> On Jul 14, 2016, at 3:55 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Wed, Jul 13, 2016 at 04:40:14PM -0400, Chuck Lever wrote:
>> 
>> The reason I was looking at this area is that one of our internal
>> testers encountered a related problem with NFSv4.1. LOCK and
>> FREE_STATEID are racing: LOCK returns an existing lock stateid, then
>> FREE_STATEID frees that stateid (NFS4_OK). FREE_STATEID should
>> return NFS4ERR_LOCKS_HELD in this case?
> 
> Looks like free_stateid is deciding whether to return LOCKS_HELD by
> inspecting the list of vfs locks, but nfsd4_lock creates the lock
> stateid before the vfs lock, so maybe there's a race like:
> 
> 	create lock stateid
> 				free_stateid
> 	vfs_lock_file
> 
> but I haven't checked that in detail.

> I haven't thought about the protocol side--does hitting this race
> require an incorrect client?

I don't believe so, but there may be some subtlety I'm
missing.

I have a network capture. The test is nfslock01 from LTP.
There are two user processes that lock, write, and unlock
alternating 64-byte pieces of the same file.

(stateid here is represented as "seqno / other")

On the wire, I see:

Frame 115004 C LOCK offset 672000 len 64
Frame 115008 R LOCK NFS4_OK stateid 1/A
Frame 115012 C WRITE stateid 0/A, offset 672000 len 64
Frame 115016 R WRITE NFS4_OK
Frame 115019 C LOCKU stateid 1/A 672000 len 64
Frame 115022 R LOCKU NFS4_OK stateid 2/A
Frame 115025 C FREE_STATEID stateid 2/A
Frame 115026 C LOCK offset 672128 len 64
Frame 115029 R FREE_STATEID NFS4_OK
Frame 115030 R LOCK stateid 3/A
Frame 115034 C WRITE stateid 0/A offset 672128 len 64
Frame 115038 R WRITE NFS4ERR_BAD_STATEID
Frame 115043 C TEST_STATEID stateid 3/A
Frame 115044 R TEST_STATEID NFS4_OK
                The returned stateid list has 3/A marked NFS4ERR_BAD_STATEID

Since the LOCKU and LOCK are accessing data segments
128 bytes apart, I assume these requests come from
the same process on the client. The FREE_STATEID
might be done in the background? which might be why
it is sent concurrently with the LOCK.

IIUC, either the FREE_STATEID should have returned
LOCKS_HELD, or the second LOCK should have returned
a fresh lock state ID.

On my client I can't seem to get FREE_STATEID and
LOCK to be sent concurrently, so the operations
always come out in an order that guarantees the
race is avoided.


> --b.
> 
>> 
>> I have not been able to reproduce this, but our tester is able to
>> hit it fairly reliably with Oracle's v4.1-based kernel running on
>> his server. Recent upstream kernels make the issue rare, but it is
>> still encountered on occasion.

--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever July 26, 2016, 9:22 p.m. UTC | #3
> On Jul 14, 2016, at 4:25 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
> 
> 
>> On Jul 14, 2016, at 3:55 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
>> 
>> On Wed, Jul 13, 2016 at 04:40:14PM -0400, Chuck Lever wrote:
>>> 
>>> The reason I was looking at this area is that one of our internal
>>> testers encountered a related problem with NFSv4.1. LOCK and
>>> FREE_STATEID are racing: LOCK returns an existing lock stateid, then
>>> FREE_STATEID frees that stateid (NFS4_OK). FREE_STATEID should
>>> return NFS4ERR_LOCKS_HELD in this case?
>> 
>> Looks like free_stateid is deciding whether to return LOCKS_HELD by
>> inspecting the list of vfs locks, but nfsd4_lock creates the lock
>> stateid before the vfs lock, so maybe there's a race like:
>> 
>> 	create lock stateid
>> 				free_stateid
>> 	vfs_lock_file
>> 
>> but I haven't checked that in detail.
> 
>> I haven't thought about the protocol side--does hitting this race
>> require an incorrect client?
> 
> I don't believe so, but there may be some subtlety I'm
> missing.
> 
> I have a network capture. The test is nfslock01 from LTP.
> There are two user processes that lock, write, and unlock
> alternating 64-byte pieces of the same file.
> 
> (stateid here is represented as "seqno / other")
> 
> On the wire, I see:
> 
> Frame 115004 C LOCK offset 672000 len 64
> Frame 115008 R LOCK NFS4_OK stateid 1/A
> Frame 115012 C WRITE stateid 0/A, offset 672000 len 64
> Frame 115016 R WRITE NFS4_OK
> Frame 115019 C LOCKU stateid 1/A 672000 len 64
> Frame 115022 R LOCKU NFS4_OK stateid 2/A
> Frame 115025 C FREE_STATEID stateid 2/A
> Frame 115026 C LOCK offset 672128 len 64
> Frame 115029 R FREE_STATEID NFS4_OK
> Frame 115030 R LOCK stateid 3/A
> Frame 115034 C WRITE stateid 0/A offset 672128 len 64
> Frame 115038 R WRITE NFS4ERR_BAD_STATEID
> Frame 115043 C TEST_STATEID stateid 3/A
> Frame 115044 R TEST_STATEID NFS4_OK
>                The returned stateid list has 3/A marked NFS4ERR_BAD_STATEID
> 
> Since the LOCKU and LOCK are accessing data segments
> 128 bytes apart, I assume these requests come from
> the same process on the client. The FREE_STATEID
> might be done in the background? which might be why
> it is sent concurrently with the LOCK.
> 
> IIUC, either the FREE_STATEID should have returned
> LOCKS_HELD, or the second LOCK should have returned
> a fresh lock state ID.
> 
> On my client I can't seem to get FREE_STATEID and
> LOCK to be sent concurrently, so the operations
> always come out in an order that guarantees the
> race is avoided.

Here's one possible scenario:

LOCK and FREE_STATEID are called simultaneously. The stateid passed
to FREE_STATEID has the same lock owner that is passed to the LOCK
operation.

nfsd4_lock is called. It gets to lookup_or_create_lock_stateid, which
takes cl_lock, finds the same stateid as was passed to FREE_STATEID,
drops cl_lock and returns. For some reason this thread is scheduled out.

This allows nfsd4_free_stateid to grab cl_lock. find_stateid_locked
finds the stateid, finds no lock state associated with the inode,
and decides to unhash the lock stateid. It drops cl_lock and
returns NFS4_OK.

nfsd4_lock resumes processing with the unhashed stateid. It completes
its processing, locks the file, and returns NFS4_OK and the dead
stateid.

Later the client uses that stateid in a WRITE. nfs4_preprocess_stateid_op
invokes nfsd4_lookup_stateid and then find_stateid_by_type. That might
find the stateid, but unhashing it earlier set sc_type to zero, and
now find_stateid_by_type returns NULL. WRITE returns BAD_STATEID.


--
Chuck Lever



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f5f82e1..31c993f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1200,27 +1200,6 @@  free_ol_stateid_reaplist(struct list_head *reaplist)
 	}
 }
 
-static void release_lockowner(struct nfs4_lockowner *lo)
-{
-	struct nfs4_client *clp = lo->lo_owner.so_client;
-	struct nfs4_ol_stateid *stp;
-	struct list_head reaplist;
-
-	INIT_LIST_HEAD(&reaplist);
-
-	spin_lock(&clp->cl_lock);
-	unhash_lockowner_locked(lo);
-	while (!list_empty(&lo->lo_owner.so_stateids)) {
-		stp = list_first_entry(&lo->lo_owner.so_stateids,
-				struct nfs4_ol_stateid, st_perstateowner);
-		WARN_ON(!unhash_lock_stateid(stp));
-		put_ol_stateid_locked(stp, &reaplist);
-	}
-	spin_unlock(&clp->cl_lock);
-	free_ol_stateid_reaplist(&reaplist);
-	nfs4_put_stateowner(&lo->lo_owner);
-}
-
 static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
 				       struct list_head *reaplist)
 {
@@ -5938,6 +5917,7 @@  nfsd4_release_lockowner(struct svc_rqst *rqstp,
 	__be32 status;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct nfs4_client *clp;
+	LIST_HEAD (reaplist);
 
 	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
 		clid->cl_boot, clid->cl_id);
@@ -5968,9 +5948,23 @@  nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		nfs4_get_stateowner(sop);
 		break;
 	}
+	if (!lo) {
+		spin_unlock(&clp->cl_lock);
+		return status;
+	}
+
+	unhash_lockowner_locked(lo);
+	while (!list_empty(&lo->lo_owner.so_stateids)) {
+		stp = list_first_entry(&lo->lo_owner.so_stateids,
+				       struct nfs4_ol_stateid,
+				       st_perstateowner);
+		WARN_ON(!unhash_lock_stateid(stp));
+		put_ol_stateid_locked(stp, &reaplist);
+	}
 	spin_unlock(&clp->cl_lock);
-	if (lo)
-		release_lockowner(lo);
+	free_ol_stateid_reaplist(&reaplist);
+	nfs4_put_stateowner(&lo->lo_owner);
+
 	return status;
 }