diff mbox

[4/4] NFS: Always wait for I/O completion before unlock

Message ID d81a853ea1ee4ba1294c85bbed7aa7298852b180.1487356965.git.bcodding@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Benjamin Coddington Feb. 17, 2017, 6:46 p.m. UTC
NFS attempts to wait for read and write completion before unlocking in
order to ensure that the data returned was protected by the lock.  When
this waiting is interrupted by a signal, the unlock may never be sent, and
messages similar to the following are seen in the kernel ring buffer:

[20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:
[20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1 fl_type=0x0 fl_pid=20183
[20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1 fl_type=0x0 fl_pid=20185

For NFSv3, the missing unlock will cause the server to refuse conflicting
locks indefinitely.  For NFSv4, the leftover lock will be removed by the
server after the lease timeout.

This patch fixes this for NFSv3 by skipping the wait in order to
immediately send the unlock if the FL_CLOSE flag is set when signaled.  For
NFSv4, this approach may cause the server to see the I/O as arriving with
an old stateid, so, for the v4 case the fix is different: the wait on I/O
completion is moved into nfs4_locku_ops' rpc_call_prepare().  This will
cause the sleep to happen in rpciod context, and a signal to the originally
waiting process will not cause the unlock to be skipped.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
---
 fs/nfs/file.c     | 13 -------------
 fs/nfs/nfs3proc.c | 13 +++++++++++++
 fs/nfs/nfs4proc.c |  7 +++++++
 fs/nfs/pagelist.c |  1 +
 4 files changed, 21 insertions(+), 13 deletions(-)

Comments

Trond Myklebust Feb. 17, 2017, 7 p.m. UTC | #1
On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote:
> NFS attempts to wait for read and write completion before unlocking

> in

> order to ensure that the data returned was protected by the

> lock.  When

> this waiting is interrupted by a signal, the unlock may never be

> sent, and

> messages similar to the following are seen in the kernel ring buffer:

> 

> [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:

> [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1 fl_type=0x0

> fl_pid=20183

> [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1 fl_type=0x0

> fl_pid=20185

> 

> For NFSv3, the missing unlock will cause the server to refuse

> conflicting

> locks indefinitely.  For NFSv4, the leftover lock will be removed by

> the

> server after the lease timeout.

> 

> This patch fixes this for NFSv3 by skipping the wait in order to

> immediately send the unlock if the FL_CLOSE flag is set when

> signaled.  For

> NFSv4, this approach may cause the server to see the I/O as arriving

> with

> an old stateid, so, for the v4 case the fix is different: the wait on

> I/O

> completion is moved into nfs4_locku_ops' rpc_call_prepare().  This

> will

> cause the sleep to happen in rpciod context, and a signal to the

> originally

> waiting process will not cause the unlock to be skipped.


NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is part
of the memory reclaim chain, so having it sleep on I/O is deadlock
prone.

Why is there a need to wait for I/O completion in the first place if
the user has killed the task that held the lock? 'kill -9' will cause
corruption; that's a fact that no amount of paper will cover over.

> 

> Signed-off-by: Benjamin Coddington <bcodding@redhat.com>

> ---

>  fs/nfs/file.c     | 13 -------------

>  fs/nfs/nfs3proc.c | 13 +++++++++++++

>  fs/nfs/nfs4proc.c |  7 +++++++

>  fs/nfs/pagelist.c |  1 +

>  4 files changed, 21 insertions(+), 13 deletions(-)

> 

> diff --git a/fs/nfs/file.c b/fs/nfs/file.c

> index a490f45df4db..adc67fe762e3 100644

> --- a/fs/nfs/file.c

> +++ b/fs/nfs/file.c

> @@ -684,7 +684,6 @@ static int

>  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int

> is_local)

>  {

>  	struct inode *inode = filp->f_mapping->host;

> -	struct nfs_lock_context *l_ctx;

>  	int status;

>  

>  	/*

> @@ -693,18 +692,6 @@ do_unlk(struct file *filp, int cmd, struct

> file_lock *fl, int is_local)

>  	 */

>  	vfs_fsync(filp, 0);

>  

> -	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));

> -	if (!IS_ERR(l_ctx)) {

> -		status = nfs_iocounter_wait(l_ctx);

> -		nfs_put_lock_context(l_ctx);

> -		if (status < 0)

> -			return status;

> -	}

> -

> -	/* NOTE: special case

> -	 * 	If we're signalled while cleaning up locks on

> process exit, we

> -	 * 	still need to complete the unlock.

> -	 */

>  	/*

>  	 * Use local locking if mounted with "-onolock" or with

> appropriate

>  	 * "-olocal_lock="

> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c

> index dc925b531f32..ec3f12571c82 100644

> --- a/fs/nfs/nfs3proc.c

> +++ b/fs/nfs/nfs3proc.c

> @@ -869,6 +869,19 @@ static int

>  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)

>  {

>  	struct inode *inode = file_inode(filp);

> +	int status;

> +	struct nfs_lock_context *l_ctx;

> +

> +	/* For v3, always send the unlock on FL_CLOSE */

> +	if (fl->fl_type == F_UNLCK) {

> +		l_ctx =

> nfs_get_lock_context(nfs_file_open_context(filp));

> +		if (!IS_ERR(l_ctx)) {

> +			status = nfs_iocounter_wait(l_ctx);

> +			nfs_put_lock_context(l_ctx);

> +			if (status < 0 && !(fl->fl_flags &

> FL_CLOSE))

> +				return status;

> +		}

> +	}

>  

>  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);

>  }

> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c

> index 91f88bfbbe79..052b97fd653d 100644

> --- a/fs/nfs/nfs4proc.c

> +++ b/fs/nfs/nfs4proc.c

> @@ -5906,6 +5906,7 @@ struct nfs4_unlockdata {

>  	struct nfs_locku_res res;

>  	struct nfs4_lock_state *lsp;

>  	struct nfs_open_context *ctx;

> +	struct nfs_lock_context *l_ctx;

>  	struct file_lock fl;

>  	struct nfs_server *server;

>  	unsigned long timestamp;

> @@ -5930,6 +5931,7 @@ static struct nfs4_unlockdata

> *nfs4_alloc_unlockdata(struct file_lock *fl,

>  	atomic_inc(&lsp->ls_count);

>  	/* Ensure we don't close file until we're done freeing

> locks! */

>  	p->ctx = get_nfs_open_context(ctx);

> +	p->l_ctx = nfs_get_lock_context(ctx);

>  	memcpy(&p->fl, fl, sizeof(p->fl));

>  	p->server = NFS_SERVER(inode);

>  	return p;

> @@ -5988,6 +5990,11 @@ static void nfs4_locku_prepare(struct rpc_task

> *task, void *data)

>  		/* Note: exit _without_ running nfs4_locku_done */

>  		goto out_no_action;

>  	}

> +

> +	if (!IS_ERR(calldata->l_ctx)) {

> +		nfs_iocounter_wait(calldata->l_ctx);

> +		nfs_put_lock_context(calldata->l_ctx);

> +	}

>  	calldata->timestamp = jiffies;

>  	if (nfs4_setup_sequence(calldata->server,

>  				&calldata->arg.seq_args,

> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c

> index 6e629b856a00..8bf3cefdaa96 100644

> --- a/fs/nfs/pagelist.c

> +++ b/fs/nfs/pagelist.c

> @@ -114,6 +114,7 @@ nfs_iocounter_wait(struct nfs_lock_context

> *l_ctx)

>  	return wait_on_atomic_t(&l_ctx->io_count,

> nfs_wait_atomic_killable,

>  			TASK_KILLABLE);

>  }

> +EXPORT_SYMBOL(nfs_iocounter_wait);

>  

>  /*

>   * nfs_page_group_lock - lock the head of the page group

-- 



	
	


Trond Myklebust
Principal System Architect
4300 El Camino Real | Suite 100
Los Altos, CA  94022
W: 650-422-3800
C: 801-921-4583 
www.primarydata.com
Benjamin Coddington Feb. 17, 2017, 7:15 p.m. UTC | #2
On 17 Feb 2017, at 14:00, Trond Myklebust wrote:

> On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote:
>> NFS attempts to wait for read and write completion before unlocking
>> in
>> order to ensure that the data returned was protected by the
>> lock.  When
>> this waiting is interrupted by a signal, the unlock may never be
>> sent, and
>> messages similar to the following are seen in the kernel ring buffer:
>>
>> [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:
>> [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1 fl_type=0x0
>> fl_pid=20183
>> [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1 fl_type=0x0
>> fl_pid=20185
>>
>> For NFSv3, the missing unlock will cause the server to refuse
>> conflicting
>> locks indefinitely.  For NFSv4, the leftover lock will be removed by
>> the
>> server after the lease timeout.
>>
>> This patch fixes this for NFSv3 by skipping the wait in order to
>> immediately send the unlock if the FL_CLOSE flag is set when
>> signaled.  For
>> NFSv4, this approach may cause the server to see the I/O as arriving
>> with
>> an old stateid, so, for the v4 case the fix is different: the wait on
>> I/O
>> completion is moved into nfs4_locku_ops' rpc_call_prepare().  This
>> will
>> cause the sleep to happen in rpciod context, and a signal to the
>> originally
>> waiting process will not cause the unlock to be skipped.
>
> NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is part
> of the memory reclaim chain, so having it sleep on I/O is deadlock
> prone.
>
> Why is there a need to wait for I/O completion in the first place if
> the user has killed the task that held the lock? 'kill -9' will cause
> corruption; that's a fact that no amount of paper will cover over.

To avoid an unnecessary recovery situation where the server asks us to resend
I/O due to an invalid stateid.

I'm fine with skipping the wait on signaled FL_CLOSE if the that's an
acceptable trade-off.  I can send a v3.

Ben

>>
>> Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
>> ---
>>  fs/nfs/file.c     | 13 -------------
>>  fs/nfs/nfs3proc.c | 13 +++++++++++++
>>  fs/nfs/nfs4proc.c |  7 +++++++
>>  fs/nfs/pagelist.c |  1 +
>>  4 files changed, 21 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index a490f45df4db..adc67fe762e3 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -684,7 +684,6 @@ static int
>>  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int
>> is_local)
>>  {
>>  	struct inode *inode = filp->f_mapping->host;
>> -	struct nfs_lock_context *l_ctx;
>>  	int status;
>>  
>>  	/*
>> @@ -693,18 +692,6 @@ do_unlk(struct file *filp, int cmd, struct
>> file_lock *fl, int is_local)
>>  	 */
>>  	vfs_fsync(filp, 0);
>>  
>> -	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
>> -	if (!IS_ERR(l_ctx)) {
>> -		status = nfs_iocounter_wait(l_ctx);
>> -		nfs_put_lock_context(l_ctx);
>> -		if (status < 0)
>> -			return status;
>> -	}
>> -
>> -	/* NOTE: special case
>> -	 * 	If we're signalled while cleaning up locks on
>> process exit, we
>> -	 * 	still need to complete the unlock.
>> -	 */
>>  	/*
>>  	 * Use local locking if mounted with "-onolock" or with
>> appropriate
>>  	 * "-olocal_lock="
>> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
>> index dc925b531f32..ec3f12571c82 100644
>> --- a/fs/nfs/nfs3proc.c
>> +++ b/fs/nfs/nfs3proc.c
>> @@ -869,6 +869,19 @@ static int
>>  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
>>  {
>>  	struct inode *inode = file_inode(filp);
>> +	int status;
>> +	struct nfs_lock_context *l_ctx;
>> +
>> +	/* For v3, always send the unlock on FL_CLOSE */
>> +	if (fl->fl_type == F_UNLCK) {
>> +		l_ctx =
>> nfs_get_lock_context(nfs_file_open_context(filp));
>> +		if (!IS_ERR(l_ctx)) {
>> +			status = nfs_iocounter_wait(l_ctx);
>> +			nfs_put_lock_context(l_ctx);
>> +			if (status < 0 && !(fl->fl_flags &
>> FL_CLOSE))
>> +				return status;
>> +		}
>> +	}
>>  
>>  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
>>  }
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index 91f88bfbbe79..052b97fd653d 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -5906,6 +5906,7 @@ struct nfs4_unlockdata {
>>  	struct nfs_locku_res res;
>>  	struct nfs4_lock_state *lsp;
>>  	struct nfs_open_context *ctx;
>> +	struct nfs_lock_context *l_ctx;
>>  	struct file_lock fl;
>>  	struct nfs_server *server;
>>  	unsigned long timestamp;
>> @@ -5930,6 +5931,7 @@ static struct nfs4_unlockdata
>> *nfs4_alloc_unlockdata(struct file_lock *fl,
>>  	atomic_inc(&lsp->ls_count);
>>  	/* Ensure we don't close file until we're done freeing
>> locks! */
>>  	p->ctx = get_nfs_open_context(ctx);
>> +	p->l_ctx = nfs_get_lock_context(ctx);
>>  	memcpy(&p->fl, fl, sizeof(p->fl));
>>  	p->server = NFS_SERVER(inode);
>>  	return p;
>> @@ -5988,6 +5990,11 @@ static void nfs4_locku_prepare(struct rpc_task
>> *task, void *data)
>>  		/* Note: exit _without_ running nfs4_locku_done */
>>  		goto out_no_action;
>>  	}
>> +
>> +	if (!IS_ERR(calldata->l_ctx)) {
>> +		nfs_iocounter_wait(calldata->l_ctx);
>> +		nfs_put_lock_context(calldata->l_ctx);
>> +	}
>>  	calldata->timestamp = jiffies;
>>  	if (nfs4_setup_sequence(calldata->server,
>>  				&calldata->arg.seq_args,
>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> index 6e629b856a00..8bf3cefdaa96 100644
>> --- a/fs/nfs/pagelist.c
>> +++ b/fs/nfs/pagelist.c
>> @@ -114,6 +114,7 @@ nfs_iocounter_wait(struct nfs_lock_context
>> *l_ctx)
>>  	return wait_on_atomic_t(&l_ctx->io_count,
>> nfs_wait_atomic_killable,
>>  			TASK_KILLABLE);
>>  }
>> +EXPORT_SYMBOL(nfs_iocounter_wait);
>>  
>>  /*
>>   * nfs_page_group_lock - lock the head of the page group
> -- 
>
>
>
> 	
> 	
>
>
> Trond Myklebust
> Principal System Architect
> 4300 El Camino Real | Suite 100
> Los Altos, CA  94022
> W: 650-422-3800
> C: 801-921-4583 
> www.primarydata.com
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Feb. 17, 2017, 7:30 p.m. UTC | #3
On Fri, 2017-02-17 at 14:15 -0500, Benjamin Coddington wrote:
> On 17 Feb 2017, at 14:00, Trond Myklebust wrote:

> 

> > On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote:

> > > NFS attempts to wait for read and write completion before

> > > unlocking

> > > in

> > > order to ensure that the data returned was protected by the

> > > lock.  When

> > > this waiting is interrupted by a signal, the unlock may never be

> > > sent, and

> > > messages similar to the following are seen in the kernel ring

> > > buffer:

> > > 

> > > [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:

> > > [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1

> > > fl_type=0x0

> > > fl_pid=20183

> > > [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1

> > > fl_type=0x0

> > > fl_pid=20185

> > > 

> > > For NFSv3, the missing unlock will cause the server to refuse

> > > conflicting

> > > locks indefinitely.  For NFSv4, the leftover lock will be removed

> > > by

> > > the

> > > server after the lease timeout.

> > > 

> > > This patch fixes this for NFSv3 by skipping the wait in order to

> > > immediately send the unlock if the FL_CLOSE flag is set when

> > > signaled.  For

> > > NFSv4, this approach may cause the server to see the I/O as

> > > arriving

> > > with

> > > an old stateid, so, for the v4 case the fix is different: the

> > > wait on

> > > I/O

> > > completion is moved into nfs4_locku_ops'

> > > rpc_call_prepare().  This

> > > will

> > > cause the sleep to happen in rpciod context, and a signal to the

> > > originally

> > > waiting process will not cause the unlock to be skipped.

> > 

> > NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is

> > part

> > of the memory reclaim chain, so having it sleep on I/O is deadlock

> > prone.

> > 

> > Why is there a need to wait for I/O completion in the first place

> > if

> > the user has killed the task that held the lock? 'kill -9' will

> > cause

> > corruption; that's a fact that no amount of paper will cover over.

> 

> To avoid an unnecessary recovery situation where the server asks us

> to resend

> I/O due to an invalid stateid.

> 


I agree we shouldn't recover in this situation. It would be better to
jettison the failed write, and invalidate the page. Can we make use of
nfs_wb_page_cancel() together with generic_error_remove_page()?

-- 
Trond Myklebust
Linux NFS client maintainer, PrimaryData
trond.myklebust@primarydata.com
Benjamin Coddington Feb. 17, 2017, 8:10 p.m. UTC | #4
On 17 Feb 2017, at 14:30, Trond Myklebust wrote:
> On Fri, 2017-02-17 at 14:15 -0500, Benjamin Coddington wrote:
>> To avoid an unnecessary recovery situation where the server asks us to
>> resend I/O due to an invalid stateid.
>
> I agree we shouldn't recover in this situation. It would be better to
> jettison the failed write, and invalidate the page. Can we make use of
> nfs_wb_page_cancel() together with generic_error_remove_page()?

I'll take a look..

Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Coddington Feb. 21, 2017, 1:56 p.m. UTC | #5
On 17 Feb 2017, at 14:30, Trond Myklebust wrote:

> On Fri, 2017-02-17 at 14:15 -0500, Benjamin Coddington wrote:
>> On 17 Feb 2017, at 14:00, Trond Myklebust wrote:
>>
>>> On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote:
>>>> NFS attempts to wait for read and write completion before
>>>> unlocking
>>>> in
>>>> order to ensure that the data returned was protected by the
>>>> lock.  When
>>>> this waiting is interrupted by a signal, the unlock may never be
>>>> sent, and
>>>> messages similar to the following are seen in the kernel ring
>>>> buffer:
>>>>
>>>> [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:
>>>> [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1
>>>> fl_type=0x0
>>>> fl_pid=20183
>>>> [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1
>>>> fl_type=0x0
>>>> fl_pid=20185
>>>>
>>>> For NFSv3, the missing unlock will cause the server to refuse
>>>> conflicting
>>>> locks indefinitely.  For NFSv4, the leftover lock will be removed
>>>> by
>>>> the
>>>> server after the lease timeout.
>>>>
>>>> This patch fixes this for NFSv3 by skipping the wait in order to
>>>> immediately send the unlock if the FL_CLOSE flag is set when
>>>> signaled.  For
>>>> NFSv4, this approach may cause the server to see the I/O as
>>>> arriving
>>>> with
>>>> an old stateid, so, for the v4 case the fix is different: the
>>>> wait on
>>>> I/O
>>>> completion is moved into nfs4_locku_ops'
>>>> rpc_call_prepare().  This
>>>> will
>>>> cause the sleep to happen in rpciod context, and a signal to the
>>>> originally
>>>> waiting process will not cause the unlock to be skipped.
>>>
>>> NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is
>>> part
>>> of the memory reclaim chain, so having it sleep on I/O is deadlock
>>> prone.
>>>
>>> Why is there a need to wait for I/O completion in the first place
>>> if
>>> the user has killed the task that held the lock? 'kill -9' will
>>> cause
>>> corruption; that's a fact that no amount of paper will cover over.
>>
>> To avoid an unnecessary recovery situation where the server asks us
>> to resend
>> I/O due to an invalid stateid.
>>
>
> I agree we shouldn't recover in this situation. It would be better to
> jettison the failed write, and invalidate the page. Can we make use of
> nfs_wb_page_cancel() together with generic_error_remove_page()?

Probably we can piggy-back on NFS_LOCK_LOST, then -EIO would get passed up
and the page would make it into generic_error_remove_page().  Any
outstanding writes are likely already transmitted or scheduled to be
tranmitted by now, and the error recovery path for incorrect stateids
doesn't intersect with nfs_wb_page_cancel(), rather it re-schedules the RPC.

But, after looking at this further, I'm not sure how much work should be
done here.  It's a fairly unlikely situation already, and if we assert that
a fatal signal means writes don't have to complete at all, I don't see the
harm in having them complete outside the lock.  Adding extra complexity to
bypass recovery for this specific situation would be optimal, but
unnecessary.

Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a490f45df4db..adc67fe762e3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -684,7 +684,6 @@  static int
 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
-	struct nfs_lock_context *l_ctx;
 	int status;
 
 	/*
@@ -693,18 +692,6 @@  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	 */
 	vfs_fsync(filp, 0);
 
-	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
-	if (!IS_ERR(l_ctx)) {
-		status = nfs_iocounter_wait(l_ctx);
-		nfs_put_lock_context(l_ctx);
-		if (status < 0)
-			return status;
-	}
-
-	/* NOTE: special case
-	 * 	If we're signalled while cleaning up locks on process exit, we
-	 * 	still need to complete the unlock.
-	 */
 	/*
 	 * Use local locking if mounted with "-onolock" or with appropriate
 	 * "-olocal_lock="
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index dc925b531f32..ec3f12571c82 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -869,6 +869,19 @@  static int
 nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(filp);
+	int status;
+	struct nfs_lock_context *l_ctx;
+
+	/* For v3, always send the unlock on FL_CLOSE */
+	if (fl->fl_type == F_UNLCK) {
+		l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+		if (!IS_ERR(l_ctx)) {
+			status = nfs_iocounter_wait(l_ctx);
+			nfs_put_lock_context(l_ctx);
+			if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+				return status;
+		}
+	}
 
 	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 91f88bfbbe79..052b97fd653d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5906,6 +5906,7 @@  struct nfs4_unlockdata {
 	struct nfs_locku_res res;
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
+	struct nfs_lock_context *l_ctx;
 	struct file_lock fl;
 	struct nfs_server *server;
 	unsigned long timestamp;
@@ -5930,6 +5931,7 @@  static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	atomic_inc(&lsp->ls_count);
 	/* Ensure we don't close file until we're done freeing locks! */
 	p->ctx = get_nfs_open_context(ctx);
+	p->l_ctx = nfs_get_lock_context(ctx);
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	p->server = NFS_SERVER(inode);
 	return p;
@@ -5988,6 +5990,11 @@  static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		/* Note: exit _without_ running nfs4_locku_done */
 		goto out_no_action;
 	}
+
+	if (!IS_ERR(calldata->l_ctx)) {
+		nfs_iocounter_wait(calldata->l_ctx);
+		nfs_put_lock_context(calldata->l_ctx);
+	}
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence(calldata->server,
 				&calldata->arg.seq_args,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6e629b856a00..8bf3cefdaa96 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -114,6 +114,7 @@  nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
 	return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
 			TASK_KILLABLE);
 }
+EXPORT_SYMBOL(nfs_iocounter_wait);
 
 /*
  * nfs_page_group_lock - lock the head of the page group