diff mbox series

[RFC,v6,1/2] fs/lock: add new callback, lm_expire_lock, to lock_manager_operations

Message ID 20211206175942.47326-2-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series nfsd: Initial implementation of NFSv4 Courteous Server | expand

Commit Message

Dai Ngo Dec. 6, 2021, 5:59 p.m. UTC
Add new callback, lm_expire_lock, to lock_manager_operations to allow
the lock manager to take appropriate action to resolve the lock conflict
if possible. The callback takes 2 arguments, file_lock of the blocker
and a testonly flag:

testonly = 1  check and return true if lock conflict can be resolved
              else return false.
testonly = 0  resolve the conflict if possible, return true if conflict
              was resolved esle return false.

Lock manager, such as NFSv4 courteous server, uses this callback to
resolve conflict by destroying lock owner, or the NFSv4 courtesy client
(client that has expired but allowed to maintains its states) that owns
the lock.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/locks.c         | 28 +++++++++++++++++++++++++---
 include/linux/fs.h |  1 +
 2 files changed, 26 insertions(+), 3 deletions(-)

Comments

Chuck Lever Dec. 6, 2021, 6:39 p.m. UTC | #1
> On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> Add new callback, lm_expire_lock, to lock_manager_operations to allow
> the lock manager to take appropriate action to resolve the lock conflict
> if possible. The callback takes 2 arguments, file_lock of the blocker
> and a testonly flag:
> 
> testonly = 1  check and return true if lock conflict can be resolved
>              else return false.
> testonly = 0  resolve the conflict if possible, return true if conflict
>              was resolved esle return false.
> 
> Lock manager, such as NFSv4 courteous server, uses this callback to
> resolve conflict by destroying lock owner, or the NFSv4 courtesy client
> (client that has expired but allowed to maintains its states) that owns
> the lock.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>

Al, Jeff, as co-maintainers of record for fs/locks.c, can you give
an Ack or Reviewed-by? I'd like to take this patch through the nfsd
tree for v5.17. Thanks for your time!


> ---
> fs/locks.c         | 28 +++++++++++++++++++++++++---
> include/linux/fs.h |  1 +
> 2 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/locks.c b/fs/locks.c
> index 3d6fb4ae847b..0fef0a6322c7 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> 	struct file_lock *cfl;
> 	struct file_lock_context *ctx;
> 	struct inode *inode = locks_inode(filp);
> +	bool ret;
> 
> 	ctx = smp_load_acquire(&inode->i_flctx);
> 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> 	}
> 
> 	spin_lock(&ctx->flc_lock);
> +retry:
> 	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
> -		if (posix_locks_conflict(fl, cfl)) {
> -			locks_copy_conflock(fl, cfl);
> -			goto out;
> +		if (!posix_locks_conflict(fl, cfl))
> +			continue;
> +		if (cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock &&
> +				cfl->fl_lmops->lm_expire_lock(cfl, 1)) {
> +			spin_unlock(&ctx->flc_lock);
> +			ret = cfl->fl_lmops->lm_expire_lock(cfl, 0);
> +			spin_lock(&ctx->flc_lock);
> +			if (ret)
> +				goto retry;
> 		}
> +		locks_copy_conflock(fl, cfl);
> +		goto out;
> 	}
> 	fl->fl_type = F_UNLCK;
> out:
> @@ -1140,6 +1150,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
> 	int error;
> 	bool added = false;
> 	LIST_HEAD(dispose);
> +	bool ret;
> 
> 	ctx = locks_get_lock_context(inode, request->fl_type);
> 	if (!ctx)
> @@ -1166,9 +1177,20 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
> 	 * blocker's list of waiters and the global blocked_hash.
> 	 */
> 	if (request->fl_type != F_UNLCK) {
> +retry:
> 		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
> 			if (!posix_locks_conflict(request, fl))
> 				continue;
> +			if (fl->fl_lmops && fl->fl_lmops->lm_expire_lock &&
> +					fl->fl_lmops->lm_expire_lock(fl, 1)) {
> +				spin_unlock(&ctx->flc_lock);
> +				percpu_up_read(&file_rwsem);
> +				ret = fl->fl_lmops->lm_expire_lock(fl, 0);
> +				percpu_down_read(&file_rwsem);
> +				spin_lock(&ctx->flc_lock);
> +				if (ret)
> +					goto retry;
> +			}
> 			if (conflock)
> 				locks_copy_conflock(conflock, fl);
> 			error = -EAGAIN;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e7a633353fd2..1a76b6451398 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1071,6 +1071,7 @@ struct lock_manager_operations {
> 	int (*lm_change)(struct file_lock *, int, struct list_head *);
> 	void (*lm_setup)(struct file_lock *, void **);
> 	bool (*lm_breaker_owns_lease)(struct file_lock *);
> +	bool (*lm_expire_lock)(struct file_lock *fl, bool testonly);
> };
> 
> struct lock_manager {
> -- 
> 2.9.5
> 

--
Chuck Lever
Trond Myklebust Dec. 6, 2021, 7:52 p.m. UTC | #2
On Mon, 2021-12-06 at 18:39 +0000, Chuck Lever III wrote:
> 
> 
> > On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > 
> > Add new callback, lm_expire_lock, to lock_manager_operations to
> > allow
> > the lock manager to take appropriate action to resolve the lock
> > conflict
> > if possible. The callback takes 2 arguments, file_lock of the
> > blocker
> > and a testonly flag:
> > 
> > testonly = 1  check and return true if lock conflict can be
> > resolved
> >              else return false.
> > testonly = 0  resolve the conflict if possible, return true if
> > conflict
> >              was resolved esle return false.
> > 
> > Lock manager, such as NFSv4 courteous server, uses this callback to
> > resolve conflict by destroying lock owner, or the NFSv4 courtesy
> > client
> > (client that has expired but allowed to maintains its states) that
> > owns
> > the lock.
> > 
> > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> 
> Al, Jeff, as co-maintainers of record for fs/locks.c, can you give
> an Ack or Reviewed-by? I'd like to take this patch through the nfsd
> tree for v5.17. Thanks for your time!
> 
> 
> > ---
> > fs/locks.c         | 28 +++++++++++++++++++++++++---
> > include/linux/fs.h |  1 +
> > 2 files changed, 26 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/locks.c b/fs/locks.c
> > index 3d6fb4ae847b..0fef0a6322c7 100644
> > --- a/fs/locks.c
> > +++ b/fs/locks.c
> > @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct
> > file_lock *fl)
> >         struct file_lock *cfl;
> >         struct file_lock_context *ctx;
> >         struct inode *inode = locks_inode(filp);
> > +       bool ret;
> > 
> >         ctx = smp_load_acquire(&inode->i_flctx);
> >         if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> > @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp, struct
> > file_lock *fl)
> >         }
> > 
> >         spin_lock(&ctx->flc_lock);
> > +retry:
> >         list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
> > -               if (posix_locks_conflict(fl, cfl)) {
> > -                       locks_copy_conflock(fl, cfl);
> > -                       goto out;
> > +               if (!posix_locks_conflict(fl, cfl))
> > +                       continue;
> > +               if (cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock
> > &&
> > +                               cfl->fl_lmops->lm_expire_lock(cfl,
> > 1)) {
> > +                       spin_unlock(&ctx->flc_lock);
> > +                       ret = cfl->fl_lmops->lm_expire_lock(cfl,
> > 0);
> > +                       spin_lock(&ctx->flc_lock);
> > +                       if (ret)
> > +                               goto retry;
> >                 }
> > +               locks_copy_conflock(fl, cfl);

How do you know 'cfl' still points to a valid object after you've
dropped the spin lock that was protecting the list?

> > +               goto out;
> >         }
> >         fl->fl_type = F_UNLCK;
> > out:
> > @@ -1140,6 +1150,7 @@ static int posix_lock_inode(struct inode
> > *inode, struct file_lock *request,
> >         int error;
> >         bool added = false;
> >         LIST_HEAD(dispose);
> > +       bool ret;
> > 
> >         ctx = locks_get_lock_context(inode, request->fl_type);
> >         if (!ctx)
> > @@ -1166,9 +1177,20 @@ static int posix_lock_inode(struct inode
> > *inode, struct file_lock *request,
> >          * blocker's list of waiters and the global blocked_hash.
> >          */
> >         if (request->fl_type != F_UNLCK) {
> > +retry:
> >                 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
> >                         if (!posix_locks_conflict(request, fl))
> >                                 continue;
> > +                       if (fl->fl_lmops && fl->fl_lmops-
> > >lm_expire_lock &&
> > +                                       fl->fl_lmops-
> > >lm_expire_lock(fl, 1)) {
> > +                               spin_unlock(&ctx->flc_lock);
> > +                               percpu_up_read(&file_rwsem);
> > +                               ret = fl->fl_lmops-
> > >lm_expire_lock(fl, 0);
> > +                               percpu_down_read(&file_rwsem);
> > +                               spin_lock(&ctx->flc_lock);
> > +                               if (ret)
> > +                                       goto retry;
> > +                       }

ditto.

> >                         if (conflock)
> >                                 locks_copy_conflock(conflock, fl);
> >                         error = -EAGAIN;
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e7a633353fd2..1a76b6451398 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -1071,6 +1071,7 @@ struct lock_manager_operations {
> >         int (*lm_change)(struct file_lock *, int, struct list_head
> > *);
> >         void (*lm_setup)(struct file_lock *, void **);
> >         bool (*lm_breaker_owns_lease)(struct file_lock *);
> > +       bool (*lm_expire_lock)(struct file_lock *fl, bool
> > testonly);
> > };
> > 
> > struct lock_manager {
> > -- 
> > 2.9.5
> > 
> 
> --
> Chuck Lever
> 
> 
>
J. Bruce Fields Dec. 6, 2021, 8:05 p.m. UTC | #3
On Mon, Dec 06, 2021 at 07:52:29PM +0000, Trond Myklebust wrote:
> On Mon, 2021-12-06 at 18:39 +0000, Chuck Lever III wrote:
> > 
> > 
> > > On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> > > 
> > > Add new callback, lm_expire_lock, to lock_manager_operations to
> > > allow
> > > the lock manager to take appropriate action to resolve the lock
> > > conflict
> > > if possible. The callback takes 2 arguments, file_lock of the
> > > blocker
> > > and a testonly flag:
> > > 
> > > testonly = 1  check and return true if lock conflict can be
> > > resolved
> > >              else return false.
> > > testonly = 0  resolve the conflict if possible, return true if
> > > conflict
> > >              was resolved esle return false.
> > > 
> > > Lock manager, such as NFSv4 courteous server, uses this callback to
> > > resolve conflict by destroying lock owner, or the NFSv4 courtesy
> > > client
> > > (client that has expired but allowed to maintains its states) that
> > > owns
> > > the lock.
> > > 
> > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > 
> > Al, Jeff, as co-maintainers of record for fs/locks.c, can you give
> > an Ack or Reviewed-by? I'd like to take this patch through the nfsd
> > tree for v5.17. Thanks for your time!
> > 
> > 
> > > ---
> > > fs/locks.c         | 28 +++++++++++++++++++++++++---
> > > include/linux/fs.h |  1 +
> > > 2 files changed, 26 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/locks.c b/fs/locks.c
> > > index 3d6fb4ae847b..0fef0a6322c7 100644
> > > --- a/fs/locks.c
> > > +++ b/fs/locks.c
> > > @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct
> > > file_lock *fl)
> > >         struct file_lock *cfl;
> > >         struct file_lock_context *ctx;
> > >         struct inode *inode = locks_inode(filp);
> > > +       bool ret;
> > > 
> > >         ctx = smp_load_acquire(&inode->i_flctx);
> > >         if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> > > @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp, struct
> > > file_lock *fl)
> > >         }
> > > 
> > >         spin_lock(&ctx->flc_lock);
> > > +retry:
> > >         list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
> > > -               if (posix_locks_conflict(fl, cfl)) {
> > > -                       locks_copy_conflock(fl, cfl);
> > > -                       goto out;
> > > +               if (!posix_locks_conflict(fl, cfl))
> > > +                       continue;
> > > +               if (cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock
> > > &&
> > > +                               cfl->fl_lmops->lm_expire_lock(cfl,
> > > 1)) {
> > > +                       spin_unlock(&ctx->flc_lock);
> > > +                       ret = cfl->fl_lmops->lm_expire_lock(cfl,
> > > 0);
> > > +                       spin_lock(&ctx->flc_lock);
> > > +                       if (ret)
> > > +                               goto retry;
> > >                 }
> > > +               locks_copy_conflock(fl, cfl);
> 
> How do you know 'cfl' still points to a valid object after you've
> dropped the spin lock that was protecting the list?

Ugh, good point, I should have noticed that when I suggested this
approach....

Maybe the first call could instead return return some reference-counted
object that a second call could wait on.

Better, maybe it could add itself to a list of such things and then we
could do this in one pass.

--b.

> 
> > > +               goto out;
> > >         }
> > >         fl->fl_type = F_UNLCK;
> > > out:
> > > @@ -1140,6 +1150,7 @@ static int posix_lock_inode(struct inode
> > > *inode, struct file_lock *request,
> > >         int error;
> > >         bool added = false;
> > >         LIST_HEAD(dispose);
> > > +       bool ret;
> > > 
> > >         ctx = locks_get_lock_context(inode, request->fl_type);
> > >         if (!ctx)
> > > @@ -1166,9 +1177,20 @@ static int posix_lock_inode(struct inode
> > > *inode, struct file_lock *request,
> > >          * blocker's list of waiters and the global blocked_hash.
> > >          */
> > >         if (request->fl_type != F_UNLCK) {
> > > +retry:
> > >                 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
> > >                         if (!posix_locks_conflict(request, fl))
> > >                                 continue;
> > > +                       if (fl->fl_lmops && fl->fl_lmops-
> > > >lm_expire_lock &&
> > > +                                       fl->fl_lmops-
> > > >lm_expire_lock(fl, 1)) {
> > > +                               spin_unlock(&ctx->flc_lock);
> > > +                               percpu_up_read(&file_rwsem);
> > > +                               ret = fl->fl_lmops-
> > > >lm_expire_lock(fl, 0);
> > > +                               percpu_down_read(&file_rwsem);
> > > +                               spin_lock(&ctx->flc_lock);
> > > +                               if (ret)
> > > +                                       goto retry;
> > > +                       }
> 
> ditto.
> 
> > >                         if (conflock)
> > >                                 locks_copy_conflock(conflock, fl);
> > >                         error = -EAGAIN;
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index e7a633353fd2..1a76b6451398 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1071,6 +1071,7 @@ struct lock_manager_operations {
> > >         int (*lm_change)(struct file_lock *, int, struct list_head
> > > *);
> > >         void (*lm_setup)(struct file_lock *, void **);
> > >         bool (*lm_breaker_owns_lease)(struct file_lock *);
> > > +       bool (*lm_expire_lock)(struct file_lock *fl, bool
> > > testonly);
> > > };
> > > 
> > > struct lock_manager {
> > > -- 
> > > 2.9.5
> > > 
> > 
> > --
> > Chuck Lever
> > 
> > 
> > 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
>
Dai Ngo Dec. 6, 2021, 8:36 p.m. UTC | #4
On 12/6/21 12:05 PM, bfields@fieldses.org wrote:
> On Mon, Dec 06, 2021 at 07:52:29PM +0000, Trond Myklebust wrote:
>> On Mon, 2021-12-06 at 18:39 +0000, Chuck Lever III wrote:
>>>
>>>> On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>>>
>>>> Add new callback, lm_expire_lock, to lock_manager_operations to
>>>> allow
>>>> the lock manager to take appropriate action to resolve the lock
>>>> conflict
>>>> if possible. The callback takes 2 arguments, file_lock of the
>>>> blocker
>>>> and a testonly flag:
>>>>
>>>> testonly = 1  check and return true if lock conflict can be
>>>> resolved
>>>>               else return false.
>>>> testonly = 0  resolve the conflict if possible, return true if
>>>> conflict
>>>>               was resolved esle return false.
>>>>
>>>> Lock manager, such as NFSv4 courteous server, uses this callback to
>>>> resolve conflict by destroying lock owner, or the NFSv4 courtesy
>>>> client
>>>> (client that has expired but allowed to maintains its states) that
>>>> owns
>>>> the lock.
>>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> Al, Jeff, as co-maintainers of record for fs/locks.c, can you give
>>> an Ack or Reviewed-by? I'd like to take this patch through the nfsd
>>> tree for v5.17. Thanks for your time!
>>>
>>>
>>>> ---
>>>> fs/locks.c         | 28 +++++++++++++++++++++++++---
>>>> include/linux/fs.h |  1 +
>>>> 2 files changed, 26 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/locks.c b/fs/locks.c
>>>> index 3d6fb4ae847b..0fef0a6322c7 100644
>>>> --- a/fs/locks.c
>>>> +++ b/fs/locks.c
>>>> @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct
>>>> file_lock *fl)
>>>>          struct file_lock *cfl;
>>>>          struct file_lock_context *ctx;
>>>>          struct inode *inode = locks_inode(filp);
>>>> +       bool ret;
>>>>
>>>>          ctx = smp_load_acquire(&inode->i_flctx);
>>>>          if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>>>> @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp, struct
>>>> file_lock *fl)
>>>>          }
>>>>
>>>>          spin_lock(&ctx->flc_lock);
>>>> +retry:
>>>>          list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
>>>> -               if (posix_locks_conflict(fl, cfl)) {
>>>> -                       locks_copy_conflock(fl, cfl);
>>>> -                       goto out;
>>>> +               if (!posix_locks_conflict(fl, cfl))
>>>> +                       continue;
>>>> +               if (cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock
>>>> &&
>>>> +                               cfl->fl_lmops->lm_expire_lock(cfl,
>>>> 1)) {
>>>> +                       spin_unlock(&ctx->flc_lock);
>>>> +                       ret = cfl->fl_lmops->lm_expire_lock(cfl,
>>>> 0);
>>>> +                       spin_lock(&ctx->flc_lock);
>>>> +                       if (ret)
>>>> +                               goto retry;
>>>>                  }
>>>> +               locks_copy_conflock(fl, cfl);
>> How do you know 'cfl' still points to a valid object after you've
>> dropped the spin lock that was protecting the list?
> Ugh, good point, I should have noticed that when I suggested this
> approach....
>
> Maybe the first call could instead return return some reference-counted
> object that a second call could wait on.
>
> Better, maybe it could add itself to a list of such things and then we
> could do this in one pass.

I think we adjust this logic a little bit to cover race condition:

The 1st call to lm_expire_lock returns the client needs to be expired.

Before we make the 2nd call, we save the 'lm_expire_lock' into a local
variable then drop the spinlock, and use the local variable to make the
2nd call so that we do not reference 'cfl'. The argument of the second
is the opaque return value from the 1st call.

nfsd4_fl_expire_lock also needs some adjustment to support the above.

-Dai

>
> --b.
>
>>>> +               goto out;
>>>>          }
>>>>          fl->fl_type = F_UNLCK;
>>>> out:
>>>> @@ -1140,6 +1150,7 @@ static int posix_lock_inode(struct inode
>>>> *inode, struct file_lock *request,
>>>>          int error;
>>>>          bool added = false;
>>>>          LIST_HEAD(dispose);
>>>> +       bool ret;
>>>>
>>>>          ctx = locks_get_lock_context(inode, request->fl_type);
>>>>          if (!ctx)
>>>> @@ -1166,9 +1177,20 @@ static int posix_lock_inode(struct inode
>>>> *inode, struct file_lock *request,
>>>>           * blocker's list of waiters and the global blocked_hash.
>>>>           */
>>>>          if (request->fl_type != F_UNLCK) {
>>>> +retry:
>>>>                  list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
>>>>                          if (!posix_locks_conflict(request, fl))
>>>>                                  continue;
>>>> +                       if (fl->fl_lmops && fl->fl_lmops-
>>>>> lm_expire_lock &&
>>>> +                                       fl->fl_lmops-
>>>>> lm_expire_lock(fl, 1)) {
>>>> +                               spin_unlock(&ctx->flc_lock);
>>>> +                               percpu_up_read(&file_rwsem);
>>>> +                               ret = fl->fl_lmops-
>>>>> lm_expire_lock(fl, 0);
>>>> +                               percpu_down_read(&file_rwsem);
>>>> +                               spin_lock(&ctx->flc_lock);
>>>> +                               if (ret)
>>>> +                                       goto retry;
>>>> +                       }
>> ditto.
>>
>>>>                          if (conflock)
>>>>                                  locks_copy_conflock(conflock, fl);
>>>>                          error = -EAGAIN;
>>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>>> index e7a633353fd2..1a76b6451398 100644
>>>> --- a/include/linux/fs.h
>>>> +++ b/include/linux/fs.h
>>>> @@ -1071,6 +1071,7 @@ struct lock_manager_operations {
>>>>          int (*lm_change)(struct file_lock *, int, struct list_head
>>>> *);
>>>>          void (*lm_setup)(struct file_lock *, void **);
>>>>          bool (*lm_breaker_owns_lease)(struct file_lock *);
>>>> +       bool (*lm_expire_lock)(struct file_lock *fl, bool
>>>> testonly);
>>>> };
>>>>
>>>> struct lock_manager {
>>>> -- 
>>>> 2.9.5
>>>>
>>> --
>>> Chuck Lever
>>>
>>>
>>>
>> -- 
>> Trond Myklebust
>> Linux NFS client maintainer, Hammerspace
>> trond.myklebust@hammerspace.com
>>
>>
Trond Myklebust Dec. 6, 2021, 10:05 p.m. UTC | #5
On Mon, 2021-12-06 at 12:36 -0800, dai.ngo@oracle.com wrote:
> 
> On 12/6/21 12:05 PM, bfields@fieldses.org wrote:
> > On Mon, Dec 06, 2021 at 07:52:29PM +0000, Trond Myklebust wrote:
> > > On Mon, 2021-12-06 at 18:39 +0000, Chuck Lever III wrote:
> > > > 
> > > > > On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com>
> > > > > wrote:
> > > > > 
> > > > > Add new callback, lm_expire_lock, to lock_manager_operations
> > > > > to
> > > > > allow
> > > > > the lock manager to take appropriate action to resolve the
> > > > > lock
> > > > > conflict
> > > > > if possible. The callback takes 2 arguments, file_lock of the
> > > > > blocker
> > > > > and a testonly flag:
> > > > > 
> > > > > testonly = 1  check and return true if lock conflict can be
> > > > > resolved
> > > > >               else return false.
> > > > > testonly = 0  resolve the conflict if possible, return true
> > > > > if
> > > > > conflict
> > > > >               was resolved esle return false.
> > > > > 
> > > > > Lock manager, such as NFSv4 courteous server, uses this
> > > > > callback to
> > > > > resolve conflict by destroying lock owner, or the NFSv4
> > > > > courtesy
> > > > > client
> > > > > (client that has expired but allowed to maintains its states)
> > > > > that
> > > > > owns
> > > > > the lock.
> > > > > 
> > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > Al, Jeff, as co-maintainers of record for fs/locks.c, can you
> > > > give
> > > > an Ack or Reviewed-by? I'd like to take this patch through the
> > > > nfsd
> > > > tree for v5.17. Thanks for your time!
> > > > 
> > > > 
> > > > > ---
> > > > > fs/locks.c         | 28 +++++++++++++++++++++++++---
> > > > > include/linux/fs.h |  1 +
> > > > > 2 files changed, 26 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/fs/locks.c b/fs/locks.c
> > > > > index 3d6fb4ae847b..0fef0a6322c7 100644
> > > > > --- a/fs/locks.c
> > > > > +++ b/fs/locks.c
> > > > > @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct
> > > > > file_lock *fl)
> > > > >          struct file_lock *cfl;
> > > > >          struct file_lock_context *ctx;
> > > > >          struct inode *inode = locks_inode(filp);
> > > > > +       bool ret;
> > > > > 
> > > > >          ctx = smp_load_acquire(&inode->i_flctx);
> > > > >          if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> > > > > @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp,
> > > > > struct
> > > > > file_lock *fl)
> > > > >          }
> > > > > 
> > > > >          spin_lock(&ctx->flc_lock);
> > > > > +retry:
> > > > >          list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
> > > > > -               if (posix_locks_conflict(fl, cfl)) {
> > > > > -                       locks_copy_conflock(fl, cfl);
> > > > > -                       goto out;
> > > > > +               if (!posix_locks_conflict(fl, cfl))
> > > > > +                       continue;
> > > > > +               if (cfl->fl_lmops && cfl->fl_lmops-
> > > > > >lm_expire_lock
> > > > > &&
> > > > > +                               cfl->fl_lmops-
> > > > > >lm_expire_lock(cfl,
> > > > > 1)) {
> > > > > +                       spin_unlock(&ctx->flc_lock);
> > > > > +                       ret = cfl->fl_lmops-
> > > > > >lm_expire_lock(cfl,
> > > > > 0);
> > > > > +                       spin_lock(&ctx->flc_lock);
> > > > > +                       if (ret)
> > > > > +                               goto retry;
> > > > >                  }
> > > > > +               locks_copy_conflock(fl, cfl);
> > > How do you know 'cfl' still points to a valid object after you've
> > > dropped the spin lock that was protecting the list?
> > Ugh, good point, I should have noticed that when I suggested this
> > approach....
> > 
> > Maybe the first call could instead return return some reference-
> > counted
> > object that a second call could wait on.
> > 
> > Better, maybe it could add itself to a list of such things and then
> > we
> > could do this in one pass.
> 
> I think we adjust this logic a little bit to cover race condition:
> 
> The 1st call to lm_expire_lock returns the client needs to be
> expired.
> 
> Before we make the 2nd call, we save the 'lm_expire_lock' into a
> local
> variable then drop the spinlock, and use the local variable to make
> the
> 2nd call so that we do not reference 'cfl'. The argument of the
> second
> is the opaque return value from the 1st call.
> 
> nfsd4_fl_expire_lock also needs some adjustment to support the above.
> 

It's not just the fact that you're using 'cfl' in the actual call to
lm_expire_lock(), but you're also using it after retaking the spinlock.
Dai Ngo Dec. 6, 2021, 11:07 p.m. UTC | #6
On 12/6/21 2:05 PM, Trond Myklebust wrote:
> On Mon, 2021-12-06 at 12:36 -0800, dai.ngo@oracle.com wrote:
>> On 12/6/21 12:05 PM, bfields@fieldses.org wrote:
>>> On Mon, Dec 06, 2021 at 07:52:29PM +0000, Trond Myklebust wrote:
>>>> On Mon, 2021-12-06 at 18:39 +0000, Chuck Lever III wrote:
>>>>>> On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@oracle.com>
>>>>>> wrote:
>>>>>>
>>>>>> Add new callback, lm_expire_lock, to lock_manager_operations
>>>>>> to
>>>>>> allow
>>>>>> the lock manager to take appropriate action to resolve the
>>>>>> lock
>>>>>> conflict
>>>>>> if possible. The callback takes 2 arguments, file_lock of the
>>>>>> blocker
>>>>>> and a testonly flag:
>>>>>>
>>>>>> testonly = 1  check and return true if lock conflict can be
>>>>>> resolved
>>>>>>                else return false.
>>>>>> testonly = 0  resolve the conflict if possible, return true
>>>>>> if
>>>>>> conflict
>>>>>>                was resolved esle return false.
>>>>>>
>>>>>> Lock manager, such as NFSv4 courteous server, uses this
>>>>>> callback to
>>>>>> resolve conflict by destroying lock owner, or the NFSv4
>>>>>> courtesy
>>>>>> client
>>>>>> (client that has expired but allowed to maintains its states)
>>>>>> that
>>>>>> owns
>>>>>> the lock.
>>>>>>
>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>>> Al, Jeff, as co-maintainers of record for fs/locks.c, can you
>>>>> give
>>>>> an Ack or Reviewed-by? I'd like to take this patch through the
>>>>> nfsd
>>>>> tree for v5.17. Thanks for your time!
>>>>>
>>>>>
>>>>>> ---
>>>>>> fs/locks.c         | 28 +++++++++++++++++++++++++---
>>>>>> include/linux/fs.h |  1 +
>>>>>> 2 files changed, 26 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/locks.c b/fs/locks.c
>>>>>> index 3d6fb4ae847b..0fef0a6322c7 100644
>>>>>> --- a/fs/locks.c
>>>>>> +++ b/fs/locks.c
>>>>>> @@ -954,6 +954,7 @@ posix_test_lock(struct file *filp, struct
>>>>>> file_lock *fl)
>>>>>>           struct file_lock *cfl;
>>>>>>           struct file_lock_context *ctx;
>>>>>>           struct inode *inode = locks_inode(filp);
>>>>>> +       bool ret;
>>>>>>
>>>>>>           ctx = smp_load_acquire(&inode->i_flctx);
>>>>>>           if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>>>>>> @@ -962,11 +963,20 @@ posix_test_lock(struct file *filp,
>>>>>> struct
>>>>>> file_lock *fl)
>>>>>>           }
>>>>>>
>>>>>>           spin_lock(&ctx->flc_lock);
>>>>>> +retry:
>>>>>>           list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
>>>>>> -               if (posix_locks_conflict(fl, cfl)) {
>>>>>> -                       locks_copy_conflock(fl, cfl);
>>>>>> -                       goto out;
>>>>>> +               if (!posix_locks_conflict(fl, cfl))
>>>>>> +                       continue;
>>>>>> +               if (cfl->fl_lmops && cfl->fl_lmops-
>>>>>>> lm_expire_lock
>>>>>> &&
>>>>>> +                               cfl->fl_lmops-
>>>>>>> lm_expire_lock(cfl,
>>>>>> 1)) {
>>>>>> +                       spin_unlock(&ctx->flc_lock);
>>>>>> +                       ret = cfl->fl_lmops-
>>>>>>> lm_expire_lock(cfl,
>>>>>> 0);
>>>>>> +                       spin_lock(&ctx->flc_lock);
>>>>>> +                       if (ret)
>>>>>> +                               goto retry;
>>>>>>                   }
>>>>>> +               locks_copy_conflock(fl, cfl);
>>>> How do you know 'cfl' still points to a valid object after you've
>>>> dropped the spin lock that was protecting the list?
>>> Ugh, good point, I should have noticed that when I suggested this
>>> approach....
>>>
>>> Maybe the first call could instead return return some reference-
>>> counted
>>> object that a second call could wait on.
>>>
>>> Better, maybe it could add itself to a list of such things and then
>>> we
>>> could do this in one pass.
>> I think we adjust this logic a little bit to cover race condition:
>>
>> The 1st call to lm_expire_lock returns the client needs to be
>> expired.
>>
>> Before we make the 2nd call, we save the 'lm_expire_lock' into a
>> local
>> variable then drop the spinlock, and use the local variable to make
>> the
>> 2nd call so that we do not reference 'cfl'. The argument of the
>> second
>> is the opaque return value from the 1st call.
>>
>> nfsd4_fl_expire_lock also needs some adjustment to support the above.
>>
> It's not just the fact that you're using 'cfl' in the actual call to
> lm_expire_lock(), but you're also using it after retaking the spinlock.

I plan to do this:

         checked_cfl = NULL;
         spin_lock(&ctx->flc_lock);
retry:
         list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
                 if (!posix_locks_conflict(fl, cfl))
                         continue;
                 if (check_cfl != cfg && cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock) {
                         void *res_data;

                         res_data = cfl->fl_lmops->lm_expire_lock(cfl, 1);
                         if (res_data) {
                                 func = cfl->fl_lmops->lm_expire_lock;
                                 spin_unlock(&ctx->flc_lock);
                                 func(res_data, 0);
                                 spin_lock(&ctx->flc_lock);
                                 checked_cfl = cfl;
                                 goto retry;
                         }
                 locks_copy_conflock(fl, cfl);
                 goto out;
         }
         fl->fl_type = F_UNLCK;

Do you still see problems with this?

-Dai
diff mbox series

Patch

diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..0fef0a6322c7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -954,6 +954,7 @@  posix_test_lock(struct file *filp, struct file_lock *fl)
 	struct file_lock *cfl;
 	struct file_lock_context *ctx;
 	struct inode *inode = locks_inode(filp);
+	bool ret;
 
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -962,11 +963,20 @@  posix_test_lock(struct file *filp, struct file_lock *fl)
 	}
 
 	spin_lock(&ctx->flc_lock);
+retry:
 	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
-		if (posix_locks_conflict(fl, cfl)) {
-			locks_copy_conflock(fl, cfl);
-			goto out;
+		if (!posix_locks_conflict(fl, cfl))
+			continue;
+		if (cfl->fl_lmops && cfl->fl_lmops->lm_expire_lock &&
+				cfl->fl_lmops->lm_expire_lock(cfl, 1)) {
+			spin_unlock(&ctx->flc_lock);
+			ret = cfl->fl_lmops->lm_expire_lock(cfl, 0);
+			spin_lock(&ctx->flc_lock);
+			if (ret)
+				goto retry;
 		}
+		locks_copy_conflock(fl, cfl);
+		goto out;
 	}
 	fl->fl_type = F_UNLCK;
 out:
@@ -1140,6 +1150,7 @@  static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 	int error;
 	bool added = false;
 	LIST_HEAD(dispose);
+	bool ret;
 
 	ctx = locks_get_lock_context(inode, request->fl_type);
 	if (!ctx)
@@ -1166,9 +1177,20 @@  static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 	 * blocker's list of waiters and the global blocked_hash.
 	 */
 	if (request->fl_type != F_UNLCK) {
+retry:
 		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 			if (!posix_locks_conflict(request, fl))
 				continue;
+			if (fl->fl_lmops && fl->fl_lmops->lm_expire_lock &&
+					fl->fl_lmops->lm_expire_lock(fl, 1)) {
+				spin_unlock(&ctx->flc_lock);
+				percpu_up_read(&file_rwsem);
+				ret = fl->fl_lmops->lm_expire_lock(fl, 0);
+				percpu_down_read(&file_rwsem);
+				spin_lock(&ctx->flc_lock);
+				if (ret)
+					goto retry;
+			}
 			if (conflock)
 				locks_copy_conflock(conflock, fl);
 			error = -EAGAIN;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd2..1a76b6451398 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1071,6 +1071,7 @@  struct lock_manager_operations {
 	int (*lm_change)(struct file_lock *, int, struct list_head *);
 	void (*lm_setup)(struct file_lock *, void **);
 	bool (*lm_breaker_owns_lease)(struct file_lock *);
+	bool (*lm_expire_lock)(struct file_lock *fl, bool testonly);
 };
 
 struct lock_manager {