[1/2] NFSD: handle GETATTR conflict with write delegation

Message ID	1685122722-18287-2-git-send-email-dai.ngo@oracle.com (mailing list archive)
State	Superseded, archived
Headers	show Return-Path: <linux-fsdevel-owner@vger.kernel.org> From: Dai Ngo <dai.ngo@oracle.com> To: chuck.lever@oracle.com, jlayton@kernel.org Cc: linux-nfs@vger.kernel.org, linux-fsdevel@vger.kernel.org Subject: [PATCH 1/2] NFSD: handle GETATTR conflict with write delegation Date: Fri, 26 May 2023 10:38:41 -0700 Message-Id: <1685122722-18287-2-git-send-email-dai.ngo@oracle.com> In-Reply-To: <1685122722-18287-1-git-send-email-dai.ngo@oracle.com> References: <1685122722-18287-1-git-send-email-dai.ngo@oracle.com> Precedence: bulk
Series	NFSD: recall write delegation on GETATTR conflict \| expand [0/2] NFSD: recall write delegation on GETATTR conflict [1/2] NFSD: handle GETATTR conflict with write delegation [2/2] NFSD: add counter for write delegation recall due to conflict with GETATTR

Dai Ngo May 26, 2023, 5:38 p.m. UTC

If the GETATTR request on a file that has write delegation in effect
and the request attributes include the change info and size attribute
then the write delegation is recalled. The server waits a maximum of
90ms for the delegation to be returned before replying NFS4ERR_DELAY
for the GETATTR.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c   |  5 +++++
 fs/nfsd/state.h     |  3 +++
 3 files changed, 56 insertions(+)

Chuck Lever May 26, 2023, 6:38 p.m. UTC | #1

On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
> If the GETATTR request on a file that has write delegation in effect
> and the request attributes include the change info and size attribute
> then the write delegation is recalled. The server waits a maximum of
> 90ms for the delegation to be returned before replying NFS4ERR_DELAY
> for the GETATTR.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c   |  5 +++++
>  fs/nfsd/state.h     |  3 +++
>  3 files changed, 56 insertions(+)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index b90b74a5e66e..9f551dbf50d6 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>  {
>  	get_stateid(cstate, &u->write.wr_stateid);
>  }
> +
> +/**
> + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes conflict
> + * @rqstp: RPC transaction context
> + * @inode: file to be checked for a conflict
> + *

Let's have this comment explain why this is necessary. At the least,
it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
write delegation to be gone before the server can respond to a
change/size GETATTR request.


> + * Returns 0 if there is no conflict; otherwise an nfs_stat
> + * code is returned.
> + */
> +__be32
> +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
> +{
> +	__be32 status;
> +	int cnt;
> +	struct file_lock_context *ctx;
> +	struct file_lock *fl;
> +	struct nfs4_delegation *dp;
> +
> +	ctx = locks_inode_context(inode);
> +	if (!ctx)
> +		return 0;
> +	spin_lock(&ctx->flc_lock);
> +	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
> +		if (fl->fl_flags == FL_LAYOUT ||
> +				fl->fl_lmops != &nfsd_lease_mng_ops)
> +			continue;
> +		if (fl->fl_type == F_WRLCK) {
> +			dp = fl->fl_owner;
> +			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
> +				spin_unlock(&ctx->flc_lock);
> +				return 0;
> +			}
> +			spin_unlock(&ctx->flc_lock);
> +			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
> +			if (status != nfserr_jukebox)
> +				return status;
> +			for (cnt = 3; cnt > 0; --cnt) {
> +				if (!nfsd_wait_for_delegreturn(rqstp, inode))
> +					continue;
> +				return 0;
> +			}

I'd rather not retry here. Can you can say why a 30ms wait is not
sufficient for this case?


> +			return status;
> +		}
> +		break;
> +	}
> +	spin_unlock(&ctx->flc_lock);
> +	return 0;
> +}
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index b83954fc57e3..4590b893dbc8 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  		if (status)
>  			goto out;
>  	}
> +	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
> +		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
> +		if (status)
> +			goto out;
> +	}
>  
>  	err = vfs_getattr(&path, &stat,
>  			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d49d3060ed4f..cbddcf484dba 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>  	cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
>  	return clp->cl_state == NFSD4_EXPIRABLE;
>  }
> +
> +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
> +				struct inode *inode);
>  #endif   /* NFSD4_STATE_H */
> -- 
> 2.9.5
>

Dai Ngo May 26, 2023, 7:34 p.m. UTC | #2

On 5/26/23 11:38 AM, Chuck Lever wrote:
> On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
>> If the GETATTR request on a file that has write delegation in effect
>> and the request attributes include the change info and size attribute
>> then the write delegation is recalled. The server waits a maximum of
>> 90ms for the delegation to be returned before replying NFS4ERR_DELAY
>> for the GETATTR.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/nfsd/nfs4xdr.c   |  5 +++++
>>   fs/nfsd/state.h     |  3 +++
>>   3 files changed, 56 insertions(+)
>>
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index b90b74a5e66e..9f551dbf50d6 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>>   {
>>   	get_stateid(cstate, &u->write.wr_stateid);
>>   }
>> +
>> +/**
>> + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes conflict
>> + * @rqstp: RPC transaction context
>> + * @inode: file to be checked for a conflict
>> + *
> Let's have this comment explain why this is necessary. At the least,
> it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
> write delegation to be gone before the server can respond to a
> change/size GETATTR request.

ok, will add the comment.

>
>
>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>> + * code is returned.
>> + */
>> +__be32
>> +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
>> +{
>> +	__be32 status;
>> +	int cnt;
>> +	struct file_lock_context *ctx;
>> +	struct file_lock *fl;
>> +	struct nfs4_delegation *dp;
>> +
>> +	ctx = locks_inode_context(inode);
>> +	if (!ctx)
>> +		return 0;
>> +	spin_lock(&ctx->flc_lock);
>> +	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
>> +		if (fl->fl_flags == FL_LAYOUT ||
>> +				fl->fl_lmops != &nfsd_lease_mng_ops)
>> +			continue;
>> +		if (fl->fl_type == F_WRLCK) {
>> +			dp = fl->fl_owner;
>> +			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
>> +				spin_unlock(&ctx->flc_lock);
>> +				return 0;
>> +			}
>> +			spin_unlock(&ctx->flc_lock);
>> +			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
>> +			if (status != nfserr_jukebox)
>> +				return status;
>> +			for (cnt = 3; cnt > 0; --cnt) {
>> +				if (!nfsd_wait_for_delegreturn(rqstp, inode))
>> +					continue;
>> +				return 0;
>> +			}
> I'd rather not retry here. Can you can say why a 30ms wait is not
> sufficient for this case?

on my VMs, it takes about 80ms for the the delegation return to complete.

-Dai

>
>
>> +			return status;
>> +		}
>> +		break;
>> +	}
>> +	spin_unlock(&ctx->flc_lock);
>> +	return 0;
>> +}
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index b83954fc57e3..4590b893dbc8 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>>   		if (status)
>>   			goto out;
>>   	}
>> +	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
>> +		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
>> +		if (status)
>> +			goto out;
>> +	}
>>   
>>   	err = vfs_getattr(&path, &stat,
>>   			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index d49d3060ed4f..cbddcf484dba 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>>   	cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
>>   	return clp->cl_state == NFSD4_EXPIRABLE;
>>   }
>> +
>> +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>> +				struct inode *inode);
>>   #endif   /* NFSD4_STATE_H */
>> -- 
>> 2.9.5
>>

Dai Ngo May 26, 2023, 7:36 p.m. UTC | #3

On 5/26/23 12:34 PM, dai.ngo@oracle.com wrote:
>
> On 5/26/23 11:38 AM, Chuck Lever wrote:
>> On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
>>> If the GETATTR request on a file that has write delegation in effect
>>> and the request attributes include the change info and size attribute
>>> then the write delegation is recalled. The server waits a maximum of
>>> 90ms for the delegation to be returned before replying NFS4ERR_DELAY
>>> for the GETATTR.
>>>
>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> ---
>>>   fs/nfsd/nfs4state.c | 48 
>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>   fs/nfsd/nfs4xdr.c   |  5 +++++
>>>   fs/nfsd/state.h     |  3 +++
>>>   3 files changed, 56 insertions(+)
>>>
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index b90b74a5e66e..9f551dbf50d6 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct 
>>> nfsd4_compound_state *cstate,
>>>   {
>>>       get_stateid(cstate, &u->write.wr_stateid);
>>>   }
>>> +
>>> +/**
>>> + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes 
>>> conflict
>>> + * @rqstp: RPC transaction context
>>> + * @inode: file to be checked for a conflict
>>> + *
>> Let's have this comment explain why this is necessary. At the least,
>> it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
>> write delegation to be gone before the server can respond to a
>> change/size GETATTR request.
>
> ok, will add the comment.
>
>>
>>
>>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>>> + * code is returned.
>>> + */
>>> +__be32
>>> +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode 
>>> *inode)
>>> +{
>>> +    __be32 status;
>>> +    int cnt;
>>> +    struct file_lock_context *ctx;
>>> +    struct file_lock *fl;
>>> +    struct nfs4_delegation *dp;
>>> +
>>> +    ctx = locks_inode_context(inode);
>>> +    if (!ctx)
>>> +        return 0;
>>> +    spin_lock(&ctx->flc_lock);
>>> +    list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
>>> +        if (fl->fl_flags == FL_LAYOUT ||
>>> +                fl->fl_lmops != &nfsd_lease_mng_ops)
>>> +            continue;
>>> +        if (fl->fl_type == F_WRLCK) {
>>> +            dp = fl->fl_owner;
>>> +            if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
>>> +                spin_unlock(&ctx->flc_lock);
>>> +                return 0;
>>> +            }
>>> +            spin_unlock(&ctx->flc_lock);
>>> +            status = nfserrno(nfsd_open_break_lease(inode, 
>>> NFSD_MAY_READ));
>>> +            if (status != nfserr_jukebox)
>>> +                return status;
>>> +            for (cnt = 3; cnt > 0; --cnt) {
>>> +                if (!nfsd_wait_for_delegreturn(rqstp, inode))
>>> +                    continue;
>>> +                return 0;
>>> +            }
>> I'd rather not retry here. Can you can say why a 30ms wait is not
>> sufficient for this case?
>
> on my VMs, it takes about 80ms for the the delegation return to complete.

Otherwise it takes about 180ms for the CB_RECALL and DELEGRETURN to complete
before the client can get a successful reply of the GETATTR.

-Dai

>
> -Dai
>
>>
>>
>>> +            return status;
>>> +        }
>>> +        break;
>>> +    }
>>> +    spin_unlock(&ctx->flc_lock);
>>> +    return 0;
>>> +}
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index b83954fc57e3..4590b893dbc8 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, 
>>> struct svc_fh *fhp,
>>>           if (status)
>>>               goto out;
>>>       }
>>> +    if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
>>> +        status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
>>> +        if (status)
>>> +            goto out;
>>> +    }
>>>         err = vfs_getattr(&path, &stat,
>>>                 STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>> index d49d3060ed4f..cbddcf484dba 100644
>>> --- a/fs/nfsd/state.h
>>> +++ b/fs/nfsd/state.h
>>> @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct 
>>> nfs4_client *clp)
>>>       cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
>>>       return clp->cl_state == NFSD4_EXPIRABLE;
>>>   }
>>> +
>>> +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>>> +                struct inode *inode);
>>>   #endif   /* NFSD4_STATE_H */
>>> -- 
>>> 2.9.5
>>>

Chuck Lever May 26, 2023, 7:40 p.m. UTC | #4

On Fri, May 26, 2023 at 12:34:16PM -0700, dai.ngo@oracle.com wrote:
> 
> On 5/26/23 11:38 AM, Chuck Lever wrote:
> > On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
> > > If the GETATTR request on a file that has write delegation in effect
> > > and the request attributes include the change info and size attribute
> > > then the write delegation is recalled. The server waits a maximum of
> > > 90ms for the delegation to be returned before replying NFS4ERR_DELAY
> > > for the GETATTR.
> > > 
> > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > ---
> > >   fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/nfsd/nfs4xdr.c   |  5 +++++
> > >   fs/nfsd/state.h     |  3 +++
> > >   3 files changed, 56 insertions(+)
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index b90b74a5e66e..9f551dbf50d6 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
> > >   {
> > >   	get_stateid(cstate, &u->write.wr_stateid);
> > >   }
> > > +
> > > +/**
> > > + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes conflict
> > > + * @rqstp: RPC transaction context
> > > + * @inode: file to be checked for a conflict
> > > + *
> > Let's have this comment explain why this is necessary. At the least,
> > it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
> > write delegation to be gone before the server can respond to a
> > change/size GETATTR request.
> 
> ok, will add the comment.
> 
> > 
> > 
> > > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > > + * code is returned.
> > > + */
> > > +__be32
> > > +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
> > > +{
> > > +	__be32 status;
> > > +	int cnt;
> > > +	struct file_lock_context *ctx;
> > > +	struct file_lock *fl;
> > > +	struct nfs4_delegation *dp;
> > > +
> > > +	ctx = locks_inode_context(inode);
> > > +	if (!ctx)
> > > +		return 0;
> > > +	spin_lock(&ctx->flc_lock);
> > > +	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
> > > +		if (fl->fl_flags == FL_LAYOUT ||
> > > +				fl->fl_lmops != &nfsd_lease_mng_ops)
> > > +			continue;
> > > +		if (fl->fl_type == F_WRLCK) {
> > > +			dp = fl->fl_owner;
> > > +			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
> > > +				spin_unlock(&ctx->flc_lock);
> > > +				return 0;
> > > +			}
> > > +			spin_unlock(&ctx->flc_lock);
> > > +			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
> > > +			if (status != nfserr_jukebox)
> > > +				return status;
> > > +			for (cnt = 3; cnt > 0; --cnt) {
> > > +				if (!nfsd_wait_for_delegreturn(rqstp, inode))
> > > +					continue;
> > > +				return 0;
> > > +			}
> > I'd rather not retry here. Can you can say why a 30ms wait is not
> > sufficient for this case?
> 
> on my VMs, it takes about 80ms for the the delegation return to complete.

I'd rather not tune for tiny VM guests. How long does it take for a
native client to handle CB_RECALL and return the delegation? It
shouldn't take longer to do so than it would for the other cases the
server already handles in under 30ms.

Even 30ms is a long time to hold up an nfsd thread, IMO.


> > > +			return status;
> > > +		}
> > > +		break;
> > > +	}
> > > +	spin_unlock(&ctx->flc_lock);
> > > +	return 0;
> > > +}
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index b83954fc57e3..4590b893dbc8 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> > >   		if (status)
> > >   			goto out;
> > >   	}
> > > +	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
> > > +		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
> > > +		if (status)
> > > +			goto out;
> > > +	}
> > >   	err = vfs_getattr(&path, &stat,
> > >   			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
> > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > index d49d3060ed4f..cbddcf484dba 100644
> > > --- a/fs/nfsd/state.h
> > > +++ b/fs/nfsd/state.h
> > > @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
> > >   	cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
> > >   	return clp->cl_state == NFSD4_EXPIRABLE;
> > >   }
> > > +
> > > +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
> > > +				struct inode *inode);
> > >   #endif   /* NFSD4_STATE_H */
> > > -- 
> > > 2.9.5
> > >

Dai Ngo May 26, 2023, 8:54 p.m. UTC | #5

On 5/26/23 12:40 PM, Chuck Lever wrote:
> On Fri, May 26, 2023 at 12:34:16PM -0700, dai.ngo@oracle.com wrote:
>> On 5/26/23 11:38 AM, Chuck Lever wrote:
>>> On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
>>>> If the GETATTR request on a file that has write delegation in effect
>>>> and the request attributes include the change info and size attribute
>>>> then the write delegation is recalled. The server waits a maximum of
>>>> 90ms for the delegation to be returned before replying NFS4ERR_DELAY
>>>> for the GETATTR.
>>>>
>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>>> ---
>>>>    fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    fs/nfsd/nfs4xdr.c   |  5 +++++
>>>>    fs/nfsd/state.h     |  3 +++
>>>>    3 files changed, 56 insertions(+)
>>>>
>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>>> index b90b74a5e66e..9f551dbf50d6 100644
>>>> --- a/fs/nfsd/nfs4state.c
>>>> +++ b/fs/nfsd/nfs4state.c
>>>> @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>>>>    {
>>>>    	get_stateid(cstate, &u->write.wr_stateid);
>>>>    }
>>>> +
>>>> +/**
>>>> + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes conflict
>>>> + * @rqstp: RPC transaction context
>>>> + * @inode: file to be checked for a conflict
>>>> + *
>>> Let's have this comment explain why this is necessary. At the least,
>>> it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
>>> write delegation to be gone before the server can respond to a
>>> change/size GETATTR request.
>> ok, will add the comment.
>>
>>>
>>>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>>>> + * code is returned.
>>>> + */
>>>> +__be32
>>>> +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
>>>> +{
>>>> +	__be32 status;
>>>> +	int cnt;
>>>> +	struct file_lock_context *ctx;
>>>> +	struct file_lock *fl;
>>>> +	struct nfs4_delegation *dp;
>>>> +
>>>> +	ctx = locks_inode_context(inode);
>>>> +	if (!ctx)
>>>> +		return 0;
>>>> +	spin_lock(&ctx->flc_lock);
>>>> +	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
>>>> +		if (fl->fl_flags == FL_LAYOUT ||
>>>> +				fl->fl_lmops != &nfsd_lease_mng_ops)
>>>> +			continue;
>>>> +		if (fl->fl_type == F_WRLCK) {
>>>> +			dp = fl->fl_owner;
>>>> +			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
>>>> +				spin_unlock(&ctx->flc_lock);
>>>> +				return 0;
>>>> +			}
>>>> +			spin_unlock(&ctx->flc_lock);
>>>> +			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
>>>> +			if (status != nfserr_jukebox)
>>>> +				return status;
>>>> +			for (cnt = 3; cnt > 0; --cnt) {
>>>> +				if (!nfsd_wait_for_delegreturn(rqstp, inode))
>>>> +					continue;
>>>> +				return 0;
>>>> +			}
>>> I'd rather not retry here. Can you can say why a 30ms wait is not
>>> sufficient for this case?
>> on my VMs, it takes about 80ms for the the delegation return to complete.
> I'd rather not tune for tiny VM guests. How long does it take for a
> native client to handle CB_RECALL and return the delegation? It
> shouldn't take longer to do so than it would for the other cases the
> server already handles in under 30ms.
>
> Even 30ms is a long time to hold up an nfsd thread, IMO.

If the client takes less than 30ms to return the delegation then the
server will reply to the GETATTR right away, it does not wait for the
whole 90ms.

The 90ms is for the worst case scenario where the client/network is slow
or under load. Even if the server waits for the whole 90ms it's still
faster to reply to the GETATTR than sending CB_RECALL and wait for
DELEGRETURN before the server can reply to the GETATTR.

-Dai

>
>
>>>> +			return status;
>>>> +		}
>>>> +		break;
>>>> +	}
>>>> +	spin_unlock(&ctx->flc_lock);
>>>> +	return 0;
>>>> +}
>>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>>> index b83954fc57e3..4590b893dbc8 100644
>>>> --- a/fs/nfsd/nfs4xdr.c
>>>> +++ b/fs/nfsd/nfs4xdr.c
>>>> @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>>>>    		if (status)
>>>>    			goto out;
>>>>    	}
>>>> +	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
>>>> +		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
>>>> +		if (status)
>>>> +			goto out;
>>>> +	}
>>>>    	err = vfs_getattr(&path, &stat,
>>>>    			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
>>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>>> index d49d3060ed4f..cbddcf484dba 100644
>>>> --- a/fs/nfsd/state.h
>>>> +++ b/fs/nfsd/state.h
>>>> @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>>>>    	cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
>>>>    	return clp->cl_state == NFSD4_EXPIRABLE;
>>>>    }
>>>> +
>>>> +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>>>> +				struct inode *inode);
>>>>    #endif   /* NFSD4_STATE_H */
>>>> -- 
>>>> 2.9.5
>>>>

Chuck Lever May 26, 2023, 11:27 p.m. UTC | #6

On Fri, May 26, 2023 at 01:54:12PM -0700, dai.ngo@oracle.com wrote:
> 
> On 5/26/23 12:40 PM, Chuck Lever wrote:
> > On Fri, May 26, 2023 at 12:34:16PM -0700, dai.ngo@oracle.com wrote:
> > > On 5/26/23 11:38 AM, Chuck Lever wrote:
> > > > On Fri, May 26, 2023 at 10:38:41AM -0700, Dai Ngo wrote:
> > > > > If the GETATTR request on a file that has write delegation in effect
> > > > > and the request attributes include the change info and size attribute
> > > > > then the write delegation is recalled. The server waits a maximum of
> > > > > 90ms for the delegation to be returned before replying NFS4ERR_DELAY
> > > > > for the GETATTR.
> > > > > 
> > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > > > > ---
> > > > >    fs/nfsd/nfs4state.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >    fs/nfsd/nfs4xdr.c   |  5 +++++
> > > > >    fs/nfsd/state.h     |  3 +++
> > > > >    3 files changed, 56 insertions(+)
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > > index b90b74a5e66e..9f551dbf50d6 100644
> > > > > --- a/fs/nfsd/nfs4state.c
> > > > > +++ b/fs/nfsd/nfs4state.c
> > > > > @@ -8353,3 +8353,51 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
> > > > >    {
> > > > >    	get_stateid(cstate, &u->write.wr_stateid);
> > > > >    }
> > > > > +
> > > > > +/**
> > > > > + * nfsd4_deleg_getattr_conflict - Trigger recall if GETATTR causes conflict
> > > > > + * @rqstp: RPC transaction context
> > > > > + * @inode: file to be checked for a conflict
> > > > > + *
> > > > Let's have this comment explain why this is necessary. At the least,
> > > > it needs to cite RFC 8881 Section 18.7.4, which REQUIREs a conflicting
> > > > write delegation to be gone before the server can respond to a
> > > > change/size GETATTR request.
> > > ok, will add the comment.
> > > 
> > > > 
> > > > > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > > > > + * code is returned.
> > > > > + */
> > > > > +__be32
> > > > > +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
> > > > > +{
> > > > > +	__be32 status;
> > > > > +	int cnt;
> > > > > +	struct file_lock_context *ctx;
> > > > > +	struct file_lock *fl;
> > > > > +	struct nfs4_delegation *dp;
> > > > > +
> > > > > +	ctx = locks_inode_context(inode);
> > > > > +	if (!ctx)
> > > > > +		return 0;
> > > > > +	spin_lock(&ctx->flc_lock);
> > > > > +	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
> > > > > +		if (fl->fl_flags == FL_LAYOUT ||
> > > > > +				fl->fl_lmops != &nfsd_lease_mng_ops)
> > > > > +			continue;
> > > > > +		if (fl->fl_type == F_WRLCK) {
> > > > > +			dp = fl->fl_owner;
> > > > > +			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
> > > > > +				spin_unlock(&ctx->flc_lock);
> > > > > +				return 0;
> > > > > +			}
> > > > > +			spin_unlock(&ctx->flc_lock);
> > > > > +			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
> > > > > +			if (status != nfserr_jukebox)
> > > > > +				return status;
> > > > > +			for (cnt = 3; cnt > 0; --cnt) {
> > > > > +				if (!nfsd_wait_for_delegreturn(rqstp, inode))
> > > > > +					continue;
> > > > > +				return 0;
> > > > > +			}
> > > > I'd rather not retry here. Can you can say why a 30ms wait is not
> > > > sufficient for this case?
> > > on my VMs, it takes about 80ms for the the delegation return to complete.
> > I'd rather not tune for tiny VM guests. How long does it take for a
> > native client to handle CB_RECALL and return the delegation? It
> > shouldn't take longer to do so than it would for the other cases the
> > server already handles in under 30ms.
> > 
> > Even 30ms is a long time to hold up an nfsd thread, IMO.
> 
> If the client takes less than 30ms to return the delegation then the
> server will reply to the GETATTR right away, it does not wait for the
> whole 90ms.
> 
> The 90ms is for the worst case scenario where the client/network is slow
> or under load. Even if the server waits for the whole 90ms it's still
> faster to reply to the GETATTR than sending CB_RECALL and wait for
> DELEGRETURN before the server can reply to the GETATTR.

The reason for the short timeout is we can't tie up nfsd threads for
a long time; that can amount to denial of service. I'm not concerned
about a single slow client, but enough clients that don't respond
quickly to CB_RECALL can prevent the server from making forward
progress, even for a short period, and that will be noticeable.

In Linux, generally we optimize for the fastest case, not the slow
cases like this one. Make the fast clients as fast as possible; do
not penalize everyone for the slow cases.

So, please make this function call nfsd_wait_for_delegreturn() only
once, and leave NFSD_DELEGRETURN_TIMEOUT at 30ms.


> > > > > +			return status;
> > > > > +		}
> > > > > +		break;
> > > > > +	}
> > > > > +	spin_unlock(&ctx->flc_lock);
> > > > > +	return 0;
> > > > > +}
> > > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > > index b83954fc57e3..4590b893dbc8 100644
> > > > > --- a/fs/nfsd/nfs4xdr.c
> > > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > > @@ -2970,6 +2970,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> > > > >    		if (status)
> > > > >    			goto out;
> > > > >    	}
> > > > > +	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
> > > > > +		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
> > > > > +		if (status)
> > > > > +			goto out;
> > > > > +	}
> > > > >    	err = vfs_getattr(&path, &stat,
> > > > >    			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
> > > > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > > > > index d49d3060ed4f..cbddcf484dba 100644
> > > > > --- a/fs/nfsd/state.h
> > > > > +++ b/fs/nfsd/state.h
> > > > > @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
> > > > >    	cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
> > > > >    	return clp->cl_state == NFSD4_EXPIRABLE;
> > > > >    }
> > > > > +
> > > > > +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
> > > > > +				struct inode *inode);
> > > > >    #endif   /* NFSD4_STATE_H */
> > > > > -- 
> > > > > 2.9.5
> > > > >

[1/2] NFSD: handle GETATTR conflict with write delegation

Commit Message

Comments

Patch