diff mbox

[v4,10/10] nfsd: give block_delegation and delegation_blocked its own spinlock

Message ID 1405696416-32585-11-git-send-email-jlayton@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton July 18, 2014, 3:13 p.m. UTC
The state lock can be fairly heavily contended, and there's no reason
that nfs4_file lookups and delegation_blocked should be mutually
exclusive.  Let's give the new block_delegation code its own spinlock.
It does mean that we'll need to take a different lock in the delegation
break code, but that's not generally as critical to performance.

Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 fs/nfsd/nfs4state.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig July 18, 2014, 5:24 p.m. UTC | #1
Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NeilBrown July 21, 2014, 7:02 a.m. UTC | #2
On Fri, 18 Jul 2014 11:13:36 -0400 Jeff Layton <jlayton@primarydata.com>
wrote:

> The state lock can be fairly heavily contended, and there's no reason
> that nfs4_file lookups and delegation_blocked should be mutually
> exclusive.  Let's give the new block_delegation code its own spinlock.
> It does mean that we'll need to take a different lock in the delegation
> break code, but that's not generally as critical to performance.
> 
> Cc: Neil Brown <neilb@suse.de>
> Signed-off-by: Jeff Layton <jlayton@primarydata.com>

Makes sense, thanks.
However.....


> ---
>  fs/nfsd/nfs4state.c | 25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index a2c6c85adfc7..952def00363b 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -506,10 +506,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
>   * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
>   * low 3 bytes as hash-table indices.
>   *
> - * 'state_lock', which is always held when block_delegations() is called,
> - * is used to manage concurrent access.  Testing does not need the lock
> - * except when swapping the two filters.
> + * 'blocked_delegations_lock', which is always held when block_delegations()
> + * is called, is used to manage concurrent access.  Testing does not need the
> + * lock except when swapping the two filters.

...this comment is wrong.  blocked_delegations_lock is *not* held when
block_delegations() is call, it is taken when needed (almost) by
block_delegations.

>   */
> +static DEFINE_SPINLOCK(blocked_delegations_lock);
>  static struct bloom_pair {
>  	int	entries, old_entries;
>  	time_t	swap_time;
> @@ -525,7 +526,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
>  	if (bd->entries == 0)
>  		return 0;
>  	if (seconds_since_boot() - bd->swap_time > 30) {
> -		spin_lock(&state_lock);
> +		spin_lock(&blocked_delegations_lock);
>  		if (seconds_since_boot() - bd->swap_time > 30) {
>  			bd->entries -= bd->old_entries;
>  			bd->old_entries = bd->entries;
> @@ -534,7 +535,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
>  			bd->new = 1-bd->new;
>  			bd->swap_time = seconds_since_boot();
>  		}
> -		spin_unlock(&state_lock);
> +		spin_unlock(&blocked_delegations_lock);
>  	}
>  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
>  	if (test_bit(hash&255, bd->set[0]) &&
> @@ -555,16 +556,16 @@ static void block_delegations(struct knfsd_fh *fh)
>  	u32 hash;
>  	struct bloom_pair *bd = &blocked_delegations;
>  
> -	lockdep_assert_held(&state_lock);
> -
>  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
>  
>  	__set_bit(hash&255, bd->set[bd->new]);
>  	__set_bit((hash>>8)&255, bd->set[bd->new]);
>  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> +	spin_lock(&blocked_delegations_lock);

__set_bit isn't atomic.  The spin_lock should be taken *before* these
__set_bit() calls.

Otherwise, looks fine.

Thanks,
NeilBrown


>  	if (bd->entries == 0)
>  		bd->swap_time = seconds_since_boot();
>  	bd->entries += 1;
> +	spin_unlock(&blocked_delegations_lock);
>  }
>  
>  static struct nfs4_delegation *
> @@ -3097,16 +3098,16 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
>  	struct nfs4_client *clp = dp->dl_stid.sc_client;
>  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
>  
> -	/*
> -	 * We can't do this in nfsd_break_deleg_cb because it is
> -	 * already holding inode->i_lock
> -	 */
> -	spin_lock(&state_lock);
>  	block_delegations(&dp->dl_fh);
> +
>  	/*
> +	 * We can't do this in nfsd_break_deleg_cb because it is
> +	 * already holding inode->i_lock.
> +	 *
>  	 * If the dl_time != 0, then we know that it has already been
>  	 * queued for a lease break. Don't queue it again.
>  	 */
> +	spin_lock(&state_lock);
>  	if (dp->dl_time == 0) {
>  		dp->dl_time = get_seconds();
>  		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
Jeff Layton July 21, 2014, 11:44 a.m. UTC | #3
On Mon, 21 Jul 2014 17:02:54 +1000
NeilBrown <neilb@suse.de> wrote:

> On Fri, 18 Jul 2014 11:13:36 -0400 Jeff Layton <jlayton@primarydata.com>
> wrote:
> 
> > The state lock can be fairly heavily contended, and there's no reason
> > that nfs4_file lookups and delegation_blocked should be mutually
> > exclusive.  Let's give the new block_delegation code its own spinlock.
> > It does mean that we'll need to take a different lock in the delegation
> > break code, but that's not generally as critical to performance.
> > 
> > Cc: Neil Brown <neilb@suse.de>
> > Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> 
> Makes sense, thanks.
> However.....
> 
> 
> > ---
> >  fs/nfsd/nfs4state.c | 25 +++++++++++++------------
> >  1 file changed, 13 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index a2c6c85adfc7..952def00363b 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -506,10 +506,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
> >   * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
> >   * low 3 bytes as hash-table indices.
> >   *
> > - * 'state_lock', which is always held when block_delegations() is called,
> > - * is used to manage concurrent access.  Testing does not need the lock
> > - * except when swapping the two filters.
> > + * 'blocked_delegations_lock', which is always held when block_delegations()
> > + * is called, is used to manage concurrent access.  Testing does not need the
> > + * lock except when swapping the two filters.
> 
> ...this comment is wrong.  blocked_delegations_lock is *not* held when
> block_delegations() is call, it is taken when needed (almost) by
> block_delegations.
> 

Thanks, fixed.

> >   */
> > +static DEFINE_SPINLOCK(blocked_delegations_lock);
> >  static struct bloom_pair {
> >  	int	entries, old_entries;
> >  	time_t	swap_time;
> > @@ -525,7 +526,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> >  	if (bd->entries == 0)
> >  		return 0;
> >  	if (seconds_since_boot() - bd->swap_time > 30) {
> > -		spin_lock(&state_lock);
> > +		spin_lock(&blocked_delegations_lock);
> >  		if (seconds_since_boot() - bd->swap_time > 30) {
> >  			bd->entries -= bd->old_entries;
> >  			bd->old_entries = bd->entries;
> > @@ -534,7 +535,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> >  			bd->new = 1-bd->new;
> >  			bd->swap_time = seconds_since_boot();
> >  		}
> > -		spin_unlock(&state_lock);
> > +		spin_unlock(&blocked_delegations_lock);
> >  	}
> >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> >  	if (test_bit(hash&255, bd->set[0]) &&
> > @@ -555,16 +556,16 @@ static void block_delegations(struct knfsd_fh *fh)
> >  	u32 hash;
> >  	struct bloom_pair *bd = &blocked_delegations;
> >  
> > -	lockdep_assert_held(&state_lock);
> > -
> >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> >  
> >  	__set_bit(hash&255, bd->set[bd->new]);
> >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > +	spin_lock(&blocked_delegations_lock);
> 
> __set_bit isn't atomic.  The spin_lock should be taken *before* these
> __set_bit() calls.
> 
> Otherwise, looks fine.
> 
> Thanks,
> NeilBrown
> 
> 

Ok. I guess the worry is that we could end up setting bits in the
middle of swapping the two fields? Makes sense -- fixed in my repo.
I'll send out the updated set later today (it also includes a few nits
that HCH pointed out last week).

As a side note...I wonder how much we'll get in the way of false
positives with this scheme?

Given that we'll always have (or will have had) a nfs4_file
corresponding to this inode, perhaps we'd be better off doing something
like storing (and maybe hashing on) the filehandle in the nfs4_file,
and just ensuring that we hold on to it for 30s or so after the last
put?

Not something I'm looking at doing today, but it might be worth
considering for a later delegations rework.

> >  	if (bd->entries == 0)
> >  		bd->swap_time = seconds_since_boot();
> >  	bd->entries += 1;
> > +	spin_unlock(&blocked_delegations_lock);
> >  }
> >  
> >  static struct nfs4_delegation *
> > @@ -3097,16 +3098,16 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
> >  	struct nfs4_client *clp = dp->dl_stid.sc_client;
> >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> >  
> > -	/*
> > -	 * We can't do this in nfsd_break_deleg_cb because it is
> > -	 * already holding inode->i_lock
> > -	 */
> > -	spin_lock(&state_lock);
> >  	block_delegations(&dp->dl_fh);
> > +
> >  	/*
> > +	 * We can't do this in nfsd_break_deleg_cb because it is
> > +	 * already holding inode->i_lock.
> > +	 *
> >  	 * If the dl_time != 0, then we know that it has already been
> >  	 * queued for a lease break. Don't queue it again.
> >  	 */
> > +	spin_lock(&state_lock);
> >  	if (dp->dl_time == 0) {
> >  		dp->dl_time = get_seconds();
> >  		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
>
J. Bruce Fields July 21, 2014, 1:11 p.m. UTC | #4
On Mon, Jul 21, 2014 at 07:44:12AM -0400, Jeff Layton wrote:
> On Mon, 21 Jul 2014 17:02:54 +1000
> NeilBrown <neilb@suse.de> wrote:
> 
> > On Fri, 18 Jul 2014 11:13:36 -0400 Jeff Layton <jlayton@primarydata.com>
> > wrote:
> > 
> > > The state lock can be fairly heavily contended, and there's no reason
> > > that nfs4_file lookups and delegation_blocked should be mutually
> > > exclusive.  Let's give the new block_delegation code its own spinlock.
> > > It does mean that we'll need to take a different lock in the delegation
> > > break code, but that's not generally as critical to performance.
> > > 
> > > Cc: Neil Brown <neilb@suse.de>
> > > Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> > 
> > Makes sense, thanks.
> > However.....
> > 
> > 
> > > ---
> > >  fs/nfsd/nfs4state.c | 25 +++++++++++++------------
> > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index a2c6c85adfc7..952def00363b 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -506,10 +506,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
> > >   * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
> > >   * low 3 bytes as hash-table indices.
> > >   *
> > > - * 'state_lock', which is always held when block_delegations() is called,
> > > - * is used to manage concurrent access.  Testing does not need the lock
> > > - * except when swapping the two filters.
> > > + * 'blocked_delegations_lock', which is always held when block_delegations()
> > > + * is called, is used to manage concurrent access.  Testing does not need the
> > > + * lock except when swapping the two filters.
> > 
> > ...this comment is wrong.  blocked_delegations_lock is *not* held when
> > block_delegations() is call, it is taken when needed (almost) by
> > block_delegations.
> > 
> 
> Thanks, fixed.
> 
> > >   */
> > > +static DEFINE_SPINLOCK(blocked_delegations_lock);
> > >  static struct bloom_pair {
> > >  	int	entries, old_entries;
> > >  	time_t	swap_time;
> > > @@ -525,7 +526,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> > >  	if (bd->entries == 0)
> > >  		return 0;
> > >  	if (seconds_since_boot() - bd->swap_time > 30) {
> > > -		spin_lock(&state_lock);
> > > +		spin_lock(&blocked_delegations_lock);
> > >  		if (seconds_since_boot() - bd->swap_time > 30) {
> > >  			bd->entries -= bd->old_entries;
> > >  			bd->old_entries = bd->entries;
> > > @@ -534,7 +535,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> > >  			bd->new = 1-bd->new;
> > >  			bd->swap_time = seconds_since_boot();
> > >  		}
> > > -		spin_unlock(&state_lock);
> > > +		spin_unlock(&blocked_delegations_lock);
> > >  	}
> > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > >  	if (test_bit(hash&255, bd->set[0]) &&
> > > @@ -555,16 +556,16 @@ static void block_delegations(struct knfsd_fh *fh)
> > >  	u32 hash;
> > >  	struct bloom_pair *bd = &blocked_delegations;
> > >  
> > > -	lockdep_assert_held(&state_lock);
> > > -
> > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > >  
> > >  	__set_bit(hash&255, bd->set[bd->new]);
> > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > +	spin_lock(&blocked_delegations_lock);
> > 
> > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > __set_bit() calls.
> > 
> > Otherwise, looks fine.
> > 
> > Thanks,
> > NeilBrown
> > 
> > 
> 
> Ok. I guess the worry is that we could end up setting bits in the
> middle of swapping the two fields? Makes sense -- fixed in my repo.
> I'll send out the updated set later today (it also includes a few nits
> that HCH pointed out last week).
> 
> As a side note...I wonder how much we'll get in the way of false
> positives with this scheme?
> 
> Given that we'll always have (or will have had) a nfs4_file
> corresponding to this inode, perhaps we'd be better off doing something
> like storing (and maybe hashing on) the filehandle in the nfs4_file,
> and just ensuring that we hold on to it for 30s or so after the last
> put?

You don't want to hold a reference to the inode unnecessarily.
(Consider for example the case of a deleted-but-still-opened file, in
which case people can notice if a large file hangs around eating up
space for an extra 30 seconds.)  So I suppose you'd put fi_inode on last
close and just make sure the rest of the code is prepared to deal with
nfs4_file's with struct inodes.  That might make sense to do.

Occasional false positives aren't necessarily a big deal, so the current
approach seems a reasonable compromise for now.

--b.

> 
> Not something I'm looking at doing today, but it might be worth
> considering for a later delegations rework.
> 
> > >  	if (bd->entries == 0)
> > >  		bd->swap_time = seconds_since_boot();
> > >  	bd->entries += 1;
> > > +	spin_unlock(&blocked_delegations_lock);
> > >  }
> > >  
> > >  static struct nfs4_delegation *
> > > @@ -3097,16 +3098,16 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
> > >  	struct nfs4_client *clp = dp->dl_stid.sc_client;
> > >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > >  
> > > -	/*
> > > -	 * We can't do this in nfsd_break_deleg_cb because it is
> > > -	 * already holding inode->i_lock
> > > -	 */
> > > -	spin_lock(&state_lock);
> > >  	block_delegations(&dp->dl_fh);
> > > +
> > >  	/*
> > > +	 * We can't do this in nfsd_break_deleg_cb because it is
> > > +	 * already holding inode->i_lock.
> > > +	 *
> > >  	 * If the dl_time != 0, then we know that it has already been
> > >  	 * queued for a lease break. Don't queue it again.
> > >  	 */
> > > +	spin_lock(&state_lock);
> > >  	if (dp->dl_time == 0) {
> > >  		dp->dl_time = get_seconds();
> > >  		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
> > 
> 
> 
> -- 
> Jeff Layton <jlayton@primarydata.com>


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton July 21, 2014, 1:23 p.m. UTC | #5
On Mon, 21 Jul 2014 09:11:27 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Mon, Jul 21, 2014 at 07:44:12AM -0400, Jeff Layton wrote:
> > On Mon, 21 Jul 2014 17:02:54 +1000
> > NeilBrown <neilb@suse.de> wrote:
> > 
> > > On Fri, 18 Jul 2014 11:13:36 -0400 Jeff Layton <jlayton@primarydata.com>
> > > wrote:
> > > 
> > > > The state lock can be fairly heavily contended, and there's no reason
> > > > that nfs4_file lookups and delegation_blocked should be mutually
> > > > exclusive.  Let's give the new block_delegation code its own spinlock.
> > > > It does mean that we'll need to take a different lock in the delegation
> > > > break code, but that's not generally as critical to performance.
> > > > 
> > > > Cc: Neil Brown <neilb@suse.de>
> > > > Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> > > 
> > > Makes sense, thanks.
> > > However.....
> > > 
> > > 
> > > > ---
> > > >  fs/nfsd/nfs4state.c | 25 +++++++++++++------------
> > > >  1 file changed, 13 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index a2c6c85adfc7..952def00363b 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -506,10 +506,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
> > > >   * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
> > > >   * low 3 bytes as hash-table indices.
> > > >   *
> > > > - * 'state_lock', which is always held when block_delegations() is called,
> > > > - * is used to manage concurrent access.  Testing does not need the lock
> > > > - * except when swapping the two filters.
> > > > + * 'blocked_delegations_lock', which is always held when block_delegations()
> > > > + * is called, is used to manage concurrent access.  Testing does not need the
> > > > + * lock except when swapping the two filters.
> > > 
> > > ...this comment is wrong.  blocked_delegations_lock is *not* held when
> > > block_delegations() is call, it is taken when needed (almost) by
> > > block_delegations.
> > > 
> > 
> > Thanks, fixed.
> > 
> > > >   */
> > > > +static DEFINE_SPINLOCK(blocked_delegations_lock);
> > > >  static struct bloom_pair {
> > > >  	int	entries, old_entries;
> > > >  	time_t	swap_time;
> > > > @@ -525,7 +526,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> > > >  	if (bd->entries == 0)
> > > >  		return 0;
> > > >  	if (seconds_since_boot() - bd->swap_time > 30) {
> > > > -		spin_lock(&state_lock);
> > > > +		spin_lock(&blocked_delegations_lock);
> > > >  		if (seconds_since_boot() - bd->swap_time > 30) {
> > > >  			bd->entries -= bd->old_entries;
> > > >  			bd->old_entries = bd->entries;
> > > > @@ -534,7 +535,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
> > > >  			bd->new = 1-bd->new;
> > > >  			bd->swap_time = seconds_since_boot();
> > > >  		}
> > > > -		spin_unlock(&state_lock);
> > > > +		spin_unlock(&blocked_delegations_lock);
> > > >  	}
> > > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > > >  	if (test_bit(hash&255, bd->set[0]) &&
> > > > @@ -555,16 +556,16 @@ static void block_delegations(struct knfsd_fh *fh)
> > > >  	u32 hash;
> > > >  	struct bloom_pair *bd = &blocked_delegations;
> > > >  
> > > > -	lockdep_assert_held(&state_lock);
> > > > -
> > > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > > >  
> > > >  	__set_bit(hash&255, bd->set[bd->new]);
> > > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > > +	spin_lock(&blocked_delegations_lock);
> > > 
> > > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > > __set_bit() calls.
> > > 
> > > Otherwise, looks fine.
> > > 
> > > Thanks,
> > > NeilBrown
> > > 
> > > 
> > 
> > Ok. I guess the worry is that we could end up setting bits in the
> > middle of swapping the two fields? Makes sense -- fixed in my repo.
> > I'll send out the updated set later today (it also includes a few nits
> > that HCH pointed out last week).
> > 
> > As a side note...I wonder how much we'll get in the way of false
> > positives with this scheme?
> > 
> > Given that we'll always have (or will have had) a nfs4_file
> > corresponding to this inode, perhaps we'd be better off doing something
> > like storing (and maybe hashing on) the filehandle in the nfs4_file,
> > and just ensuring that we hold on to it for 30s or so after the last
> > put?
> 
> You don't want to hold a reference to the inode unnecessarily.
> (Consider for example the case of a deleted-but-still-opened file, in
> which case people can notice if a large file hangs around eating up
> space for an extra 30 seconds.)  So I suppose you'd put fi_inode on last
> close and just make sure the rest of the code is prepared to deal with
> nfs4_file's with struct inodes.  That might make sense to do.
> 

Yeah, that's what I was thinking. Change the code to hash the nfs4_file
based on filehandle instead of inode (which may make sense anyway), and
then just keep it around for a little while to handle delegation checks
without pinning down any vfs objects. We could institute some sort of
LRU collection of unused nfs4_files too to ensure the cache doesn't
grow too large.

> Occasional false positives aren't necessarily a big deal, so the current
> approach seems a reasonable compromise for now.
> 

Right, it may be no big deal at all, but the question is -- "how often
do we hit false positives here?" I imagine it depends on workload to
some degree.

Is there some way we could sanity check the hit/miss rate without
needing to do too much tracking?

Anyway...it's more food for thought for later work in this area...

> > 
> > Not something I'm looking at doing today, but it might be worth
> > considering for a later delegations rework.
> > 
> > > >  	if (bd->entries == 0)
> > > >  		bd->swap_time = seconds_since_boot();
> > > >  	bd->entries += 1;
> > > > +	spin_unlock(&blocked_delegations_lock);
> > > >  }
> > > >  
> > > >  static struct nfs4_delegation *
> > > > @@ -3097,16 +3098,16 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
> > > >  	struct nfs4_client *clp = dp->dl_stid.sc_client;
> > > >  	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > >  
> > > > -	/*
> > > > -	 * We can't do this in nfsd_break_deleg_cb because it is
> > > > -	 * already holding inode->i_lock
> > > > -	 */
> > > > -	spin_lock(&state_lock);
> > > >  	block_delegations(&dp->dl_fh);
> > > > +
> > > >  	/*
> > > > +	 * We can't do this in nfsd_break_deleg_cb because it is
> > > > +	 * already holding inode->i_lock.
> > > > +	 *
> > > >  	 * If the dl_time != 0, then we know that it has already been
> > > >  	 * queued for a lease break. Don't queue it again.
> > > >  	 */
> > > > +	spin_lock(&state_lock);
> > > >  	if (dp->dl_time == 0) {
> > > >  		dp->dl_time = get_seconds();
> > > >  		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
> > > 
> > 
> > 
> > -- 
> > Jeff Layton <jlayton@primarydata.com>
> 
>
NeilBrown July 21, 2014, 8:40 p.m. UTC | #6
On Mon, 21 Jul 2014 07:44:12 -0400 Jeff Layton <jeff.layton@primarydata.com>
wrote:

> On Mon, 21 Jul 2014 17:02:54 +1000
> NeilBrown <neilb@suse.de> wrote:

> > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > >  
> > >  	__set_bit(hash&255, bd->set[bd->new]);
> > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > +	spin_lock(&blocked_delegations_lock);
> > 
> > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > __set_bit() calls.
> > 
> > Otherwise, looks fine.
> > 
> > Thanks,
> > NeilBrown
> > 
> > 
> 
> Ok. I guess the worry is that we could end up setting bits in the
> middle of swapping the two fields? Makes sense -- fixed in my repo.

It is more subtle than that.
__set_bit() will:
  read a value from memory to a register
  set a bit in the register
  write the register back out to memory

If two threads both run __set_bit on the same word of memory at the same
time, one of the updates can get lost.
set_bit() (no underscore) performs an atomic RMW to avoid this, but is more
expensive.
spin_lock() obviously ensures the required exclusion and as we are going to
take the lock anyway we may as well take it before setting bits so we can use
the non-atomic (cheaper) __set_bit function.

> I'll send out the updated set later today (it also includes a few nits
> that HCH pointed out last week).
> 
> As a side note...I wonder how much we'll get in the way of false
> positives with this scheme?

If a future version of NFSv4 could allow delegations to be granted while a
file is open (oh, it seems you are the only client using this file at the
moment, you can treat this "open" as a delegation if you like) a few false
positives would be a complete non-issue.  As it is, I think we just have to
hope.

Thanks,
NeilBrown
J. Bruce Fields July 21, 2014, 9:17 p.m. UTC | #7
On Tue, Jul 22, 2014 at 06:40:49AM +1000, NeilBrown wrote:
> On Mon, 21 Jul 2014 07:44:12 -0400 Jeff Layton <jeff.layton@primarydata.com>
> wrote:
> 
> > On Mon, 21 Jul 2014 17:02:54 +1000
> > NeilBrown <neilb@suse.de> wrote:
> 
> > > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > > >  
> > > >  	__set_bit(hash&255, bd->set[bd->new]);
> > > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > > +	spin_lock(&blocked_delegations_lock);
> > > 
> > > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > > __set_bit() calls.
> > > 
> > > Otherwise, looks fine.
> > > 
> > > Thanks,
> > > NeilBrown
> > > 
> > > 
> > 
> > Ok. I guess the worry is that we could end up setting bits in the
> > middle of swapping the two fields? Makes sense -- fixed in my repo.
> 
> It is more subtle than that.
> __set_bit() will:
>   read a value from memory to a register
>   set a bit in the register
>   write the register back out to memory
> 
> If two threads both run __set_bit on the same word of memory at the same
> time, one of the updates can get lost.
> set_bit() (no underscore) performs an atomic RMW to avoid this, but is more
> expensive.
> spin_lock() obviously ensures the required exclusion and as we are going to
> take the lock anyway we may as well take it before setting bits so we can use
> the non-atomic (cheaper) __set_bit function.
> 
> > I'll send out the updated set later today (it also includes a few nits
> > that HCH pointed out last week).
> > 
> > As a side note...I wonder how much we'll get in the way of false
> > positives with this scheme?
> 
> If a future version of NFSv4 could allow delegations to be granted while a
> file is open (oh, it seems you are the only client using this file at the
> moment, you can treat this "open" as a delegation if you like) a few false
> positives would be a complete non-issue.

For what it's worth, I think 4.1 provides what you're asking for here;
see

	http://tools.ietf.org/html/rfc5661#section-20.7

and the discussion of the various WANT_ flags in

	http://tools.ietf.org/html/rfc5661#section-18.16.3

As far as I know none of that is implemented yet.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NeilBrown July 21, 2014, 10:50 p.m. UTC | #8
On Mon, 21 Jul 2014 17:17:57 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Tue, Jul 22, 2014 at 06:40:49AM +1000, NeilBrown wrote:
> > On Mon, 21 Jul 2014 07:44:12 -0400 Jeff Layton <jeff.layton@primarydata.com>
> > wrote:
> > 
> > > On Mon, 21 Jul 2014 17:02:54 +1000
> > > NeilBrown <neilb@suse.de> wrote:
> > 
> > > > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > > > >  
> > > > >  	__set_bit(hash&255, bd->set[bd->new]);
> > > > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > > > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > > > +	spin_lock(&blocked_delegations_lock);
> > > > 
> > > > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > > > __set_bit() calls.
> > > > 
> > > > Otherwise, looks fine.
> > > > 
> > > > Thanks,
> > > > NeilBrown
> > > > 
> > > > 
> > > 
> > > Ok. I guess the worry is that we could end up setting bits in the
> > > middle of swapping the two fields? Makes sense -- fixed in my repo.
> > 
> > It is more subtle than that.
> > __set_bit() will:
> >   read a value from memory to a register
> >   set a bit in the register
> >   write the register back out to memory
> > 
> > If two threads both run __set_bit on the same word of memory at the same
> > time, one of the updates can get lost.
> > set_bit() (no underscore) performs an atomic RMW to avoid this, but is more
> > expensive.
> > spin_lock() obviously ensures the required exclusion and as we are going to
> > take the lock anyway we may as well take it before setting bits so we can use
> > the non-atomic (cheaper) __set_bit function.
> > 
> > > I'll send out the updated set later today (it also includes a few nits
> > > that HCH pointed out last week).
> > > 
> > > As a side note...I wonder how much we'll get in the way of false
> > > positives with this scheme?
> > 
> > If a future version of NFSv4 could allow delegations to be granted while a
> > file is open (oh, it seems you are the only client using this file at the
> > moment, you can treat this "open" as a delegation if you like) a few false
> > positives would be a complete non-issue.
> 
> For what it's worth, I think 4.1 provides what you're asking for here;
> see
> 
> 	http://tools.ietf.org/html/rfc5661#section-20.7
> 
> and the discussion of the various WANT_ flags in
> 
> 	http://tools.ietf.org/html/rfc5661#section-18.16.3
> 
> As far as I know none of that is implemented yet.
> 
> --b.

I guess I should really read the 4.1 (and 4.2) spec some day....
Though the 20.7 section seems to be about saying "resources in general are
available" rather than "this  specific file that you wanted a delegation for
but didn't get one is how up for delegation"....
But I only had a quick read so I might have missed something.

Thanks,
NeilBrown
J. Bruce Fields July 22, 2014, 3 p.m. UTC | #9
On Tue, Jul 22, 2014 at 08:50:25AM +1000, NeilBrown wrote:
> On Mon, 21 Jul 2014 17:17:57 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> wrote:
> 
> > On Tue, Jul 22, 2014 at 06:40:49AM +1000, NeilBrown wrote:
> > > On Mon, 21 Jul 2014 07:44:12 -0400 Jeff Layton <jeff.layton@primarydata.com>
> > > wrote:
> > > 
> > > > On Mon, 21 Jul 2014 17:02:54 +1000
> > > > NeilBrown <neilb@suse.de> wrote:
> > > 
> > > > > >  	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
> > > > > >  
> > > > > >  	__set_bit(hash&255, bd->set[bd->new]);
> > > > > >  	__set_bit((hash>>8)&255, bd->set[bd->new]);
> > > > > >  	__set_bit((hash>>16)&255, bd->set[bd->new]);
> > > > > > +	spin_lock(&blocked_delegations_lock);
> > > > > 
> > > > > __set_bit isn't atomic.  The spin_lock should be taken *before* these
> > > > > __set_bit() calls.
> > > > > 
> > > > > Otherwise, looks fine.
> > > > > 
> > > > > Thanks,
> > > > > NeilBrown
> > > > > 
> > > > > 
> > > > 
> > > > Ok. I guess the worry is that we could end up setting bits in the
> > > > middle of swapping the two fields? Makes sense -- fixed in my repo.
> > > 
> > > It is more subtle than that.
> > > __set_bit() will:
> > >   read a value from memory to a register
> > >   set a bit in the register
> > >   write the register back out to memory
> > > 
> > > If two threads both run __set_bit on the same word of memory at the same
> > > time, one of the updates can get lost.
> > > set_bit() (no underscore) performs an atomic RMW to avoid this, but is more
> > > expensive.
> > > spin_lock() obviously ensures the required exclusion and as we are going to
> > > take the lock anyway we may as well take it before setting bits so we can use
> > > the non-atomic (cheaper) __set_bit function.
> > > 
> > > > I'll send out the updated set later today (it also includes a few nits
> > > > that HCH pointed out last week).
> > > > 
> > > > As a side note...I wonder how much we'll get in the way of false
> > > > positives with this scheme?
> > > 
> > > If a future version of NFSv4 could allow delegations to be granted while a
> > > file is open (oh, it seems you are the only client using this file at the
> > > moment, you can treat this "open" as a delegation if you like) a few false
> > > positives would be a complete non-issue.
> > 
> > For what it's worth, I think 4.1 provides what you're asking for here;
> > see
> > 
> > 	http://tools.ietf.org/html/rfc5661#section-20.7
> > 
> > and the discussion of the various WANT_ flags in
> > 
> > 	http://tools.ietf.org/html/rfc5661#section-18.16.3
> > 
> > As far as I know none of that is implemented yet.
> > 
> > --b.
> 
> I guess I should really read the 4.1 (and 4.2) spec some day....
> Though the 20.7 section seems to be about saying "resources in general are
> available" rather than "this  specific file that you wanted a delegation for
> but didn't get one is how up for delegation"....
> But I only had a quick read so I might have missed something.

It was me that missed something, looks like CB_PUSH_DELEG is what you
actually want, not CB_RECALL_OBJ_AVAIL:

	http://tools.ietf.org/html/rfc5661#section-20.5
 
NFS4.1: it's the whole bell-and-whistle orchestra.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a2c6c85adfc7..952def00363b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -506,10 +506,11 @@  static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
  * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
  * low 3 bytes as hash-table indices.
  *
- * 'state_lock', which is always held when block_delegations() is called,
- * is used to manage concurrent access.  Testing does not need the lock
- * except when swapping the two filters.
+ * 'blocked_delegations_lock', which is always held when block_delegations()
+ * is called, is used to manage concurrent access.  Testing does not need the
+ * lock except when swapping the two filters.
  */
+static DEFINE_SPINLOCK(blocked_delegations_lock);
 static struct bloom_pair {
 	int	entries, old_entries;
 	time_t	swap_time;
@@ -525,7 +526,7 @@  static int delegation_blocked(struct knfsd_fh *fh)
 	if (bd->entries == 0)
 		return 0;
 	if (seconds_since_boot() - bd->swap_time > 30) {
-		spin_lock(&state_lock);
+		spin_lock(&blocked_delegations_lock);
 		if (seconds_since_boot() - bd->swap_time > 30) {
 			bd->entries -= bd->old_entries;
 			bd->old_entries = bd->entries;
@@ -534,7 +535,7 @@  static int delegation_blocked(struct knfsd_fh *fh)
 			bd->new = 1-bd->new;
 			bd->swap_time = seconds_since_boot();
 		}
-		spin_unlock(&state_lock);
+		spin_unlock(&blocked_delegations_lock);
 	}
 	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
 	if (test_bit(hash&255, bd->set[0]) &&
@@ -555,16 +556,16 @@  static void block_delegations(struct knfsd_fh *fh)
 	u32 hash;
 	struct bloom_pair *bd = &blocked_delegations;
 
-	lockdep_assert_held(&state_lock);
-
 	hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
 
 	__set_bit(hash&255, bd->set[bd->new]);
 	__set_bit((hash>>8)&255, bd->set[bd->new]);
 	__set_bit((hash>>16)&255, bd->set[bd->new]);
+	spin_lock(&blocked_delegations_lock);
 	if (bd->entries == 0)
 		bd->swap_time = seconds_since_boot();
 	bd->entries += 1;
+	spin_unlock(&blocked_delegations_lock);
 }
 
 static struct nfs4_delegation *
@@ -3097,16 +3098,16 @@  void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
 	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
-	/*
-	 * We can't do this in nfsd_break_deleg_cb because it is
-	 * already holding inode->i_lock
-	 */
-	spin_lock(&state_lock);
 	block_delegations(&dp->dl_fh);
+
 	/*
+	 * We can't do this in nfsd_break_deleg_cb because it is
+	 * already holding inode->i_lock.
+	 *
 	 * If the dl_time != 0, then we know that it has already been
 	 * queued for a lease break. Don't queue it again.
 	 */
+	spin_lock(&state_lock);
 	if (dp->dl_time == 0) {
 		dp->dl_time = get_seconds();
 		list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);