diff mbox

[v4,72/73] xfs: Convert mru cache to XArray

Message ID 20171206004159.3755-73-willy@infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Matthew Wilcox Dec. 6, 2017, 12:41 a.m. UTC
From: Matthew Wilcox <mawilcox@microsoft.com>

This eliminates a call to radix_tree_preload().

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 fs/xfs/xfs_mru_cache.c | 72 +++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

Comments

Dave Chinner Dec. 6, 2017, 1:36 a.m. UTC | #1
On Tue, Dec 05, 2017 at 04:41:58PM -0800, Matthew Wilcox wrote:
> From: Matthew Wilcox <mawilcox@microsoft.com>
> 
> This eliminates a call to radix_tree_preload().

.....

>  void
> @@ -431,24 +424,24 @@ xfs_mru_cache_insert(
>  	unsigned long		key,
>  	struct xfs_mru_cache_elem *elem)
>  {
> +	XA_STATE(xas, &mru->store, key);
>  	int			error;
>  
>  	ASSERT(mru && mru->lists);
>  	if (!mru || !mru->lists)
>  		return -EINVAL;
>  
> -	if (radix_tree_preload(GFP_NOFS))
> -		return -ENOMEM;
> -
>  	INIT_LIST_HEAD(&elem->list_node);
>  	elem->key = key;
>  
> -	spin_lock(&mru->lock);
> -	error = radix_tree_insert(&mru->store, key, elem);
> -	radix_tree_preload_end();
> -	if (!error)
> -		_xfs_mru_cache_list_insert(mru, elem);
> -	spin_unlock(&mru->lock);
> +	do {
> +		xas_lock(&xas);
> +		xas_store(&xas, elem);
> +		error = xas_error(&xas);
> +		if (!error)
> +			_xfs_mru_cache_list_insert(mru, elem);
> +		xas_unlock(&xas);
> +	} while (xas_nomem(&xas, GFP_NOFS));

Ok, so why does this have a retry loop on ENOMEM despite the
existing code handling that error? And why put such a loop in this
code and not any of the other XFS code that used
radix_tree_preload() and is arguably much more important to avoid
ENOMEM on insert (e.g. the inode cache)?

Also, I really don't like the pattern of using xa_lock()/xa_unlock()
to protect access to an external structure. i.e. the mru->lock
context is protecting multiple fields and operations in the MRU
structure, not just the radix tree operations. Turning that around
so that a larger XFS structure and algorithm is now protected by an
opaque internal lock from generic storage structure the forms part
of the larger structure seems like a bad design pattern to me...

Cheers,

Dave.
Matthew Wilcox Dec. 6, 2017, 2:02 a.m. UTC | #2
On Wed, Dec 06, 2017 at 12:36:48PM +1100, Dave Chinner wrote:
> > -	if (radix_tree_preload(GFP_NOFS))
> > -		return -ENOMEM;
> > -
> >  	INIT_LIST_HEAD(&elem->list_node);
> >  	elem->key = key;
> >  
> > -	spin_lock(&mru->lock);
> > -	error = radix_tree_insert(&mru->store, key, elem);
> > -	radix_tree_preload_end();
> > -	if (!error)
> > -		_xfs_mru_cache_list_insert(mru, elem);
> > -	spin_unlock(&mru->lock);
> > +	do {
> > +		xas_lock(&xas);
> > +		xas_store(&xas, elem);
> > +		error = xas_error(&xas);
> > +		if (!error)
> > +			_xfs_mru_cache_list_insert(mru, elem);
> > +		xas_unlock(&xas);
> > +	} while (xas_nomem(&xas, GFP_NOFS));
> 
> Ok, so why does this have a retry loop on ENOMEM despite the
> existing code handling that error? And why put such a loop in this
> code and not any of the other XFS code that used
> radix_tree_preload() and is arguably much more important to avoid
> ENOMEM on insert (e.g. the inode cache)?

If we need more nodes in the tree, xas_store() will try to allocate them
with GFP_NOWAIT | __GFP_NOWARN.  If that fails, it signals it in xas_error().
xas_nomem() will notice that we're in an ENOMEM situation, and allocate
a node using your preferred GFP flags (NOIO in your case).  Then we retry,
guaranteeing forward progress. [1]

The other conversions use the normal API instead of the advanced API, so
all of this gets hidden away.  For example, the inode cache does this:

+       curr = xa_cmpxchg(&pag->pag_ici_xa, agino, NULL, ip, GFP_NOFS);

and xa_cmpxchg internally does:

        do {
                xa_lock_irqsave(xa, flags);
                curr = xas_create(&xas);
                if (curr == old)
                        xas_store(&xas, entry);
                xa_unlock_irqrestore(xa, flags);
        } while (xas_nomem(&xas, gfp));


> Also, I really don't like the pattern of using xa_lock()/xa_unlock()
> to protect access to an external structure. i.e. the mru->lock
> context is protecting multiple fields and operations in the MRU
> structure, not just the radix tree operations. Turning that around
> so that a larger XFS structure and algorithm is now protected by an
> opaque internal lock from generic storage structure the forms part
> of the larger structure seems like a bad design pattern to me...

It's the design pattern I've always intended to use.  Naturally, the
xfs radix trees weren't my initial target; it was the page cache, and
the page cache does the same thing; uses the tree_lock to protect both
the radix tree and several other fields in that same data structure.

I'm open to argument on this though ... particularly if you have a better
design pattern in mind!

[1] I actually have this documented!  It's in the xas_nomem() kernel-doc:

 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 6, 2017, 3:14 a.m. UTC | #3
On Tue, Dec 05, 2017 at 06:02:08PM -0800, Matthew Wilcox wrote:
> On Wed, Dec 06, 2017 at 12:36:48PM +1100, Dave Chinner wrote:
> > > -	if (radix_tree_preload(GFP_NOFS))
> > > -		return -ENOMEM;
> > > -
> > >  	INIT_LIST_HEAD(&elem->list_node);
> > >  	elem->key = key;
> > >  
> > > -	spin_lock(&mru->lock);
> > > -	error = radix_tree_insert(&mru->store, key, elem);
> > > -	radix_tree_preload_end();
> > > -	if (!error)
> > > -		_xfs_mru_cache_list_insert(mru, elem);
> > > -	spin_unlock(&mru->lock);
> > > +	do {
> > > +		xas_lock(&xas);
> > > +		xas_store(&xas, elem);
> > > +		error = xas_error(&xas);
> > > +		if (!error)
> > > +			_xfs_mru_cache_list_insert(mru, elem);
> > > +		xas_unlock(&xas);
> > > +	} while (xas_nomem(&xas, GFP_NOFS));
> > 
> > Ok, so why does this have a retry loop on ENOMEM despite the
> > existing code handling that error? And why put such a loop in this
> > code and not any of the other XFS code that used
> > radix_tree_preload() and is arguably much more important to avoid
> > ENOMEM on insert (e.g. the inode cache)?
> 
> If we need more nodes in the tree, xas_store() will try to allocate them
> with GFP_NOWAIT | __GFP_NOWARN.  If that fails, it signals it in xas_error().
> xas_nomem() will notice that we're in an ENOMEM situation, and allocate
> a node using your preferred GFP flags (NOIO in your case).  Then we retry,
> guaranteeing forward progress. [1]
> 
> The other conversions use the normal API instead of the advanced API, so
> all of this gets hidden away.  For example, the inode cache does this:
> 
> +       curr = xa_cmpxchg(&pag->pag_ici_xa, agino, NULL, ip, GFP_NOFS);
> 
> and xa_cmpxchg internally does:
> 
>         do {
>                 xa_lock_irqsave(xa, flags);
>                 curr = xas_create(&xas);
>                 if (curr == old)
>                         xas_store(&xas, entry);
>                 xa_unlock_irqrestore(xa, flags);
>         } while (xas_nomem(&xas, gfp));

Ah, OK, that's not obvious from the code changes. :/

However, it's probably overkill for XFS. In all the cases, when we
insert there should be no entry in the tree - the
radix tree insert error handling code there was simply catching
"should never happen" cases and handling it without crashing.

Now that I've looked at this, I have to say that having a return
value of NULL meaning "success" is quite counter-intuitive. That's
going to fire my "that looks so wrong" detector every time I look at
the code and notice it's erroring out on a non-null return value
that isn't a PTR_ERR case....

Also, there's no need for irqsave/restore() locking contexts here as
we never access these caches from interrupt contexts. That's just
going to be extra overhead, especially on workloads that run 10^6
inodes inodes through the cache every second. That's a problem
caused by driving the locks into the XA structure and then needing
to support callers that require irq safety....

> > Also, I really don't like the pattern of using xa_lock()/xa_unlock()
> > to protect access to an external structure. i.e. the mru->lock
> > context is protecting multiple fields and operations in the MRU
> > structure, not just the radix tree operations. Turning that around
> > so that a larger XFS structure and algorithm is now protected by an
> > opaque internal lock from generic storage structure the forms part
> > of the larger structure seems like a bad design pattern to me...
> 
> It's the design pattern I've always intended to use.  Naturally, the
> xfs radix trees weren't my initial target; it was the page cache, and
> the page cache does the same thing; uses the tree_lock to protect both
> the radix tree and several other fields in that same data structure.
> 
> I'm open to argument on this though ... particularly if you have a better
> design pattern in mind!

I don't mind structures having internal locking - I have a problem
with leaking them into contexts outside the structure they protect.
That way lies madness - you can't change the internal locking in
future because of external dependencies, and the moment you need
something different externally we've got to go back to an external
lock anyway.

This is demonstrated by the way you converted the XFS dquot tree -
you didn't replace the dquot tree lock with the internal xa_lock
because it's a mutex and we have to sleep holding it. IOWs, we've
added another layer of locking here, not simplified the code.

What I really see here is that  we have inconsistent locking
patterns w.r.t. XA stores inside XFS - some have an external mutex
to cover a wider scope, some use xa_lock/xa_unlock to span multiple
operations, some directly access the internal xa lock via direct
spin_lock/unlock(...xa_lock) calls and non-locking XA call variants.
In some places you remove explicit rcu_read_lock() calls because the
internal xa_lock implies RCU, but in other places we still need them
because we have to protect the objects the tree points to, not just
the tree....

IOWs, there's no consistent pattern to the changes you've made to
the XFS code. The existing radix tree code has clear, consistent
locking, tagging and lookup patterns. In contrast, each conversion
to the XA code has resulted in a different solution for each radix
tree conversion. Yes, there's been a small reduction in the amoutn
of code in converting to the XA API, but it comes at the cost of
consistency and ease of understanding the code that uses the radix
tree API.

Cheers,

Dave.
Matthew Wilcox Dec. 6, 2017, 4:45 a.m. UTC | #4
On Wed, Dec 06, 2017 at 02:14:56PM +1100, Dave Chinner wrote:
> > The other conversions use the normal API instead of the advanced API, so
> > all of this gets hidden away.  For example, the inode cache does this:
> 
> Ah, OK, that's not obvious from the code changes. :/

Yeah, it's a lot easier to understand (I think!) if you build the
docs in that tree and look at
file:///home/willy/kernel/xarray-3/Documentation/output/core-api/xarray.html
(mutatis mutandi).  I've tried to tell a nice story about how to put
all the pieces together from the normal to the advanced API.

> However, it's probably overkill for XFS. In all the cases, when we
> insert there should be no entry in the tree - the
> radix tree insert error handling code there was simply catching
> "should never happen" cases and handling it without crashing.

I thought it was probably overkill to be using xa_cmpxchg() in the
pag_ici patch.  I didn't want to take away your error handling as part
of the conversion, but I think a rational person implementing it today
would just call xa_store() and not even worry about the return value
except to check it for IS_ERR().

That said, using xa_cmpxchg() in the dquot code looked like the right
thing to do?  Since we'd dropped the qi mutex and the ILOCK, it looks
entirely reasonable for another thread to come in and set up the dquot.
But I'm obviously quite ignorant of the XFS internals, so maybe there's
something else going on that makes this essentially a "can't happen".

> Now that I've looked at this, I have to say that having a return
> value of NULL meaning "success" is quite counter-intuitive. That's
> going to fire my "that looks so wrong" detector every time I look at
> the code and notice it's erroring out on a non-null return value
> that isn't a PTR_ERR case....

It's the same convention as cmpxchg().  I think it's triggering your
"looks so wrong" detector because it's fundamentally not the natural
thing to write.  I certainly don't cmpxchg() new entries into an array
and check the result was NULL ;-)

> Also, there's no need for irqsave/restore() locking contexts here as
> we never access these caches from interrupt contexts. That's just
> going to be extra overhead, especially on workloads that run 10^6
> inodes inodes through the cache every second. That's a problem
> caused by driving the locks into the XA structure and then needing
> to support callers that require irq safety....

I'm quite happy to have normal API variants that don't save/restore
interrupts.  Just need to come up with good names ... I don't think
xa_store_noirq() is a good name, but maybe you do?

> > It's the design pattern I've always intended to use.  Naturally, the
> > xfs radix trees weren't my initial target; it was the page cache, and
> > the page cache does the same thing; uses the tree_lock to protect both
> > the radix tree and several other fields in that same data structure.
> > 
> > I'm open to argument on this though ... particularly if you have a better
> > design pattern in mind!
> 
> I don't mind structures having internal locking - I have a problem
> with leaking them into contexts outside the structure they protect.
> That way lies madness - you can't change the internal locking in
> future because of external dependencies, and the moment you need
> something different externally we've got to go back to an external
> lock anyway.
> 
> This is demonstrated by the way you converted the XFS dquot tree -
> you didn't replace the dquot tree lock with the internal xa_lock
> because it's a mutex and we have to sleep holding it. IOWs, we've
> added another layer of locking here, not simplified the code.

I agree the dquot code is no simpler than it was, but it's also no more
complicated from a locking analysis point of view; the xa_lock is just
not providing you with any useful exclusion.

At least, not today.  One of the future plans is to allow xa_nodes to
be allocated from ZONE_MOVABLE.  In order to do that, we have to be
able to tell which lock protects any given node.  With the XArray,
we can find that out (xa_node->root->xa_lock); with the radix tree,
we don't even know what kind of lock protects the tree.

> What I really see here is that  we have inconsistent locking
> patterns w.r.t. XA stores inside XFS - some have an external mutex
> to cover a wider scope, some use xa_lock/xa_unlock to span multiple
> operations, some directly access the internal xa lock via direct
> spin_lock/unlock(...xa_lock) calls and non-locking XA call variants.
> In some places you remove explicit rcu_read_lock() calls because the
> internal xa_lock implies RCU, but in other places we still need them
> because we have to protect the objects the tree points to, not just
> the tree....
> 
> IOWs, there's no consistent pattern to the changes you've made to
> the XFS code. The existing radix tree code has clear, consistent
> locking, tagging and lookup patterns. In contrast, each conversion
> to the XA code has resulted in a different solution for each radix
> tree conversion. Yes, there's been a small reduction in the amoutn
> of code in converting to the XA API, but it comes at the cost of
> consistency and ease of understanding the code that uses the radix
> tree API.

There are other costs to not having a lock.  The lockdep/RCU
analysis done on the radix tree code is none.  Because we have
no idea what lock might protect any individual radix tree, we use
rcu_dereference_raw(), disabling lockdep's ability to protect us.

It's funny that you see the hodgepodge of different locking strategies
in the XFS code base as being a problem with the XArray.  I see it as
being a consequence of XFS's different needs.  No, the XArray can't
solve all of your problems, but it hasn't made your locking more complex.

And I don't agree that the existing radix tree code has clear, consistent
locking patterns.  For example, this use of RCU was unnecessary:

 xfs_queue_eofblocks(
        struct xfs_mount *mp)
 {
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+       if (xa_tagged(&mp->m_perag_xa, XFS_ICI_EOFBLOCKS_TAG))
                queue_delayed_work(mp->m_eofblocks_workqueue,
                                   &mp->m_eofblocks_work,
                                   msecs_to_jiffies(xfs_eofb_secs * 1000));
-       rcu_read_unlock();
 }

radix_tree_tagged never required the RCU lock (commit 7cf9c2c76c1a).
I think you're just used to the radix tree pattern of "we provide no
locking for you, come up with your own scheme".

What might make more sense for XFS is coming up with something
intermediate between the full on xa_state-based API and the "we handle
everything for you" normal API.  For example, how would you feel about
xfs_mru_cache_insert() looking like this:

	xa_lock(&mru->store);
	error = PTR_ERR_OR_ZERO(__xa_store(&mru->store, key, elem, GFP_NOFS));
	if (!error)
		_xfs_mru_cache_list_insert(mru, elem);
	xa_unlock(&mru->store);

	return error;

xfs_mru_cache_lookup would look like:

	xa_lock(&mru->store);
	elem = __xa_load(&mru->store, key);
	...

There's no real need for the mru code to be using the full-on xa_state
API.  For something like DAX or the page cache, there's a real advantage,
but the mru code is, I think, a great example of a user who has somewhat
more complex locking requirements, but doesn't use the array in a
complex way.

The dquot code is just going to have to live with the fact that there's
additional locking going on that it doesn't need.  I'm open to getting
rid of the irqsafety, but we can't give up the spinlock protection
without giving up the RCU/lockdep analysis and the ability to move nodes.
I don't suppose the dquot code can 


Thanks for spending so much time on this and being so passionate about
making this the best possible code it can be.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Dec. 6, 2017, 4:52 a.m. UTC | #5
On Tue, Dec 05, 2017 at 08:45:49PM -0800, Matthew Wilcox wrote:
> The dquot code is just going to have to live with the fact that there's
> additional locking going on that it doesn't need.  I'm open to getting
> rid of the irqsafety, but we can't give up the spinlock protection
> without giving up the RCU/lockdep analysis and the ability to move nodes.
> I don't suppose the dquot code can 

Oops, thought I'd finished writing this paragraph.

I don't suppose the dquot code can be restructured to use the xa_lock to
protect, say, qi_dquots?  I suspect not, since you wouldn't know which
of the three xarray locks to use.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 6, 2017, 8:44 a.m. UTC | #6
On Tue, Dec 05, 2017 at 08:45:49PM -0800, Matthew Wilcox wrote:
> On Wed, Dec 06, 2017 at 02:14:56PM +1100, Dave Chinner wrote:
> > > The other conversions use the normal API instead of the advanced API, so
> > > all of this gets hidden away.  For example, the inode cache does this:
> > 
> > Ah, OK, that's not obvious from the code changes. :/
> 
> Yeah, it's a lot easier to understand (I think!) if you build the
> docs in that tree and look at
> file:///home/willy/kernel/xarray-3/Documentation/output/core-api/xarray.html
> (mutatis mutandi).  I've tried to tell a nice story about how to put
> all the pieces together from the normal to the advanced API.
> 
> > However, it's probably overkill for XFS. In all the cases, when we
> > insert there should be no entry in the tree - the
> > radix tree insert error handling code there was simply catching
> > "should never happen" cases and handling it without crashing.
> 
> I thought it was probably overkill to be using xa_cmpxchg() in the
> pag_ici patch.  I didn't want to take away your error handling as part
> of the conversion, but I think a rational person implementing it today
> would just call xa_store() and not even worry about the return value
> except to check it for IS_ERR().

*nod*

> That said, using xa_cmpxchg() in the dquot code looked like the right
> thing to do?  Since we'd dropped the qi mutex and the ILOCK, it looks
> entirely reasonable for another thread to come in and set up the dquot.
> But I'm obviously quite ignorant of the XFS internals, so maybe there's
> something else going on that makes this essentially a "can't happen".

It's no different to the inode cache code, which drops the RCU
lock on lookup miss, instantiates the new inode (maybe reading it
off disk), then locks the tree and attempts to insert it. Both cases
use "insert if empty, otherwise retry lookup from start" semantics.

cmpxchg is for replacing a known object in a store - it's not really
intended for doing initial inserts after a lookup tells us there is
nothing in the store.  The radix tree "insert only if empty" makes
sense here, because it naturally takes care of lookup/insert races
via the -EEXIST mechanism.

I think that providing xa_store_excl() (which would return -EEXIST
if the entry is not empty) would be a better interface here, because
it matches the semantics of lookup cache population used all over
the kernel....

> > Now that I've looked at this, I have to say that having a return
> > value of NULL meaning "success" is quite counter-intuitive. That's
> > going to fire my "that looks so wrong" detector every time I look at
> > the code and notice it's erroring out on a non-null return value
> > that isn't a PTR_ERR case....
> 
> It's the same convention as cmpxchg().  I think it's triggering your
> "looks so wrong" detector because it's fundamentally not the natural
> thing to write.

Most definitely the case, and this is why it's a really bad
interface for the semantics we have. This how we end up with code
that makes it easy for programmers to screw up pointer checks in
error handling... :/

> I'm quite happy to have normal API variants that don't save/restore
> interrupts.  Just need to come up with good names ... I don't think
> xa_store_noirq() is a good name, but maybe you do?

I'd prefer not to have to deal with such things at all. :P

How many subsystems actually require irq safety in the XA locking
code? Make them use irqsafe versions, not make everyone else use
"noirq" versions, as is the convention for the rest of the kernel
code....

> > > It's the design pattern I've always intended to use.  Naturally, the
> > > xfs radix trees weren't my initial target; it was the page cache, and
> > > the page cache does the same thing; uses the tree_lock to protect both
> > > the radix tree and several other fields in that same data structure.
> > > 
> > > I'm open to argument on this though ... particularly if you have a better
> > > design pattern in mind!
> > 
> > I don't mind structures having internal locking - I have a problem
> > with leaking them into contexts outside the structure they protect.
> > That way lies madness - you can't change the internal locking in
> > future because of external dependencies, and the moment you need
> > something different externally we've got to go back to an external
> > lock anyway.
> > 
> > This is demonstrated by the way you converted the XFS dquot tree -
> > you didn't replace the dquot tree lock with the internal xa_lock
> > because it's a mutex and we have to sleep holding it. IOWs, we've
> > added another layer of locking here, not simplified the code.
> 
> I agree the dquot code is no simpler than it was, but it's also no more
> complicated from a locking analysis point of view; the xa_lock is just
> not providing you with any useful exclusion.

Sure, that's fine. All I'm doing is pointing out that we can't use
the internal xa_lock to handle everything the indexed objects
require, and so we're going to still need external locks in
many cases.

> At least, not today.  One of the future plans is to allow xa_nodes to
> be allocated from ZONE_MOVABLE.  In order to do that, we have to be
> able to tell which lock protects any given node.  With the XArray,
> we can find that out (xa_node->root->xa_lock); with the radix tree,
> we don't even know what kind of lock protects the tree.

Yup, this is a prime example of why we shouldn't be creating
external dependencies by smearing the locking context outside the XA
structure itself. It's not a stretch to see something like a
ZONE_MOVEABLE dependency because some other object indexed in a XA
is stored in the same page as the xa_node that points to it, and
both require the same xa_lock to move/update...

> There are other costs to not having a lock.  The lockdep/RCU
> analysis done on the radix tree code is none.  Because we have
> no idea what lock might protect any individual radix tree, we use
> rcu_dereference_raw(), disabling lockdep's ability to protect us.

Unfortunately for you, I don't find arguments along the lines of
"lockdep will save us" at all convincing.  lockdep already throws
too many false positives to be useful as a tool that reliably and
accurately points out rare, exciting, complex, intricate locking
problems.

> It's funny that you see the hodgepodge of different locking strategies
> in the XFS code base as being a problem with the XArray.  I see it as
> being a consequence of XFS's different needs.  No, the XArray can't
> solve all of your problems, but it hasn't made your locking more complex.

I'm not worried about changes in locking complexity here because, as
you point out, there isn't a change. What I'm mostly concerned about
is the removal of abstraction, modularity and isolation between
the XFS code and the library infrastructure it uses.

> 
> And I don't agree that the existing radix tree code has clear, consistent
> locking patterns.  For example, this use of RCU was unnecessary:
> 
>  xfs_queue_eofblocks(
>         struct xfs_mount *mp)
>  {
> -       rcu_read_lock();
> -       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
> +       if (xa_tagged(&mp->m_perag_xa, XFS_ICI_EOFBLOCKS_TAG))
>                 queue_delayed_work(mp->m_eofblocks_workqueue,
>                                    &mp->m_eofblocks_work,
>                                    msecs_to_jiffies(xfs_eofb_secs * 1000));
> -       rcu_read_unlock();
>  }
> 
> radix_tree_tagged never required the RCU lock (commit 7cf9c2c76c1a).
> I think you're just used to the radix tree pattern of "we provide no
> locking for you, come up with your own scheme".

No, I'm used to having no-one really understand how "magic lockless
RCU lookups" actually work.  When i originally wrote the lockless
lookup code, I couldn't find anyone who both understood RCU and the
XFS inode cache to review the code for correctness.  Hence it had to
be dumbed down to the point that it was "stupidly obvious that it's
safe".

That problem has not gone away - very few people who read and have
to maintain this code understandxs all the nasty little intricacies
of RCU lookups.  Hiding /more/ of the locking semantics from the
programmers makes it even harder to explain why the algorithm is
safe. If the rules are basic (e.g. all radix tree lookups use RCU
locking) then it's easier for everyone to understand, review and
keep the code working correctly because there's almost no scope for
getting it wrong.

That's one of the advantages of the "we provide no locking for you,
come up with your own scheme" approach - we can dumb it down to the
point of being understandable and maintainable without anyone
needing to hurt their brain on memory-barriers.txt every time
someone changes the code.

Also, it's worth keeping in mind that this dumb code provides the
fastest and most scalable inode cache infrastructure in the kernel.
i.e. it's the structures and algorithms iused that make the code
fast, but it's the simplicity of the code that makes it
understandable and maintainable. The XArray code is a good
algorithm, we've just got to make the API suitable for dumb idiots
like me to be able to write reliable, maintainable code that uses
it.

> What might make more sense for XFS is coming up with something
> intermediate between the full on xa_state-based API and the "we handle
> everything for you" normal API.  For example, how would you feel about
> xfs_mru_cache_insert() looking like this:
> 
> 	xa_lock(&mru->store);
> 	error = PTR_ERR_OR_ZERO(__xa_store(&mru->store, key, elem, GFP_NOFS));
> 	if (!error)
> 		_xfs_mru_cache_list_insert(mru, elem);
> 	xa_unlock(&mru->store);
> 
> 	return error;
> 
> xfs_mru_cache_lookup would look like:
> 
> 	xa_lock(&mru->store);
> 	elem = __xa_load(&mru->store, key);
> 	....
> There's no real need for the mru code to be using the full-on xa_state
> API.  For something like DAX or the page cache, there's a real advantage,
> but the mru code is, I think, a great example of a user who has somewhat
> more complex locking requirements, but doesn't use the array in a
> complex way.

Yes, that's because the radix tree is not central to it's algorithm
or purpose.  The MRU cache (Most Recently Used Cache) is mostly
about the management of the items on lists in the priority
reclaimation array.  The radix tree is just there to provide a fast
"is there an item for this key already being aged" lookup so we
don't have to scan lists to do this.

i.e. Right now we could just as easily replace the radix tree with a
rbtree or resizing hash table as an XArray - the radix tree was just
a convenient "already implemented" key-based indexing mechanism that
was in the kernel when the MRU cache was implemented. Put simply:
the radix tree is not a primary structure in the MRU cache - it's
only an implementation detail and that's another reason why I'm not
a fan of smearing the internal locking of the replacement structure
all through the MRU code....

/me shrugs

BTW, something else I just noticed: all the comments in XFS that
talk about the radix trees would need updating.

$ git grep radix fs/xfs
fs/xfs/xfs_dquot.c:             /* uninit / unused quota found in radix tree, keep looking  */
fs/xfs/xfs_icache.c:    /* propagate the reclaim tag up into the perag radix tree */
fs/xfs/xfs_icache.c:    /* clear the reclaim tag from the perag radix tree */
fs/xfs/xfs_icache.c: * We set the inode flag atomically with the radix tree tag.
fs/xfs/xfs_icache.c: * Once we get tag lookups on the radix tree, this inode flag
fs/xfs/xfs_icache.c:     * radix tree nodes not being updated yet. We monitor for this by
fs/xfs/xfs_icache.c:     * Because the inode hasn't been added to the radix-tree yet it can't
fs/xfs/xfs_icache.c:     * These values must be set before inserting the inode into the radix
fs/xfs/xfs_icache.c:     * radix tree traversal here.  It assumes this function
fs/xfs/xfs_icache.c: * radix tree lookups to a minimum. The batch size is a trade off between
fs/xfs/xfs_icache.c:     * The radix tree lock here protects a thread in xfs_iget from racing
fs/xfs/xfs_icache.c:     * Remove the inode from the per-AG radix tree.
fs/xfs/xfs_icache.c:     * with inode cache radix tree lookups.  This is because the lookup
fs/xfs/xfs_icache.c:     * Don't bother locking the AG and looking up in the radix trees
fs/xfs/xfs_icache.c:            /* propagate the eofblocks tag up into the perag radix tree */
fs/xfs/xfs_icache.c:            /* clear the eofblocks tag from the perag radix tree */
fs/xfs/xfs_icache.h: * tags for inode radix tree
fs/xfs/xfs_qm.c: * currently is the only interface into the radix tree code that allows
$

Cheers,

Dave.
Matthew Wilcox Dec. 6, 2017, 2:06 p.m. UTC | #7
On Wed, Dec 06, 2017 at 07:44:04PM +1100, Dave Chinner wrote:
> On Tue, Dec 05, 2017 at 08:45:49PM -0800, Matthew Wilcox wrote:
> > That said, using xa_cmpxchg() in the dquot code looked like the right
> > thing to do?  Since we'd dropped the qi mutex and the ILOCK, it looks
> > entirely reasonable for another thread to come in and set up the dquot.
> > But I'm obviously quite ignorant of the XFS internals, so maybe there's
> > something else going on that makes this essentially a "can't happen".
> 
> It's no different to the inode cache code, which drops the RCU
> lock on lookup miss, instantiates the new inode (maybe reading it
> off disk), then locks the tree and attempts to insert it. Both cases
> use "insert if empty, otherwise retry lookup from start" semantics.

Ah.  I had my focus set a little narrow on the inode cache code and didn't
recognise the pattern.

Why do you sleep for one jiffy after encountering a miss, then seeing
someone else insert the inode for you?

> cmpxchg is for replacing a known object in a store - it's not really
> intended for doing initial inserts after a lookup tells us there is
> nothing in the store.  The radix tree "insert only if empty" makes
> sense here, because it naturally takes care of lookup/insert races
> via the -EEXIST mechanism.
> 
> I think that providing xa_store_excl() (which would return -EEXIST
> if the entry is not empty) would be a better interface here, because
> it matches the semantics of lookup cache population used all over
> the kernel....

I'm not thrilled with xa_store_excl(), but I need to think about that
a bit more.

> > I'm quite happy to have normal API variants that don't save/restore
> > interrupts.  Just need to come up with good names ... I don't think
> > xa_store_noirq() is a good name, but maybe you do?
> 
> I'd prefer not to have to deal with such things at all. :P
> 
> How many subsystems actually require irq safety in the XA locking
> code? Make them use irqsafe versions, not make everyone else use
> "noirq" versions, as is the convention for the rest of the kernel
> code....

Hard to say how many existing radix tree users require the irq safety.
Also hard to say how many potential users (people currently using
linked lists, people using resizable arrays, etc) need irq safety.
My thinking was "make it safe by default and let people who know better
have a way to opt out", but there's definitely something to be said for
"make it fast by default and let people who need the unusual behaviour
type those extra few letters".

So, you're arguing for providing xa_store(), xa_store_irq(), xa_store_bh()
and xa_store_irqsafe()?  (at least on demand, as users come to light?)
At least the read side doesn't require any variants; everybody can use
RCU for read side protection.

("safe", not "save" because I wouldn't make the caller provide the
"flags" argument).

> > At least, not today.  One of the future plans is to allow xa_nodes to
> > be allocated from ZONE_MOVABLE.  In order to do that, we have to be
> > able to tell which lock protects any given node.  With the XArray,
> > we can find that out (xa_node->root->xa_lock); with the radix tree,
> > we don't even know what kind of lock protects the tree.
> 
> Yup, this is a prime example of why we shouldn't be creating
> external dependencies by smearing the locking context outside the XA
> structure itself. It's not a stretch to see something like a
> ZONE_MOVEABLE dependency because some other object indexed in a XA
> is stored in the same page as the xa_node that points to it, and
> both require the same xa_lock to move/update...

That is a bit of a stretch.  Christoph Lameter and I had a discussion about it
here: https://www.spinics.net/lists/linux-mm/msg122902.html

There's no situation where you need to acquire two locks in order to
free an object; you'd create odd locking dependencies between objects
if you did that (eg we already have a locking dependency between pag_ici
and perag from __xfs_inode_set_eofblocks_tag).  It'd be a pretty horrible
shrinker design where you had to get all the locks on all the objects,
regardless of what locking order the real code had.

> > There are other costs to not having a lock.  The lockdep/RCU
> > analysis done on the radix tree code is none.  Because we have
> > no idea what lock might protect any individual radix tree, we use
> > rcu_dereference_raw(), disabling lockdep's ability to protect us.
> 
> Unfortunately for you, I don't find arguments along the lines of
> "lockdep will save us" at all convincing.  lockdep already throws
> too many false positives to be useful as a tool that reliably and
> accurately points out rare, exciting, complex, intricate locking
> problems.

But it does reliably and accurately point out "dude, you forgot to take
the lock".  It's caught a number of real problems in my own testing that
you never got to see.

> That problem has not gone away - very few people who read and have
> to maintain this code understandxs all the nasty little intricacies
> of RCU lookups.  Hiding /more/ of the locking semantics from the
> programmers makes it even harder to explain why the algorithm is
> safe. If the rules are basic (e.g. all radix tree lookups use RCU
> locking) then it's easier for everyone to understand, review and
> keep the code working correctly because there's almost no scope for
> getting it wrong.

Couldn't agree more.  Using RCU is subtle, and the parts of the kernel
that use calls like radix_tree_lookup_slot() are frequently buggy,
not least because the sparse annotations were missing until I added
them recently.  That's why the XArray makes sure it has the RCU lock
for you on anything that needs it.

Not that helps you ... you need to hold the RCU lock yourself because
your data are protected by RCU.  I did wonder if you could maybe
improve performance slightly by using something like the page cache's
get_speculative, re-check scheme, but I totally understand your desire
to not make this so hard to understand.

> BTW, something else I just noticed: all the comments in XFS that
> talk about the radix trees would need updating.

I know ... I've been trying to resist the urge to fix comments and spend
more of my time on getting the code working.  It's frustrating to see
people use "radix tree" when what they really mean was "page cache".
Our abstractions leak like sieves.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 7, 2017, 12:38 a.m. UTC | #8
On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
> On Wed, Dec 06, 2017 at 07:44:04PM +1100, Dave Chinner wrote:
> > On Tue, Dec 05, 2017 at 08:45:49PM -0800, Matthew Wilcox wrote:
> > > That said, using xa_cmpxchg() in the dquot code looked like the right
> > > thing to do?  Since we'd dropped the qi mutex and the ILOCK, it looks
> > > entirely reasonable for another thread to come in and set up the dquot.
> > > But I'm obviously quite ignorant of the XFS internals, so maybe there's
> > > something else going on that makes this essentially a "can't happen".
> > 
> > It's no different to the inode cache code, which drops the RCU
> > lock on lookup miss, instantiates the new inode (maybe reading it
> > off disk), then locks the tree and attempts to insert it. Both cases
> > use "insert if empty, otherwise retry lookup from start" semantics.
> 
> Ah.  I had my focus set a little narrow on the inode cache code and didn't
> recognise the pattern.
> 
> Why do you sleep for one jiffy after encountering a miss, then seeing
> someone else insert the inode for you?

The sleep is a backoff that allows whatever we raced with to
complete, be it a hit that raced with an inode being reclaimed and
removed, or a miss that raced with another insert. Ideally we'd
sleep on the XFS_INEW bit, similar to the vfs I_NEW flag, but it's
not quite that simple with the reclaim side of things...

> > cmpxchg is for replacing a known object in a store - it's not really
> > intended for doing initial inserts after a lookup tells us there is
> > nothing in the store.  The radix tree "insert only if empty" makes
> > sense here, because it naturally takes care of lookup/insert races
> > via the -EEXIST mechanism.
> > 
> > I think that providing xa_store_excl() (which would return -EEXIST
> > if the entry is not empty) would be a better interface here, because
> > it matches the semantics of lookup cache population used all over
> > the kernel....
> 
> I'm not thrilled with xa_store_excl(), but I need to think about that
> a bit more.

Not fussed about the name - I just think we need a function that
matches the insert semantics of the code....

> > > I'm quite happy to have normal API variants that don't save/restore
> > > interrupts.  Just need to come up with good names ... I don't think
> > > xa_store_noirq() is a good name, but maybe you do?
> > 
> > I'd prefer not to have to deal with such things at all. :P
> > 
> > How many subsystems actually require irq safety in the XA locking
> > code? Make them use irqsafe versions, not make everyone else use
> > "noirq" versions, as is the convention for the rest of the kernel
> > code....
> 
> Hard to say how many existing radix tree users require the irq safety.

The mapping tree requires it because it gets called from IO
completion contexts to clear page writeback state, but I don't know
about any of the others.

> Also hard to say how many potential users (people currently using
> linked lists, people using resizable arrays, etc) need irq safety.
> My thinking was "make it safe by default and let people who know better
> have a way to opt out", but there's definitely something to be said for
> "make it fast by default and let people who need the unusual behaviour
> type those extra few letters".
> 
> So, you're arguing for providing xa_store(), xa_store_irq(), xa_store_bh()
> and xa_store_irqsafe()?  (at least on demand, as users come to light?)
> At least the read side doesn't require any variants; everybody can use
> RCU for read side protection.

That would follow the pattern of the rest of the kernel APIs, though
I think it might be cleaner to simply state the locking requirement
to xa_init() and keep all those details completely internal rather
than encoding them into API calls. After all, the "irqsafe-ness" of
the locking needs to be consistent across the entire XA instance....

> ("safe", not "save" because I wouldn't make the caller provide the
> "flags" argument).
> 
> > > At least, not today.  One of the future plans is to allow xa_nodes to
> > > be allocated from ZONE_MOVABLE.  In order to do that, we have to be
> > > able to tell which lock protects any given node.  With the XArray,
> > > we can find that out (xa_node->root->xa_lock); with the radix tree,
> > > we don't even know what kind of lock protects the tree.
> > 
> > Yup, this is a prime example of why we shouldn't be creating
> > external dependencies by smearing the locking context outside the XA
> > structure itself. It's not a stretch to see something like a
> > ZONE_MOVEABLE dependency because some other object indexed in a XA
> > is stored in the same page as the xa_node that points to it, and
> > both require the same xa_lock to move/update...
> 
> That is a bit of a stretch.  Christoph Lameter and I had a discussion about it
> here: https://www.spinics.net/lists/linux-mm/msg122902.html
> 
> There's no situation where you need to acquire two locks in order to
> free an object;

ZONE_MOVEABLE is for moving migratable objects, not freeing
unreferenced objects. i.e. it's used to indicate the active objects
can be moved to a different location whilst it has other objects
pointing to it. This requires atomically swapping all the external
pointer references to the object so everything sees either the old
object before the move or the new object after the move. While the
move is in progress, we have to stall anything that could possibly
reference the object and in general that means we have lock up all
the objects that point to the object being moved.

For things like inodes, we have *lots* of external references to
them, and so we'd have to stall all of those external references
to update them once movement is complete. Lots of locks to hold
there, potentially including the xa_lock for the trees that index
the inode.

Hence if we are trying to migrate multiple objects at a time (i.e.
the bulk slab page clearing case) then we've got to lock up multiple
refrenceing objects and structure that may have overlapping
dependencies and so could end up trying to get the same locks that
other objects in the page already hold.

It's an utter mess - xa_node might be simple, but the general case
for slab objects in ZONE_MOVEABLE is anything but simple.  That's
the reason we've never made any progress on generic slab
defragmentation in the past 12-13 years - we haven't worked out how
to solve this fundamental "atomically update all external references
to the object being moved" problem.

> you'd create odd locking dependencies between objects
> if you did that (eg we already have a locking dependency between pag_ici
> and perag from __xfs_inode_set_eofblocks_tag)

You missed this one: xfs_inode_set_reclaim_tag()

It nests pag->pag_ici_lock - ip->i_flags_lock - mp->m_perag_lock
in one pass because we've got an inode flag and tags in two separate
radix trees we need to update atomically....

Also, that's called in the evict() path so, yeah, we're actually
nesting multiple locks to get the inode into a state where we can
reclaim it...

> It'd be a pretty horrible
> shrinker design where you had to get all the locks on all the objects,
> regardless of what locking order the real code had.

The shrinker (i.e. memory reclaim) doesn't need to do that - only
object migration does. They operate on vastly different object
contexts and should not be conflated.

Cheers,

Dave.
Theodore Ts'o Dec. 7, 2017, 4:06 p.m. UTC | #9
On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
> > Unfortunately for you, I don't find arguments along the lines of
> > "lockdep will save us" at all convincing.  lockdep already throws
> > too many false positives to be useful as a tool that reliably and
> > accurately points out rare, exciting, complex, intricate locking
> > problems.
> 
> But it does reliably and accurately point out "dude, you forgot to take
> the lock".  It's caught a number of real problems in my own testing that
> you never got to see.

The problem is that if it has too many false positives --- and it's
gotten *way* worse with the completion callback "feature", people will
just stop using Lockdep as being too annyoing and a waste of developer
time when trying to figure what is a legitimate locking bug versus
lockdep getting confused.

<Rant>I can't even disable the new Lockdep feature which is throwing
lots of new false positives --- it's just all or nothing.</Rant>

Dave has just said he's already stopped using Lockdep, as a result.

     	      	   			      - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 7, 2017, 10:22 p.m. UTC | #10
On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
> On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
> > > Unfortunately for you, I don't find arguments along the lines of
> > > "lockdep will save us" at all convincing.  lockdep already throws
> > > too many false positives to be useful as a tool that reliably and
> > > accurately points out rare, exciting, complex, intricate locking
> > > problems.
> > 
> > But it does reliably and accurately point out "dude, you forgot to take
> > the lock".  It's caught a number of real problems in my own testing that
> > you never got to see.
> 
> The problem is that if it has too many false positives --- and it's
> gotten *way* worse with the completion callback "feature", people will
> just stop using Lockdep as being too annyoing and a waste of developer
> time when trying to figure what is a legitimate locking bug versus
> lockdep getting confused.
> 
> <Rant>I can't even disable the new Lockdep feature which is throwing
> lots of new false positives --- it's just all or nothing.</Rant>
> 
> Dave has just said he's already stopped using Lockdep, as a result.

This is compeltely OT, but FYI I stopped using lockdep a long time
ago.  We've spend orders of magnitude more time and effort to shut
up lockdep false positives in the XFS code than we ever have on
locking problems that lockdep has uncovered. And still lockdep
throws too many false positives on XFS workloads to be useful to me.

But it's more than that: I understand just how much lockdep *doesn't
check* and that means *I know I can't rely on lockdep* for potential
deadlock detection. e.g.  it doesn't cover semaphores, which means
it has zero coverage of the entire XFS metadata buffer subsystem and
the complex locking orders we have for metadata updates.

Put simply: lockdep doesn't provide me with any benefit, so I don't
use it...

Cheers,

Dave.
Matthew Wilcox Dec. 7, 2017, 10:38 p.m. UTC | #11
On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
> The problem is that if it has too many false positives --- and it's
> gotten *way* worse with the completion callback "feature", people will
> just stop using Lockdep as being too annyoing and a waste of developer
> time when trying to figure what is a legitimate locking bug versus
> lockdep getting confused.
> 
> <Rant>I can't even disable the new Lockdep feature which is throwing
> lots of new false positives --- it's just all or nothing.</Rant>

You *can* ... but it's way more hacking Kconfig than you ought to have
to do (which is a separate rant ...)

You need to get LOCKDEP_CROSSRELEASE off.  I'd revert patches
e26f34a407aec9c65bce2bc0c838fabe4f051fc6 and
b483cf3bc249d7af706390efa63d6671e80d1c09

I think it was a mistake to force these on for everybody; they have a
much higher false-positive rate than the rest of lockdep, so as you say
forcing them on leads to fewer people using *any* of lockdep.

The bug you're hitting isn't Byungchul's fault; it's an annotation
problem.  The same kind of annotation problem that we used to have with
dozens of other places in the kernel which are now fixed.  If you didn't
have to hack Kconfig to get rid of this problem, you'd be happier, right?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Dec. 7, 2017, 10:39 p.m. UTC | #12
On Thu, Dec 07, 2017 at 02:38:03PM -0800, Matthew Wilcox wrote:
> You need to get LOCKDEP_CROSSRELEASE off.  I'd revert patches
> e26f34a407aec9c65bce2bc0c838fabe4f051fc6 and
> b483cf3bc249d7af706390efa63d6671e80d1c09

Oops.  I meant to revert 2dcd5adfb7401b762ddbe4b86dcacc2f3de6b97b.
Or you could cherry-pick b483cf3bc249d7af706390efa63d6671e80d1c09.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 8, 2017, 12:14 a.m. UTC | #13
On Thu, Dec 07, 2017 at 02:38:03PM -0800, Matthew Wilcox wrote:
> On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
> > The problem is that if it has too many false positives --- and it's
> > gotten *way* worse with the completion callback "feature", people will
> > just stop using Lockdep as being too annyoing and a waste of developer
> > time when trying to figure what is a legitimate locking bug versus
> > lockdep getting confused.
> > 
> > <Rant>I can't even disable the new Lockdep feature which is throwing
> > lots of new false positives --- it's just all or nothing.</Rant>
> 
> You *can* ... but it's way more hacking Kconfig than you ought to have
> to do (which is a separate rant ...)
> 
> You need to get LOCKDEP_CROSSRELEASE off.  I'd revert patches
> e26f34a407aec9c65bce2bc0c838fabe4f051fc6 and
> b483cf3bc249d7af706390efa63d6671e80d1c09
> 
> I think it was a mistake to force these on for everybody; they have a
> much higher false-positive rate than the rest of lockdep, so as you say
> forcing them on leads to fewer people using *any* of lockdep.
> 
> The bug you're hitting isn't Byungchul's fault; it's an annotation
> problem.  The same kind of annotation problem that we used to have with
> dozens of other places in the kernel which are now fixed.

That's one of the fundamental problem with lockdep - it throws the
difficulty of solving all these new false positives onto the
developers who know nothing about lockdep and don't follow it's
development. And until they do solve them - especially in critical
subsystems that everyone uses like the storage stack - lockdep is
essentially worthless.

> If you didn't
> have to hack Kconfig to get rid of this problem, you'd be happier, right?

I'd be much happier if it wasn't turned on by default in the first
place.  We gave plenty of warnings that there were still unsolved
false positive problems with the new checks in the storage stack.

Cheers,

Dave.
Byungchul Park Dec. 8, 2017, 4:45 a.m. UTC | #14
On Fri, Dec 08, 2017 at 09:22:16AM +1100, Dave Chinner wrote:
> On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
> > On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
> > > > Unfortunately for you, I don't find arguments along the lines of
> > > > "lockdep will save us" at all convincing.  lockdep already throws
> > > > too many false positives to be useful as a tool that reliably and
> > > > accurately points out rare, exciting, complex, intricate locking
> > > > problems.
> > > 
> > > But it does reliably and accurately point out "dude, you forgot to take
> > > the lock".  It's caught a number of real problems in my own testing that
> > > you never got to see.
> > 
> > The problem is that if it has too many false positives --- and it's
> > gotten *way* worse with the completion callback "feature", people will
> > just stop using Lockdep as being too annyoing and a waste of developer
> > time when trying to figure what is a legitimate locking bug versus
> > lockdep getting confused.
> > 
> > <Rant>I can't even disable the new Lockdep feature which is throwing
> > lots of new false positives --- it's just all or nothing.</Rant>
> > 
> > Dave has just said he's already stopped using Lockdep, as a result.
> 
> This is compeltely OT, but FYI I stopped using lockdep a long time
> ago.  We've spend orders of magnitude more time and effort to shut
> up lockdep false positives in the XFS code than we ever have on
> locking problems that lockdep has uncovered. And still lockdep
> throws too many false positives on XFS workloads to be useful to me.
> 
> But it's more than that: I understand just how much lockdep *doesn't
> check* and that means *I know I can't rely on lockdep* for potential
> deadlock detection. e.g.  it doesn't cover semaphores, which means

Hello,

I'm careful in saying the following since you seem to feel not good at
crossrelease and even lockdep. Now that cross-release has been
introduced, semaphores can be covered as you might know. Actually, all
general waiters can.

> it has zero coverage of the entire XFS metadata buffer subsystem and
> the complex locking orders we have for metadata updates.
> 
> Put simply: lockdep doesn't provide me with any benefit, so I don't
> use it...

Sad..

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 8, 2017, 7:25 a.m. UTC | #15
On Fri, Dec 08, 2017 at 01:45:52PM +0900, Byungchul Park wrote:
> On Fri, Dec 08, 2017 at 09:22:16AM +1100, Dave Chinner wrote:
> > On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
> > > On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
> > > > > Unfortunately for you, I don't find arguments along the lines of
> > > > > "lockdep will save us" at all convincing.  lockdep already throws
> > > > > too many false positives to be useful as a tool that reliably and
> > > > > accurately points out rare, exciting, complex, intricate locking
> > > > > problems.
> > > > 
> > > > But it does reliably and accurately point out "dude, you forgot to take
> > > > the lock".  It's caught a number of real problems in my own testing that
> > > > you never got to see.
> > > 
> > > The problem is that if it has too many false positives --- and it's
> > > gotten *way* worse with the completion callback "feature", people will
> > > just stop using Lockdep as being too annyoing and a waste of developer
> > > time when trying to figure what is a legitimate locking bug versus
> > > lockdep getting confused.
> > > 
> > > <Rant>I can't even disable the new Lockdep feature which is throwing
> > > lots of new false positives --- it's just all or nothing.</Rant>
> > > 
> > > Dave has just said he's already stopped using Lockdep, as a result.
> > 
> > This is compeltely OT, but FYI I stopped using lockdep a long time
> > ago.  We've spend orders of magnitude more time and effort to shut
> > up lockdep false positives in the XFS code than we ever have on
> > locking problems that lockdep has uncovered. And still lockdep
> > throws too many false positives on XFS workloads to be useful to me.
> > 
> > But it's more than that: I understand just how much lockdep *doesn't
> > check* and that means *I know I can't rely on lockdep* for potential
> > deadlock detection. e.g.  it doesn't cover semaphores, which means
> 
> Hello,
> 
> I'm careful in saying the following since you seem to feel not good at
> crossrelease and even lockdep. Now that cross-release has been
> introduced, semaphores can be covered as you might know. Actually, all
> general waiters can.

And all it will do is create a whole bunch more work for us XFS guys
to shut up all the the false positive crap that falls out from it
because the locking model we have is far more complex than any of
the lockdep developers thought was necessary to support, just like
happened with the XFS inode annotations all those years ago.

e.g. nobody has ever bothered to ask us what is needed to describe
XFS's semaphore locking model.  If you did that, you'd know that we
nest *thousands* of locked semaphores in compeltely random lock
order during metadata buffer writeback. And that this lock order
does not reflect the actual locking order rules we have for locking
buffers during transactions.

Oh, and you'd also know that a semaphore's lock order and context
can change multiple times during the life time of the buffer.  Say
we free a block and the reallocate it as something else before it is
reclaimed - that buffer now might have a different lock order. Or
maybe we promote a buffer to be a root btree block as a result of a
join - it's now the first buffer in a lock run, rather than a child.
Or we split a tree, and the root is now a node and so no longer is
the first buffer in a lock run. Or that we walk sideways along the
leaf nodes siblings during searches.  IOWs, there is no well defined
static lock ordering at all for buffers - and therefore semaphores -
in XFS at all.

And knowing that, you wouldn't simply mention that lockdep can
support semaphores now as though that is necessary to "make it work"
for XFS.  It's going to be much simpler for us to just turn off
lockdep and ignore whatever crap it sends our way than it is to
spend unplanned weeks of our time to try to make lockdep sorta work
again. Sure, we might get there in the end, but it's likely to take
months, if not years like it did with the XFS inode annotations.....

> > it has zero coverage of the entire XFS metadata buffer subsystem and
> > the complex locking orders we have for metadata updates.
> > 
> > Put simply: lockdep doesn't provide me with any benefit, so I don't
> > use it...
> 
> Sad..

I don't think you understand. I'll try to explain.

The lockdep infrastructure by itself doesn't make lockdep a useful
tool - it mostly generates false positives because it has no
concept of locking models that don't match it's internal tracking
assumptions and/or limitations.

That means if we can't suppress the false positives, then lockdep is
going to be too noisy to find real problems.  It's taken the XFS
developers months of work over the past 7-8 years to suppress all
the *common* false positives that lockdep throws on XFS. And despite
all that work, there's still too many false positives occuring
because we can't easily suppress them with annotations. IOWs, the
signal to noise ratio is still too low for lockdep to find real
problems.

That's why lockdep isn't useful to me - the noise floor is too high,
and the effort to lower the noise floor further is too great.

This is important, because cross-release just raised the noise floor
by a large margin and so now we have to spend the time to reduce it
again back to where it was before cross-release was added.  IOWs,
adding new detection features to lockdep actually makes lockdep less
useful for a significant period of time. That length of time is
dependent on the rate at which subsystem developers can suppress the
false positives and lower the noise floor back down to an acceptible
level. And there is always the possibility that we can't get the
noise floor low enough for lockdep to be a reliable, useful tool for
some subsystems....

That's what I don't think you understand - that the most important
part of lockdep is /not the core infrastructure/ you work on. The
most important part of lockdep is the annotations that suppress the
noise floor and allow the real problems to stand out.

Cheers,

Dave.
Byungchul Park Dec. 8, 2017, 9:27 a.m. UTC | #16
On 12/8/2017 4:25 PM, Dave Chinner wrote:
> On Fri, Dec 08, 2017 at 01:45:52PM +0900, Byungchul Park wrote:
>> On Fri, Dec 08, 2017 at 09:22:16AM +1100, Dave Chinner wrote:
>>> On Thu, Dec 07, 2017 at 11:06:34AM -0500, Theodore Ts'o wrote:
>>>> On Wed, Dec 06, 2017 at 06:06:48AM -0800, Matthew Wilcox wrote:
>>>>>> Unfortunately for you, I don't find arguments along the lines of
>>>>>> "lockdep will save us" at all convincing.  lockdep already throws
>>>>>> too many false positives to be useful as a tool that reliably and
>>>>>> accurately points out rare, exciting, complex, intricate locking
>>>>>> problems.
>>>>>
>>>>> But it does reliably and accurately point out "dude, you forgot to take
>>>>> the lock".  It's caught a number of real problems in my own testing that
>>>>> you never got to see.
>>>>
>>>> The problem is that if it has too many false positives --- and it's
>>>> gotten *way* worse with the completion callback "feature", people will
>>>> just stop using Lockdep as being too annyoing and a waste of developer
>>>> time when trying to figure what is a legitimate locking bug versus
>>>> lockdep getting confused.
>>>>
>>>> <Rant>I can't even disable the new Lockdep feature which is throwing
>>>> lots of new false positives --- it's just all or nothing.</Rant>
>>>>
>>>> Dave has just said he's already stopped using Lockdep, as a result.
>>>
>>> This is compeltely OT, but FYI I stopped using lockdep a long time
>>> ago.  We've spend orders of magnitude more time and effort to shut
>>> up lockdep false positives in the XFS code than we ever have on
>>> locking problems that lockdep has uncovered. And still lockdep
>>> throws too many false positives on XFS workloads to be useful to me.
>>>
>>> But it's more than that: I understand just how much lockdep *doesn't
>>> check* and that means *I know I can't rely on lockdep* for potential
>>> deadlock detection. e.g.  it doesn't cover semaphores, which means
>>
>> Hello,
>>
>> I'm careful in saying the following since you seem to feel not good at
>> crossrelease and even lockdep. Now that cross-release has been
>> introduced, semaphores can be covered as you might know. Actually, all
>> general waiters can.
> 
> And all it will do is create a whole bunch more work for us XFS guys
> to shut up all the the false positive crap that falls out from it
> because the locking model we have is far more complex than any of
> the lockdep developers thought was necessary to support, just like
> happened with the XFS inode annotations all those years ago.
> 
> e.g. nobody has ever bothered to ask us what is needed to describe
> XFS's semaphore locking model.  If you did that, you'd know that we
> nest *thousands* of locked semaphores in compeltely random lock
> order during metadata buffer writeback. And that this lock order
> does not reflect the actual locking order rules we have for locking
> buffers during transactions.
> 
> Oh, and you'd also know that a semaphore's lock order and context
> can change multiple times during the life time of the buffer.  Say
> we free a block and the reallocate it as something else before it is
> reclaimed - that buffer now might have a different lock order. Or
> maybe we promote a buffer to be a root btree block as a result of a
> join - it's now the first buffer in a lock run, rather than a child.
> Or we split a tree, and the root is now a node and so no longer is
> the first buffer in a lock run. Or that we walk sideways along the
> leaf nodes siblings during searches.  IOWs, there is no well defined
> static lock ordering at all for buffers - and therefore semaphores -
> in XFS at all.
> 
> And knowing that, you wouldn't simply mention that lockdep can
> support semaphores now as though that is necessary to "make it work"
> for XFS.  It's going to be much simpler for us to just turn off
> lockdep and ignore whatever crap it sends our way than it is to
> spend unplanned weeks of our time to try to make lockdep sorta work
> again. Sure, we might get there in the end, but it's likely to take
> months, if not years like it did with the XFS inode annotations.....
> 
>>> it has zero coverage of the entire XFS metadata buffer subsystem and
>>> the complex locking orders we have for metadata updates.
>>>
>>> Put simply: lockdep doesn't provide me with any benefit, so I don't
>>> use it...
>>
>> Sad..
> 
> I don't think you understand. I'll try to explain.
> 
> The lockdep infrastructure by itself doesn't make lockdep a useful
> tool - it mostly generates false positives because it has no
> concept of locking models that don't match it's internal tracking
> assumptions and/or limitations.
> 
> That means if we can't suppress the false positives, then lockdep is
> going to be too noisy to find real problems.  It's taken the XFS
> developers months of work over the past 7-8 years to suppress all
> the *common* false positives that lockdep throws on XFS. And despite
> all that work, there's still too many false positives occuring
> because we can't easily suppress them with annotations. IOWs, the
> signal to noise ratio is still too low for lockdep to find real
> problems.
> 
> That's why lockdep isn't useful to me - the noise floor is too high,
> and the effort to lower the noise floor further is too great.
> 
> This is important, because cross-release just raised the noise floor
> by a large margin and so now we have to spend the time to reduce it
> again back to where it was before cross-release was added.  IOWs,
> adding new detection features to lockdep actually makes lockdep less
> useful for a significant period of time. That length of time is
> dependent on the rate at which subsystem developers can suppress the
> false positives and lower the noise floor back down to an acceptible
> level. And there is always the possibility that we can't get the
> noise floor low enough for lockdep to be a reliable, useful tool for
> some subsystems....
> 
> That's what I don't think you understand - that the most important
> part of lockdep is /not the core infrastructure/ you work on. The
> most important part of lockdep is the annotations that suppress the
> noise floor and allow the real problems to stand out.

I'm sorry to hear that.. If I were you, I would also get
annoyed. And.. thanks for explanation.

But, I think assigning lock classes properly and checking
relationship of the classes to detect deadlocks is reasonable.

In my opinion about the common lockdep stuff, there are 2
problems on it.

1) Firstly, it's hard to assign lock classes *properly*. By
default, it relies on the caller site of lockdep_init_map(),
but we need to assign another class manually, where ordering
rules are complicated so cannot rely on the caller site. That
*only* can be done by experts of the subsystem.

I think if they want to get benifit from lockdep, they have no
choice but to assign classes manually with the domain knowledge,
or use *lockdep_set_novalidate_class()* to invalidate locks
making the developers annoyed and not want to use the checking
for them.

It's a problem of choice between (1) getting benifit from
lockdep by doing something with the domain knowledge, and (2)
giving up the benifit by invalidating locks making them panic.

2) Secondly, I've seen several places where lock_acquire()s
are a little bit wrongly used more than we need. That would add
additional detection capability and make lockdep strong but
increase the possibility to give us more false positives.

If you don't want to work on the additional annotations at the
moment, then I think you can choose an option whatever you
want, and consider locks again you've invalidated, when it
becomes necessary to detect deadlocks involving those locks by
validating those locks back and adding necessary annotations.

Am I missing something?
Theodore Ts'o Dec. 8, 2017, 3:27 p.m. UTC | #17
On Thu, Dec 07, 2017 at 02:38:03PM -0800, Matthew Wilcox wrote:
> I think it was a mistake to force these on for everybody; they have a
> much higher false-positive rate than the rest of lockdep, so as you say
> forcing them on leads to fewer people using *any* of lockdep.
> 
> The bug you're hitting isn't Byungchul's fault; it's an annotation
> problem.  The same kind of annotation problem that we used to have with
> dozens of other places in the kernel which are now fixed.

The question is whose responsibility is it to annotate the code?  On
another thread it was suggested it was ext4's responsibility to add
annotations to avoid the false positives --- never the mind the fact
that every single file system is going to have add annotations.

Also note that the documentation for how to add annotations is
***horrible***.  It's mostly, "try to figure out how other people
added magic cargo cult code which is not well defined (look at the
definitions of lockdep_set_class, lockdep_set_class_and_name,
lockdep_set_class_and_subclass, and lockdep_set_subclass, and weep) in
other subsystems and hope and pray it works for you."

And the explanation when there are failures, either false positives,
or not, are completely opaque.  For example:

[   16.190198] ext4lazyinit/648 is trying to acquire lock:
[   16.191201]  ((gendisk_completion)1 << part_shift(NUMA_NO_NODE)){+.+.}, at: [<8a1ebe9d>] wait_for_completion_io+0x12/0x20

Just try to tell me that:

	((gendisk_completion)1 << part_shift(NUMA_NO_NODE)){+.+.}

is human comprehensible with a straight face.  And since the messages
don't even include the subclass/class/name key annotations, as lockdep
tries to handle things that are more and more complex, I'd argue it
has already crossed the boundary where unless you are a lockdep
developer, good luck trying to understand what is going on or how to
add annotations.

So if you are adding complexity to the kernel with the argument,
"lockdep will save us", I'm with Dave --- it's just not a believable
argument.

Cheers,

						- Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alan Stern Dec. 8, 2017, 5:35 p.m. UTC | #18
On Fri, 8 Dec 2017, Byungchul Park wrote:

> I'm sorry to hear that.. If I were you, I would also get
> annoyed. And.. thanks for explanation.
> 
> But, I think assigning lock classes properly and checking
> relationship of the classes to detect deadlocks is reasonable.
> 
> In my opinion about the common lockdep stuff, there are 2
> problems on it.
> 
> 1) Firstly, it's hard to assign lock classes *properly*. By
> default, it relies on the caller site of lockdep_init_map(),
> but we need to assign another class manually, where ordering
> rules are complicated so cannot rely on the caller site. That
> *only* can be done by experts of the subsystem.
> 
> I think if they want to get benifit from lockdep, they have no
> choice but to assign classes manually with the domain knowledge,
> or use *lockdep_set_novalidate_class()* to invalidate locks
> making the developers annoyed and not want to use the checking
> for them.

Lockdep's no_validate class is used when the locking patterns are too
complicated for lockdep to understand.  Basically, it tells lockdep to
ignore those locks.

The device core uses that class.  The tree of struct devices, each with
its own lock, gets used in many different and complicated ways.  
Lockdep can't understand this -- it doesn't have the ability to
represent an arbitrarily deep hierarchical tree of locks -- so we tell
it to ignore the device locks.

It sounds like XFS may need to do the same thing with its semaphores.

Alan Stern

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Dec. 8, 2017, 6:14 p.m. UTC | #19
On Fri, Dec 08, 2017 at 10:27:17AM -0500, Theodore Ts'o wrote:
> So if you are adding complexity to the kernel with the argument,
> "lockdep will save us", I'm with Dave --- it's just not a believable
> argument.

I think that's a gross misrepresentation of what I'm doing.

At the moment, the radix tree actively disables the RCU checking that
enabling lockdep would give us.  It has to, because it has no idea what
lock protects any individual access to the radix tree.  The XArray can
use the RCU checking because it knows that every reference is protected
by either the spinlock or the RCU lock.

Dave was saying that he has a tree which has to be protected by a mutex
because of where it is in the locking hierarchy, and I was vigorously
declining his proposal of allowing him to skip taking the spinlock.

And yes, we have bugs today that I assume we only stumble across every
few billion years (or only on alpha, or only if our compiler gets more
aggressive) because we have missing rcu_dereference annotations.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 8, 2017, 10:36 p.m. UTC | #20
On Fri, Dec 08, 2017 at 12:35:07PM -0500, Alan Stern wrote:
> On Fri, 8 Dec 2017, Byungchul Park wrote:
> 
> > I'm sorry to hear that.. If I were you, I would also get
> > annoyed. And.. thanks for explanation.
> > 
> > But, I think assigning lock classes properly and checking
> > relationship of the classes to detect deadlocks is reasonable.
> > 
> > In my opinion about the common lockdep stuff, there are 2
> > problems on it.
> > 
> > 1) Firstly, it's hard to assign lock classes *properly*. By
> > default, it relies on the caller site of lockdep_init_map(),
> > but we need to assign another class manually, where ordering
> > rules are complicated so cannot rely on the caller site. That
> > *only* can be done by experts of the subsystem.

Sure, but that's not the issue here. The issue here is the lack of
communication with subsystem experts and that the annotation
complexity warnings given immediately by the subsystem experts were
completely ignored...

> > I think if they want to get benifit from lockdep, they have no
> > choice but to assign classes manually with the domain knowledge,
> > or use *lockdep_set_novalidate_class()* to invalidate locks
> > making the developers annoyed and not want to use the checking
> > for them.
> 
> Lockdep's no_validate class is used when the locking patterns are too
> complicated for lockdep to understand.  Basically, it tells lockdep to
> ignore those locks.

Let me just point out two things here:

	1. Using lockdep_set_novalidate_class() for anything other
	than device->mutex will throw checkpatch warnings. Nice. (*)

	2. lockdep_set_novalidate_class() is completely undocumented
	- it's the first I've ever heard of this functionality. i.e.
	nobody has ever told us there is a mechanism to turn off
	validation of an object; we've *always* been told to "change
	your code and/or fix your annotations" when discussing
	lockdep deficiencies. (**)

> The device core uses that class.  The tree of struct devices, each with
> its own lock, gets used in many different and complicated ways.  
> Lockdep can't understand this -- it doesn't have the ability to
> represent an arbitrarily deep hierarchical tree of locks -- so we tell
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

That largely describes the in-memory structure of XFS, except we
have a forest of lock trees, not just one....

> it to ignore the device locks.
> 
> It sounds like XFS may need to do the same thing with its semaphores.

Who-ever adds semaphore checking to lockdep can add those
annotations. The externalisation of the development cost of new
lockdep functionality is one of the problems here.

-Dave.

(*) checkpatch.pl is considered mostly harmful round here, too,
but that's another rant....

(**) the frequent occurrence of "core code/devs aren't held to the
same rules/standard as everyone else" is another rant I have stored
up for a rainy day.
Dave Chinner Dec. 8, 2017, 10:47 p.m. UTC | #21
On Fri, Dec 08, 2017 at 10:14:38AM -0800, Matthew Wilcox wrote:
> At the moment, the radix tree actively disables the RCU checking that
> enabling lockdep would give us.  It has to, because it has no idea what
> lock protects any individual access to the radix tree.  The XArray can
> use the RCU checking because it knows that every reference is protected
> by either the spinlock or the RCU lock.
> 
> Dave was saying that he has a tree which has to be protected by a mutex
> because of where it is in the locking hierarchy, and I was vigorously
> declining his proposal of allowing him to skip taking the spinlock.

Oh, I wasn't suggesting that you remove the internal tree locking
because we need external locking.

I was trying to point out that the internal locking doesn't remove
the need for external locking,  and that there are cases where
smearing the internal lock outside the XA tree doesn't work, either.
i.e. internal locking doesn't replace all the cases where external
locking is required, and so it's less efficient than the existing
radix tree code.

What I was questioning was the value of replacing the radix tree
code with a less efficient structure just to add lockdep validation
to a tree that doesn't actually need any extra locking validation...

Cheers,

Dave.
Matthew Wilcox Dec. 8, 2017, 11:01 p.m. UTC | #22
On Thu, Dec 07, 2017 at 11:38:43AM +1100, Dave Chinner wrote:
> > > cmpxchg is for replacing a known object in a store - it's not really
> > > intended for doing initial inserts after a lookup tells us there is
> > > nothing in the store.  The radix tree "insert only if empty" makes
> > > sense here, because it naturally takes care of lookup/insert races
> > > via the -EEXIST mechanism.
> > > 
> > > I think that providing xa_store_excl() (which would return -EEXIST
> > > if the entry is not empty) would be a better interface here, because
> > > it matches the semantics of lookup cache population used all over
> > > the kernel....
> > 
> > I'm not thrilled with xa_store_excl(), but I need to think about that
> > a bit more.
> 
> Not fussed about the name - I just think we need a function that
> matches the insert semantics of the code....

I think I have something that works better for you than returning -EEXIST
(because you don't actually want -EEXIST, you want -EAGAIN):

        /* insert the new inode */
-       spin_lock(&pag->pag_ici_lock);
-       error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-       if (unlikely(error)) {
-               WARN_ON(error != -EEXIST);
-               XFS_STATS_INC(mp, xs_ig_dup);
-               error = -EAGAIN;
-               goto out_preload_end;
-       }
-       spin_unlock(&pag->pag_ici_lock);
-       radix_tree_preload_end();
+       curr = xa_cmpxchg(&pag->pag_ici_xa, agino, NULL, ip, GFP_NOFS);
+       error = __xa_race(curr, -EAGAIN);
+       if (error)
+               goto out_unlock;

...

-out_preload_end:
-       spin_unlock(&pag->pag_ici_lock);
-       radix_tree_preload_end();
+out_unlock:
+       if (error == -EAGAIN)
+               XFS_STATS_INC(mp, xs_ig_dup);

I've changed the behaviour slightly in that returning an -ENOMEM used to
hit a WARN_ON, and I don't think that's the right response -- GFP_NOFS
returning -ENOMEM probably gets you a nice warning already from the
mm code.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joe Perches Dec. 9, 2017, 5 p.m. UTC | #23
On Sat, 2017-12-09 at 09:36 +1100, Dave Chinner wrote:
> 	1. Using lockdep_set_novalidate_class() for anything other
> 	than device->mutex will throw checkpatch warnings. Nice. (*)
[]
> (*) checkpatch.pl is considered mostly harmful round here, too,
> but that's another rant....

How so?

> (**) the frequent occurrence of "core code/devs aren't held to the
> same rules/standard as everyone else" is another rant I have stored
> up for a rainy day.

Yeah.  I wouldn't mind reading that one...

Rainy season is starting right about now here too.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 10, 2017, 11:57 p.m. UTC | #24
On Fri, Dec 08, 2017 at 03:01:31PM -0800, Matthew Wilcox wrote:
> On Thu, Dec 07, 2017 at 11:38:43AM +1100, Dave Chinner wrote:
> > > > cmpxchg is for replacing a known object in a store - it's not really
> > > > intended for doing initial inserts after a lookup tells us there is
> > > > nothing in the store.  The radix tree "insert only if empty" makes
> > > > sense here, because it naturally takes care of lookup/insert races
> > > > via the -EEXIST mechanism.
> > > > 
> > > > I think that providing xa_store_excl() (which would return -EEXIST
> > > > if the entry is not empty) would be a better interface here, because
> > > > it matches the semantics of lookup cache population used all over
> > > > the kernel....
> > > 
> > > I'm not thrilled with xa_store_excl(), but I need to think about that
> > > a bit more.
> > 
> > Not fussed about the name - I just think we need a function that
> > matches the insert semantics of the code....
> 
> I think I have something that works better for you than returning -EEXIST
> (because you don't actually want -EEXIST, you want -EAGAIN):
> 
>         /* insert the new inode */
> -       spin_lock(&pag->pag_ici_lock);
> -       error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
> -       if (unlikely(error)) {
> -               WARN_ON(error != -EEXIST);
> -               XFS_STATS_INC(mp, xs_ig_dup);
> -               error = -EAGAIN;
> -               goto out_preload_end;
> -       }
> -       spin_unlock(&pag->pag_ici_lock);
> -       radix_tree_preload_end();
> +       curr = xa_cmpxchg(&pag->pag_ici_xa, agino, NULL, ip, GFP_NOFS);
> +       error = __xa_race(curr, -EAGAIN);
> +       if (error)
> +               goto out_unlock;
> 
> ...
> 
> -out_preload_end:
> -       spin_unlock(&pag->pag_ici_lock);
> -       radix_tree_preload_end();
> +out_unlock:
> +       if (error == -EAGAIN)
> +               XFS_STATS_INC(mp, xs_ig_dup);
> 
> I've changed the behaviour slightly in that returning an -ENOMEM used to
> hit a WARN_ON, and I don't think that's the right response -- GFP_NOFS
> returning -ENOMEM probably gets you a nice warning already from the
> mm code.

It's been a couple of days since I first looked at this, and my
initial reaction of "that's horrible!" hasn't changed.

What you are proposing here might be a perfectly reasonable
*internal implemention* of xa_store_excl(), but it makes for a
terrible external API because the sematics and behaviour are so
vague. e.g. what does "race" mean here with respect to an insert
failure?

i.e.  the fact the cmpxchg failed may not have anything to do with a
race condtion - it failed because the slot wasn't empty like we
expected it to be. There can be any number of reasons the slot isn't
empty - the API should not "document" that the reason the insert
failed was a race condition. It should document the case that we
"couldn't insert because there was an existing entry in the slot".
Let the surrounding code document the reason why that might have
happened - it's not for the API to assume reasons for failure.

i.e. this API and potential internal implementation makes much
more sense:

int
xa_store_iff_empty(...)
{
	curr = xa_cmpxchg(&pag->pag_ici_xa, agino, NULL, ip, GFP_NOFS);
	if (!curr)
		return 0;	/* success! */
	if (!IS_ERR(curr))
		return -EEXIST;	/* failed - slot not empty */
	return PTR_ERR(curr);	/* failed - XA internal issue */
}

as it replaces the existing preload and insert code in the XFS code
paths whilst letting us handle and document the "insert failed
because slot not empty" case however we want. It implies nothing
about *why* the slot wasn't empty, just that we couldn't do the
insert because it wasn't empty.

Cheers,

Dave.
Dave Chinner Dec. 11, 2017, 9:43 p.m. UTC | #25
On Sat, Dec 09, 2017 at 09:00:18AM -0800, Joe Perches wrote:
> On Sat, 2017-12-09 at 09:36 +1100, Dave Chinner wrote:
> > 	1. Using lockdep_set_novalidate_class() for anything other
> > 	than device->mutex will throw checkpatch warnings. Nice. (*)
> []
> > (*) checkpatch.pl is considered mostly harmful round here, too,
> > but that's another rant....
> 
> How so?

Short story is that it barfs all over the slightly non-standard
coding style used in XFS.  It generates enough noise on incidental
things we aren't important that it complicates simple things. e.g. I
just moved a block of defines from one header to another, and
checkpatch throws about 10 warnings on that because of whitespace.
I'm just moving code - I don't want to change it and it doesn't need
to be modified because it is neat and easy to read and is obviously
correct. A bunch of prototypes I added another parameter to gets
warnings because it uses "unsigned" for an existing parameter that
I'm not changing. And so on.

This sort of stuff is just lowest-common-denominator noise - great
for new code and/or inexperienced developers, but not for working
with large bodies of existing code with slightly non-standard
conventions. Churning *lots* of code we otherwise wouldn't touch
just to shut up checkpatch warnings takes time, risks regressions
and makes it harder to trace the history of the code when we are
trying to track down bugs.

Cheers,

Dave.
Joe Perches Dec. 11, 2017, 10:12 p.m. UTC | #26
On Tue, 2017-12-12 at 08:43 +1100, Dave Chinner wrote:
> On Sat, Dec 09, 2017 at 09:00:18AM -0800, Joe Perches wrote:
> > On Sat, 2017-12-09 at 09:36 +1100, Dave Chinner wrote:
> > > 	1. Using lockdep_set_novalidate_class() for anything other
> > > 	than device->mutex will throw checkpatch warnings. Nice. (*)
> > []
> > > (*) checkpatch.pl is considered mostly harmful round here, too,
> > > but that's another rant....
> > 
> > How so?
> 
> Short story is that it barfs all over the slightly non-standard
> coding style used in XFS.
[]
> This sort of stuff is just lowest-common-denominator noise - great
> for new code and/or inexperienced developers, but not for working
> with large bodies of existing code with slightly non-standard
> conventions.

Completely reasonable.  Thanks.

Do you get many checkpatch submitters for fs/xfs?

If so, could probably do something about adding
a checkpatch file flag to the directory or equivalent.

Maybe add something like:

fs/xfs/.checkpatch

where the contents turn off most everything
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Dec. 11, 2017, 10:43 p.m. UTC | #27
On Mon, Dec 11, 2017 at 02:12:28PM -0800, Joe Perches wrote:
> Completely reasonable.  Thanks.

If we're doing "completely reasonable" complaints, then ...

 - I don't understand why plain 'unsigned' is deemed bad.

 - The rule about all function parameters in prototypes having a name
   doesn't make sense.  Example:

int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);

   There is zero additional value in naming 'ida'.  I know it's an ida.
   The struct name tells me that.  If there're two struct ida pointers
   in the prototype, then sure, I want to name them so I know which is
   which (maybe 'src' and 'dst').  Having an unadorned 'int' parameter
   to a function should be a firable offence.  But there's no need to
   call 'gfp_t' anything.  We know it's a gfp_t.  Adding 'gfp_mask'
   after it doesn't tell us anything extra.

 - Forcing a blank line after variable declarations sometimes makes for
   some weird-looking code.  For example, there is no problem with this
   code (from a checkpatch PoV):

        if (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
                /* Move xa_index to the first index of this entry */
                xas->xa_index = (((xas->xa_index >> node->shift) &
                                  ~XA_CHUNK_MASK) | offset) << node->shift;
        }

   but if I decide I don't need 'offset' outside this block, and I want
   to move the declaration inside, it looks like this:

        if (xa_is_sibling(entry)) {
                unsigned int offset = xa_to_sibling(entry);

                entry = xa_entry(xas->xa, node, offset);
                /* Move xa_index to the first index of this entry */
                xas->xa_index = (((xas->xa_index >> node->shift) &
                                  ~XA_CHUNK_MASK) | offset) << node->shift;
        }

   Does that blank line really add anything to your comprehension of the
   block?  It upsets my train of thought.

   Constructively, I think this warning can be suppressed for blocks
   that are under, say, 8 lines.  Or maybe indented blocks is where I don't
   want this warning.  Not sure.

   Here's another example where I don't think the blank line adds anything:

static inline int xa_store_empty(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp, int errno)
{
        void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp);
        if (!curr)
                return 0;
        if (xa_is_err(curr))
                return xa_err(curr);
        return errno;
}

   So line count definitely has something to do with it.

 - There's no warning for the first paragraph of section 6:

6) Functions
------------

Functions should be short and sweet, and do just one thing.  They should
fit on one or two screenfuls of text (the ISO/ANSI screen size is 80x24,
as we all know), and do one thing and do that well.

   I'm not expecting you to be able to write a perl script that checks
   the first line, but we have way too many 200-plus line functions in
   the kernel.  I'd like a warning on anything over 200 lines (a factor
   of 4 over Linus's stated goal).

 - I don't understand the error for xa_head here:

struct xarray {
        spinlock_t      xa_lock;
        gfp_t           xa_flags;
        void __rcu *    xa_head;
};

   Do people really think that:

struct xarray {
        spinlock_t      xa_lock;
        gfp_t           xa_flags;
        void __rcu	*xa_head;
};

   is more aesthetically pleasing?  And not just that, but it's an *error*
   so the former is *RIGHT* and this is *WRONG*.  And not just a matter
   of taste?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Dec. 11, 2017, 11:38 p.m. UTC | #28
On Mon, Dec 11, 2017 at 02:12:28PM -0800, Joe Perches wrote:
> On Tue, 2017-12-12 at 08:43 +1100, Dave Chinner wrote:
> > On Sat, Dec 09, 2017 at 09:00:18AM -0800, Joe Perches wrote:
> > > On Sat, 2017-12-09 at 09:36 +1100, Dave Chinner wrote:
> > > > 	1. Using lockdep_set_novalidate_class() for anything other
> > > > 	than device->mutex will throw checkpatch warnings. Nice. (*)
> > > []
> > > > (*) checkpatch.pl is considered mostly harmful round here, too,
> > > > but that's another rant....
> > > 
> > > How so?
> > 
> > Short story is that it barfs all over the slightly non-standard
> > coding style used in XFS.
> []
> > This sort of stuff is just lowest-common-denominator noise - great
> > for new code and/or inexperienced developers, but not for working
> > with large bodies of existing code with slightly non-standard
> > conventions.
> 
> Completely reasonable.  Thanks.
> 
> Do you get many checkpatch submitters for fs/xfs?

We used to. Not recently, though.

Cheers,

Dave.
Joe Perches Dec. 11, 2017, 11:46 p.m. UTC | #29
On Mon, 2017-12-11 at 14:43 -0800, Matthew Wilcox wrote:
> On Mon, Dec 11, 2017 at 02:12:28PM -0800, Joe Perches wrote:
> > Completely reasonable.  Thanks.
> 
> If we're doing "completely reasonable" complaints, then ...
> 
>  - I don't understand why plain 'unsigned' is deemed bad.

That was a David Miller preference.

>  - The rule about all function parameters in prototypes having a name
>    doesn't make sense.  Example:
> 
> int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);

Improvements to regex welcomed.

>  - Forcing a blank line after variable declarations sometimes makes for
>    some weird-looking code.

True.  I don't care for this one myself.
>    Constructively, I think this warning can be suppressed for blocks
>    that are under, say, 8 lines.

Not easy to do as checkpatch works on patches.

> 6) Functions
> ------------
> 
> Functions should be short and sweet, and do just one thing.  They should
> fit on one or two screenfuls of text (the ISO/ANSI screen size is 80x24,
> as we all know), and do one thing and do that well.
> 
>    I'm not expecting you to be able to write a perl script that checks
>    the first line, but we have way too many 200-plus line functions in
>    the kernel.  I'd like a warning on anything over 200 lines (a factor
>    of 4 over Linus's stated goal).

Maybe reasonable.
Some declaration blocks for things like:

void foo(void)
{
	static const struct foobar array[] = {
		{ long count of lines... };
	[body]
}

might make that warning unreasonable though.

>  - I don't understand the error for xa_head here:
> 
> struct xarray {
>         spinlock_t      xa_lock;
>         gfp_t           xa_flags;
>         void __rcu *    xa_head;
> };
> 
>    Do people really think that:
> 
> struct xarray {
>         spinlock_t      xa_lock;
>         gfp_t           xa_flags;
>         void __rcu	*xa_head;
> };
> 
>    is more aesthetically pleasing?  And not just that, but it's an *error*
>    so the former is *RIGHT* and this is *WRONG*.  And not just a matter
>    of taste?

No opinion really.
That's from Andy Whitcroft's original implementation.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alan Stern Dec. 12, 2017, 3:51 p.m. UTC | #30
On Mon, 11 Dec 2017, Joe Perches wrote:

> >  - I don't understand the error for xa_head here:
> > 
> > struct xarray {
> >         spinlock_t      xa_lock;
> >         gfp_t           xa_flags;
> >         void __rcu *    xa_head;
> > };
> > 
> >    Do people really think that:
> > 
> > struct xarray {
> >         spinlock_t      xa_lock;
> >         gfp_t           xa_flags;
> >         void __rcu	*xa_head;
> > };
> > 
> >    is more aesthetically pleasing?  And not just that, but it's an *error*
> >    so the former is *RIGHT* and this is *WRONG*.  And not just a matter

Not sure what was meant here.  Neither one is *WRONG* in the sense of 
being invalid C code.  In the sense of what checkpatch will accept, the 
former is *WRONG* and the latter is *RIGHT* -- the opposite of what 
was written.

> >    of taste?
> 
> No opinion really.
> That's from Andy Whitcroft's original implementation.

This one, at least, is easy to explain.  The original version tends to
lead to bugs, or easily misunderstood code.  Consider if another
variable was added to the declaration; it could easily turn into:

	void __rcu *	xa_head, xa_head2;

(The compiler will reject this, but it wouldn't if the underlying type
had been int instead of void.)

Doing it the other way makes the meaning a lot more clear:

	void __rcu	*xa_head, *xa_head2;

This is an idiom specifically intended to reduce the likelihood of 
errors.  Rather like avoiding assignments inside conditionals.

Alan Stern

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Knut Omang Dec. 21, 2017, 12:05 p.m. UTC | #31
Joe Perches <joe@perches.com> writes:

> On Tue, 2017-12-12 at 08:43 +1100, Dave Chinner wrote:
>> On Sat, Dec 09, 2017 at 09:00:18AM -0800, Joe Perches wrote:
>> > On Sat, 2017-12-09 at 09:36 +1100, Dave Chinner wrote:
>> > > 	1. Using lockdep_set_novalidate_class() for anything other
>> > > 	than device->mutex will throw checkpatch warnings. Nice. (*)
>> > []
>> > > (*) checkpatch.pl is considered mostly harmful round here, too,
>> > > but that's another rant....
>> > 
>> > How so?
>> 
>> Short story is that it barfs all over the slightly non-standard
>> coding style used in XFS.
> []
>> This sort of stuff is just lowest-common-denominator noise - great
>> for new code and/or inexperienced developers, but not for working
>> with large bodies of existing code with slightly non-standard
>> conventions.
>
> Completely reasonable.  Thanks.
>
> Do you get many checkpatch submitters for fs/xfs?
>
> If so, could probably do something about adding
> a checkpatch file flag to the directory or equivalent.
>
> Maybe add something like:
>
> fs/xfs/.checkpatch
>
> where the contents turn off most everything

I propose a more fine grained and configurable form of this in

   https://lkml.org/lkml/2017/12/16/343

that also handles sparse and other checkers in a similar way.

Thanks,
Knut

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f8a674d7f092..2179bede5396 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -101,10 +101,9 @@ 
  * an infinite loop in the code.
  */
 struct xfs_mru_cache {
-	struct radix_tree_root	store;     /* Core storage data structure.  */
+	struct xarray		store;     /* Core storage data structure.  */
 	struct list_head	*lists;    /* Array of lists, one per grp.  */
 	struct list_head	reap_list; /* Elements overdue for reaping. */
-	spinlock_t		lock;      /* Lock to protect this struct.  */
 	unsigned int		grp_count; /* Number of discrete groups.    */
 	unsigned int		grp_time;  /* Time period spanned by grps.  */
 	unsigned int		lru_grp;   /* Group containing time zero.   */
@@ -232,22 +231,21 @@  _xfs_mru_cache_list_insert(
  * data store, removing it from the reap list, calling the client's free
  * function and deleting the element from the element zone.
  *
- * We get called holding the mru->lock, which we drop and then reacquire.
- * Sparse need special help with this to tell it we know what we are doing.
+ * We get called holding the mru->store lock, which we drop and then reacquire.
+ * Sparse needs special help with this to tell it we know what we are doing.
  */
 STATIC void
 _xfs_mru_cache_clear_reap_list(
 	struct xfs_mru_cache	*mru)
-		__releases(mru->lock) __acquires(mru->lock)
+		__releases(mru->store) __acquires(mru->store)
 {
 	struct xfs_mru_cache_elem *elem, *next;
 	struct list_head	tmp;
 
 	INIT_LIST_HEAD(&tmp);
 	list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
-
 		/* Remove the element from the data store. */
-		radix_tree_delete(&mru->store, elem->key);
+		__xa_erase(&mru->store, elem->key);
 
 		/*
 		 * remove to temp list so it can be freed without
@@ -255,14 +253,14 @@  _xfs_mru_cache_clear_reap_list(
 		 */
 		list_move(&elem->list_node, &tmp);
 	}
-	spin_unlock(&mru->lock);
+	xa_unlock(&mru->store);
 
 	list_for_each_entry_safe(elem, next, &tmp, list_node) {
 		list_del_init(&elem->list_node);
 		mru->free_func(elem);
 	}
 
-	spin_lock(&mru->lock);
+	xa_lock(&mru->store);
 }
 
 /*
@@ -284,7 +282,7 @@  _xfs_mru_cache_reap(
 	if (!mru || !mru->lists)
 		return;
 
-	spin_lock(&mru->lock);
+	xa_lock(&mru->store);
 	next = _xfs_mru_cache_migrate(mru, jiffies);
 	_xfs_mru_cache_clear_reap_list(mru);
 
@@ -298,7 +296,7 @@  _xfs_mru_cache_reap(
 		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
 	}
 
-	spin_unlock(&mru->lock);
+	xa_unlock(&mru->store);
 }
 
 int
@@ -358,13 +356,8 @@  xfs_mru_cache_create(
 	for (grp = 0; grp < mru->grp_count; grp++)
 		INIT_LIST_HEAD(mru->lists + grp);
 
-	/*
-	 * We use GFP_KERNEL radix tree preload and do inserts under a
-	 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
-	 */
-	INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
+	xa_init(&mru->store);
 	INIT_LIST_HEAD(&mru->reap_list);
-	spin_lock_init(&mru->lock);
 	INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
 
 	mru->grp_time  = grp_time;
@@ -394,17 +387,17 @@  xfs_mru_cache_flush(
 	if (!mru || !mru->lists)
 		return;
 
-	spin_lock(&mru->lock);
+	xa_lock(&mru->store);
 	if (mru->queued) {
-		spin_unlock(&mru->lock);
+		xa_unlock(&mru->store);
 		cancel_delayed_work_sync(&mru->work);
-		spin_lock(&mru->lock);
+		xa_lock(&mru->store);
 	}
 
 	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
 	_xfs_mru_cache_clear_reap_list(mru);
 
-	spin_unlock(&mru->lock);
+	xa_unlock(&mru->store);
 }
 
 void
@@ -431,24 +424,24 @@  xfs_mru_cache_insert(
 	unsigned long		key,
 	struct xfs_mru_cache_elem *elem)
 {
+	XA_STATE(xas, &mru->store, key);
 	int			error;
 
 	ASSERT(mru && mru->lists);
 	if (!mru || !mru->lists)
 		return -EINVAL;
 
-	if (radix_tree_preload(GFP_NOFS))
-		return -ENOMEM;
-
 	INIT_LIST_HEAD(&elem->list_node);
 	elem->key = key;
 
-	spin_lock(&mru->lock);
-	error = radix_tree_insert(&mru->store, key, elem);
-	radix_tree_preload_end();
-	if (!error)
-		_xfs_mru_cache_list_insert(mru, elem);
-	spin_unlock(&mru->lock);
+	do {
+		xas_lock(&xas);
+		xas_store(&xas, elem);
+		error = xas_error(&xas);
+		if (!error)
+			_xfs_mru_cache_list_insert(mru, elem);
+		xas_unlock(&xas);
+	} while (xas_nomem(&xas, GFP_NOFS));
 
 	return error;
 }
@@ -470,11 +463,11 @@  xfs_mru_cache_remove(
 	if (!mru || !mru->lists)
 		return NULL;
 
-	spin_lock(&mru->lock);
-	elem = radix_tree_delete(&mru->store, key);
+	xa_lock(&mru->store);
+	elem = __xa_erase(&mru->store, key);
 	if (elem)
 		list_del(&elem->list_node);
-	spin_unlock(&mru->lock);
+	xa_unlock(&mru->store);
 
 	return elem;
 }
@@ -520,20 +513,21 @@  xfs_mru_cache_lookup(
 	struct xfs_mru_cache	*mru,
 	unsigned long		key)
 {
+	XA_STATE(xas, &mru->store, key);
 	struct xfs_mru_cache_elem *elem;
 
 	ASSERT(mru && mru->lists);
 	if (!mru || !mru->lists)
 		return NULL;
 
-	spin_lock(&mru->lock);
-	elem = radix_tree_lookup(&mru->store, key);
+	xas_lock(&xas);
+	elem = xas_load(&xas);
 	if (elem) {
 		list_del(&elem->list_node);
 		_xfs_mru_cache_list_insert(mru, elem);
-		__release(mru_lock); /* help sparse not be stupid */
+		__release(&xas); /* help sparse not be stupid */
 	} else
-		spin_unlock(&mru->lock);
+		xas_unlock(&xas);
 
 	return elem;
 }
@@ -546,7 +540,7 @@  xfs_mru_cache_lookup(
 void
 xfs_mru_cache_done(
 	struct xfs_mru_cache	*mru)
-		__releases(mru->lock)
+		__releases(mru->store)
 {
-	spin_unlock(&mru->lock);
+	xa_unlock(&mru->store);
 }