diff mbox series

[4/5] xfs: convert perag lookup to xarray

Message ID 20240821063901.650776-5-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/5] xfs: use kfree_rcu_mightsleep to free the perag structures | expand

Commit Message

Christoph Hellwig Aug. 21, 2024, 6:38 a.m. UTC
Convert the perag lookup from the legacy radix tree to the xarray,
which allows for much nicer iteration and bulk lookup semantics.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 31 +++++++-------------------
 fs/xfs/xfs_icache.c    | 50 +++++++++++++++++++++---------------------
 fs/xfs/xfs_mount.h     |  3 +--
 fs/xfs/xfs_super.c     |  3 +--
 4 files changed, 35 insertions(+), 52 deletions(-)

Comments

Darrick J. Wong Aug. 21, 2024, 4:28 p.m. UTC | #1
On Wed, Aug 21, 2024 at 08:38:31AM +0200, Christoph Hellwig wrote:
> Convert the perag lookup from the legacy radix tree to the xarray,
> which allows for much nicer iteration and bulk lookup semantics.

Looks like a pretty straightforward covnersion.  Is there a good
justification for converting the ici radix tree too?  Or is it too
sparse to be worth doing?

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_ag.c | 31 +++++++-------------------
>  fs/xfs/xfs_icache.c    | 50 +++++++++++++++++++++---------------------
>  fs/xfs/xfs_mount.h     |  3 +--
>  fs/xfs/xfs_super.c     |  3 +--
>  4 files changed, 35 insertions(+), 52 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 87f00f0180846f..5f0494702e0b55 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -46,7 +46,7 @@ xfs_perag_get(
>  	struct xfs_perag	*pag;
>  
>  	rcu_read_lock();
> -	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
> +	pag = xa_load(&mp->m_perags, agno);
>  	if (pag) {
>  		trace_xfs_perag_get(pag, _RET_IP_);
>  		ASSERT(atomic_read(&pag->pag_ref) >= 0);
> @@ -92,7 +92,7 @@ xfs_perag_grab(
>  	struct xfs_perag	*pag;
>  
>  	rcu_read_lock();
> -	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
> +	pag = xa_load(&mp->m_perags, agno);
>  	if (pag) {
>  		trace_xfs_perag_grab(pag, _RET_IP_);
>  		if (!atomic_inc_not_zero(&pag->pag_active_ref))
> @@ -195,9 +195,7 @@ xfs_free_perag(
>  	xfs_agnumber_t		agno;
>  
>  	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> -		spin_lock(&mp->m_perag_lock);
> -		pag = radix_tree_delete(&mp->m_perag_tree, agno);
> -		spin_unlock(&mp->m_perag_lock);
> +		pag = xa_erase(&mp->m_perags, agno);
>  		ASSERT(pag);
>  		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
>  		xfs_defer_drain_free(&pag->pag_intents_drain);
> @@ -286,9 +284,7 @@ xfs_free_unused_perag_range(
>  	xfs_agnumber_t		index;
>  
>  	for (index = agstart; index < agend; index++) {
> -		spin_lock(&mp->m_perag_lock);
> -		pag = radix_tree_delete(&mp->m_perag_tree, index);
> -		spin_unlock(&mp->m_perag_lock);
> +		pag = xa_erase(&mp->m_perags, index);
>  		if (!pag)
>  			break;
>  		xfs_buf_cache_destroy(&pag->pag_bcache);
> @@ -329,20 +325,11 @@ xfs_initialize_perag(
>  		pag->pag_agno = index;
>  		pag->pag_mount = mp;
>  
> -		error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> -		if (error)
> -			goto out_free_pag;
> -
> -		spin_lock(&mp->m_perag_lock);
> -		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
> -			WARN_ON_ONCE(1);
> -			spin_unlock(&mp->m_perag_lock);
> -			radix_tree_preload_end();
> -			error = -EEXIST;
> +		error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL);
> +		if (error) {
> +			WARN_ON_ONCE(error == -EBUSY);
>  			goto out_free_pag;
>  		}
> -		spin_unlock(&mp->m_perag_lock);
> -		radix_tree_preload_end();
>  
>  #ifdef __KERNEL__
>  		/* Place kernel structure only init below this point. */
> @@ -390,9 +377,7 @@ xfs_initialize_perag(
>  
>  out_remove_pag:
>  	xfs_defer_drain_free(&pag->pag_intents_drain);
> -	spin_lock(&mp->m_perag_lock);
> -	radix_tree_delete(&mp->m_perag_tree, index);
> -	spin_unlock(&mp->m_perag_lock);
> +	pag = xa_erase(&mp->m_perags, index);
>  out_free_pag:
>  	kfree(pag);
>  out_unwind_new_pags:
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 4d71fbfe71299a..5bca845e702f1d 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -65,6 +65,18 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
>  					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
>  					 XFS_ICWALK_FLAG_UNION)
>  
> +/* Marks for the perag xarray */
> +#define XFS_PERAG_RECLAIM_MARK	XA_MARK_0
> +#define XFS_PERAG_BLOCKGC_MARK	XA_MARK_1
> +
> +static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
> +{
> +	if (tag == XFS_ICI_RECLAIM_TAG)
> +		return XFS_PERAG_RECLAIM_MARK;
> +	ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
> +	return XFS_PERAG_BLOCKGC_MARK;
> +}
> +
>  /*
>   * Allocate and initialise an xfs_inode.
>   */
> @@ -191,7 +203,7 @@ xfs_reclaim_work_queue(
>  {
>  
>  	rcu_read_lock();
> -	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
> +	if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
>  		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
>  			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
>  	}
> @@ -241,9 +253,7 @@ xfs_perag_set_inode_tag(
>  		return;
>  
>  	/* propagate the tag up into the perag radix tree */
> -	spin_lock(&mp->m_perag_lock);
> -	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
> -	spin_unlock(&mp->m_perag_lock);
> +	xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
>  
>  	/* start background work */
>  	switch (tag) {
> @@ -285,9 +295,7 @@ xfs_perag_clear_inode_tag(
>  		return;
>  
>  	/* clear the tag from the perag radix tree */
> -	spin_lock(&mp->m_perag_lock);
> -	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
> -	spin_unlock(&mp->m_perag_lock);
> +	xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
>  
>  	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
>  }
> @@ -302,7 +310,6 @@ xfs_perag_get_next_tag(
>  	unsigned int		tag)
>  {
>  	unsigned long		index = 0;
> -	int			found;
>  
>  	if (pag) {
>  		index = pag->pag_agno + 1;
> @@ -310,14 +317,11 @@ xfs_perag_get_next_tag(
>  	}
>  
>  	rcu_read_lock();
> -	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
> -					(void **)&pag, index, 1, tag);
> -	if (found <= 0) {
> -		rcu_read_unlock();
> -		return NULL;
> +	pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
> +	if (pag) {
> +		trace_xfs_perag_get_next_tag(pag, _RET_IP_);
> +		atomic_inc(&pag->pag_ref);
>  	}
> -	trace_xfs_perag_get_next_tag(pag, _RET_IP_);
> -	atomic_inc(&pag->pag_ref);
>  	rcu_read_unlock();
>  	return pag;
>  }
> @@ -332,7 +336,6 @@ xfs_perag_grab_next_tag(
>  	int			tag)
>  {
>  	unsigned long		index = 0;
> -	int			found;
>  
>  	if (pag) {
>  		index = pag->pag_agno + 1;
> @@ -340,15 +343,12 @@ xfs_perag_grab_next_tag(
>  	}
>  
>  	rcu_read_lock();
> -	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
> -					(void **)&pag, index, 1, tag);
> -	if (found <= 0) {
> -		rcu_read_unlock();
> -		return NULL;
> +	pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
> +	if (pag) {
> +		trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
> +		if (!atomic_inc_not_zero(&pag->pag_active_ref))
> +			pag = NULL;
>  	}
> -	trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
> -	if (!atomic_inc_not_zero(&pag->pag_active_ref))
> -		pag = NULL;
>  	rcu_read_unlock();
>  	return pag;
>  }
> @@ -1038,7 +1038,7 @@ xfs_reclaim_inodes(
>  	if (xfs_want_reclaim_sick(mp))
>  		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
>  
> -	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
> +	while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
>  		xfs_ail_push_all_sync(mp->m_ail);
>  		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
>  	}
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index d0567dfbc0368d..dce2d832e1e6d1 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -208,8 +208,7 @@ typedef struct xfs_mount {
>  	 */
>  	atomic64_t		m_allocbt_blks;
>  
> -	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> -	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> +	struct xarray		m_perags;	/* per-ag accounting info */
>  	uint64_t		m_resblks;	/* total reserved blocks */
>  	uint64_t		m_resblks_avail;/* available reserved blocks */
>  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 27e9f749c4c7fc..c41b543f2e9121 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -2009,8 +2009,7 @@ static int xfs_init_fs_context(
>  		return -ENOMEM;
>  
>  	spin_lock_init(&mp->m_sb_lock);
> -	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
> -	spin_lock_init(&mp->m_perag_lock);
> +	xa_init(&mp->m_perags);
>  	mutex_init(&mp->m_growlock);
>  	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
>  	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
> -- 
> 2.43.0
> 
>
Christoph Hellwig Aug. 22, 2024, 3:45 a.m. UTC | #2
On Wed, Aug 21, 2024 at 09:28:10AM -0700, Darrick J. Wong wrote:
> On Wed, Aug 21, 2024 at 08:38:31AM +0200, Christoph Hellwig wrote:
> > Convert the perag lookup from the legacy radix tree to the xarray,
> > which allows for much nicer iteration and bulk lookup semantics.
> 
> Looks like a pretty straightforward covnersion.  Is there a good
> justification for converting the ici radix tree too?  Or is it too
> sparse to be worth doing?

radix trees and xarrays have pretty similar behavior related to
sparseness or waste of interior nodes due to it.  So unless we find a
better data structure for it, it would be worthwhile.

But the ici radix tree does pretty funny things in terms of also
protecting other fields with the lock synchronizing it, so the conversion
is fairly complicated and I don't feel like doing it right now, at least
no without evaluating if for example a rthashtable might actually be
the better data structure here.  The downside of the rthashtable is
that it doens't support tags/masks and isn't great for iteration, so it
might very much not be very suitable.
Dave Chinner Aug. 27, 2024, 10:51 p.m. UTC | #3
On Thu, Aug 22, 2024 at 05:45:48AM +0200, Christoph Hellwig wrote:
> On Wed, Aug 21, 2024 at 09:28:10AM -0700, Darrick J. Wong wrote:
> > On Wed, Aug 21, 2024 at 08:38:31AM +0200, Christoph Hellwig wrote:
> > > Convert the perag lookup from the legacy radix tree to the xarray,
> > > which allows for much nicer iteration and bulk lookup semantics.
> > 
> > Looks like a pretty straightforward covnersion.  Is there a good
> > justification for converting the ici radix tree too?  Or is it too
> > sparse to be worth doing?
> 
> radix trees and xarrays have pretty similar behavior related to
> sparseness or waste of interior nodes due to it. 

And the node size is still 64 entries, which matches up with inode
chunk size. Hence a fully populated and cached inode chunk fills
xarray nodes completely, just like the radix tree. Hence if our
inode allocation locality decisions work, we end up with good
population characteristics in the in-memory cache index, too.

> So unless we find a
> better data structure for it, it would be worthwhile.

I have prototype patches to convert the ici radix tree to an xarray.
When I wrote it a few months ago I never got the time to actually
test it because other stuff happened....

> But the ici radix tree does pretty funny things in terms of also
> protecting other fields with the lock synchronizing it, so the conversion
> is fairly complicated

The locking isn't a big deal - I just used xa_lock() and xa_unlock()
to use the internal xarray lock to replace the perag->pag_ici_lock.
This gives the same semantics of making external state and tree
state updates atomic.

e.g. this code in xfs_reclaim_inode():

	spin_lock(&pag->pag_ici_lock);
	if (!radix_tree_delete(&pag->pag_ici_root,
			       XFS_INO_TO_AGINO(ip->i_mount, ino)))
		ASSERT(0);
	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
	spin_unlock(&pag->pag_ici_lock);

becomes:

	xa_lock(&pag->pag_icache);
	if (__xa_erase(&pag->pag_icache,
			XFS_INO_TO_AGINO(ip->i_mount, ino)) != ip)
		ASSERT(0);
	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
	xa_unlock(&pag->pag_icache);

so the clearing of the XFS_ICI_RECLAIM_TAG in the mp->m_perag tree
is still atomic w.r.t. the removal of the inode from the icache
xarray.

> and I don't feel like doing it right now, at least
> no without evaluating if for example a rthashtable might actually be
> the better data structure here.  The downside of the rthashtable is
> that it doens't support tags/masks and isn't great for iteration, so it
> might very much not be very suitable.

The rhashtable is not suited to the inode cache at all. A very
common access pattern is iterating all the inodes in an inode
cluster (e.g. in xfs_iflush_cluster() or during an icwalk) and with
a radix tree or xarray, these lookups all hit the same node and
cachelines. We've optimised this into gang lookups, which means
all the inodes in a cluster are fetched at the same time via
sequential memory access.

Move to a rhashtable makes this iteration mechanism impossible
because the rhashtable is unordered. Every inode we look up now
takes at least one cacheline miss because it's in some other
completely random index in the rhashtable and not adjacent to
the last inode we lookuped up. Worse, we have to dereference each
object we find on the chain to do key matching, so it's at least two
cacheline accesses per inode lookup.

So instead of a cluster lookup of 32 inodes only requiring a few
cacheline accesses to walk down the tree and then 4 sequential
cacheline accesses to retreive all the inode pointers, we have at
least 64 individual random cacheline accesses to get the pointers to
the same number of inodes.

IOWs, a hashtable of any kind is way more inefficient than using the
radix tree or xarray when it comes to the sorts of lockless
sequential access patterns we use internally with the XFS inode
cache.

Keep in mind that I went through all this "scalable structure
analysis" back in 2007 before I replaced the hash table based inode
cache implementation with radix trees. Radix trees were a far better
choice than a hash table way back then, and nothing in our inode
cache access patterns and algorithms has really changed since
then....

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 87f00f0180846f..5f0494702e0b55 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -46,7 +46,7 @@  xfs_perag_get(
 	struct xfs_perag	*pag;
 
 	rcu_read_lock();
-	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	pag = xa_load(&mp->m_perags, agno);
 	if (pag) {
 		trace_xfs_perag_get(pag, _RET_IP_);
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
@@ -92,7 +92,7 @@  xfs_perag_grab(
 	struct xfs_perag	*pag;
 
 	rcu_read_lock();
-	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	pag = xa_load(&mp->m_perags, agno);
 	if (pag) {
 		trace_xfs_perag_grab(pag, _RET_IP_);
 		if (!atomic_inc_not_zero(&pag->pag_active_ref))
@@ -195,9 +195,7 @@  xfs_free_perag(
 	xfs_agnumber_t		agno;
 
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-		spin_lock(&mp->m_perag_lock);
-		pag = radix_tree_delete(&mp->m_perag_tree, agno);
-		spin_unlock(&mp->m_perag_lock);
+		pag = xa_erase(&mp->m_perags, agno);
 		ASSERT(pag);
 		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
 		xfs_defer_drain_free(&pag->pag_intents_drain);
@@ -286,9 +284,7 @@  xfs_free_unused_perag_range(
 	xfs_agnumber_t		index;
 
 	for (index = agstart; index < agend; index++) {
-		spin_lock(&mp->m_perag_lock);
-		pag = radix_tree_delete(&mp->m_perag_tree, index);
-		spin_unlock(&mp->m_perag_lock);
+		pag = xa_erase(&mp->m_perags, index);
 		if (!pag)
 			break;
 		xfs_buf_cache_destroy(&pag->pag_bcache);
@@ -329,20 +325,11 @@  xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 
-		error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
-		if (error)
-			goto out_free_pag;
-
-		spin_lock(&mp->m_perag_lock);
-		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-			WARN_ON_ONCE(1);
-			spin_unlock(&mp->m_perag_lock);
-			radix_tree_preload_end();
-			error = -EEXIST;
+		error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL);
+		if (error) {
+			WARN_ON_ONCE(error == -EBUSY);
 			goto out_free_pag;
 		}
-		spin_unlock(&mp->m_perag_lock);
-		radix_tree_preload_end();
 
 #ifdef __KERNEL__
 		/* Place kernel structure only init below this point. */
@@ -390,9 +377,7 @@  xfs_initialize_perag(
 
 out_remove_pag:
 	xfs_defer_drain_free(&pag->pag_intents_drain);
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_delete(&mp->m_perag_tree, index);
-	spin_unlock(&mp->m_perag_lock);
+	pag = xa_erase(&mp->m_perags, index);
 out_free_pag:
 	kfree(pag);
 out_unwind_new_pags:
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 4d71fbfe71299a..5bca845e702f1d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -65,6 +65,18 @@  static int xfs_icwalk_ag(struct xfs_perag *pag,
 					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
 					 XFS_ICWALK_FLAG_UNION)
 
+/* Marks for the perag xarray */
+#define XFS_PERAG_RECLAIM_MARK	XA_MARK_0
+#define XFS_PERAG_BLOCKGC_MARK	XA_MARK_1
+
+static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
+{
+	if (tag == XFS_ICI_RECLAIM_TAG)
+		return XFS_PERAG_RECLAIM_MARK;
+	ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
+	return XFS_PERAG_BLOCKGC_MARK;
+}
+
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -191,7 +203,7 @@  xfs_reclaim_work_queue(
 {
 
 	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+	if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
 		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 	}
@@ -241,9 +253,7 @@  xfs_perag_set_inode_tag(
 		return;
 
 	/* propagate the tag up into the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
-	spin_unlock(&mp->m_perag_lock);
+	xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
 
 	/* start background work */
 	switch (tag) {
@@ -285,9 +295,7 @@  xfs_perag_clear_inode_tag(
 		return;
 
 	/* clear the tag from the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
-	spin_unlock(&mp->m_perag_lock);
+	xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
 
 	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
 }
@@ -302,7 +310,6 @@  xfs_perag_get_next_tag(
 	unsigned int		tag)
 {
 	unsigned long		index = 0;
-	int			found;
 
 	if (pag) {
 		index = pag->pag_agno + 1;
@@ -310,14 +317,11 @@  xfs_perag_get_next_tag(
 	}
 
 	rcu_read_lock();
-	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-					(void **)&pag, index, 1, tag);
-	if (found <= 0) {
-		rcu_read_unlock();
-		return NULL;
+	pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
+	if (pag) {
+		trace_xfs_perag_get_next_tag(pag, _RET_IP_);
+		atomic_inc(&pag->pag_ref);
 	}
-	trace_xfs_perag_get_next_tag(pag, _RET_IP_);
-	atomic_inc(&pag->pag_ref);
 	rcu_read_unlock();
 	return pag;
 }
@@ -332,7 +336,6 @@  xfs_perag_grab_next_tag(
 	int			tag)
 {
 	unsigned long		index = 0;
-	int			found;
 
 	if (pag) {
 		index = pag->pag_agno + 1;
@@ -340,15 +343,12 @@  xfs_perag_grab_next_tag(
 	}
 
 	rcu_read_lock();
-	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-					(void **)&pag, index, 1, tag);
-	if (found <= 0) {
-		rcu_read_unlock();
-		return NULL;
+	pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
+	if (pag) {
+		trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
+		if (!atomic_inc_not_zero(&pag->pag_active_ref))
+			pag = NULL;
 	}
-	trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
-	if (!atomic_inc_not_zero(&pag->pag_active_ref))
-		pag = NULL;
 	rcu_read_unlock();
 	return pag;
 }
@@ -1038,7 +1038,7 @@  xfs_reclaim_inodes(
 	if (xfs_want_reclaim_sick(mp))
 		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
 
-	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+	while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
 		xfs_ail_push_all_sync(mp->m_ail);
 		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d0567dfbc0368d..dce2d832e1e6d1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -208,8 +208,7 @@  typedef struct xfs_mount {
 	 */
 	atomic64_t		m_allocbt_blks;
 
-	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
-	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	struct xarray		m_perags;	/* per-ag accounting info */
 	uint64_t		m_resblks;	/* total reserved blocks */
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 27e9f749c4c7fc..c41b543f2e9121 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2009,8 +2009,7 @@  static int xfs_init_fs_context(
 		return -ENOMEM;
 
 	spin_lock_init(&mp->m_sb_lock);
-	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
-	spin_lock_init(&mp->m_perag_lock);
+	xa_init(&mp->m_perags);
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);