diff mbox series

[2/3] xfs: convert perag lookup to xarray

Message ID 20240812063143.3806677-3-hch@lst.de (mailing list archive)
State Superseded, archived
Headers show
Series [1/3] xarray: add xa_set | expand

Commit Message

Christoph Hellwig Aug. 12, 2024, 6:31 a.m. UTC
Convert the perag lookup from the legacy radix tree to the xarray,
which allows for much nicer iteration and bulk lookup semantics.

Note that this removes the helpers for tagged get and grab and the
for_each* wrappers built around them and instead uses the xa_for_each*
iteration helpers directly in xfs_icache.c, which simplifies the code
nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 81 +++++-------------------------------------
 fs/xfs/libxfs/xfs_ag.h | 11 ------
 fs/xfs/xfs_icache.c    | 77 +++++++++++++++++++++++++--------------
 fs/xfs/xfs_mount.h     |  3 +-
 fs/xfs/xfs_super.c     |  3 +-
 fs/xfs/xfs_trace.h     |  3 +-
 6 files changed, 61 insertions(+), 117 deletions(-)

Comments

Matthew Wilcox Aug. 12, 2024, 2:39 p.m. UTC | #1
On Mon, Aug 12, 2024 at 08:31:01AM +0200, Christoph Hellwig wrote:
> @@ -1020,14 +1016,16 @@ xfs_reclaim_inodes_count(
>  	struct xfs_mount	*mp)
>  {
>  	struct xfs_perag	*pag;
> -	xfs_agnumber_t		ag = 0;
> +	unsigned long		index = 0;
>  	long			reclaimable = 0;
>  
> -	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
> -		ag = pag->pag_agno + 1;
> +	rcu_read_lock();
> +	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_RECLAIM_TAG) {
> +		trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
>  		reclaimable += pag->pag_ici_reclaimable;
> -		xfs_perag_put(pag);
>  	}
> +	rcu_read_unlock();

Would you rather use xas_for_each_marked() here?  O(n) rather than
O(n.log(n)).

Other than that, looks like a straightforward and correct conversion.
Christoph Hellwig Aug. 12, 2024, 2:43 p.m. UTC | #2
On Mon, Aug 12, 2024 at 03:39:33PM +0100, Matthew Wilcox wrote:
> > +	rcu_read_lock();
> > +	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_RECLAIM_TAG) {
> > +		trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
> >  		reclaimable += pag->pag_ici_reclaimable;
> > -		xfs_perag_put(pag);
> >  	}
> > +	rcu_read_unlock();
> 
> Would you rather use xas_for_each_marked() here?  O(n) rather than
> O(n.log(n)).
>
> Other than that, looks like a straightforward and correct conversion.

That probably would be more efficient, but the API feels like awkward
due to the required end argument on top of the slightly cumbersome
but not really horrible caller provided XA_STATE().

Is there any good reason why there's isn't an O(1) version that is
just as simple to use as xa_for_each_marked?
Pankaj Raghav (Samsung) Aug. 12, 2024, 4:39 p.m. UTC | #3
On Mon, Aug 12, 2024 at 08:31:01AM +0200, Christoph Hellwig wrote:
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 7e80732cb54708..5efb1e8b4107a9 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -46,7 +46,7 @@ xfs_perag_get(
>  	struct xfs_perag	*pag;
>  
>  	rcu_read_lock();
xa_load() already calls rcu_read_lock(). So we can get rid of this I
guess?
> -	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
> -xfs_perag_get_tag(
> -	struct xfs_mount	*mp,
> -	xfs_agnumber_t		first,
> -	unsigned int		tag)
> -{
> -	struct xfs_perag	*pag;
> -	int			found;
> -
> -	rcu_read_lock();
> -	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
> -					(void **)&pag, first, 1, tag);
> -	if (found <= 0) {
> -		rcu_read_unlock();
> -		return NULL;
> -	}
> -	trace_xfs_perag_get_tag(pag, _RET_IP_);
> -	atomic_inc(&pag->pag_ref);
> -	rcu_read_unlock();
> -	return pag;
> -}
> -
>  /* Get a passive reference to the given perag. */
>  struct xfs_perag *
>  xfs_perag_hold(
> @@ -117,38 +92,13 @@ xfs_perag_grab(
>  	struct xfs_perag	*pag;
>  
>  	rcu_read_lock();
Same here.
Matthew Wilcox Aug. 12, 2024, 7:02 p.m. UTC | #4
On Mon, Aug 12, 2024 at 04:39:56PM +0000, Pankaj Raghav (Samsung) wrote:
> On Mon, Aug 12, 2024 at 08:31:01AM +0200, Christoph Hellwig wrote:
> > diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> > index 7e80732cb54708..5efb1e8b4107a9 100644
> > --- a/fs/xfs/libxfs/xfs_ag.c
> > +++ b/fs/xfs/libxfs/xfs_ag.c
> > @@ -46,7 +46,7 @@ xfs_perag_get(
> >  	struct xfs_perag	*pag;
> >  
> >  	rcu_read_lock();
> xa_load() already calls rcu_read_lock(). So we can get rid of this I
> guess?

Almost certainly not; I assume pag is RCU-freed, so you'd be introducing
a UAF if the RCU lock is dropped before getting a refcount on the pag.
kernel test robot Aug. 13, 2024, 6:37 a.m. UTC | #5
Hi Christoph,

kernel test robot noticed the following build warnings:

[auto build test WARNING on xfs-linux/for-next]
[also build test WARNING on akpm-mm/mm-nonmm-unstable linus/master v6.11-rc3 next-20240813]
[cannot apply to djwong-xfs/djwong-devel]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Christoph-Hellwig/xfs-convert-perag-lookup-to-xarray/20240812-183447
base:   https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
patch link:    https://lore.kernel.org/r/20240812063143.3806677-3-hch%40lst.de
patch subject: [PATCH 2/3] xfs: convert perag lookup to xarray
config: x86_64-randconfig-122-20240813 (https://download.01.org/0day-ci/archive/20240813/202408131403.L5SrjEa7-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240813/202408131403.L5SrjEa7-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408131403.L5SrjEa7-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> fs/xfs/xfs_icache.c:1776:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got unsigned int enum xfs_icwalk_goal goal @@
   fs/xfs/xfs_icache.c:1776:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1776:9: sparse:     got unsigned int enum xfs_icwalk_goal goal
>> fs/xfs/xfs_icache.c:1776:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got unsigned int enum xfs_icwalk_goal goal @@
   fs/xfs/xfs_icache.c:1776:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1776:9: sparse:     got unsigned int enum xfs_icwalk_goal goal
>> fs/xfs/xfs_icache.c:244:51: sparse: sparse: incorrect type in argument 3 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got unsigned int tag @@
   fs/xfs/xfs_icache.c:244:51: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:244:51: sparse:     got unsigned int tag
   fs/xfs/xfs_icache.c:286:53: sparse: sparse: incorrect type in argument 3 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got unsigned int tag @@
   fs/xfs/xfs_icache.c:286:53: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:286:53: sparse:     got unsigned int tag
>> fs/xfs/xfs_icache.c:1379:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1379:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1379:9: sparse:     got int
>> fs/xfs/xfs_icache.c:1379:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1379:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1379:9: sparse:     got int
   fs/xfs/xfs_icache.c:1509:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1509:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1509:9: sparse:     got int
   fs/xfs/xfs_icache.c:1509:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1509:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1509:9: sparse:     got int
   fs/xfs/xfs_icache.c:1515:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1515:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1515:9: sparse:     got int
   fs/xfs/xfs_icache.c:1515:9: sparse: sparse: incorrect type in argument 4 (different base types) @@     expected restricted xa_mark_t [usertype] @@     got int @@
   fs/xfs/xfs_icache.c:1515:9: sparse:     expected restricted xa_mark_t [usertype]
   fs/xfs/xfs_icache.c:1515:9: sparse:     got int
   fs/xfs/xfs_icache.c: note: in included file (through include/linux/rbtree.h, include/linux/mm_types.h, include/linux/mmzone.h, ...):
   include/linux/rcupdate.h:869:25: sparse: sparse: context imbalance in 'xfs_iget_recycle' - unexpected unlock
   fs/xfs/xfs_icache.c:580:28: sparse: sparse: context imbalance in 'xfs_iget_cache_hit' - different lock contexts for basic block

vim +1776 fs/xfs/xfs_icache.c

  1762	
  1763	/* Walk all incore inodes to achieve a given goal. */
  1764	static int
  1765	xfs_icwalk(
  1766		struct xfs_mount	*mp,
  1767		enum xfs_icwalk_goal	goal,
  1768		struct xfs_icwalk	*icw)
  1769	{
  1770		struct xfs_perag	*pag;
  1771		int			error = 0;
  1772		int			last_error = 0;
  1773		unsigned long		index = 0;
  1774	
  1775		rcu_read_lock();
> 1776		xa_for_each_marked(&mp->m_perags, index, pag, goal) {
  1777			if (!atomic_inc_not_zero(&pag->pag_active_ref))
  1778				continue;
  1779			rcu_read_unlock();
  1780	
  1781			error = xfs_icwalk_ag(pag, goal, icw);
  1782			xfs_perag_rele(pag);
  1783	
  1784			rcu_read_lock();
  1785			if (error) {
  1786				last_error = error;
  1787				if (error == -EFSCORRUPTED)
  1788					break;
  1789			}
  1790		}
  1791		rcu_read_unlock();
  1792	
  1793		return last_error;
  1794		BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
  1795	}
  1796
Pankaj Raghav (Samsung) Aug. 13, 2024, 8:30 a.m. UTC | #6
On Mon, Aug 12, 2024 at 08:02:36PM +0100, Matthew Wilcox wrote:
> On Mon, Aug 12, 2024 at 04:39:56PM +0000, Pankaj Raghav (Samsung) wrote:
> > On Mon, Aug 12, 2024 at 08:31:01AM +0200, Christoph Hellwig wrote:
> > > diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> > > index 7e80732cb54708..5efb1e8b4107a9 100644
> > > --- a/fs/xfs/libxfs/xfs_ag.c
> > > +++ b/fs/xfs/libxfs/xfs_ag.c
> > > @@ -46,7 +46,7 @@ xfs_perag_get(
> > >  	struct xfs_perag	*pag;
> > >  
> > >  	rcu_read_lock();
> > xa_load() already calls rcu_read_lock(). So we can get rid of this I
> > guess?
> 
> Almost certainly not; I assume pag is RCU-freed, so you'd be introducing
> a UAF if the RCU lock is dropped before getting a refcount on the pag.

Ah, yes. rcu lock is needed until we get a refcount on pag.
Dave Chinner Aug. 14, 2024, 4:59 a.m. UTC | #7
On Mon, Aug 12, 2024 at 08:31:01AM +0200, Christoph Hellwig wrote:
> Convert the perag lookup from the legacy radix tree to the xarray,
> which allows for much nicer iteration and bulk lookup semantics.
> 
> Note that this removes the helpers for tagged get and grab and the
> for_each* wrappers built around them and instead uses the xa_for_each*
> iteration helpers directly in xfs_icache.c, which simplifies the code
> nicely.

Can we split the implementation change and the API change into two
separate patches, please?

I have no problems with the xarray conversion, I have reservations
about the API changes.

I hav eno idea what the new iteration method that is needed looks
like, but I'd prefer not to be exposing all the perag locking and
reference counting semantics all over the code - the current
iterators were introduced to remove all that stuff from existing
iterators.

This patch makes iteration go back to this model:


	rcu_read_lock()
	xa_for_each....() {
		/* get active or passive ref count */
		....
		rcu_read_unlock();

		/* do work on AG */

		/* put/rele perag */

		/* take RCU lock for next perag lookup */
		rcu_read_lock();
	}
	rcu_read_unlock();


And that feels like a step backward from an API perspective, not
an improvement....

So what's the overall plan for avoiding this sort of mess
everywhere? Can we re-implement the existing iterators more
efficiently with xarray iterators, or does xarray-based iteration
require going back to the old way of open coding all iterations?

> @@ -1493,21 +1497,32 @@ xfs_blockgc_flush_all(
>  	struct xfs_mount	*mp)
>  {
>  	struct xfs_perag	*pag;
> -	xfs_agnumber_t		agno;
> +	unsigned long		index = 0;
>  
>  	trace_xfs_blockgc_flush_all(mp, __return_address);
>  
>  	/*
> -	 * For each blockgc worker, move its queue time up to now.  If it
> -	 * wasn't queued, it will not be requeued.  Then flush whatever's
> -	 * left.
> +	 * For each blockgc worker, move its queue time up to now.  If it wasn't
> +	 * queued, it will not be requeued.  Then flush whatever is left.
>  	 */
> -	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
> -		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
> -				&pag->pag_blockgc_work, 0);
> +	rcu_read_lock();
> +	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_BLOCKGC_TAG)
> +		mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0);
> +	rcu_read_unlock();
>
> +	index = 0;
> +	rcu_read_lock();
> +	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_BLOCKGC_TAG) {
> +		if (!atomic_inc_not_zero(&pag->pag_active_ref))
> +			continue;
> +		rcu_read_unlock();
>  
> -	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
>  		flush_delayed_work(&pag->pag_blockgc_work);
> +		xfs_perag_rele(pag);
> +
> +		rcu_read_lock();
> +	}
> +	rcu_read_unlock();

And this is the whole problem with open coding iterators. The first
iterator accesses perag structures and potentially queues them for
work without holding a valid reference to the perag. The second
iteration takes reference counts, so can access the perag safely.

Why are these two iterations different? What makes the first,
non-reference counted iteration safe?

>  
>  	return xfs_inodegc_flush(mp);
>  }
> @@ -1755,18 +1770,26 @@ xfs_icwalk(
>  	struct xfs_perag	*pag;
>  	int			error = 0;
>  	int			last_error = 0;
> -	xfs_agnumber_t		agno;
> +	unsigned long		index = 0;
> +
> +	rcu_read_lock();
> +	xa_for_each_marked(&mp->m_perags, index, pag, goal) {
> +		if (!atomic_inc_not_zero(&pag->pag_active_ref))
> +			continue;
> +		rcu_read_unlock();
>  
> -	for_each_perag_tag(mp, agno, pag, goal) {
>  		error = xfs_icwalk_ag(pag, goal, icw);
> +		xfs_perag_rele(pag);
> +
> +		rcu_read_lock();
>  		if (error) {
>  			last_error = error;
> -			if (error == -EFSCORRUPTED) {
> -				xfs_perag_rele(pag);
> +			if (error == -EFSCORRUPTED)
>  				break;
> -			}
>  		}
>  	}
> +	rcu_read_unlock();
> +

And there's the open coded pattern I talked about earlier that we
introduced the for_each_perag iterators to avoid.

Like I said, converting to xarray - no problems with that. Changing
the iterator API - doesn't seem like a step forwards right now.

-Dave.
Christoph Hellwig Aug. 14, 2024, 5:21 a.m. UTC | #8
On Wed, Aug 14, 2024 at 02:59:23PM +1000, Dave Chinner wrote:
> Can we split the implementation change and the API change into two
> separate patches, please?

I don't think that is entirely possible, but things can be split
a bit more.  I've actually done some of that including more API
changes in the meantime, so this might only need small adjustments
anyway.

> So what's the overall plan for avoiding this sort of mess
> everywhere? Can we re-implement the existing iterators more
> efficiently with xarray iterators, or does xarray-based iteration
> require going back to the old way of open coding all iterations?

We can reimplement them, but they won't be more efficient.  That
being said using the xas iterator for places that don't need to
unlock works really nicely, and I've added a little syntactic sugar
to make this even nicer as in:

	rcu_read_lock();
	for_each_perag_marked(mp, pag, XFS_PERAG_RECLAIM_MARK) {
		/* do stuff */
	}
	rcu_read_unlock();

which is about as good as it gets in terms of efficiency and readability.

For the ones that need to sleep I'm now doing:

	struct xfs_perag        *pag = NULL;

	while ((pag = xfs_iwalk_next_ag(mp, pag, XFS_PERAG_BLOCKGC_MARK))) {
		/* do stuff */
	}

which is also nice, and except for the _MARK stuff due to the sparse
__bitwise annotations can be one in a prep patch.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 7e80732cb54708..5efb1e8b4107a9 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -46,7 +46,7 @@  xfs_perag_get(
 	struct xfs_perag	*pag;
 
 	rcu_read_lock();
-	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	pag = xa_load(&mp->m_perags, agno);
 	if (pag) {
 		trace_xfs_perag_get(pag, _RET_IP_);
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
@@ -56,31 +56,6 @@  xfs_perag_get(
 	return pag;
 }
 
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		first,
-	unsigned int		tag)
-{
-	struct xfs_perag	*pag;
-	int			found;
-
-	rcu_read_lock();
-	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-					(void **)&pag, first, 1, tag);
-	if (found <= 0) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	trace_xfs_perag_get_tag(pag, _RET_IP_);
-	atomic_inc(&pag->pag_ref);
-	rcu_read_unlock();
-	return pag;
-}
-
 /* Get a passive reference to the given perag. */
 struct xfs_perag *
 xfs_perag_hold(
@@ -117,38 +92,13 @@  xfs_perag_grab(
 	struct xfs_perag	*pag;
 
 	rcu_read_lock();
-	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	pag = xa_load(&mp->m_perags, agno);
 	if (pag) {
 		trace_xfs_perag_grab(pag, _RET_IP_);
 		if (!atomic_inc_not_zero(&pag->pag_active_ref))
 			pag = NULL;
 	}
-	rcu_read_unlock();
-	return pag;
-}
 
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_grab_tag(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		first,
-	int			tag)
-{
-	struct xfs_perag	*pag;
-	int			found;
-
-	rcu_read_lock();
-	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-					(void **)&pag, first, 1, tag);
-	if (found <= 0) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	trace_xfs_perag_grab_tag(pag, _RET_IP_);
-	if (!atomic_inc_not_zero(&pag->pag_active_ref))
-		pag = NULL;
 	rcu_read_unlock();
 	return pag;
 }
@@ -256,9 +206,7 @@  xfs_free_perag(
 	xfs_agnumber_t		agno;
 
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-		spin_lock(&mp->m_perag_lock);
-		pag = radix_tree_delete(&mp->m_perag_tree, agno);
-		spin_unlock(&mp->m_perag_lock);
+		pag = xa_erase(&mp->m_perags, agno);
 		ASSERT(pag);
 		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
 		xfs_defer_drain_free(&pag->pag_intents_drain);
@@ -347,9 +295,7 @@  xfs_free_unused_perag_range(
 	xfs_agnumber_t		index;
 
 	for (index = agstart; index < agend; index++) {
-		spin_lock(&mp->m_perag_lock);
-		pag = radix_tree_delete(&mp->m_perag_tree, index);
-		spin_unlock(&mp->m_perag_lock);
+		pag = xa_erase(&mp->m_perags, index);
 		if (!pag)
 			break;
 		xfs_buf_cache_destroy(&pag->pag_bcache);
@@ -390,20 +336,11 @@  xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 
-		error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
-		if (error)
-			goto out_free_pag;
-
-		spin_lock(&mp->m_perag_lock);
-		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-			WARN_ON_ONCE(1);
-			spin_unlock(&mp->m_perag_lock);
-			radix_tree_preload_end();
-			error = -EEXIST;
+		error = xa_set(&mp->m_perags, index, pag, GFP_KERNEL);
+		if (error) {
+			WARN_ON_ONCE(error == -EEXIST);
 			goto out_free_pag;
 		}
-		spin_unlock(&mp->m_perag_lock);
-		radix_tree_preload_end();
 
 #ifdef __KERNEL__
 		/* Place kernel structure only init below this point. */
@@ -451,9 +388,7 @@  xfs_initialize_perag(
 
 out_remove_pag:
 	xfs_defer_drain_free(&pag->pag_intents_drain);
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_delete(&mp->m_perag_tree, index);
-	spin_unlock(&mp->m_perag_lock);
+	pag = xa_erase(&mp->m_perags, index);
 out_free_pag:
 	kfree(pag);
 out_unwind_new_pags:
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 35de09a2516c70..b5eee2c787b6b7 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -156,15 +156,11 @@  void xfs_free_perag(struct xfs_mount *mp);
 
 /* Passive AG references */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
-		unsigned int tag);
 struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
 void xfs_perag_put(struct xfs_perag *pag);
 
 /* Active AG references */
 struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t);
-struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t,
-				   int tag);
 void xfs_perag_rele(struct xfs_perag *pag);
 
 /*
@@ -266,13 +262,6 @@  xfs_perag_next(
 	(agno) = 0; \
 	for_each_perag_from((mp), (agno), (pag))
 
-#define for_each_perag_tag(mp, agno, pag, tag) \
-	for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \
-		(pag) != NULL; \
-		(agno) = (pag)->pag_agno + 1, \
-		xfs_perag_rele(pag), \
-		(pag) = xfs_perag_grab_tag((mp), (agno), (tag)))
-
 static inline struct xfs_perag *
 xfs_perag_next_wrap(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index cf629302d48e74..c37f22a4c79f31 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -191,7 +191,7 @@  xfs_reclaim_work_queue(
 {
 
 	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+	if (xa_marked(&mp->m_perags, XFS_ICI_RECLAIM_TAG)) {
 		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 	}
@@ -241,9 +241,7 @@  xfs_perag_set_inode_tag(
 		return;
 
 	/* propagate the tag up into the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
-	spin_unlock(&mp->m_perag_lock);
+	xa_set_mark(&mp->m_perags, pag->pag_agno, tag);
 
 	/* start background work */
 	switch (tag) {
@@ -285,9 +283,7 @@  xfs_perag_clear_inode_tag(
 		return;
 
 	/* clear the tag from the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
-	spin_unlock(&mp->m_perag_lock);
+	xa_clear_mark(&mp->m_perags, pag->pag_agno, tag);
 
 	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
 }
@@ -977,7 +973,7 @@  xfs_reclaim_inodes(
 	if (xfs_want_reclaim_sick(mp))
 		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
 
-	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+	while (xa_marked(&mp->m_perags, XFS_ICI_RECLAIM_TAG)) {
 		xfs_ail_push_all_sync(mp->m_ail);
 		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
 	}
@@ -1020,14 +1016,16 @@  xfs_reclaim_inodes_count(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
-	xfs_agnumber_t		ag = 0;
+	unsigned long		index = 0;
 	long			reclaimable = 0;
 
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		ag = pag->pag_agno + 1;
+	rcu_read_lock();
+	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_RECLAIM_TAG) {
+		trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
 		reclaimable += pag->pag_ici_reclaimable;
-		xfs_perag_put(pag);
 	}
+	rcu_read_unlock();
+
 	return reclaimable;
 }
 
@@ -1370,14 +1368,20 @@  xfs_blockgc_start(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
-	xfs_agnumber_t		agno;
+	unsigned long		index = 0;
 
 	if (xfs_set_blockgc_enabled(mp))
 		return;
 
 	trace_xfs_blockgc_start(mp, __return_address);
-	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+
+	rcu_read_lock();
+	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_BLOCKGC_TAG) {
+		if (!atomic_read(&pag->pag_active_ref))
+			continue;
 		xfs_blockgc_queue(pag);
+	}
+	rcu_read_unlock();
 }
 
 /* Don't try to run block gc on an inode that's in any of these states. */
@@ -1493,21 +1497,32 @@  xfs_blockgc_flush_all(
 	struct xfs_mount	*mp)
 {
 	struct xfs_perag	*pag;
-	xfs_agnumber_t		agno;
+	unsigned long		index = 0;
 
 	trace_xfs_blockgc_flush_all(mp, __return_address);
 
 	/*
-	 * For each blockgc worker, move its queue time up to now.  If it
-	 * wasn't queued, it will not be requeued.  Then flush whatever's
-	 * left.
+	 * For each blockgc worker, move its queue time up to now.  If it wasn't
+	 * queued, it will not be requeued.  Then flush whatever is left.
 	 */
-	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
-		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
-				&pag->pag_blockgc_work, 0);
+	rcu_read_lock();
+	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_BLOCKGC_TAG)
+		mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0);
+	rcu_read_unlock();
+
+	index = 0;
+	rcu_read_lock();
+	xa_for_each_marked(&mp->m_perags, index, pag, XFS_ICI_BLOCKGC_TAG) {
+		if (!atomic_inc_not_zero(&pag->pag_active_ref))
+			continue;
+		rcu_read_unlock();
 
-	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
 		flush_delayed_work(&pag->pag_blockgc_work);
+		xfs_perag_rele(pag);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
 
 	return xfs_inodegc_flush(mp);
 }
@@ -1755,18 +1770,26 @@  xfs_icwalk(
 	struct xfs_perag	*pag;
 	int			error = 0;
 	int			last_error = 0;
-	xfs_agnumber_t		agno;
+	unsigned long		index = 0;
+
+	rcu_read_lock();
+	xa_for_each_marked(&mp->m_perags, index, pag, goal) {
+		if (!atomic_inc_not_zero(&pag->pag_active_ref))
+			continue;
+		rcu_read_unlock();
 
-	for_each_perag_tag(mp, agno, pag, goal) {
 		error = xfs_icwalk_ag(pag, goal, icw);
+		xfs_perag_rele(pag);
+
+		rcu_read_lock();
 		if (error) {
 			last_error = error;
-			if (error == -EFSCORRUPTED) {
-				xfs_perag_rele(pag);
+			if (error == -EFSCORRUPTED)
 				break;
-			}
 		}
 	}
+	rcu_read_unlock();
+
 	return last_error;
 	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d0567dfbc0368d..dce2d832e1e6d1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -208,8 +208,7 @@  typedef struct xfs_mount {
 	 */
 	atomic64_t		m_allocbt_blks;
 
-	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
-	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	struct xarray		m_perags;	/* per-ag accounting info */
 	uint64_t		m_resblks;	/* total reserved blocks */
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 27e9f749c4c7fc..c41b543f2e9121 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2009,8 +2009,7 @@  static int xfs_init_fs_context(
 		return -ENOMEM;
 
 	spin_lock_init(&mp->m_sb_lock);
-	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
-	spin_lock_init(&mp->m_perag_lock);
+	xa_init(&mp->m_perags);
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 180ce697305a92..f3ddee49071c16 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -210,14 +210,13 @@  DEFINE_EVENT(xfs_perag_class, name,	\
 	TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \
 	TP_ARGS(pag, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_hold);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
-DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count);
 
 TRACE_EVENT(xfs_inodegc_worker,
 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),