diff mbox series

[2/2] btrfs: use xarray for btrfs_root::delayed_nodes_tree instead of radix-tree

Message ID e283f8d460c7b3288e8eb1d8974d6b5842210167.1701384168.git.dsterba@suse.com (mailing list archive)
State New, archived
Headers show
Series Convert btrfs_root::delayed_nodes_tree to xarray | expand

Commit Message

David Sterba Nov. 30, 2023, 10:49 p.m. UTC
Port btrfs_root::delayed_nodes_tree to the xarray API. The functionality
is equivalent, the flags are still GFP_ATOMIC as the changes are done
under a spin lock. Using a sleeping allocation would need changing the
lock to mutex.

The conversion is almost direct, btrfs_kill_all_delayed_nodes() uses an
iterator to collect the items to delete.

The patch is almost the same as 253bf57555e451 ("btrfs: turn
delayed_nodes_tree into an XArray"), there are renames, comments and
change of the GFP flags for xa_insert.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h         |  6 +--
 fs/btrfs/delayed-inode.c | 80 ++++++++++++++++++++--------------------
 fs/btrfs/disk-io.c       |  3 +-
 fs/btrfs/inode.c         |  2 +-
 4 files changed, 47 insertions(+), 44 deletions(-)

Comments

Filipe Manana Dec. 1, 2023, 11:03 a.m. UTC | #1
On Thu, Nov 30, 2023 at 10:56 PM David Sterba <dsterba@suse.com> wrote:
>
> Port btrfs_root::delayed_nodes_tree to the xarray API. The functionality
> is equivalent, the flags are still GFP_ATOMIC as the changes are done
> under a spin lock. Using a sleeping allocation would need changing the
> lock to mutex.
>
> The conversion is almost direct, btrfs_kill_all_delayed_nodes() uses an
> iterator to collect the items to delete.
>
> The patch is almost the same as 253bf57555e451 ("btrfs: turn
> delayed_nodes_tree into an XArray"), there are renames, comments and
> change of the GFP flags for xa_insert.

Can we include in the changelog why we are converting from radix tree to xarray?
What do we gain with it? Why not the more recent maple tree for example?

Codewise this is about the same amount of LOCs, and I don't find it
either particularly simpler or more complex either.
Performance wise, there's no indication of anything.

It also removed the preallocation using GFP_NOFS and now depends on
atomic allocations on insertions.
This is odd because we've been making efforts over time to eliminate
atomic allocations in order to reduce the chances of allocation
failures,
yet we are now allowing for more atomic allocations without mentioning
what we gain in return.

Thanks.

>
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
>  fs/btrfs/ctree.h         |  6 +--
>  fs/btrfs/delayed-inode.c | 80 ++++++++++++++++++++--------------------
>  fs/btrfs/disk-io.c       |  3 +-
>  fs/btrfs/inode.c         |  2 +-
>  4 files changed, 47 insertions(+), 44 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 54fd4eb92745..70e828d33177 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -227,10 +227,10 @@ struct btrfs_root {
>         struct rb_root inode_tree;
>
>         /*
> -        * radix tree that keeps track of delayed nodes of every inode,
> -        * protected by inode_lock
> +        * Xarray that keeps track of delayed nodes of every inode, protected
> +        * by @inode_lock.
>          */
> -       struct radix_tree_root delayed_nodes_tree;
> +       struct xarray delayed_nodes;
>         /*
>          * right now this just gets used so that a root has its own devid
>          * for stat.  It may be used for more later
> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> index c9c4a53048a1..0437f52ca42c 100644
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>         }
>
>         spin_lock(&root->inode_lock);
> -       node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
> +       node = xa_load(&root->delayed_nodes, ino);
>
>         if (node) {
>                 if (btrfs_inode->delayed_node) {
> @@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>
>                 /*
>                  * It's possible that we're racing into the middle of removing
> -                * this node from the radix tree.  In this case, the refcount
> +                * this node from the xarray.  In this case, the refcount
>                  * was zero and it should never go back to one.  Just return
> -                * NULL like it was never in the radix at all; our release
> +                * NULL like it was never in the xarray at all; our release
>                  * function is in the process of removing it.
>                  *
>                  * Some implementations of refcount_inc refuse to bump the
> @@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>                  * here, refcount_inc() may decide to just WARN_ONCE() instead
>                  * of actually bumping the refcount.
>                  *
> -                * If this node is properly in the radix, we want to bump the
> +                * If this node is properly in the xarray, we want to bump the
>                  * refcount twice, once for the inode and once for this get
>                  * operation.
>                  */
> @@ -121,28 +121,29 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
>         u64 ino = btrfs_ino(btrfs_inode);
>         int ret;
>
> -again:
> -       node = btrfs_get_delayed_node(btrfs_inode);
> -       if (node)
> -               return node;
> +       do {
> +               node = btrfs_get_delayed_node(btrfs_inode);
> +               if (node)
> +                       return node;
>
> -       node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
> -       if (!node)
> -               return ERR_PTR(-ENOMEM);
> -       btrfs_init_delayed_node(node, root, ino);
> +               node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
> +               if (!node)
> +                       return ERR_PTR(-ENOMEM);
> +               btrfs_init_delayed_node(node, root, ino);
>
> -       /* cached in the btrfs inode and can be accessed */
> -       refcount_set(&node->refs, 2);
> +               /* Cached in the inode and can be accessed. */
> +               refcount_set(&node->refs, 2);
>
> -       spin_lock(&root->inode_lock);
> -       ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
> -       if (ret < 0) {
> -               spin_unlock(&root->inode_lock);
> -               kmem_cache_free(delayed_node_cache, node);
> -               if (ret == -EEXIST)
> -                       goto again;
> -               return ERR_PTR(ret);
> -       }
> +               spin_lock(&root->inode_lock);
> +               ret = xa_insert(&root->delayed_nodes, ino, node, GFP_ATOMIC);
> +               if (ret < 0) {
> +                       spin_unlock(&root->inode_lock);
> +                       kmem_cache_free(delayed_node_cache, node);
> +                       if (ret != -EBUSY)
> +                               return ERR_PTR(ret);
> +                       /* Otherwise it's ENOMEM. */
> +               }
> +       } while (ret < 0);
>         btrfs_inode->delayed_node = node;
>         spin_unlock(&root->inode_lock);
>
> @@ -263,8 +264,7 @@ static void __btrfs_release_delayed_node(
>                  * back up.  We can delete it now.
>                  */
>                 ASSERT(refcount_read(&delayed_node->refs) == 0);
> -               radix_tree_delete(&root->delayed_nodes_tree,
> -                                 delayed_node->inode_id);
> +               xa_erase(&root->delayed_nodes, delayed_node->inode_id);
>                 spin_unlock(&root->inode_lock);
>                 kmem_cache_free(delayed_node_cache, delayed_node);
>         }
> @@ -2032,34 +2032,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
>
>  void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
>  {
> -       u64 inode_id = 0;
> +       unsigned long index = 0;
>         struct btrfs_delayed_node *delayed_nodes[8];
> -       int i, n;
>
>         while (1) {
> +               struct btrfs_delayed_node *node;
> +               int count;
> +
>                 spin_lock(&root->inode_lock);
> -               n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
> -                                          (void **)delayed_nodes, inode_id,
> -                                          ARRAY_SIZE(delayed_nodes));
> -               if (!n) {
> +               if (xa_empty(&root->delayed_nodes)) {
>                         spin_unlock(&root->inode_lock);
> -                       break;
> +                       return;
>                 }
>
> -               inode_id = delayed_nodes[n - 1]->inode_id + 1;
> -               for (i = 0; i < n; i++) {
> +               count = 0;
> +               xa_for_each_start(&root->delayed_nodes, index, node, index) {
>                         /*
>                          * Don't increase refs in case the node is dead and
>                          * about to be removed from the tree in the loop below
>                          */
> -                       if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
> -                               delayed_nodes[i] = NULL;
> +                       if (refcount_inc_not_zero(&node->refs)) {
> +                               delayed_nodes[count] = node;
> +                               count++;
> +                       }
> +                       if (count >= ARRAY_SIZE(delayed_nodes))
> +                               break;
>                 }
>                 spin_unlock(&root->inode_lock);
> +               index++;
>
> -               for (i = 0; i < n; i++) {
> -                       if (!delayed_nodes[i])
> -                               continue;
> +               for (int i = 0; i < count; i++) {
>                         __btrfs_kill_delayed_node(delayed_nodes[i]);
>                         btrfs_release_delayed_node(delayed_nodes[i]);
>                 }
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9317606017e2..39810120e9f9 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -655,7 +655,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
>         root->nr_delalloc_inodes = 0;
>         root->nr_ordered_extents = 0;
>         root->inode_tree = RB_ROOT;
> -       INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
> +       /* GFP flags are compatible with XA_FLAGS_*. */
> +       xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
>
>         btrfs_init_root_block_rsv(root);
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f8647d8271b7..41c904530eaa 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3805,7 +3805,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
>          * cache.
>          *
>          * This is required for both inode re-read from disk and delayed inode
> -        * in delayed_nodes_tree.
> +        * in the delayed_nodes xarray.
>          */
>         if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
>                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
> --
> 2.42.1
>
>
David Sterba Dec. 4, 2023, 3:49 p.m. UTC | #2
On Fri, Dec 01, 2023 at 11:03:25AM +0000, Filipe Manana wrote:
> On Thu, Nov 30, 2023 at 10:56 PM David Sterba <dsterba@suse.com> wrote:
> >
> > Port btrfs_root::delayed_nodes_tree to the xarray API. The functionality
> > is equivalent, the flags are still GFP_ATOMIC as the changes are done
> > under a spin lock. Using a sleeping allocation would need changing the
> > lock to mutex.
> >
> > The conversion is almost direct, btrfs_kill_all_delayed_nodes() uses an
> > iterator to collect the items to delete.
> >
> > The patch is almost the same as 253bf57555e451 ("btrfs: turn
> > delayed_nodes_tree into an XArray"), there are renames, comments and
> > change of the GFP flags for xa_insert.
> 
> Can we include in the changelog why we are converting from radix tree to xarray?
> What do we gain with it? Why not the more recent maple tree for example?

I haven't evaluated maple tree for as a replacement. Regarding the
radix-tree, it's actually implemeted by xarray so it's more or less just
an API change. The reason to remove radix-tree is that it's an obsolete
API, https://lwn.net/Articles/745073/

"The current patch set converts a number of radix-tree users to
XArrays, but some still remain. If all goes according to Wilcox's plan,
though, those will be converted in the near future and the radix-tree
API will head toward removal."

> Codewise this is about the same amount of LOCs, and I don't find it
> either particularly simpler or more complex either.
> Performance wise, there's no indication of anything.

Yeah, line count should not matter, performance should be the same
(because it's the same underlying data structure).

> It also removed the preallocation using GFP_NOFS and now depends on
> atomic allocations on insertions.
> This is odd because we've been making efforts over time to eliminate
> atomic allocations in order to reduce the chances of allocation
> failures,
> yet we are now allowing for more atomic allocations without mentioning
> what we gain in return.

Right now it's because of the API conversion.  The preload mechanism is
gone from the API, we might be able to add it using the specialized API
or xa_reserve. The preload disables preemption so we can't use sleeping
allocations like GFP_NOFS in xa_insert(). This would need the spinlock
-> mutex conversion.

I'd like to get rid of the atomic allocations too, but wuold like to do
it in steps. The radix-tree -> xarray is almost direct, switching
preload to atomic allocations is potentially problematic but only in
some cases. We have other atomic allocations and it does not seem to be
causing problems.

The long term plan:

- radix-tree -> xarray
- spinlock -> mutex
- no atomic allocations for xarrays
David Sterba Dec. 4, 2023, 4:07 p.m. UTC | #3
On Mon, Dec 04, 2023 at 04:49:34PM +0100, David Sterba wrote:
> On Fri, Dec 01, 2023 at 11:03:25AM +0000, Filipe Manana wrote:
> > On Thu, Nov 30, 2023 at 10:56 PM David Sterba <dsterba@suse.com> wrote:
> > >
> > > Port btrfs_root::delayed_nodes_tree to the xarray API. The functionality
> > > is equivalent, the flags are still GFP_ATOMIC as the changes are done
> > > under a spin lock. Using a sleeping allocation would need changing the
> > > lock to mutex.
> > >
> > > The conversion is almost direct, btrfs_kill_all_delayed_nodes() uses an
> > > iterator to collect the items to delete.
> > >
> > > The patch is almost the same as 253bf57555e451 ("btrfs: turn
> > > delayed_nodes_tree into an XArray"), there are renames, comments and
> > > change of the GFP flags for xa_insert.
> > 
> > Can we include in the changelog why we are converting from radix tree to xarray?
> > What do we gain with it? Why not the more recent maple tree for example?
> 
> I haven't evaluated maple tree for as a replacement. Regarding the
> radix-tree, it's actually implemeted by xarray so it's more or less just
> an API change. The reason to remove radix-tree is that it's an obsolete
> API, https://lwn.net/Articles/745073/
> 
> "The current patch set converts a number of radix-tree users to
> XArrays, but some still remain. If all goes according to Wilcox's plan,
> though, those will be converted in the near future and the radix-tree
> API will head toward removal."
> 
> > Codewise this is about the same amount of LOCs, and I don't find it
> > either particularly simpler or more complex either.
> > Performance wise, there's no indication of anything.
> 
> Yeah, line count should not matter, performance should be the same
> (because it's the same underlying data structure).
> 
> > It also removed the preallocation using GFP_NOFS and now depends on
> > atomic allocations on insertions.
> > This is odd because we've been making efforts over time to eliminate
> > atomic allocations in order to reduce the chances of allocation
> > failures,
> > yet we are now allowing for more atomic allocations without mentioning
> > what we gain in return.
> 
> Right now it's because of the API conversion.  The preload mechanism is
> gone from the API, we might be able to add it using the specialized API
> or xa_reserve. The preload disables preemption so we can't use sleeping
> allocations like GFP_NOFS in xa_insert(). This would need the spinlock
> -> mutex conversion.
> 
> I'd like to get rid of the atomic allocations too, but wuold like to do
> it in steps. The radix-tree -> xarray is almost direct, switching
> preload to atomic allocations is potentially problematic but only in
> some cases. We have other atomic allocations and it does not seem to be
> causing problems.
> 
> The long term plan:
> 
> - radix-tree -> xarray
> - spinlock -> mutex
> - no atomic allocations for xarrays

It the lock conversion would not be the right way, the xa_reserve can be
done but it it's not as simple as the preload, it inserts a reserved
entry to the tree which is NULL upon read so we'd have to handle that
everywhere.
David Sterba Dec. 5, 2023, 7:04 p.m. UTC | #4
On Mon, Dec 04, 2023 at 05:07:31PM +0100, David Sterba wrote:
> On Mon, Dec 04, 2023 at 04:49:34PM +0100, David Sterba wrote:
> > On Fri, Dec 01, 2023 at 11:03:25AM +0000, Filipe Manana wrote:
> > > On Thu, Nov 30, 2023 at 10:56 PM David Sterba <dsterba@suse.com> wrote:

> It the lock conversion would not be the right way, the xa_reserve can be
> done but it it's not as simple as the preload, it inserts a reserved
> entry to the tree which is NULL upon read so we'd have to handle that
> everywhere.

Seems that xa_reserve can emulate the preload. It takes the GFP flags
and will insert a reserved entry, btrfs_get_delayed_node() expects a
NULL and there's no other xa_load() except
btrfs_get_or_create_delayed_node() that's doing the insert.

The insertion is done to the reserved slot by xa_store() and serialized
by the spin lock, the slot reservation can race, xarray handles that.
xa_store() also takes the GFP flags but they should not be needed.

I'm running the code below with manual error injection (xa_reserve()
fails every 100th time), so far fstests continue, reporting either
enomem or transaction aborts.

--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -122,6 +122,8 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	int ret;
 
 	do {
+		void *ptr;
+
 		node = btrfs_get_delayed_node(btrfs_inode);
 		if (node)
 			return node;
@@ -134,15 +136,25 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 		/* Cached in the inode and can be accessed. */
 		refcount_set(&node->refs, 2);
 
+		ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+		if (ret == -ENOMEM) {
+			kmem_cache_free(delayed_node_cache, node);
+			return ERR_PTR(-ENOMEM);
+		}
 		spin_lock(&root->inode_lock);
-		ret = xa_insert(&root->delayed_nodes, ino, node, GFP_ATOMIC);
-		if (ret < 0) {
+		ptr = xa_load(&root->delayed_nodes, ino);
+		if (ptr) {
+			/* Somebody inserted it. */
 			spin_unlock(&root->inode_lock);
 			kmem_cache_free(delayed_node_cache, node);
-			if (ret != -EBUSY)
-				return ERR_PTR(ret);
-			/* Otherwise it's ENOMEM. */
+			ret = -EEXIST;
+			continue;
 		}
+		ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+		ASSERT(xa_err(ptr) != -EINVAL);
+		ASSERT(xa_err(ptr) != -ENOMEM);
+		ASSERT(ptr == NULL);
+		ret = 0;
 	} while (ret < 0);
 	btrfs_inode->delayed_node = node;
 	spin_unlock(&root->inode_lock);
Filipe Manana Dec. 5, 2023, 7:39 p.m. UTC | #5
On Tue, Dec 5, 2023 at 7:11 PM David Sterba <dsterba@suse.cz> wrote:
>
> On Mon, Dec 04, 2023 at 05:07:31PM +0100, David Sterba wrote:
> > On Mon, Dec 04, 2023 at 04:49:34PM +0100, David Sterba wrote:
> > > On Fri, Dec 01, 2023 at 11:03:25AM +0000, Filipe Manana wrote:
> > > > On Thu, Nov 30, 2023 at 10:56 PM David Sterba <dsterba@suse.com> wrote:
>
> > It the lock conversion would not be the right way, the xa_reserve can be
> > done but it it's not as simple as the preload, it inserts a reserved
> > entry to the tree which is NULL upon read so we'd have to handle that
> > everywhere.
>
> Seems that xa_reserve can emulate the preload. It takes the GFP flags
> and will insert a reserved entry, btrfs_get_delayed_node() expects a
> NULL and there's no other xa_load() except
> btrfs_get_or_create_delayed_node() that's doing the insert.
>
> The insertion is done to the reserved slot by xa_store() and serialized
> by the spin lock, the slot reservation can race, xarray handles that.
> xa_store() also takes the GFP flags but they should not be needed.
>
> I'm running the code below with manual error injection (xa_reserve()
> fails every 100th time), so far fstests continue, reporting either
> enomem or transaction aborts.

Oh, that's perfect and seems very promising. Thanks.

>
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -122,6 +122,8 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
>         int ret;
>
>         do {
> +               void *ptr;
> +
>                 node = btrfs_get_delayed_node(btrfs_inode);
>                 if (node)
>                         return node;
> @@ -134,15 +136,25 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
>                 /* Cached in the inode and can be accessed. */
>                 refcount_set(&node->refs, 2);
>
> +               ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
> +               if (ret == -ENOMEM) {
> +                       kmem_cache_free(delayed_node_cache, node);
> +                       return ERR_PTR(-ENOMEM);
> +               }
>                 spin_lock(&root->inode_lock);
> -               ret = xa_insert(&root->delayed_nodes, ino, node, GFP_ATOMIC);
> -               if (ret < 0) {
> +               ptr = xa_load(&root->delayed_nodes, ino);
> +               if (ptr) {
> +                       /* Somebody inserted it. */
>                         spin_unlock(&root->inode_lock);
>                         kmem_cache_free(delayed_node_cache, node);
> -                       if (ret != -EBUSY)
> -                               return ERR_PTR(ret);
> -                       /* Otherwise it's ENOMEM. */
> +                       ret = -EEXIST;
> +                       continue;
>                 }
> +               ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
> +               ASSERT(xa_err(ptr) != -EINVAL);
> +               ASSERT(xa_err(ptr) != -ENOMEM);
> +               ASSERT(ptr == NULL);
> +               ret = 0;
>         } while (ret < 0);
>         btrfs_inode->delayed_node = node;
>         spin_unlock(&root->inode_lock);
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54fd4eb92745..70e828d33177 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -227,10 +227,10 @@  struct btrfs_root {
 	struct rb_root inode_tree;
 
 	/*
-	 * radix tree that keeps track of delayed nodes of every inode,
-	 * protected by inode_lock
+	 * Xarray that keeps track of delayed nodes of every inode, protected
+	 * by @inode_lock.
 	 */
-	struct radix_tree_root delayed_nodes_tree;
+	struct xarray delayed_nodes;
 	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c9c4a53048a1..0437f52ca42c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -71,7 +71,7 @@  static struct btrfs_delayed_node *btrfs_get_delayed_node(
 	}
 
 	spin_lock(&root->inode_lock);
-	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+	node = xa_load(&root->delayed_nodes, ino);
 
 	if (node) {
 		if (btrfs_inode->delayed_node) {
@@ -83,9 +83,9 @@  static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
 		/*
 		 * It's possible that we're racing into the middle of removing
-		 * this node from the radix tree.  In this case, the refcount
+		 * this node from the xarray.  In this case, the refcount
 		 * was zero and it should never go back to one.  Just return
-		 * NULL like it was never in the radix at all; our release
+		 * NULL like it was never in the xarray at all; our release
 		 * function is in the process of removing it.
 		 *
 		 * Some implementations of refcount_inc refuse to bump the
@@ -93,7 +93,7 @@  static struct btrfs_delayed_node *btrfs_get_delayed_node(
 		 * here, refcount_inc() may decide to just WARN_ONCE() instead
 		 * of actually bumping the refcount.
 		 *
-		 * If this node is properly in the radix, we want to bump the
+		 * If this node is properly in the xarray, we want to bump the
 		 * refcount twice, once for the inode and once for this get
 		 * operation.
 		 */
@@ -121,28 +121,29 @@  static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	u64 ino = btrfs_ino(btrfs_inode);
 	int ret;
 
-again:
-	node = btrfs_get_delayed_node(btrfs_inode);
-	if (node)
-		return node;
+	do {
+		node = btrfs_get_delayed_node(btrfs_inode);
+		if (node)
+			return node;
 
-	node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
-	if (!node)
-		return ERR_PTR(-ENOMEM);
-	btrfs_init_delayed_node(node, root, ino);
+		node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+		if (!node)
+			return ERR_PTR(-ENOMEM);
+		btrfs_init_delayed_node(node, root, ino);
 
-	/* cached in the btrfs inode and can be accessed */
-	refcount_set(&node->refs, 2);
+		/* Cached in the inode and can be accessed. */
+		refcount_set(&node->refs, 2);
 
-	spin_lock(&root->inode_lock);
-	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
-	if (ret < 0) {
-		spin_unlock(&root->inode_lock);
-		kmem_cache_free(delayed_node_cache, node);
-		if (ret == -EEXIST)
-			goto again;
-		return ERR_PTR(ret);
-	}
+		spin_lock(&root->inode_lock);
+		ret = xa_insert(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+		if (ret < 0) {
+			spin_unlock(&root->inode_lock);
+			kmem_cache_free(delayed_node_cache, node);
+			if (ret != -EBUSY)
+				return ERR_PTR(ret);
+			/* Otherwise it's ENOMEM. */
+		}
+	} while (ret < 0);
 	btrfs_inode->delayed_node = node;
 	spin_unlock(&root->inode_lock);
 
@@ -263,8 +264,7 @@  static void __btrfs_release_delayed_node(
 		 * back up.  We can delete it now.
 		 */
 		ASSERT(refcount_read(&delayed_node->refs) == 0);
-		radix_tree_delete(&root->delayed_nodes_tree,
-				  delayed_node->inode_id);
+		xa_erase(&root->delayed_nodes, delayed_node->inode_id);
 		spin_unlock(&root->inode_lock);
 		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
@@ -2032,34 +2032,36 @@  void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
 
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 {
-	u64 inode_id = 0;
+	unsigned long index = 0;
 	struct btrfs_delayed_node *delayed_nodes[8];
-	int i, n;
 
 	while (1) {
+		struct btrfs_delayed_node *node;
+		int count;
+
 		spin_lock(&root->inode_lock);
-		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
-					   (void **)delayed_nodes, inode_id,
-					   ARRAY_SIZE(delayed_nodes));
-		if (!n) {
+		if (xa_empty(&root->delayed_nodes)) {
 			spin_unlock(&root->inode_lock);
-			break;
+			return;
 		}
 
-		inode_id = delayed_nodes[n - 1]->inode_id + 1;
-		for (i = 0; i < n; i++) {
+		count = 0;
+		xa_for_each_start(&root->delayed_nodes, index, node, index) {
 			/*
 			 * Don't increase refs in case the node is dead and
 			 * about to be removed from the tree in the loop below
 			 */
-			if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
-				delayed_nodes[i] = NULL;
+			if (refcount_inc_not_zero(&node->refs)) {
+				delayed_nodes[count] = node;
+				count++;
+			}
+			if (count >= ARRAY_SIZE(delayed_nodes))
+				break;
 		}
 		spin_unlock(&root->inode_lock);
+		index++;
 
-		for (i = 0; i < n; i++) {
-			if (!delayed_nodes[i])
-				continue;
+		for (int i = 0; i < count; i++) {
 			__btrfs_kill_delayed_node(delayed_nodes[i]);
 			btrfs_release_delayed_node(delayed_nodes[i]);
 		}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9317606017e2..39810120e9f9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -655,7 +655,8 @@  static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->nr_delalloc_inodes = 0;
 	root->nr_ordered_extents = 0;
 	root->inode_tree = RB_ROOT;
-	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+	/* GFP flags are compatible with XA_FLAGS_*. */
+	xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
 
 	btrfs_init_root_block_rsv(root);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f8647d8271b7..41c904530eaa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3805,7 +3805,7 @@  static int btrfs_read_locked_inode(struct inode *inode,
 	 * cache.
 	 *
 	 * This is required for both inode re-read from disk and delayed inode
-	 * in delayed_nodes_tree.
+	 * in the delayed_nodes xarray.
 	 */
 	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,