diff mbox series

[2/3] btrfs: skip inodes without loaded extent maps when shrinking extent maps

Message ID 663658b73e3cd9dd5e34e8eee34f4959f6ccb5ec.1739710434.git.fdmanana@suse.com (mailing list archive)
State New
Headers show
Series btrfs: some fixes related to the extent map shrinker | expand

Commit Message

Filipe Manana Feb. 16, 2025, 1:16 p.m. UTC
From: Filipe Manana <fdmanana@suse.com>

If there are inodes that don't have any loaded extent maps, we end up
grabbing a reference on them and later adding a delayed iput, which wakes
up the cleaner and makes it do unnecessary work. This is common when for
example the inodes were open only to run stat(2) or all their extent maps
were already released through the folio release callback
(btrfs_release_folio()) or released by a previous run of the shrinker, or
directories which never have extent maps.

Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Tested-by: Ivan Shapovalov <intelfx@intelfx.name>
Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/
CC: stable@vger.kernel.org # 6.13+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/extent_map.c | 77 +++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 21 deletions(-)

Comments

Johannes Thumshirn Feb. 18, 2025, 6:18 a.m. UTC | #1
On 16.02.25 14:16, fdmanana@kernel.org wrote:
> +static struct btrfs_inode *find_first_inode(struct btrfs_root *root, u64 min_ino)

Can we call it something like find_first_inode_for_shrinker() or sth 
like that?

It is very similar to btrfs_find_first_inode() but not the similar 
enough to be the subset of it and I find that quite confusing.

Otherwise
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Filipe Manana Feb. 18, 2025, 10:19 a.m. UTC | #2
On Tue, Feb 18, 2025 at 6:18 AM Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 16.02.25 14:16, fdmanana@kernel.org wrote:
> > +static struct btrfs_inode *find_first_inode(struct btrfs_root *root, u64 min_ino)
>
> Can we call it something like find_first_inode_for_shrinker() or sth
> like that?

Sure, I'll rename it to find_first_inode_to_shrink() at commit time then.

Thanks.

>
> It is very similar to btrfs_find_first_inode() but not the similar
> enough to be the subset of it and I find that quite confusing.
>
> Otherwise
> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
diff mbox series

Patch

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bee1b94a1049..820aef514238 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1128,6 +1128,8 @@  static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 	long nr_dropped = 0;
 	struct rb_node *node;
 
+	lockdep_assert_held_write(&tree->lock);
+
 	/*
 	 * Take the mmap lock so that we serialize with the inode logging phase
 	 * of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1141,12 @@  static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 	 * to find new extents, which may not be there yet because ordered
 	 * extents haven't completed yet.
 	 *
-	 * We also do a try lock because otherwise we could deadlock. This is
-	 * because the shrinker for this filesystem may be invoked while we are
-	 * in a path that is holding the mmap lock in write mode. For example in
-	 * a reflink operation while COWing an extent buffer, when allocating
-	 * pages for a new extent buffer and under memory pressure, the shrinker
-	 * may be invoked, and therefore we would deadlock by attempting to read
-	 * lock the mmap lock while we are holding already a write lock on it.
+	 * We also do a try lock because we don't want to block for too long and
+	 * we are holding the extent map tree's lock in write mode.
 	 */
 	if (!down_read_trylock(&inode->i_mmap_lock))
 		return 0;
 
-	/*
-	 * We want to be fast so if the lock is busy we don't want to spend time
-	 * waiting for it - either some task is about to do IO for the inode or
-	 * we may have another task shrinking extent maps, here in this code, so
-	 * skip this inode.
-	 */
-	if (!write_trylock(&tree->lock)) {
-		up_read(&inode->i_mmap_lock);
-		return 0;
-	}
-
 	node = rb_first(&tree->root);
 	while (node) {
 		struct rb_node *next = rb_next(node);
@@ -1201,12 +1187,60 @@  static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 			break;
 		node = next;
 	}
-	write_unlock(&tree->lock);
 	up_read(&inode->i_mmap_lock);
 
 	return nr_dropped;
 }
 
+static struct btrfs_inode *find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+	struct btrfs_inode *inode;
+	unsigned long from = min_ino;
+
+	xa_lock(&root->inodes);
+	while (true) {
+		struct extent_map_tree *tree;
+
+		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+		if (!inode)
+			break;
+
+		tree = &inode->extent_tree;
+
+		/*
+		 * We want to be fast so if the lock is busy we don't want to
+		 * spend time waiting for it (some task is about to do IO for
+		 * the inode).
+		 */
+		if (!write_trylock(&tree->lock))
+			goto next;
+
+		/*
+		 * Skip inode if it doesn't have loaded extent maps, so we avoid
+		 * getting a reference and doing an iput later. This includes
+		 * cases like files that were opened for things like stat(2), or
+		 * files with all extent maps previously released through the
+		 * release folio callback (btrfs_release_folio()) or released in
+		 * a previous run, or directories which never have extent maps.
+		 */
+		if (RB_EMPTY_ROOT(&tree->root)) {
+			write_unlock(&tree->lock);
+			goto next;
+		}
+
+		if (igrab(&inode->vfs_inode))
+			break;
+
+		write_unlock(&tree->lock);
+next:
+		from = btrfs_ino(inode) + 1;
+		cond_resched_lock(&root->inodes.xa_lock);
+	}
+	xa_unlock(&root->inodes);
+
+	return inode;
+}
+
 static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1214,9 +1248,10 @@  static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 	long nr_dropped = 0;
 	u64 min_ino = fs_info->em_shrinker_last_ino + 1;
 
-	inode = btrfs_find_first_inode(root, min_ino);
+	inode = find_first_inode(root, min_ino);
 	while (inode) {
 		nr_dropped += btrfs_scan_inode(inode, ctx);
+		write_unlock(&inode->extent_tree.lock);
 
 		min_ino = btrfs_ino(inode) + 1;
 		fs_info->em_shrinker_last_ino = btrfs_ino(inode);
@@ -1227,7 +1262,7 @@  static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 
 		cond_resched();
 
-		inode = btrfs_find_first_inode(root, min_ino);
+		inode = find_first_inode(root, min_ino);
 	}
 
 	if (inode) {