diff mbox series

[5/9] btrfs: reduce inode lock critical section when setting and clearing delalloc

Message ID ddec77dee32394687dab1b94904951be7e903222.1707491248.git.fdmanana@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: cleanups and minor performance change to setting/clearing delalloc | expand

Commit Message

Filipe Manana Feb. 9, 2024, 6 p.m. UTC
From: Filipe Manana <fdmanana@suse.com>

When setting and clearing a delalloc range, at btrfs_set_delalloc_extent()
and btrfs_clear_delalloc_extent(), we are adding/removing the inode
to/from the root's list of delalloc inodes while under the protection of
the inode's lock. This however is not needed, we can add and remove the
inode to the root's list without holding the inode's lock because here
we are under the protection of the io tree's lock, reducing the size of
the critical section delimited by the inode's lock. The inode's lock is
used in many other places such as when finishing an ordered extent (when
calling btrfs_update_inode_bytes() or btrfs_delalloc_release_metadata(),
or decreasing the number of outstanding extents) or when reserving space
when doing a buffered or direct IO write (calls to functions from
delalloc-space.c).

So move the inode add/remove operations to the root's list of delalloc
inodes to outside the critical section delimited by the inode's lock.
This also allows us to get rid of the BTRFS_INODE_IN_DELALLOC_LIST flag
since we can rely on the inode's delalloc bytes counter to determine if
the inode is or is not in the list.

The following fio based test, that exercises IO to multiple files in the
same subvolume, was used to test:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/nullb0
   MNT=/mnt/nullb0
   MOUNT_OPTIONS="-o ssd"

   mkfs.btrfs -f $DEV &> /dev/null
   mount $MOUNT_OPTIONS $DEV $MNT

   fio --direct=0 --ioengine=sync --thread --directory=$MNT \
       --invalidate=1 --group_reporting=1 \
       --new_group --rw=randwrite --size=50m --numjobs=200 \
       --bs=4k --fsync_on_close=0 --fallocate=none --end_fsync=0 \
       --name=foo --filename_format=FioWorkloads.\$jobnum

   umount $MNT

The test was run on a non-debug kernel (Debian's default kernel config)
against a 16G null block device.

Result before this patch:

   WRITE: bw=81.9MiB/s (85.9MB/s), 81.9MiB/s-81.9MiB/s (85.9MB/s-85.9MB/s), io=9.77GiB (10.5GB), run=122136-122136msec

Result after this patch:

   WRITE: bw=86.8MiB/s (91.0MB/s), 86.8MiB/s-86.8MiB/s (91.0MB/s-91.0MB/s), io=9.77GiB (10.5GB), run=115180-115180msec

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/btrfs_inode.h |  1 -
 fs/btrfs/inode.c       | 60 ++++++++++++++++++++++++++++--------------
 2 files changed, 40 insertions(+), 21 deletions(-)
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 32c59c5bd94d..7799d00cc5d6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -41,7 +41,6 @@  enum {
 	  */
 	BTRFS_INODE_NEEDS_FULL_SYNC,
 	BTRFS_INODE_COPY_EVERYTHING,
-	BTRFS_INODE_IN_DELALLOC_LIST,
 	BTRFS_INODE_HAS_PROPS,
 	BTRFS_INODE_SNAPSHOT_FLUSH,
 	/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b273fdbd63cd..778bb6754e00 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2391,17 +2391,14 @@  static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	spin_lock(&root->delalloc_lock);
-	if (list_empty(&inode->delalloc_inodes)) {
-		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
-		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
-		root->nr_delalloc_inodes++;
-		if (root->nr_delalloc_inodes == 1) {
-			spin_lock(&fs_info->delalloc_root_lock);
-			BUG_ON(!list_empty(&root->delalloc_root));
-			list_add_tail(&root->delalloc_root,
-				      &fs_info->delalloc_roots);
-			spin_unlock(&fs_info->delalloc_root_lock);
-		}
+	ASSERT(list_empty(&inode->delalloc_inodes));
+	list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+	root->nr_delalloc_inodes++;
+	if (root->nr_delalloc_inodes == 1) {
+		spin_lock(&fs_info->delalloc_root_lock);
+		BUG_ON(!list_empty(&root->delalloc_root));
+		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
 	}
 	spin_unlock(&root->delalloc_lock);
 }
@@ -2413,10 +2410,14 @@  void __btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 
 	lockdep_assert_held(&root->delalloc_lock);
 
+	/*
+	 * We may be called after the inode was already deleted from the list,
+	 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+	 * and then later through btrfs_clear_delalloc_extent() while the inode
+	 * still has ->delalloc_bytes > 0.
+	 */
 	if (!list_empty(&inode->delalloc_inodes)) {
 		list_del_init(&inode->delalloc_inodes);
-		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-			  &inode->runtime_flags);
 		root->nr_delalloc_inodes--;
 		if (!root->nr_delalloc_inodes) {
 			ASSERT(list_empty(&root->delalloc_inodes));
@@ -2444,6 +2445,8 @@  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 		WARN_ON(1);
 	/*
@@ -2453,6 +2456,7 @@  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 	 */
 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		u64 len = state->end + 1 - state->start;
+		u64 prev_delalloc_bytes;
 		u32 num_extents = count_max_extents(fs_info, len);
 		bool do_list = !btrfs_is_free_space_inode(inode);
 
@@ -2467,13 +2471,20 @@  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 					 fs_info->delalloc_batch);
 		spin_lock(&inode->lock);
+		prev_delalloc_bytes = inode->delalloc_bytes;
 		inode->delalloc_bytes += len;
 		if (bits & EXTENT_DEFRAG)
 			inode->defrag_bytes += len;
-		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-					 &inode->runtime_flags))
-			btrfs_add_delalloc_inode(inode);
 		spin_unlock(&inode->lock);
+
+		/*
+		 * We don't need to be under the protection of the inode's lock,
+		 * because we are called while holding the inode's io_tree lock
+		 * and are therefore protected against concurrent calls of this
+		 * function and btrfs_clear_delalloc_extent().
+		 */
+		if (do_list && prev_delalloc_bytes == 0)
+			btrfs_add_delalloc_inode(inode);
 	}
 
 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2495,6 +2506,8 @@  void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(fs_info, len);
 
+	lockdep_assert_held(&inode->io_tree.lock);
+
 	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
 		spin_lock(&inode->lock);
 		inode->defrag_bytes -= len;
@@ -2508,6 +2521,7 @@  void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 	 */
 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = inode->root;
+		u64 new_delalloc_bytes;
 		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		spin_lock(&inode->lock);
@@ -2536,11 +2550,17 @@  void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 					 fs_info->delalloc_batch);
 		spin_lock(&inode->lock);
 		inode->delalloc_bytes -= len;
-		if (do_list && inode->delalloc_bytes == 0 &&
-		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-					&inode->runtime_flags))
-			btrfs_del_delalloc_inode(inode);
+		new_delalloc_bytes = inode->delalloc_bytes;
 		spin_unlock(&inode->lock);
+
+		/*
+		 * We don't need to be under the protection of the inode's lock,
+		 * because we are called while holding the inode's io_tree lock
+		 * and are therefore protected against concurrent calls of this
+		 * function and btrfs_set_delalloc_extent().
+		 */
+		if (do_list && new_delalloc_bytes == 0)
+			btrfs_del_delalloc_inode(inode);
 	}
 
 	if ((state->state & EXTENT_DELALLOC_NEW) &&