diff mbox series

[6/7] xfs: implement sb->iter_vfs_inodes

Message ID 20241002014017.3801899-7-david@fromorbit.com (mailing list archive)
State New
Headers show
Series vfs: improving inode cache iteration scalability | expand

Commit Message

Dave Chinner Oct. 2, 2024, 1:33 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

We can iterate all the in-memory VFS inodes via the xfs_icwalk()
interface, so implement the new superblock operation to walk inodes
in this way.

This removes the dependency XFS has on the sb->s_inodes list and
allows us to avoid the global lock that marshalls this list and
must be taken on every VFS inode instantiation and eviction. This
greatly improves the rate at which we can stream inodes through the
VFS inode cache.

Sharded, share-nothing cold cache workload with 100,000 files per
thread in per-thread directories.

Before:

Filesystem      Files  Threads  Create       Walk      Chmod      Unlink     Bulkstat
       xfs     400000     4      4.269      3.225      4.557      7.316      1.306
       xfs     800000     8      4.844      3.227      4.702      7.905      1.908
       xfs    1600000    16      6.286      3.296      5.592      8.838      4.392
       xfs    3200000    32      8.912      5.681      8.505     11.724      7.085
       xfs    6400000    64     15.344     11.144     14.162     18.604     15.494

After:

Filesystem      Files  Threads  Create       Walk      Chmod      Unlink     Bulkstat
       xfs     400000     4      4.140      3.502      4.154      7.242      1.164
       xfs     800000     8      4.637      2.836      4.444      7.896      1.093
       xfs    1600000    16      5.549      3.054      5.213      8.696      1.107
       xfs    3200000    32      8.387      3.218      6.867     10.668      1.125
       xfs    6400000    64     14.112      3.953     10.365     18.620      1.270

Bulkstat shows the real story here - before we start to see
scalability problems at 16 threads. Patched shows almost perfect
scalability up to 64 threads streaming inodes through the VFS cache
using I_DONTCACHE semantics.

Note: this is an initial, unoptimised implementation that could be
significantly improved and reduced in size by using a radix tree tag
filter for VFS inodes and so use the generic tag-filtered
xfs_icwalk() implementation instead of special casing it like this
patch does.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 151 +++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h |   3 +
 fs/xfs/xfs_iops.c   |   1 -
 fs/xfs/xfs_super.c  |  11 ++++
 4 files changed, 163 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a680e5b82672..ee544556cee7 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1614,6 +1614,155 @@  xfs_blockgc_free_quota(
 			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
 }
 
+/* VFS Inode Cache Walking Code */
+
+/* XFS inodes in these states are not visible to the VFS. */
+#define XFS_ITER_VFS_NOGRAB_IFLAGS	(XFS_INEW | \
+					 XFS_NEED_INACTIVE | \
+					 XFS_INACTIVATING | \
+					 XFS_IRECLAIMABLE | \
+					 XFS_IRECLAIM)
+/*
+ * If the inode we found is visible to the VFS inode cache, then return it to
+ * the caller.
+ *
+ * In the normal case, we need to validate the VFS inode state and take a
+ * reference to it here. We will drop that reference once the VFS inode has been
+ * processed by the ino_iter_fn.
+ *
+ * However, if the INO_ITER_UNSAFE flag is set, we do not take references to the
+ * inode - it is the ino_iter_fn's responsibility to validate the inode is still
+ * a VFS inode once we hand it to them. We do not drop references after
+ * processing these inodes; the processing function may have evicted the VFS
+ * inode from cache as part of it's processing.
+ */
+static bool
+xfs_iter_vfs_igrab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	bool			ret = false;
+
+	ASSERT(rcu_read_lock_held());
+
+	/* Check for stale RCU freed inode */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	if (ip->i_flags & XFS_ITER_VFS_NOGRAB_IFLAGS)
+		goto out_unlock_noent;
+
+	if ((flags & INO_ITER_UNSAFE) ||
+	    super_iter_iget(inode, flags))
+		ret = true;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ret;
+}
+
+/*
+ * Initial implementation of vfs inode walker. This does not use batched lookups
+ * for initial simplicity and testing, though it could use them quite
+ * efficiently for both safe and unsafe iteration contexts.
+ */
+static int
+xfs_icwalk_vfs_inodes_ag(
+	struct xfs_perag	*pag,
+	ino_iter_fn		iter_fn,
+	void			*private_data,
+	int			flags)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	uint32_t		first_index = 0;
+	int			ret = 0;
+	int			nr_found;
+	bool			done = false;
+
+	do {
+		struct xfs_inode *ip;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+				(void **)&ip, first_index, 1);
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Update the index for the next lookup. Catch
+		 * overflows into the next AG range which can occur if
+		 * we have inodes in the last block of the AG and we
+		 * are currently pointing to the last inode.
+		 */
+		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+			done = true;
+
+		if (!xfs_iter_vfs_igrab(ip, flags)) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		/*
+		 * If we are doing an unsafe iteration, we must continue to hold
+		 * the RCU lock across the callback to guarantee the existence
+		 * of inode. We can't hold the rcu lock for reference counted
+		 * inodes because the callback is allowed to block in that case.
+		 */
+		if (!(flags & INO_ITER_UNSAFE))
+			rcu_read_unlock();
+
+		ret = iter_fn(VFS_I(ip), private_data);
+
+		/*
+		 * We've run the callback, so we can drop the existence
+		 * guarantee we hold on the inode now.
+		 */
+		if (!(flags & INO_ITER_UNSAFE))
+			iput(VFS_I(ip));
+		else
+			rcu_read_unlock();
+
+		if (ret == INO_ITER_ABORT) {
+			ret = 0;
+			break;
+		}
+		if (ret < 0)
+			break;
+
+	} while (!done);
+
+	return ret;
+}
+
+int
+xfs_icwalk_vfs_inodes(
+	struct xfs_mount	*mp,
+	ino_iter_fn		iter_fn,
+	void			*private_data,
+	int			flags)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			ret;
+
+	for_each_perag(mp, agno, pag) {
+		ret = xfs_icwalk_vfs_inodes_ag(pag, iter_fn,
+				private_data, flags);
+		if (ret == INO_ITER_ABORT) {
+			ret = 0;
+			break;
+		}
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
 /* XFS Inode Cache Walking Code */
 
 /*
@@ -1624,7 +1773,6 @@  xfs_blockgc_free_quota(
  */
 #define XFS_LOOKUP_BATCH	32
 
-
 /*
  * Decide if we want to grab this inode in anticipation of doing work towards
  * the goal.
@@ -1700,7 +1848,6 @@  xfs_icwalk_ag(
 		int		i;
 
 		rcu_read_lock();
-
 		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
 				(void **) batch, first_index,
 				XFS_LOOKUP_BATCH, goal);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 905944dafbe5..c2754ea28a88 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -18,6 +18,9 @@  struct xfs_icwalk {
 	long		icw_scan_limit;
 };
 
+int xfs_icwalk_vfs_inodes(struct xfs_mount *mp, ino_iter_fn iter_fn,
+		void *private_data, int flags);
+
 /* Flags that reflect xfs_fs_eofblocks functionality. */
 #define XFS_ICWALK_FLAG_SYNC		(1U << 0) /* sync/wait mode scan */
 #define XFS_ICWALK_FLAG_UID		(1U << 1) /* filter by uid */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ee79cf161312..5375c17ed69c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1293,7 +1293,6 @@  xfs_setup_inode(
 	inode->i_ino = ip->i_ino;
 	inode->i_state |= I_NEW;
 
-	inode_sb_list_add(inode);
 	/* make the inode look hashed for the writeback code */
 	inode_fake_hash(inode);
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fbb3a1594c0d..a2ef1b582066 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1179,6 +1179,16 @@  xfs_fs_shutdown(
 	xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
 }
 
+static int
+xfs_fs_iter_vfs_inodes(
+	struct super_block	*sb,
+	ino_iter_fn		iter_fn,
+	void			*private_data,
+	int			flags)
+{
+	return xfs_icwalk_vfs_inodes(XFS_M(sb), iter_fn, private_data, flags);
+}
+
 static const struct super_operations xfs_super_operations = {
 	.alloc_inode		= xfs_fs_alloc_inode,
 	.destroy_inode		= xfs_fs_destroy_inode,
@@ -1193,6 +1203,7 @@  static const struct super_operations xfs_super_operations = {
 	.nr_cached_objects	= xfs_fs_nr_cached_objects,
 	.free_cached_objects	= xfs_fs_free_cached_objects,
 	.shutdown		= xfs_fs_shutdown,
+	.iter_vfs_inodes	= xfs_fs_iter_vfs_inodes,
 };
 
 static int