diff mbox series

[3/7] vfs: convert vfs inode iterators to super_iter_inodes_unsafe()

Message ID 20241002014017.3801899-4-david@fromorbit.com (mailing list archive)
State New
Headers show
Series vfs: improving inode cache iteration scalability | expand

Commit Message

Dave Chinner Oct. 2, 2024, 1:33 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Convert VFS internal superblock inode iterators that cannot use
referenced inodes to the new super_iter_inodes_unsafe() iterator.
Dquot and inode eviction require this special handling due to
special eviction handling requirements. The special
nr_blockdev_pages() statistics code needs it as well, as this is
called from si_meminfo() and so can potentially be run from
locations where arbitrary blocking is not allowed or desirable.

New cases using this iterator need careful consideration.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 block/bdev.c     | 24 +++++++++++----
 fs/inode.c       | 79 ++++++++++++++++++++++++++----------------------
 fs/quota/dquot.c | 72 ++++++++++++++++++++++++-------------------
 3 files changed, 102 insertions(+), 73 deletions(-)
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index 33f9c4605e3a..b5a362156ca1 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -472,16 +472,28 @@  void bdev_drop(struct block_device *bdev)
 	iput(BD_INODE(bdev));
 }
 
+static int bdev_pages_count(struct inode *inode, void *data)
+{
+	long	*pages = data;
+
+	*pages += inode->i_mapping->nrpages;
+	return INO_ITER_DONE;
+}
+
 long nr_blockdev_pages(void)
 {
-	struct inode *inode;
 	long ret = 0;
 
-	spin_lock(&blockdev_superblock->s_inode_list_lock);
-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
-		ret += inode->i_mapping->nrpages;
-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
-
+	/*
+	 * We can be called from contexts where blocking is not
+	 * desirable. The count is advisory at best, and we only
+	 * need to access the inode mapping. Hence as long as we
+	 * have an inode existence guarantee, we can safely count
+	 * the cached pages on each inode without needing reference
+	 * counted inodes.
+	 */
+	super_iter_inodes_unsafe(blockdev_superblock,
+			bdev_pages_count, &ret);
 	return ret;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 0a53d8c34203..3f335f78c5b2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -761,8 +761,11 @@  static void evict(struct inode *inode)
  * Dispose-list gets a local list with local inodes in it, so it doesn't
  * need to worry about list corruption and SMP locks.
  */
-static void dispose_list(struct list_head *head)
+static bool dispose_list(struct list_head *head)
 {
+	if (list_empty(head))
+		return false;
+
 	while (!list_empty(head)) {
 		struct inode *inode;
 
@@ -772,6 +775,7 @@  static void dispose_list(struct list_head *head)
 		evict(inode);
 		cond_resched();
 	}
+	return true;
 }
 
 /**
@@ -783,47 +787,50 @@  static void dispose_list(struct list_head *head)
  * so any inode reaching zero refcount during or after that call will
  * be immediately evicted.
  */
+static int evict_inode_fn(struct inode *inode, void *data)
+{
+	struct list_head *dispose = data;
+
+	spin_lock(&inode->i_lock);
+	if (atomic_read(&inode->i_count) ||
+	    (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))) {
+		spin_unlock(&inode->i_lock);
+		return INO_ITER_DONE;
+	}
+
+	inode->i_state |= I_FREEING;
+	inode_lru_list_del(inode);
+	spin_unlock(&inode->i_lock);
+	list_add(&inode->i_lru, dispose);
+
+	/*
+	 * If we've run long enough to need rescheduling, abort the
+	 * iteration so we can return to evict_inodes() and dispose of the
+	 * inodes before collecting more inodes to evict.
+	 */
+	if (need_resched())
+		return INO_ITER_ABORT;
+	return INO_ITER_DONE;
+}
+
 void evict_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-again:
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (atomic_read(&inode->i_count))
-			continue;
-
-		spin_lock(&inode->i_lock);
-		if (atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		inode->i_state |= I_FREEING;
-		inode_lru_list_del(inode);
-		spin_unlock(&inode->i_lock);
-		list_add(&inode->i_lru, &dispose);
-
+	do {
 		/*
-		 * We can have a ton of inodes to evict at unmount time given
-		 * enough memory, check to see if we need to go to sleep for a
-		 * bit so we don't livelock.
+		 * We do not want to take references to inodes whilst iterating
+		 * because we are trying to evict unreferenced inodes from
+		 * the cache. Hence we need to use the unsafe iteration
+		 * mechanism and do all the required inode validity checks in
+		 * evict_inode_fn() to safely queue unreferenced inodes for
+		 * eviction.
+		 *
+		 * We repeat the iteration until it doesn't find any more
+		 * inodes to dispose of.
 		 */
-		if (need_resched()) {
-			spin_unlock(&sb->s_inode_list_lock);
-			cond_resched();
-			dispose_list(&dispose);
-			goto again;
-		}
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-
-	dispose_list(&dispose);
+		super_iter_inodes_unsafe(sb, evict_inode_fn, &dispose);
+	} while (dispose_list(&dispose));
 }
 EXPORT_SYMBOL_GPL(evict_inodes);
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b40410cd39af..ea0bd807fed7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1075,41 +1075,51 @@  static int add_dquot_ref(struct super_block *sb, int type)
 	return err;
 }
 
+struct dquot_ref_data {
+	int	type;
+	int	reserved;
+};
+
+static int remove_dquot_ref_fn(struct inode *inode, void *data)
+{
+	struct dquot_ref_data *ref = data;
+
+	spin_lock(&dq_data_lock);
+	if (!IS_NOQUOTA(inode)) {
+		struct dquot __rcu **dquots = i_dquot(inode);
+		struct dquot *dquot = srcu_dereference_check(
+			dquots[ref->type], &dquot_srcu,
+			lockdep_is_held(&dq_data_lock));
+
+#ifdef CONFIG_QUOTA_DEBUG
+		if (unlikely(inode_get_rsv_space(inode) > 0))
+			ref->reserved++;
+#endif
+		rcu_assign_pointer(dquots[ref->type], NULL);
+		if (dquot)
+			dqput(dquot);
+	}
+	spin_unlock(&dq_data_lock);
+	return INO_ITER_DONE;
+}
+
 static void remove_dquot_ref(struct super_block *sb, int type)
 {
-	struct inode *inode;
-#ifdef CONFIG_QUOTA_DEBUG
-	int reserved = 0;
-#endif
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		/*
-		 *  We have to scan also I_NEW inodes because they can already
-		 *  have quota pointer initialized. Luckily, we need to touch
-		 *  only quota pointers and these have separate locking
-		 *  (dq_data_lock).
-		 */
-		spin_lock(&dq_data_lock);
-		if (!IS_NOQUOTA(inode)) {
-			struct dquot __rcu **dquots = i_dquot(inode);
-			struct dquot *dquot = srcu_dereference_check(
-				dquots[type], &dquot_srcu,
-				lockdep_is_held(&dq_data_lock));
+	struct dquot_ref_data ref = {
+		.type = type,
+	};
 
+	/*
+	 * We have to scan I_NEW inodes because they can already
+	 * have quota pointer initialized. Luckily, we need to touch
+	 * only quota pointers and these have separate locking
+	 * (dq_data_lock) so the existence guarantee that
+	 * super_iter_inodes_unsafe() provides inodes passed to
+	 * remove_dquot_ref_fn() is sufficient for this operation.
+	 */
+	super_iter_inodes_unsafe(sb, remove_dquot_ref_fn, &ref);
 #ifdef CONFIG_QUOTA_DEBUG
-			if (unlikely(inode_get_rsv_space(inode) > 0))
-				reserved = 1;
-#endif
-			rcu_assign_pointer(dquots[type], NULL);
-			if (dquot)
-				dqput(dquot);
-		}
-		spin_unlock(&dq_data_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-#ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
+	if (ref.reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
 			" was disabled thus quota information is probably "
 			"inconsistent. Please run quotacheck(8).\n", sb->s_id);