diff mbox series

[4/7] vfs: Convert sb->s_inodes iteration to super_iter_inodes()

Message ID 20241002014017.3801899-5-david@fromorbit.com (mailing list archive)
State New
Headers show
Series vfs: improving inode cache iteration scalability | expand

Commit Message

Dave Chinner Oct. 2, 2024, 1:33 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Convert all the remaining superblock inode iterators to use
super_iter_inodes(). These are mostly straight forward conversions
for the iterations that use references, and the bdev use cases that
didn't even validate the inode before dereferencing it are now
inherently safe.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 block/bdev.c           |  76 ++++++++--------------
 fs/drop_caches.c       |  38 ++++-------
 fs/gfs2/ops_fstype.c   |  67 ++++++-------------
 fs/notify/fsnotify.c   |  75 ++++++---------------
 fs/quota/dquot.c       |  79 +++++++++--------------
 security/landlock/fs.c | 143 ++++++++++++++---------------------------
 6 files changed, 154 insertions(+), 324 deletions(-)
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index b5a362156ca1..5f720e12f731 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1226,56 +1226,36 @@  void bdev_mark_dead(struct block_device *bdev, bool surprise)
  */
 EXPORT_SYMBOL_GPL(bdev_mark_dead);
 
+static int sync_bdev_fn(struct inode *inode, void *data)
+{
+	struct block_device *bdev;
+	bool wait = *(bool *)data;
+
+	if (inode->i_mapping->nrpages == 0)
+		return INO_ITER_DONE;
+
+	bdev = I_BDEV(inode);
+	mutex_lock(&bdev->bd_disk->open_mutex);
+	if (!atomic_read(&bdev->bd_openers)) {
+		; /* skip */
+	} else if (wait) {
+		/*
+		 * We keep the error status of individual mapping so
+		 * that applications can catch the writeback error using
+		 * fsync(2). See filemap_fdatawait_keep_errors() for
+		 * details.
+		 */
+		filemap_fdatawait_keep_errors(inode->i_mapping);
+	} else {
+		filemap_fdatawrite(inode->i_mapping);
+	}
+	mutex_unlock(&bdev->bd_disk->open_mutex);
+	return INO_ITER_DONE;
+}
+
 void sync_bdevs(bool wait)
 {
-	struct inode *inode, *old_inode = NULL;
-
-	spin_lock(&blockdev_superblock->s_inode_list_lock);
-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
-		struct address_space *mapping = inode->i_mapping;
-		struct block_device *bdev;
-
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
-		    mapping->nrpages == 0) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&blockdev_superblock->s_inode_list_lock);
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock  We cannot iput the inode now as we can
-		 * be holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		iput(old_inode);
-		old_inode = inode;
-		bdev = I_BDEV(inode);
-
-		mutex_lock(&bdev->bd_disk->open_mutex);
-		if (!atomic_read(&bdev->bd_openers)) {
-			; /* skip */
-		} else if (wait) {
-			/*
-			 * We keep the error status of individual mapping so
-			 * that applications can catch the writeback error using
-			 * fsync(2). See filemap_fdatawait_keep_errors() for
-			 * details.
-			 */
-			filemap_fdatawait_keep_errors(inode->i_mapping);
-		} else {
-			filemap_fdatawrite(inode->i_mapping);
-		}
-		mutex_unlock(&bdev->bd_disk->open_mutex);
-
-		spin_lock(&blockdev_superblock->s_inode_list_lock);
-	}
-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
-	iput(old_inode);
+	super_iter_inodes(blockdev_superblock, sync_bdev_fn, &wait, 0);
 }
 
 /*
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d45ef541d848..901cda15537f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -16,36 +16,20 @@ 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
-static void drop_pagecache_sb(struct super_block *sb, void *unused)
+static int invalidate_inode_fn(struct inode *inode, void *data)
 {
-	struct inode *inode, *toput_inode = NULL;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		/*
-		 * We must skip inodes in unusual state. We may also skip
-		 * inodes without pages but we deliberately won't in case
-		 * we need to reschedule to avoid softlockups.
-		 */
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (mapping_empty(inode->i_mapping) && !need_resched())) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
-
+	if (!mapping_empty(inode->i_mapping))
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
-		iput(toput_inode);
-		toput_inode = inode;
+	return INO_ITER_DONE;
+}
 
-		cond_resched();
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(toput_inode);
+/*
+ * Note: it would be nice to check mapping_empty() before we get a reference on
+ * the inode in super_iter_inodes(), but that's a future optimisation.
+ */
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
+{
+	super_iter_inodes(sb, invalidate_inode_fn, NULL, 0);
 }
 
 int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e83d293c3614..f20862614ad6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1714,53 +1714,10 @@  static int gfs2_meta_init_fs_context(struct fs_context *fc)
 	return 0;
 }
 
-/**
- * gfs2_evict_inodes - evict inodes cooperatively
- * @sb: the superblock
- *
- * When evicting an inode with a zero link count, we are trying to upgrade the
- * inode's iopen glock from SH to EX mode in order to determine if we can
- * delete the inode.  The other nodes are supposed to evict the inode from
- * their caches if they can, and to poke the inode's inode glock if they cannot
- * do so.  Either behavior allows gfs2_upgrade_iopen_glock() to proceed
- * quickly, but if the other nodes are not cooperating, the lock upgrading
- * attempt will time out.  Since inodes are evicted sequentially, this can add
- * up quickly.
- *
- * Function evict_inodes() tries to keep the s_inode_list_lock list locked over
- * a long time, which prevents other inodes from being evicted concurrently.
- * This precludes the cooperative behavior we are looking for.  This special
- * version of evict_inodes() avoids that.
- *
- * Modeled after drop_pagecache_sb().
- */
-static void gfs2_evict_inodes(struct super_block *sb)
+/* Nothing to do because we just want to bounce the inode through iput() */
+static int gfs2_evict_inode_fn(struct inode *inode, void *data)
 {
-	struct inode *inode, *toput_inode = NULL;
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-
-	set_bit(SDF_EVICTING, &sdp->sd_flags);
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
-		    !need_resched()) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		atomic_inc(&inode->i_count);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
-
-		iput(toput_inode);
-		toput_inode = inode;
-
-		cond_resched();
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(toput_inode);
+	return INO_ITER_DONE;
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1779,7 +1736,23 @@  static void gfs2_kill_sb(struct super_block *sb)
 	sdp->sd_master_dir = NULL;
 	shrink_dcache_sb(sb);
 
-	gfs2_evict_inodes(sb);
+	/*
+	 * When evicting an inode with a zero link count, we are trying to
+	 * upgrade the inode's iopen glock from SH to EX mode in order to
+	 * determine if we can delete the inode.  The other nodes are supposed
+	 * to evict the inode from their caches if they can, and to poke the
+	 * inode's inode glock if they cannot do so.  Either behavior allows
+	 * gfs2_upgrade_iopen_glock() to proceed quickly, but if the other nodes
+	 * are not cooperating, the lock upgrading attempt will time out.  Since
+	 * inodes are evicted sequentially, this can add up quickly.
+	 *
+	 * evict_inodes() tries to keep the s_inode_list_lock list locked over a
+	 * long time, which prevents other inodes from being evicted
+	 * concurrently.  This precludes the cooperative behavior we are looking
+	 * for. 
+	 */
+	set_bit(SDF_EVICTING, &sdp->sd_flags);
+	super_iter_inodes(sb, gfs2_evict_inode_fn, NULL, 0);
 
 	/*
 	 * Flush and then drain the delete workqueue here (via
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 272c8a1dab3c..68c34ed94271 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -28,63 +28,14 @@  void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 	fsnotify_clear_marks_by_mount(mnt);
 }
 
-/**
- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-static void fsnotify_unmount_inodes(struct super_block *sb)
+static int fsnotify_unmount_inode_fn(struct inode *inode, void *data)
 {
-	struct inode *inode, *iput_inode = NULL;
+	spin_unlock(&inode->i_lock);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		/*
-		 * We cannot __iget() an inode in state I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with SB_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 * However, we should have been called /after/ evict_inodes
-		 * removed all zero refcount inodes, in any case.  Test to
-		 * be sure.
-		 */
-		if (!atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
-
-		iput(iput_inode);
-
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify_inode(inode, FS_UNMOUNT);
-
-		fsnotify_inode_delete(inode);
-
-		iput_inode = inode;
-
-		cond_resched();
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-
-	iput(iput_inode);
+	/* for each watch, send FS_UNMOUNT and then remove it */
+	fsnotify_inode(inode, FS_UNMOUNT);
+	fsnotify_inode_delete(inode);
+	return INO_ITER_DONE;
 }
 
 void fsnotify_sb_delete(struct super_block *sb)
@@ -95,7 +46,19 @@  void fsnotify_sb_delete(struct super_block *sb)
 	if (!sbinfo)
 		return;
 
-	fsnotify_unmount_inodes(sb);
+	/*
+	 * If i_count is zero, the inode cannot have any watches and
+	 * doing an __iget/iput with SB_ACTIVE clear would actually
+	 * evict all inodes with zero i_count from icache which is
+	 * unnecessarily violent and may in fact be illegal to do.
+	 * However, we should have been called /after/ evict_inodes
+	 * removed all zero refcount inodes, in any case. Hence we use
+	 * INO_ITER_REFERENCED to ensure zero refcount inodes are filtered
+	 * properly.
+	 */
+	super_iter_inodes(sb, fsnotify_unmount_inode_fn, NULL,
+			INO_ITER_REFERENCED);
+
 	fsnotify_clear_marks_by_sb(sb);
 	/* Wait for outstanding object references from connectors */
 	wait_var_event(fsnotify_sb_watched_objects(sb),
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ea0bd807fed7..ea9fce7acd1b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1017,56 +1017,40 @@  static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
+struct dquot_ref_data {
+	int	type;
+	int	reserved;
+};
+
+static int add_dquot_ref_fn(struct inode *inode, void *data)
+{
+	struct dquot_ref_data *ref = data;
+	int ret;
+
+	if (!dqinit_needed(inode, ref->type))
+		return INO_ITER_DONE;
+
+#ifdef CONFIG_QUOTA_DEBUG
+	if (unlikely(inode_get_rsv_space(inode) > 0))
+		ref->reserved++;
+#endif
+	ret = __dquot_initialize(inode, ref->type);
+	if (ret < 0)
+		return ret;
+	return INO_ITER_DONE;
+}
+
 /* This routine is guarded by s_umount semaphore */
 static int add_dquot_ref(struct super_block *sb, int type)
 {
-	struct inode *inode, *old_inode = NULL;
-#ifdef CONFIG_QUOTA_DEBUG
-	int reserved = 0;
-#endif
-	int err = 0;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    !atomic_read(&inode->i_writecount) ||
-		    !dqinit_needed(inode, type)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
-
-#ifdef CONFIG_QUOTA_DEBUG
-		if (unlikely(inode_get_rsv_space(inode) > 0))
-			reserved = 1;
-#endif
-		iput(old_inode);
-		err = __dquot_initialize(inode, type);
-		if (err) {
-			iput(inode);
-			goto out;
-		}
+	struct dquot_ref_data ref = {
+		.type = type,
+	};
+	int err;
 
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock. We cannot iput the inode now as we can be
-		 * holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		old_inode = inode;
-		cond_resched();
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(old_inode);
-out:
+	err = super_iter_inodes(sb, add_dquot_ref_fn, &ref, 0);
 #ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
+	if (ref.reserved) {
 		quota_error(sb, "Writes happened before quota was turned on "
 			"thus quota information is probably inconsistent. "
 			"Please run quotacheck(8)");
@@ -1075,11 +1059,6 @@  static int add_dquot_ref(struct super_block *sb, int type)
 	return err;
 }
 
-struct dquot_ref_data {
-	int	type;
-	int	reserved;
-};
-
 static int remove_dquot_ref_fn(struct inode *inode, void *data)
 {
 	struct dquot_ref_data *ref = data;
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 7d79fc8abe21..013ec4017ddd 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1223,109 +1223,60 @@  static void hook_inode_free_security_rcu(void *inode_security)
 
 /*
  * Release the inodes used in a security policy.
- *
- * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
  */
+static int release_inode_fn(struct inode *inode, void *data)
+{
+
+	rcu_read_lock();
+	object = rcu_dereference(landlock_inode(inode)->object);
+	if (!object) {
+		rcu_read_unlock();
+		return INO_ITER_DONE;
+	}
+
+	/*
+	 * If there is no concurrent release_inode() ongoing, then we
+	 * are in charge of calling iput() on this inode, otherwise we
+	 * will just wait for it to finish.
+	 */
+	spin_lock(&object->lock);
+	if (object->underobj != inode) {
+		spin_unlock(&object->lock);
+		rcu_read_unlock();
+		return INO_ITER_DONE;
+	}
+
+	object->underobj = NULL;
+	spin_unlock(&object->lock);
+	rcu_read_unlock();
+
+	/*
+	 * Because object->underobj was not NULL, release_inode() and
+	 * get_inode_object() guarantee that it is safe to reset
+	 * landlock_inode(inode)->object while it is not NULL.  It is therefore
+	 * not necessary to lock inode->i_lock.
+	 */
+	rcu_assign_pointer(landlock_inode(inode)->object, NULL);
+
+	/*
+	 * At this point, we own the ihold() reference that was originally set
+	 * up by get_inode_object() as well as the reference the inode iterator
+	 * obtained before calling us.  Therefore the following call to iput()
+	 * will not sleep nor drop the inode because there is now at least two
+	 * references to it.
+	 */
+	iput(inode);
+	return INO_ITER_DONE;
+}
+
 static void hook_sb_delete(struct super_block *const sb)
 {
-	struct inode *inode, *prev_inode = NULL;
-
 	if (!landlock_initialized)
 		return;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		struct landlock_object *object;
+	super_iter_inodes(sb, release_inode_fn, NULL, 0);
 
-		/* Only handles referenced inodes. */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		/*
-		 * Protects against concurrent modification of inode (e.g.
-		 * from get_inode_object()).
-		 */
-		spin_lock(&inode->i_lock);
-		/*
-		 * Checks I_FREEING and I_WILL_FREE  to protect against a race
-		 * condition when release_inode() just called iput(), which
-		 * could lead to a NULL dereference of inode->security or a
-		 * second call to iput() for the same Landlock object.  Also
-		 * checks I_NEW because such inode cannot be tied to an object.
-		 */
-		if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		rcu_read_lock();
-		object = rcu_dereference(landlock_inode(inode)->object);
-		if (!object) {
-			rcu_read_unlock();
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		/* Keeps a reference to this inode until the next loop walk. */
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-
-		/*
-		 * If there is no concurrent release_inode() ongoing, then we
-		 * are in charge of calling iput() on this inode, otherwise we
-		 * will just wait for it to finish.
-		 */
-		spin_lock(&object->lock);
-		if (object->underobj == inode) {
-			object->underobj = NULL;
-			spin_unlock(&object->lock);
-			rcu_read_unlock();
-
-			/*
-			 * Because object->underobj was not NULL,
-			 * release_inode() and get_inode_object() guarantee
-			 * that it is safe to reset
-			 * landlock_inode(inode)->object while it is not NULL.
-			 * It is therefore not necessary to lock inode->i_lock.
-			 */
-			rcu_assign_pointer(landlock_inode(inode)->object, NULL);
-			/*
-			 * At this point, we own the ihold() reference that was
-			 * originally set up by get_inode_object() and the
-			 * __iget() reference that we just set in this loop
-			 * walk.  Therefore the following call to iput() will
-			 * not sleep nor drop the inode because there is now at
-			 * least two references to it.
-			 */
-			iput(inode);
-		} else {
-			spin_unlock(&object->lock);
-			rcu_read_unlock();
-		}
-
-		if (prev_inode) {
-			/*
-			 * At this point, we still own the __iget() reference
-			 * that we just set in this loop walk.  Therefore we
-			 * can drop the list lock and know that the inode won't
-			 * disappear from under us until the next loop walk.
-			 */
-			spin_unlock(&sb->s_inode_list_lock);
-			/*
-			 * We can now actually put the inode reference from the
-			 * previous loop walk, which is not needed anymore.
-			 */
-			iput(prev_inode);
-			cond_resched();
-			spin_lock(&sb->s_inode_list_lock);
-		}
-		prev_inode = inode;
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-
-	/* Puts the inode reference from the last loop walk, if any. */
-	if (prev_inode)
-		iput(prev_inode);
-	/* Waits for pending iput() in release_inode(). */
+	/* Waits for pending iput()s in release_inode(). */
 	wait_var_event(&landlock_superblock(sb)->inode_refs,
 		       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
 }