[03/11] vfs: Use dlock list for superblock's inode list

Message ID	20231206060629.2827226-4-david@fromorbit.com (mailing list archive)
State	New, archived
Headers	show Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=fromorbit-com.20230601.gappssmtp.com header.i=@fromorbit-com.20230601.gappssmtp.com header.b="SvdV/jF9" From: Dave Chinner <david@fromorbit.com> To: linux-fsdevel@vger.kernel.org Cc: linux-block@vger.kernel.org, linux-cachefs@redhat.com, dhowells@redhat.com, gfs2@lists.linux.dev, dm-devel@lists.linux.dev, linux-security-module@vger.kernel.org, selinux@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH 03/11] vfs: Use dlock list for superblock's inode list Date: Wed, 6 Dec 2023 17:05:32 +1100 Message-ID: <20231206060629.2827226-4-david@fromorbit.com> In-Reply-To: <20231206060629.2827226-1-david@fromorbit.com> References: <20231206060629.2827226-1-david@fromorbit.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	vfs: inode cache scalability improvements \| expand [0/11] vfs: inode cache scalability improvements [01/11] lib/dlock-list: Distributed and lock-protected lists [02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants [03/11] vfs: Use dlock list for superblock's inode list [04/11] lib/dlock-list: Make sibling CPUs share the same linked list [05/11] selinux: use dlist for isec inode list [06/11] vfs: factor out inode hash head calculation [07/11] hlist-bl: add hlist_bl_fake() [08/11] vfs: inode cache conversion to hash-bl [09/11] hash-bl: explicitly initialise hash-bl heads [10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep [11/11] hlist-bl: introduced nested locking for dm-snap

diff --git a/block/bdev.c b/block/bdev.c index 750aec178b6a..07135fd6fda4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -437,11 +437,11 @@ long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; + DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes); - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + dlist_for_each_entry(inode, &iter, i_sb_list) { ret += inode->i_mapping->nrpages; - spin_unlock(&blockdev_superblock->s_inode_list_lock); + } return ret; } @@ -1032,9 +1032,9 @@ EXPORT_SYMBOL_GPL(bdev_mark_dead); void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes); - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; @@ -1046,15 +1046,8 @@ void sync_bdevs(bool wait) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ + dlock_list_unlock(&iter); + iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); @@ -1075,9 +1068,8 @@ void sync_bdevs(bool wait) } mutex_unlock(&bdev->bd_disk->open_mutex); - spin_lock(&blockdev_superblock->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index b9575957a7c2..3596d0a7c0da 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -19,9 +19,9 @@ int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); /* * We must skip inodes in unusual state. We may also skip @@ -35,16 +35,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; cond_resched(); - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..1105710482e7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1738,22 +1738,24 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc) * attempt will time out. Since inodes are evicted sequentially, this can add * up quickly. * - * Function evict_inodes() tries to keep the s_inode_list_lock list locked over - * a long time, which prevents other inodes from being evicted concurrently. - * This precludes the cooperative behavior we are looking for. This special - * version of evict_inodes() avoids that. - * * Modeled after drop_pagecache_sb(). + * + * XXX(dgc): this is particularly awful. With the dlist for inodes, concurrent + * access to the inode list can occur and evict_inodes() will drop the per-cpu + * list lock if the CPU needs rescheduling. Hence if this exists just because + * evict_inodes() holds the s_inode_list_lock for long periods preventing + * concurrent inode eviction work from being done, this can probably go away + * entirely now. */ static void gfs2_evict_inodes(struct super_block *sb) { struct inode *inode, *toput_inode = NULL; struct gfs2_sbd *sdp = sb->s_fs_info; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); set_bit(SDF_EVICTING, &sdp->sd_flags); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && !need_resched()) { @@ -1762,15 +1764,14 @@ static void gfs2_evict_inodes(struct super_block *sb) } atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); iput(toput_inode); toput_inode = inode; cond_resched(); - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/inode.c b/fs/inode.c index 17c50a75514f..3426691fa305 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -30,7 +30,7 @@ * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes->head->lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list @@ -39,7 +39,7 @@ * * Lock ordering: * - * inode->i_sb->s_inode_list_lock + * inode->i_sb->s_inodes->head->lock * inode->i_lock * Inode LRU list locks * @@ -47,7 +47,7 @@ * inode->i_lock * * inode_hash_lock - * inode->i_sb->s_inode_list_lock + * inode->i_sb->s_inodes->head->lock * inode->i_lock * * iunique_lock @@ -423,7 +423,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_LIST_HEAD(&inode->i_sb_list); + init_dlock_list_node(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } @@ -492,19 +492,14 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + dlock_lists_add(&inode->i_sb_list, &inode->i_sb->s_inodes); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); - } + if (!list_empty(&inode->i_sb_list.list)) + dlock_lists_del(&inode->i_sb_list); } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -713,11 +708,12 @@ static void dispose_list(struct list_head *head) void evict_inodes(struct super_block *sb) { struct inode *inode; + struct dlock_list_iter iter; LIST_HEAD(dispose); again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + init_dlock_list_iter(&iter, &sb->s_inodes); + dlist_for_each_entry(inode, &iter, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -738,13 +734,12 @@ void evict_inodes(struct super_block *sb) * bit so we don't livelock. */ if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); cond_resched(); dispose_list(&dispose); goto again; } } - spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -759,11 +754,12 @@ EXPORT_SYMBOL_GPL(evict_inodes); void invalidate_inodes(struct super_block *sb) { struct inode *inode; + struct dlock_list_iter iter; LIST_HEAD(dispose); again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + init_dlock_list_iter(&iter, &sb->s_inodes); + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); @@ -779,13 +775,12 @@ void invalidate_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); cond_resched(); dispose_list(&dispose); goto again; } } - spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -1232,7 +1227,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ - if (list_empty(&inode->i_sb_list)) + if (list_empty(&inode->i_sb_list.list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7974e91ffe13..15e3769e76f5 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -33,14 +33,15 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inodes list lock and CAN + * block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point @@ -68,7 +69,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); iput(iput_inode); @@ -80,9 +81,8 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; cond_resched(); - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 58b5de081b57..e873dcbe6feb 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1024,13 +1024,13 @@ static int dqinit_needed(struct inode *inode, int type) static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif int err = 0; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || @@ -1040,7 +1040,7 @@ static int add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + dlock_list_unlock(&iter); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -1053,19 +1053,10 @@ static int add_dquot_ref(struct super_block *sb, int type) goto out; } - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can be - * holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ old_inode = inode; cond_resched(); - spin_lock(&sb->s_inode_list_lock); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); iput(old_inode); out: #ifdef CONFIG_QUOTA_DEBUG @@ -1081,12 +1072,12 @@ static int add_dquot_ref(struct super_block *sb, int type) static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch @@ -1108,7 +1099,6 @@ static void remove_dquot_ref(struct super_block *sb, int type) } spin_unlock(&dq_data_lock); } - spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/super.c b/fs/super.c index 076392396e72..61c19e3f06d8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s) super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); + free_dlock_list_heads(&s->s_inodes); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); @@ -367,8 +368,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); - INIT_LIST_HEAD(&s->s_inodes); - spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); @@ -383,6 +382,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; + if (alloc_dlock_list_heads(&s->s_inodes)) + goto fail; + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) @@ -695,7 +697,7 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); - if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + if (CHECK_DATA_CORRUPTION(!dlock_lists_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* @@ -704,14 +706,13 @@ void generic_shutdown_super(struct super_block *sb) * iput_final() or such crashes cleanly. */ struct inode *inode; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } - spin_unlock(&sb->s_inode_list_lock); } } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..bb35591733f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> +#include <linux/dlock-list.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -702,7 +703,7 @@ struct inode { u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ - struct list_head i_sb_list; + struct dlock_list_node i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; @@ -1315,9 +1316,8 @@ struct super_block { */ int s_stack_depth; - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ + /* The internal per-list locks protect s_inodes */ + struct dlock_list_heads s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ diff --git a/security/landlock/fs.c b/security/landlock/fs.c index bc7c126deea2..4269d9938c09 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -844,12 +844,12 @@ static void hook_inode_free_security(struct inode *const inode) static void hook_sb_delete(struct super_block *const sb) { struct inode *inode, *prev_inode = NULL; + DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes); if (!landlock_initialized) return; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + dlist_for_each_entry(inode, &iter, i_sb_list) { struct landlock_object *object; /* Only handles referenced inodes. */ @@ -883,6 +883,7 @@ static void hook_sb_delete(struct super_block *const sb) /* Keeps a reference to this inode until the next loop walk. */ __iget(inode); spin_unlock(&inode->i_lock); + dlock_list_unlock(&iter); /* * If there is no concurrent release_inode() ongoing, then we @@ -917,25 +918,11 @@ static void hook_sb_delete(struct super_block *const sb) rcu_read_unlock(); } - if (prev_inode) { - /* - * At this point, we still own the __iget() reference - * that we just set in this loop walk. Therefore we - * can drop the list lock and know that the inode won't - * disappear from under us until the next loop walk. - */ - spin_unlock(&sb->s_inode_list_lock); - /* - * We can now actually put the inode reference from the - * previous loop walk, which is not needed anymore. - */ - iput(prev_inode); - cond_resched(); - spin_lock(&sb->s_inode_list_lock); - } + iput(prev_inode); prev_inode = inode; + cond_resched(); + dlock_list_relock(&iter); } - spin_unlock(&sb->s_inode_list_lock); /* Puts the inode reference from the last loop walk, if any. */ if (prev_inode)

[03/11] vfs: Use dlock list for superblock's inode list

Commit Message

Comments

Patch