[08/11] vfs: inode cache conversion to hash-bl

Message ID	20231206060629.2827226-9-david@fromorbit.com (mailing list archive)
State	Handled Elsewhere
Delegated to:	Paul Moore
Headers	show Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=fromorbit-com.20230601.gappssmtp.com header.i=@fromorbit-com.20230601.gappssmtp.com header.b="C70rExER" From: Dave Chinner <david@fromorbit.com> To: linux-fsdevel@vger.kernel.org Cc: linux-block@vger.kernel.org, linux-cachefs@redhat.com, dhowells@redhat.com, gfs2@lists.linux.dev, dm-devel@lists.linux.dev, linux-security-module@vger.kernel.org, selinux@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH 08/11] vfs: inode cache conversion to hash-bl Date: Wed, 6 Dec 2023 17:05:37 +1100 Message-ID: <20231206060629.2827226-9-david@fromorbit.com> In-Reply-To: <20231206060629.2827226-1-david@fromorbit.com> References: <20231206060629.2827226-1-david@fromorbit.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	vfs: inode cache scalability improvements \| expand [0/11] vfs: inode cache scalability improvements [01/11] lib/dlock-list: Distributed and lock-protected lists [02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants [03/11] vfs: Use dlock list for superblock's inode list [04/11] lib/dlock-list: Make sibling CPUs share the same linked list [05/11] selinux: use dlist for isec inode list [06/11] vfs: factor out inode hash head calculation [07/11] hlist-bl: add hlist_bl_fake() [08/11] vfs: inode cache conversion to hash-bl [09/11] hash-bl: explicitly initialise hash-bl heads [10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep [11/11] hlist-bl: introduced nested locking for dm-snap

diff --git a/fs/inode.c b/fs/inode.c index fead81550cf4..3eb9c4e5b279 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -56,8 +56,7 @@ static unsigned int i_hash_mask __ro_after_init; static unsigned int i_hash_shift __ro_after_init; -static struct hlist_head *inode_hashtable __ro_after_init; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); +static struct hlist_bl_head *inode_hashtable __ro_after_init; static unsigned long hash(struct super_block *sb, unsigned long hashval) { @@ -69,7 +68,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) return tmp & i_hash_mask; } -static inline struct hlist_head *i_hash_head(struct super_block *sb, +static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, unsigned int hashval) { return inode_hashtable + hash(sb, hashval); @@ -434,7 +433,7 @@ EXPORT_SYMBOL(address_space_init_once); void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); - INIT_HLIST_NODE(&inode->i_hash); + INIT_HLIST_BL_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); @@ -518,6 +517,17 @@ static inline void inode_sb_list_del(struct inode *inode) dlock_lists_del(&inode->i_sb_list); } +/* + * Ensure that we store the hash head in the inode when we insert the inode into + * the hlist_bl_head... + */ +static inline void +__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) +{ + hlist_bl_add_head_rcu(&inode->i_hash, b); + inode->i_hash_head = b; +} + /** * __insert_inode_hash - hash an inode * @inode: unhashed inode @@ -528,13 +538,13 @@ static inline void inode_sb_list_del(struct inode *inode) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { - struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); spin_lock(&inode->i_lock); - hlist_add_head_rcu(&inode->i_hash, b); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); } EXPORT_SYMBOL(__insert_inode_hash); @@ -546,11 +556,44 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void __remove_inode_hash(struct inode *inode) { - spin_lock(&inode_hash_lock); - spin_lock(&inode->i_lock); - hlist_del_init_rcu(&inode->i_hash); - spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + struct hlist_bl_head *b = inode->i_hash_head; + + /* + * There are some callers that come through here without synchronisation + * and potentially with multiple references to the inode. Hence we have + * to handle the case that we might race with a remove and insert to a + * different list. Coda, in particular, seems to have a userspace API + * that can directly trigger "unhash/rehash to different list" behaviour + * without any serialisation at all. + * + * Hence we have to handle the situation where the inode->i_hash_head + * might point to a different list than what we expect, indicating that + * we raced with another unhash and potentially a new insertion. This + * means we have to retest the head once we have everything locked up + * and loop again if it doesn't match. + */ + while (b) { + hlist_bl_lock(b); + spin_lock(&inode->i_lock); + if (b != inode->i_hash_head) { + hlist_bl_unlock(b); + b = inode->i_hash_head; + spin_unlock(&inode->i_lock); + continue; + } + /* + * Need to set the pprev pointer to NULL after list removal so + * that both RCU traversals and hlist_bl_unhashed() work + * correctly at this point. + */ + hlist_bl_del_rcu(&inode->i_hash); + inode->i_hash.pprev = NULL; + inode->i_hash_head = NULL; + spin_unlock(&inode->i_lock); + hlist_bl_unlock(b); + break; + } + } EXPORT_SYMBOL(__remove_inode_hash); @@ -886,26 +929,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode); +static void __wait_on_freeing_inode(struct hlist_bl_head *b, + struct inode *inode); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, - struct hlist_head *head, + struct hlist_bl_head *b, int (*test)(struct inode *, void *), void *data) { + struct hlist_bl_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(b, inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -924,19 +969,20 @@ static struct inode *find_inode(struct super_block *sb, * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct hlist_bl_head *b, unsigned long ino) { + struct hlist_bl_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(b, inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -1186,25 +1232,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note both @test and @set are called with the inode hash chain lock held, + * so can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct hlist_head *head = i_hash_head(inode->i_sb, hashval); + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); struct inode *old; again: - spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data); + hlist_bl_lock(b); + old = find_inode(inode->i_sb, b, test, data); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); if (IS_ERR(old)) return NULL; wait_on_inode(old); @@ -1226,7 +1272,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); /* @@ -1236,7 +1282,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, if (list_empty(&inode->i_sb_list.list)) inode_sb_list_add(inode); unlock: - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return inode; } @@ -1297,12 +1343,12 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = i_hash_head(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode_fast(sb, b, ino); + hlist_bl_unlock(b); if (inode) { if (IS_ERR(inode)) return NULL; @@ -1318,17 +1364,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) if (inode) { struct inode *old; - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino); + old = find_inode_fast(sb, b, ino); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -1341,7 +1387,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); destroy_inode(inode); if (IS_ERR(old)) return NULL; @@ -1365,10 +1411,11 @@ EXPORT_SYMBOL(iget_locked); */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { - struct hlist_head *b = i_hash_head(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); + struct hlist_bl_node *node; struct inode *inode; - hlist_for_each_entry_rcu(inode, b, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } @@ -1452,12 +1499,12 @@ EXPORT_SYMBOL(igrab); struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = i_hash_head(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); struct inode *inode; - spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode(sb, b, test, data); + hlist_bl_unlock(b); return IS_ERR(inode) ? NULL : inode; } @@ -1507,12 +1554,12 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = i_hash_head(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode_fast(sb, b, ino); + hlist_bl_unlock(b); if (inode) { if (IS_ERR(inode)) @@ -1556,12 +1603,13 @@ struct inode *find_inode_nowait(struct super_block *sb, void *), void *data) { - struct hlist_head *head = i_hash_head(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct hlist_bl_node *node; struct inode *inode, *ret_inode = NULL; int mval; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_lock(b); + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); @@ -1572,7 +1620,7 @@ struct inode *find_inode_nowait(struct super_block *sb, goto out; } out: - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); @@ -1601,13 +1649,14 @@ EXPORT_SYMBOL(find_inode_nowait); struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = i_hash_head(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct hlist_bl_node *node; struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); - hlist_for_each_entry_rcu(inode, head, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) @@ -1639,13 +1688,14 @@ EXPORT_SYMBOL(find_inode_rcu); struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = i_hash_head(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); + struct hlist_bl_node *node; struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); - hlist_for_each_entry_rcu(inode, head, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) @@ -1659,39 +1709,42 @@ int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; - struct hlist_head *head = i_hash_head(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); while (1) { - struct inode *old = NULL; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, head, i_hash) { - if (old->i_ino != ino) + struct hlist_bl_node *node; + struct inode *old = NULL, *t; + + hlist_bl_lock(b); + hlist_bl_for_each_entry(t, node, b, i_hash) { + if (t->i_ino != ino) continue; - if (old->i_sb != sb) + if (t->i_sb != sb) continue; - spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { - spin_unlock(&old->i_lock); + spin_lock(&t->i_lock); + if (t->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&t->i_lock); continue; } + old = t; break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -2271,17 +2324,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode) +static void __wait_on_freeing_inode(struct hlist_bl_head *b, + struct inode *inode) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); schedule(); finish_wait(wq, &wait.wq_entry); - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); } static __initdata unsigned long ihash_entries; @@ -2307,7 +2361,7 @@ void __init inode_init_early(void) inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct hlist_bl_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, @@ -2333,7 +2387,7 @@ void __init inode_init(void) inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct hlist_bl_head), ihash_entries, 14, HASH_ZERO, diff --git a/include/linux/fs.h b/include/linux/fs.h index bb35591733f1..0ef1b72340c7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -692,7 +692,8 @@ struct inode { unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; - struct hlist_node i_hash; + struct hlist_bl_node i_hash; + struct hlist_bl_head *i_hash_head; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ @@ -758,7 +759,7 @@ static inline unsigned int i_blocksize(const struct inode *node) static inline int inode_unhashed(struct inode *inode) { - return hlist_unhashed(&inode->i_hash); + return hlist_bl_unhashed(&inode->i_hash); } /* @@ -769,7 +770,7 @@ static inline int inode_unhashed(struct inode *inode) */ static inline void inode_fake_hash(struct inode *inode) { - hlist_add_fake(&inode->i_hash); + hlist_bl_add_fake(&inode->i_hash); } /* @@ -2946,7 +2947,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) + if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) __remove_inode_hash(inode); }

[08/11] vfs: inode cache conversion to hash-bl

Commit Message

Comments

Patch