diff mbox series

[3/3] vfs: inode cache conversion to hash-bl

Message ID 20210406123343.1739669-4-david@fromorbit.com (mailing list archive)
State New, archived
Headers show
Series vfs: convert inode cache to hlist-bl | expand

Commit Message

Dave Chinner April 6, 2021, 12:33 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Because scalability of the global inode_hash_lock really, really
sucks and prevents me from doing scalability characterisation and
analysis of bcachefs algorithms.

Profiles of a 32-way concurrent create of 51.2m inodes with fsmark
on a couple of different filesystems on a 5.10 kernel:

-   52.13%     0.04%  [kernel]            [k] ext4_create
   - 52.09% ext4_create
      - 41.03% __ext4_new_inode
         - 29.92% insert_inode_locked
            - 25.35% _raw_spin_lock
               - do_raw_spin_lock
                  - 24.97% __pv_queued_spin_lock_slowpath


-   72.33%     0.02%  [kernel]            [k] do_filp_open
   - 72.31% do_filp_open
      - 72.28% path_openat
         - 57.03% bch2_create
            - 56.46% __bch2_create
               - 40.43% inode_insert5
                  - 36.07% _raw_spin_lock
                     - do_raw_spin_lock
                          35.86% __pv_queued_spin_lock_slowpath
                    4.02% find_inode

btrfs was tested but it is limited by internal lock contention at
>=2 threads on this workload, so never hammers the inode cache lock
hard enough for this change to matter to it's performance.

However, both bcachefs and ext4 demonstrate poor scaling at >=8
threads on concurrent lookup or create workloads.

Hence convert the inode hash table to a RCU-aware hash-bl table just
like the dentry cache. Note that we need to store a pointer to the
hlist_bl_head the inode has been added to in the inode so that when
it comes to unhash the inode we know what list to lock. We need to
do this because, unlike the dentry cache, the hash value that is
used to hash the inode is not generated from the inode itself. i.e.
filesystems can provide this themselves so we have to either store
the hashval or the hlist head pointer in the inode to be able to
find the right list head for removal...

Concurrent create with variying thread count (files/s):

		vanilla			Patched
threads		ext4	  bcachefs	ext4	  bcachefs
2		117k			112k	   85k
4		185k			190k	  145k
8		303k	  185k		346k	  255k
16		389k	  190k		465k	  420k
32		360k	  142k		437k	  481k

		ext4			bcachefs
threads		vanilla	 patched	vanilla	patched
2		117k	 112k		 80k	 85k
4		185k	 190k		133k	145k
8		303k	 346k		185k	255k
16		389k	 465k		190k	420k
32		360k	 437k		142k	481k

CPU usage for both bcachefs and ext4 at 16 and 32 threads has been
halved on the patched kernel, while performance has increased
marginally on ext4 and massively on bcachefs. Internal filesystem
algorithms now limit performance on these workloads, not the global
inode_hash_lock.

Profile of the workloads on the patched kernels:

-   35.94%     0.07%  [kernel]                  [k] ext4_create
   - 35.87% ext4_create
      - 20.45% __ext4_new_inode
...
           3.36% insert_inode_locked

   - 78.43% do_filp_open
      - 78.36% path_openat
         - 53.95% bch2_create
            - 47.99% __bch2_create
....
              - 7.57% inode_insert5
                    6.94% find_inode

Spinlock contention is largely gone from the inode hash operations
and the filesystems are limited by contention in their internal
algorithms.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/inode.c         | 200 ++++++++++++++++++++++++++++-----------------
 include/linux/fs.h |   9 +-
 2 files changed, 132 insertions(+), 77 deletions(-)

Comments

Matthew Wilcox April 6, 2021, 1:28 p.m. UTC | #1
On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
> +++ b/fs/inode.c
> @@ -57,8 +57,7 @@
>  
>  static unsigned int i_hash_mask __read_mostly;
>  static unsigned int i_hash_shift __read_mostly;
> -static struct hlist_head *inode_hashtable __read_mostly;
> -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
> +static struct hlist_bl_head *inode_hashtable __read_mostly;

I'm a little concerned that we're losing a lockdep map here.  

Nobody seems to have done this for list_bl yet, and I'd be reluctant
to gate your patch on "Hey, Dave, solve this problem nobody else has
done yet".

But maybe we can figure out how to do this?  I think we want one lockdep
map for the entire hashtable (it's split for performance, not because
they're logically separate locks).  So something like ...

static struct lockdep_map inode_hash_map =
	STATIC_LOCKDEP_MAP_INIT("inode_hash", &inode_hash_map);

and add:

static inline void hlist_bl_lock_map(struct hlist_bl_head *b,
					struct lockdep_map *map)
{
	bit_spin_lock(0, (unsigned long *)b);
	spin_acquire(map, 0, 0, _RET_IP_);
}

then use hlist_bl_lock_map() throughout.  Would that work?
Adding lockdep experts for opinions.

>  static unsigned long hash(struct super_block *sb, unsigned long hashval)
>  {
> @@ -70,7 +69,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
>  	return tmp & i_hash_mask;
>  }
>  
> -static inline struct hlist_head *i_hash_head(struct super_block *sb,
> +static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
>  		unsigned int hashval)
>  {
>  	return inode_hashtable + hash(sb, hashval);
> @@ -407,7 +406,7 @@ EXPORT_SYMBOL(address_space_init_once);
>  void inode_init_once(struct inode *inode)
>  {
>  	memset(inode, 0, sizeof(*inode));
> -	INIT_HLIST_NODE(&inode->i_hash);
> +	INIT_HLIST_BL_NODE(&inode->i_hash);
>  	INIT_LIST_HEAD(&inode->i_devices);
>  	INIT_LIST_HEAD(&inode->i_io_list);
>  	INIT_LIST_HEAD(&inode->i_wb_list);
> @@ -491,6 +490,17 @@ static inline void inode_sb_list_del(struct inode *inode)
>  	}
>  }
>  
> +/*
> + * Ensure that we store the hash head in the inode when we insert the inode into
> + * the hlist_bl_head...
> + */
> +static inline void
> +__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
> +{
> +	hlist_bl_add_head_rcu(&inode->i_hash, b);
> +	inode->i_hash_head = b;
> +}
> +
>  /**
>   *	__insert_inode_hash - hash an inode
>   *	@inode: unhashed inode
> @@ -501,13 +511,13 @@ static inline void inode_sb_list_del(struct inode *inode)
>   */
>  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
>  {
> -	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
>  
> -	spin_lock(&inode_hash_lock);
> +	hlist_bl_lock(b);
>  	spin_lock(&inode->i_lock);
> -	hlist_add_head_rcu(&inode->i_hash, b);
> +	__insert_inode_hash_head(inode, b);
>  	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  }
>  EXPORT_SYMBOL(__insert_inode_hash);
>  
> @@ -519,11 +529,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
>   */
>  void __remove_inode_hash(struct inode *inode)
>  {
> -	spin_lock(&inode_hash_lock);
> -	spin_lock(&inode->i_lock);
> -	hlist_del_init_rcu(&inode->i_hash);
> -	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	struct hlist_bl_head *b = inode->i_hash_head;
> +
> +	/*
> +	 * There are some callers that come through here without synchronisation
> +	 * and potentially with multiple references to the inode. Hence we have
> +	 * to handle the case that we might race with a remove and insert to a
> +	 * different list. Coda, in particular, seems to have a userspace API
> +	 * that can directly trigger "unhash/rehash to different list" behaviour
> +	 * without any serialisation at all.
> +	 *
> +	 * Hence we have to handle the situation where the inode->i_hash_head
> +	 * might point to a different list than what we expect, indicating that
> +	 * we raced with another unhash and potentially a new insertion. This
> +	 * means we have to retest the head once we have everything locked up
> +	 * and loop again if it doesn't match.
> +	 */
> +	while (b) {
> +		hlist_bl_lock(b);
> +		spin_lock(&inode->i_lock);
> +		if (b != inode->i_hash_head) {
> +			hlist_bl_unlock(b);
> +			b = inode->i_hash_head;
> +			spin_unlock(&inode->i_lock);
> +			continue;
> +		}
> +		/*
> +		 * Need to set the pprev pointer to NULL after list removal so
> +		 * that both RCU traversals and hlist_bl_unhashed() work
> +		 * correctly at this point.
> +		 */
> +		hlist_bl_del_rcu(&inode->i_hash);
> +		inode->i_hash.pprev = NULL;
> +		inode->i_hash_head = NULL;
> +		spin_unlock(&inode->i_lock);
> +		hlist_bl_unlock(b);
> +		break;
> +	}
> +
>  }
>  EXPORT_SYMBOL(__remove_inode_hash);
>  
> @@ -813,26 +856,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
>  	return freed;
>  }
>  
> -static void __wait_on_freeing_inode(struct inode *inode);
> +static void __wait_on_freeing_inode(struct hlist_bl_head *b,
> +				struct inode *inode);
>  /*
>   * Called with the inode lock held.
>   */
>  static struct inode *find_inode(struct super_block *sb,
> -				struct hlist_head *head,
> +				struct hlist_bl_head *b,
>  				int (*test)(struct inode *, void *),
>  				void *data)
>  {
> +	struct hlist_bl_node *node;
>  	struct inode *inode = NULL;
>  
>  repeat:
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_sb != sb)
>  			continue;
>  		if (!test(inode, data))
>  			continue;
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
> -			__wait_on_freeing_inode(inode);
> +			__wait_on_freeing_inode(b, inode);
>  			goto repeat;
>  		}
>  		if (unlikely(inode->i_state & I_CREATING)) {
> @@ -851,19 +896,20 @@ static struct inode *find_inode(struct super_block *sb,
>   * iget_locked for details.
>   */
>  static struct inode *find_inode_fast(struct super_block *sb,
> -				struct hlist_head *head, unsigned long ino)
> +				struct hlist_bl_head *b, unsigned long ino)
>  {
> +	struct hlist_bl_node *node;
>  	struct inode *inode = NULL;
>  
>  repeat:
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_ino != ino)
>  			continue;
>  		if (inode->i_sb != sb)
>  			continue;
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
> -			__wait_on_freeing_inode(inode);
> +			__wait_on_freeing_inode(b, inode);
>  			goto repeat;
>  		}
>  		if (unlikely(inode->i_state & I_CREATING)) {
> @@ -1072,26 +1118,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
>   * return it locked, hashed, and with the I_NEW flag set. The file system gets
>   * to fill it in before unlocking it via unlock_new_inode().
>   *
> - * Note both @test and @set are called with the inode_hash_lock held, so can't
> - * sleep.
> + * Note both @test and @set are called with the inode hash chain lock held,
> + * so can't sleep.
>   */
>  struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
>  			    int (*test)(struct inode *, void *),
>  			    int (*set)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
>  	struct inode *old;
>  	bool creating = inode->i_state & I_CREATING;
>  
>  again:
> -	spin_lock(&inode_hash_lock);
> -	old = find_inode(inode->i_sb, head, test, data);
> +	hlist_bl_lock(b);
> +	old = find_inode(inode->i_sb, b, test, data);
>  	if (unlikely(old)) {
>  		/*
>  		 * Uhhuh, somebody else created the same inode under us.
>  		 * Use the old inode instead of the preallocated one.
>  		 */
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		if (IS_ERR(old))
>  			return NULL;
>  		wait_on_inode(old);
> @@ -1113,12 +1159,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
>  	 */
>  	spin_lock(&inode->i_lock);
>  	inode->i_state |= I_NEW;
> -	hlist_add_head_rcu(&inode->i_hash, head);
> +	__insert_inode_hash_head(inode, b);
>  	spin_unlock(&inode->i_lock);
>  	if (!creating)
>  		inode_sb_list_add(inode);
>  unlock:
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  
>  	return inode;
>  }
> @@ -1179,12 +1225,12 @@ EXPORT_SYMBOL(iget5_locked);
>   */
>  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  	struct inode *inode;
>  again:
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode_fast(sb, head, ino);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode_fast(sb, b, ino);
> +	hlist_bl_unlock(b);
>  	if (inode) {
>  		if (IS_ERR(inode))
>  			return NULL;
> @@ -1200,17 +1246,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  	if (inode) {
>  		struct inode *old;
>  
> -		spin_lock(&inode_hash_lock);
> +		hlist_bl_lock(b);
>  		/* We released the lock, so.. */
> -		old = find_inode_fast(sb, head, ino);
> +		old = find_inode_fast(sb, b, ino);
>  		if (!old) {
>  			inode->i_ino = ino;
>  			spin_lock(&inode->i_lock);
>  			inode->i_state = I_NEW;
> -			hlist_add_head_rcu(&inode->i_hash, head);
> +			__insert_inode_hash_head(inode, b);
>  			spin_unlock(&inode->i_lock);
>  			inode_sb_list_add(inode);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  
>  			/* Return the locked inode with I_NEW set, the
>  			 * caller is responsible for filling in the contents
> @@ -1223,7 +1269,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  		 * us. Use the old inode instead of the one we just
>  		 * allocated.
>  		 */
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		destroy_inode(inode);
>  		if (IS_ERR(old))
>  			return NULL;
> @@ -1247,10 +1293,11 @@ EXPORT_SYMBOL(iget_locked);
>   */
>  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
> -	hlist_for_each_entry_rcu(inode, b, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_ino == ino && inode->i_sb == sb)
>  			return 0;
>  	}
> @@ -1334,12 +1381,12 @@ EXPORT_SYMBOL(igrab);
>  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
>  		int (*test)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
>  	struct inode *inode;
>  
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode(sb, head, test, data);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode(sb, b, test, data);
> +	hlist_bl_unlock(b);
>  
>  	return IS_ERR(inode) ? NULL : inode;
>  }
> @@ -1389,12 +1436,12 @@ EXPORT_SYMBOL(ilookup5);
>   */
>  struct inode *ilookup(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  	struct inode *inode;
>  again:
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode_fast(sb, head, ino);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode_fast(sb, b, ino);
> +	hlist_bl_unlock(b);
>  
>  	if (inode) {
>  		if (IS_ERR(inode))
> @@ -1438,12 +1485,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
>  					     void *),
>  				void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
> +	struct hlist_bl_node *node;
>  	struct inode *inode, *ret_inode = NULL;
>  	int mval;
>  
> -	spin_lock(&inode_hash_lock);
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_lock(b);
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_sb != sb)
>  			continue;
>  		mval = match(inode, hashval, data);
> @@ -1454,7 +1502,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
>  		goto out;
>  	}
>  out:
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  	return ret_inode;
>  }
>  EXPORT_SYMBOL(find_inode_nowait);
> @@ -1483,13 +1531,14 @@ EXPORT_SYMBOL(find_inode_nowait);
>  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
>  			     int (*test)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
>  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
>  			 "suspicious find_inode_rcu() usage");
>  
> -	hlist_for_each_entry_rcu(inode, head, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_sb == sb &&
>  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
>  		    test(inode, data))
> @@ -1521,13 +1570,14 @@ EXPORT_SYMBOL(find_inode_rcu);
>  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
>  				    unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
>  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
>  			 "suspicious find_inode_by_ino_rcu() usage");
>  
> -	hlist_for_each_entry_rcu(inode, head, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_ino == ino &&
>  		    inode->i_sb == sb &&
>  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
> @@ -1541,39 +1591,42 @@ int insert_inode_locked(struct inode *inode)
>  {
>  	struct super_block *sb = inode->i_sb;
>  	ino_t ino = inode->i_ino;
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  
>  	while (1) {
> -		struct inode *old = NULL;
> -		spin_lock(&inode_hash_lock);
> -		hlist_for_each_entry(old, head, i_hash) {
> -			if (old->i_ino != ino)
> +		struct hlist_bl_node *node;
> +		struct inode *old = NULL, *t;
> +
> +		hlist_bl_lock(b);
> +		hlist_bl_for_each_entry(t, node, b, i_hash) {
> +			if (t->i_ino != ino)
>  				continue;
> -			if (old->i_sb != sb)
> +			if (t->i_sb != sb)
>  				continue;
> -			spin_lock(&old->i_lock);
> -			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
> -				spin_unlock(&old->i_lock);
> +			spin_lock(&t->i_lock);
> +			if (t->i_state & (I_FREEING|I_WILL_FREE)) {
> +				spin_unlock(&t->i_lock);
>  				continue;
>  			}
> +			old = t;
>  			break;
>  		}
>  		if (likely(!old)) {
>  			spin_lock(&inode->i_lock);
>  			inode->i_state |= I_NEW | I_CREATING;
> -			hlist_add_head_rcu(&inode->i_hash, head);
> +			__insert_inode_hash_head(inode, b);
>  			spin_unlock(&inode->i_lock);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  			return 0;
>  		}
>  		if (unlikely(old->i_state & I_CREATING)) {
>  			spin_unlock(&old->i_lock);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  			return -EBUSY;
>  		}
>  		__iget(old);
>  		spin_unlock(&old->i_lock);
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		wait_on_inode(old);
>  		if (unlikely(!inode_unhashed(old))) {
>  			iput(old);
> @@ -2046,17 +2099,18 @@ EXPORT_SYMBOL(inode_needs_sync);
>   * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
>   * will DTRT.
>   */
> -static void __wait_on_freeing_inode(struct inode *inode)
> +static void __wait_on_freeing_inode(struct hlist_bl_head *b,
> +				struct inode *inode)
>  {
>  	wait_queue_head_t *wq;
>  	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
>  	wq = bit_waitqueue(&inode->i_state, __I_NEW);
>  	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  	schedule();
>  	finish_wait(wq, &wait.wq_entry);
> -	spin_lock(&inode_hash_lock);
> +	hlist_bl_lock(b);
>  }
>  
>  static __initdata unsigned long ihash_entries;
> @@ -2082,7 +2136,7 @@ void __init inode_init_early(void)
>  
>  	inode_hashtable =
>  		alloc_large_system_hash("Inode-cache",
> -					sizeof(struct hlist_head),
> +					sizeof(struct hlist_bl_head),
>  					ihash_entries,
>  					14,
>  					HASH_EARLY | HASH_ZERO,
> @@ -2108,7 +2162,7 @@ void __init inode_init(void)
>  
>  	inode_hashtable =
>  		alloc_large_system_hash("Inode-cache",
> -					sizeof(struct hlist_head),
> +					sizeof(struct hlist_bl_head),
>  					ihash_entries,
>  					14,
>  					HASH_ZERO,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ec8f3ddf4a6a..89c9e49a71a6 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -664,7 +664,8 @@ struct inode {
>  	unsigned long		dirtied_when;	/* jiffies of first dirtying */
>  	unsigned long		dirtied_time_when;
>  
> -	struct hlist_node	i_hash;
> +	struct hlist_bl_node	i_hash;
> +	struct hlist_bl_head	*i_hash_head;
>  	struct list_head	i_io_list;	/* backing dev IO list */
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
> @@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
>  
>  static inline int inode_unhashed(struct inode *inode)
>  {
> -	return hlist_unhashed(&inode->i_hash);
> +	return hlist_bl_unhashed(&inode->i_hash);
>  }
>  
>  /*
> @@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode)
>   */
>  static inline void inode_fake_hash(struct inode *inode)
>  {
> -	hlist_add_fake(&inode->i_hash);
> +	hlist_bl_add_fake(&inode->i_hash);
>  }
>  
>  /*
> @@ -3065,7 +3066,7 @@ static inline void insert_inode_hash(struct inode *inode)
>  extern void __remove_inode_hash(struct inode *);
>  static inline void remove_inode_hash(struct inode *inode)
>  {
> -	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
> +	if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
>  		__remove_inode_hash(inode);
>  }
>  
> -- 
> 2.31.0
>
Dave Chinner April 6, 2021, 9:22 p.m. UTC | #2
On Tue, Apr 06, 2021 at 02:28:34PM +0100, Matthew Wilcox wrote:
> On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
> > +++ b/fs/inode.c
> > @@ -57,8 +57,7 @@
> >  
> >  static unsigned int i_hash_mask __read_mostly;
> >  static unsigned int i_hash_shift __read_mostly;
> > -static struct hlist_head *inode_hashtable __read_mostly;
> > -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
> > +static struct hlist_bl_head *inode_hashtable __read_mostly;
> 
> I'm a little concerned that we're losing a lockdep map here.  
> 
> Nobody seems to have done this for list_bl yet, and I'd be reluctant
> to gate your patch on "Hey, Dave, solve this problem nobody else has
> done yet".

I really don't care about lockdep. Adding lockdep support to
hlist_bl is somebody else's problem - I'm just using infrastructure
that already exists. Also, the dentry cache usage of hlist_bl is
vastly more complex and so if lockdep coverage was really necessary,
it would have already been done....

And, FWIW, I'm also aware of the problems that RT kernels have with
the use of bit spinlocks and being unable to turn them into sleeping
mutexes by preprocessor magic. I don't care about that either,
because dentry cache...

> But maybe we can figure out how to do this?  I think we want one lockdep
> map for the entire hashtable (it's split for performance, not because
> they're logically separate locks).  So something like ...
> 
> static struct lockdep_map inode_hash_map =
> 	STATIC_LOCKDEP_MAP_INIT("inode_hash", &inode_hash_map);
> 
> and add:
> 
> static inline void hlist_bl_lock_map(struct hlist_bl_head *b,
> 					struct lockdep_map *map)
> {
> 	bit_spin_lock(0, (unsigned long *)b);
> 	spin_acquire(map, 0, 0, _RET_IP_);
> }
> 
> then use hlist_bl_lock_map() throughout.  Would that work?
> Adding lockdep experts for opinions.

Maybe, but it's kinda messy to have to carry the lockdep map around
externally to the structure. Not to mention it won't support holding
multiple hash chains locked at once if that is ever needed (e.g.
rehashing an object with a different hashval)...

Cheers,

Dave.
Kent Overstreet April 6, 2021, 10:16 p.m. UTC | #3
On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Because scalability of the global inode_hash_lock really, really
> sucks and prevents me from doing scalability characterisation and
> analysis of bcachefs algorithms.
> 
> Profiles of a 32-way concurrent create of 51.2m inodes with fsmark
> on a couple of different filesystems on a 5.10 kernel:
> 
> -   52.13%     0.04%  [kernel]            [k] ext4_create
>    - 52.09% ext4_create
>       - 41.03% __ext4_new_inode
>          - 29.92% insert_inode_locked
>             - 25.35% _raw_spin_lock
>                - do_raw_spin_lock
>                   - 24.97% __pv_queued_spin_lock_slowpath
> 
> 
> -   72.33%     0.02%  [kernel]            [k] do_filp_open
>    - 72.31% do_filp_open
>       - 72.28% path_openat
>          - 57.03% bch2_create
>             - 56.46% __bch2_create
>                - 40.43% inode_insert5
>                   - 36.07% _raw_spin_lock
>                      - do_raw_spin_lock
>                           35.86% __pv_queued_spin_lock_slowpath
>                     4.02% find_inode
> 
> btrfs was tested but it is limited by internal lock contention at
> >=2 threads on this workload, so never hammers the inode cache lock
> hard enough for this change to matter to it's performance.
> 
> However, both bcachefs and ext4 demonstrate poor scaling at >=8
> threads on concurrent lookup or create workloads.
> 
> Hence convert the inode hash table to a RCU-aware hash-bl table just
> like the dentry cache. Note that we need to store a pointer to the
> hlist_bl_head the inode has been added to in the inode so that when
> it comes to unhash the inode we know what list to lock. We need to
> do this because, unlike the dentry cache, the hash value that is
> used to hash the inode is not generated from the inode itself. i.e.
> filesystems can provide this themselves so we have to either store
> the hashval or the hlist head pointer in the inode to be able to
> find the right list head for removal...
> 
> Concurrent create with variying thread count (files/s):
> 
> 		vanilla			Patched
> threads		ext4	  bcachefs	ext4	  bcachefs
> 2		117k			112k	   85k
> 4		185k			190k	  145k
> 8		303k	  185k		346k	  255k
> 16		389k	  190k		465k	  420k
> 32		360k	  142k		437k	  481k
> 
> 		ext4			bcachefs
> threads		vanilla	 patched	vanilla	patched
> 2		117k	 112k		 80k	 85k
> 4		185k	 190k		133k	145k
> 8		303k	 346k		185k	255k
> 16		389k	 465k		190k	420k
> 32		360k	 437k		142k	481k
> 
> CPU usage for both bcachefs and ext4 at 16 and 32 threads has been
> halved on the patched kernel, while performance has increased
> marginally on ext4 and massively on bcachefs. Internal filesystem
> algorithms now limit performance on these workloads, not the global
> inode_hash_lock.
> 
> Profile of the workloads on the patched kernels:
> 
> -   35.94%     0.07%  [kernel]                  [k] ext4_create
>    - 35.87% ext4_create
>       - 20.45% __ext4_new_inode
> ...
>            3.36% insert_inode_locked
> 
>    - 78.43% do_filp_open
>       - 78.36% path_openat
>          - 53.95% bch2_create
>             - 47.99% __bch2_create
> ....
>               - 7.57% inode_insert5
>                     6.94% find_inode
> 
> Spinlock contention is largely gone from the inode hash operations
> and the filesystems are limited by contention in their internal
> algorithms.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Reviewed-and-tested-by: Kent Overstreet <kent.overstreet@gmail.com>

> ---
>  fs/inode.c         | 200 ++++++++++++++++++++++++++++-----------------
>  include/linux/fs.h |   9 +-
>  2 files changed, 132 insertions(+), 77 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index b8d9eb3454dc..867af386177b 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -57,8 +57,7 @@
>  
>  static unsigned int i_hash_mask __read_mostly;
>  static unsigned int i_hash_shift __read_mostly;
> -static struct hlist_head *inode_hashtable __read_mostly;
> -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
> +static struct hlist_bl_head *inode_hashtable __read_mostly;
>  
>  static unsigned long hash(struct super_block *sb, unsigned long hashval)
>  {
> @@ -70,7 +69,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
>  	return tmp & i_hash_mask;
>  }
>  
> -static inline struct hlist_head *i_hash_head(struct super_block *sb,
> +static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
>  		unsigned int hashval)
>  {
>  	return inode_hashtable + hash(sb, hashval);
> @@ -407,7 +406,7 @@ EXPORT_SYMBOL(address_space_init_once);
>  void inode_init_once(struct inode *inode)
>  {
>  	memset(inode, 0, sizeof(*inode));
> -	INIT_HLIST_NODE(&inode->i_hash);
> +	INIT_HLIST_BL_NODE(&inode->i_hash);
>  	INIT_LIST_HEAD(&inode->i_devices);
>  	INIT_LIST_HEAD(&inode->i_io_list);
>  	INIT_LIST_HEAD(&inode->i_wb_list);
> @@ -491,6 +490,17 @@ static inline void inode_sb_list_del(struct inode *inode)
>  	}
>  }
>  
> +/*
> + * Ensure that we store the hash head in the inode when we insert the inode into
> + * the hlist_bl_head...
> + */
> +static inline void
> +__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
> +{
> +	hlist_bl_add_head_rcu(&inode->i_hash, b);
> +	inode->i_hash_head = b;
> +}
> +
>  /**
>   *	__insert_inode_hash - hash an inode
>   *	@inode: unhashed inode
> @@ -501,13 +511,13 @@ static inline void inode_sb_list_del(struct inode *inode)
>   */
>  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
>  {
> -	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
>  
> -	spin_lock(&inode_hash_lock);
> +	hlist_bl_lock(b);
>  	spin_lock(&inode->i_lock);
> -	hlist_add_head_rcu(&inode->i_hash, b);
> +	__insert_inode_hash_head(inode, b);
>  	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  }
>  EXPORT_SYMBOL(__insert_inode_hash);
>  
> @@ -519,11 +529,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
>   */
>  void __remove_inode_hash(struct inode *inode)
>  {
> -	spin_lock(&inode_hash_lock);
> -	spin_lock(&inode->i_lock);
> -	hlist_del_init_rcu(&inode->i_hash);
> -	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	struct hlist_bl_head *b = inode->i_hash_head;
> +
> +	/*
> +	 * There are some callers that come through here without synchronisation
> +	 * and potentially with multiple references to the inode. Hence we have
> +	 * to handle the case that we might race with a remove and insert to a
> +	 * different list. Coda, in particular, seems to have a userspace API
> +	 * that can directly trigger "unhash/rehash to different list" behaviour
> +	 * without any serialisation at all.
> +	 *
> +	 * Hence we have to handle the situation where the inode->i_hash_head
> +	 * might point to a different list than what we expect, indicating that
> +	 * we raced with another unhash and potentially a new insertion. This
> +	 * means we have to retest the head once we have everything locked up
> +	 * and loop again if it doesn't match.
> +	 */
> +	while (b) {
> +		hlist_bl_lock(b);
> +		spin_lock(&inode->i_lock);
> +		if (b != inode->i_hash_head) {
> +			hlist_bl_unlock(b);
> +			b = inode->i_hash_head;
> +			spin_unlock(&inode->i_lock);
> +			continue;
> +		}
> +		/*
> +		 * Need to set the pprev pointer to NULL after list removal so
> +		 * that both RCU traversals and hlist_bl_unhashed() work
> +		 * correctly at this point.
> +		 */
> +		hlist_bl_del_rcu(&inode->i_hash);
> +		inode->i_hash.pprev = NULL;
> +		inode->i_hash_head = NULL;
> +		spin_unlock(&inode->i_lock);
> +		hlist_bl_unlock(b);
> +		break;
> +	}
> +
>  }
>  EXPORT_SYMBOL(__remove_inode_hash);
>  
> @@ -813,26 +856,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
>  	return freed;
>  }
>  
> -static void __wait_on_freeing_inode(struct inode *inode);
> +static void __wait_on_freeing_inode(struct hlist_bl_head *b,
> +				struct inode *inode);
>  /*
>   * Called with the inode lock held.
>   */
>  static struct inode *find_inode(struct super_block *sb,
> -				struct hlist_head *head,
> +				struct hlist_bl_head *b,
>  				int (*test)(struct inode *, void *),
>  				void *data)
>  {
> +	struct hlist_bl_node *node;
>  	struct inode *inode = NULL;
>  
>  repeat:
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_sb != sb)
>  			continue;
>  		if (!test(inode, data))
>  			continue;
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
> -			__wait_on_freeing_inode(inode);
> +			__wait_on_freeing_inode(b, inode);
>  			goto repeat;
>  		}
>  		if (unlikely(inode->i_state & I_CREATING)) {
> @@ -851,19 +896,20 @@ static struct inode *find_inode(struct super_block *sb,
>   * iget_locked for details.
>   */
>  static struct inode *find_inode_fast(struct super_block *sb,
> -				struct hlist_head *head, unsigned long ino)
> +				struct hlist_bl_head *b, unsigned long ino)
>  {
> +	struct hlist_bl_node *node;
>  	struct inode *inode = NULL;
>  
>  repeat:
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_ino != ino)
>  			continue;
>  		if (inode->i_sb != sb)
>  			continue;
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
> -			__wait_on_freeing_inode(inode);
> +			__wait_on_freeing_inode(b, inode);
>  			goto repeat;
>  		}
>  		if (unlikely(inode->i_state & I_CREATING)) {
> @@ -1072,26 +1118,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
>   * return it locked, hashed, and with the I_NEW flag set. The file system gets
>   * to fill it in before unlocking it via unlock_new_inode().
>   *
> - * Note both @test and @set are called with the inode_hash_lock held, so can't
> - * sleep.
> + * Note both @test and @set are called with the inode hash chain lock held,
> + * so can't sleep.
>   */
>  struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
>  			    int (*test)(struct inode *, void *),
>  			    int (*set)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
>  	struct inode *old;
>  	bool creating = inode->i_state & I_CREATING;
>  
>  again:
> -	spin_lock(&inode_hash_lock);
> -	old = find_inode(inode->i_sb, head, test, data);
> +	hlist_bl_lock(b);
> +	old = find_inode(inode->i_sb, b, test, data);
>  	if (unlikely(old)) {
>  		/*
>  		 * Uhhuh, somebody else created the same inode under us.
>  		 * Use the old inode instead of the preallocated one.
>  		 */
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		if (IS_ERR(old))
>  			return NULL;
>  		wait_on_inode(old);
> @@ -1113,12 +1159,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
>  	 */
>  	spin_lock(&inode->i_lock);
>  	inode->i_state |= I_NEW;
> -	hlist_add_head_rcu(&inode->i_hash, head);
> +	__insert_inode_hash_head(inode, b);
>  	spin_unlock(&inode->i_lock);
>  	if (!creating)
>  		inode_sb_list_add(inode);
>  unlock:
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  
>  	return inode;
>  }
> @@ -1179,12 +1225,12 @@ EXPORT_SYMBOL(iget5_locked);
>   */
>  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  	struct inode *inode;
>  again:
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode_fast(sb, head, ino);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode_fast(sb, b, ino);
> +	hlist_bl_unlock(b);
>  	if (inode) {
>  		if (IS_ERR(inode))
>  			return NULL;
> @@ -1200,17 +1246,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  	if (inode) {
>  		struct inode *old;
>  
> -		spin_lock(&inode_hash_lock);
> +		hlist_bl_lock(b);
>  		/* We released the lock, so.. */
> -		old = find_inode_fast(sb, head, ino);
> +		old = find_inode_fast(sb, b, ino);
>  		if (!old) {
>  			inode->i_ino = ino;
>  			spin_lock(&inode->i_lock);
>  			inode->i_state = I_NEW;
> -			hlist_add_head_rcu(&inode->i_hash, head);
> +			__insert_inode_hash_head(inode, b);
>  			spin_unlock(&inode->i_lock);
>  			inode_sb_list_add(inode);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  
>  			/* Return the locked inode with I_NEW set, the
>  			 * caller is responsible for filling in the contents
> @@ -1223,7 +1269,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  		 * us. Use the old inode instead of the one we just
>  		 * allocated.
>  		 */
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		destroy_inode(inode);
>  		if (IS_ERR(old))
>  			return NULL;
> @@ -1247,10 +1293,11 @@ EXPORT_SYMBOL(iget_locked);
>   */
>  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
> -	hlist_for_each_entry_rcu(inode, b, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_ino == ino && inode->i_sb == sb)
>  			return 0;
>  	}
> @@ -1334,12 +1381,12 @@ EXPORT_SYMBOL(igrab);
>  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
>  		int (*test)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
>  	struct inode *inode;
>  
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode(sb, head, test, data);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode(sb, b, test, data);
> +	hlist_bl_unlock(b);
>  
>  	return IS_ERR(inode) ? NULL : inode;
>  }
> @@ -1389,12 +1436,12 @@ EXPORT_SYMBOL(ilookup5);
>   */
>  struct inode *ilookup(struct super_block *sb, unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  	struct inode *inode;
>  again:
> -	spin_lock(&inode_hash_lock);
> -	inode = find_inode_fast(sb, head, ino);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_lock(b);
> +	inode = find_inode_fast(sb, b, ino);
> +	hlist_bl_unlock(b);
>  
>  	if (inode) {
>  		if (IS_ERR(inode))
> @@ -1438,12 +1485,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
>  					     void *),
>  				void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
> +	struct hlist_bl_node *node;
>  	struct inode *inode, *ret_inode = NULL;
>  	int mval;
>  
> -	spin_lock(&inode_hash_lock);
> -	hlist_for_each_entry(inode, head, i_hash) {
> +	hlist_bl_lock(b);
> +	hlist_bl_for_each_entry(inode, node, b, i_hash) {
>  		if (inode->i_sb != sb)
>  			continue;
>  		mval = match(inode, hashval, data);
> @@ -1454,7 +1502,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
>  		goto out;
>  	}
>  out:
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  	return ret_inode;
>  }
>  EXPORT_SYMBOL(find_inode_nowait);
> @@ -1483,13 +1531,14 @@ EXPORT_SYMBOL(find_inode_nowait);
>  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
>  			     int (*test)(struct inode *, void *), void *data)
>  {
> -	struct hlist_head *head = i_hash_head(sb, hashval);
> +	struct hlist_bl_head *b = i_hash_head(sb, hashval);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
>  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
>  			 "suspicious find_inode_rcu() usage");
>  
> -	hlist_for_each_entry_rcu(inode, head, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_sb == sb &&
>  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
>  		    test(inode, data))
> @@ -1521,13 +1570,14 @@ EXPORT_SYMBOL(find_inode_rcu);
>  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
>  				    unsigned long ino)
>  {
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
> +	struct hlist_bl_node *node;
>  	struct inode *inode;
>  
>  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
>  			 "suspicious find_inode_by_ino_rcu() usage");
>  
> -	hlist_for_each_entry_rcu(inode, head, i_hash) {
> +	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
>  		if (inode->i_ino == ino &&
>  		    inode->i_sb == sb &&
>  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
> @@ -1541,39 +1591,42 @@ int insert_inode_locked(struct inode *inode)
>  {
>  	struct super_block *sb = inode->i_sb;
>  	ino_t ino = inode->i_ino;
> -	struct hlist_head *head = i_hash_head(sb, ino);
> +	struct hlist_bl_head *b = i_hash_head(sb, ino);
>  
>  	while (1) {
> -		struct inode *old = NULL;
> -		spin_lock(&inode_hash_lock);
> -		hlist_for_each_entry(old, head, i_hash) {
> -			if (old->i_ino != ino)
> +		struct hlist_bl_node *node;
> +		struct inode *old = NULL, *t;
> +
> +		hlist_bl_lock(b);
> +		hlist_bl_for_each_entry(t, node, b, i_hash) {
> +			if (t->i_ino != ino)
>  				continue;
> -			if (old->i_sb != sb)
> +			if (t->i_sb != sb)
>  				continue;
> -			spin_lock(&old->i_lock);
> -			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
> -				spin_unlock(&old->i_lock);
> +			spin_lock(&t->i_lock);
> +			if (t->i_state & (I_FREEING|I_WILL_FREE)) {
> +				spin_unlock(&t->i_lock);
>  				continue;
>  			}
> +			old = t;
>  			break;
>  		}
>  		if (likely(!old)) {
>  			spin_lock(&inode->i_lock);
>  			inode->i_state |= I_NEW | I_CREATING;
> -			hlist_add_head_rcu(&inode->i_hash, head);
> +			__insert_inode_hash_head(inode, b);
>  			spin_unlock(&inode->i_lock);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  			return 0;
>  		}
>  		if (unlikely(old->i_state & I_CREATING)) {
>  			spin_unlock(&old->i_lock);
> -			spin_unlock(&inode_hash_lock);
> +			hlist_bl_unlock(b);
>  			return -EBUSY;
>  		}
>  		__iget(old);
>  		spin_unlock(&old->i_lock);
> -		spin_unlock(&inode_hash_lock);
> +		hlist_bl_unlock(b);
>  		wait_on_inode(old);
>  		if (unlikely(!inode_unhashed(old))) {
>  			iput(old);
> @@ -2046,17 +2099,18 @@ EXPORT_SYMBOL(inode_needs_sync);
>   * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
>   * will DTRT.
>   */
> -static void __wait_on_freeing_inode(struct inode *inode)
> +static void __wait_on_freeing_inode(struct hlist_bl_head *b,
> +				struct inode *inode)
>  {
>  	wait_queue_head_t *wq;
>  	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
>  	wq = bit_waitqueue(&inode->i_state, __I_NEW);
>  	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  	spin_unlock(&inode->i_lock);
> -	spin_unlock(&inode_hash_lock);
> +	hlist_bl_unlock(b);
>  	schedule();
>  	finish_wait(wq, &wait.wq_entry);
> -	spin_lock(&inode_hash_lock);
> +	hlist_bl_lock(b);
>  }
>  
>  static __initdata unsigned long ihash_entries;
> @@ -2082,7 +2136,7 @@ void __init inode_init_early(void)
>  
>  	inode_hashtable =
>  		alloc_large_system_hash("Inode-cache",
> -					sizeof(struct hlist_head),
> +					sizeof(struct hlist_bl_head),
>  					ihash_entries,
>  					14,
>  					HASH_EARLY | HASH_ZERO,
> @@ -2108,7 +2162,7 @@ void __init inode_init(void)
>  
>  	inode_hashtable =
>  		alloc_large_system_hash("Inode-cache",
> -					sizeof(struct hlist_head),
> +					sizeof(struct hlist_bl_head),
>  					ihash_entries,
>  					14,
>  					HASH_ZERO,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ec8f3ddf4a6a..89c9e49a71a6 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -664,7 +664,8 @@ struct inode {
>  	unsigned long		dirtied_when;	/* jiffies of first dirtying */
>  	unsigned long		dirtied_time_when;
>  
> -	struct hlist_node	i_hash;
> +	struct hlist_bl_node	i_hash;
> +	struct hlist_bl_head	*i_hash_head;
>  	struct list_head	i_io_list;	/* backing dev IO list */
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
> @@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
>  
>  static inline int inode_unhashed(struct inode *inode)
>  {
> -	return hlist_unhashed(&inode->i_hash);
> +	return hlist_bl_unhashed(&inode->i_hash);
>  }
>  
>  /*
> @@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode)
>   */
>  static inline void inode_fake_hash(struct inode *inode)
>  {
> -	hlist_add_fake(&inode->i_hash);
> +	hlist_bl_add_fake(&inode->i_hash);
>  }
>  
>  /*
> @@ -3065,7 +3066,7 @@ static inline void insert_inode_hash(struct inode *inode)
>  extern void __remove_inode_hash(struct inode *);
>  static inline void remove_inode_hash(struct inode *inode)
>  {
> -	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
> +	if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
>  		__remove_inode_hash(inode);
>  }
>  
> -- 
> 2.31.0
>
Thomas Gleixner April 12, 2021, 3:20 p.m. UTC | #4
Dave,

On Wed, Apr 07 2021 at 07:22, Dave Chinner wrote:
> On Tue, Apr 06, 2021 at 02:28:34PM +0100, Matthew Wilcox wrote:
>> On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
>> > +++ b/fs/inode.c
>> > @@ -57,8 +57,7 @@
>> >  
>> >  static unsigned int i_hash_mask __read_mostly;
>> >  static unsigned int i_hash_shift __read_mostly;
>> > -static struct hlist_head *inode_hashtable __read_mostly;
>> > -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
>> > +static struct hlist_bl_head *inode_hashtable __read_mostly;
>> 
>> I'm a little concerned that we're losing a lockdep map here.  
>> 
>> Nobody seems to have done this for list_bl yet, and I'd be reluctant
>> to gate your patch on "Hey, Dave, solve this problem nobody else has
>> done yet".
>
> I really don't care about lockdep. Adding lockdep support to
> hlist_bl is somebody else's problem - I'm just using infrastructure
> that already exists. Also, the dentry cache usage of hlist_bl is
> vastly more complex and so if lockdep coverage was really necessary,
> it would have already been done....
>
> And, FWIW, I'm also aware of the problems that RT kernels have with
> the use of bit spinlocks and being unable to turn them into sleeping
> mutexes by preprocessor magic. I don't care about that either,
> because dentry cache...

In the dentry cache it's a non-issue.

RT does not have a problem with bit spinlocks per se, it depends on how
they are used and what nests inside. Most of them are just kept as bit
spinlocks because the lock held, and therefore preempt disabled times
are small and no other on RT conflicting operations happen inside.

In the case at hand this is going to be a problem because inode->i_lock
nests inside the bit spinlock and we can't make inode->i_lock a raw
spinlock because it protects way heavier weight code pathes as well.

Thanks,

        tglx
Dave Chinner April 12, 2021, 10:15 p.m. UTC | #5
On Mon, Apr 12, 2021 at 05:20:53PM +0200, Thomas Gleixner wrote:
> Dave,
> 
> On Wed, Apr 07 2021 at 07:22, Dave Chinner wrote:
> > On Tue, Apr 06, 2021 at 02:28:34PM +0100, Matthew Wilcox wrote:
> >> On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
> >> > +++ b/fs/inode.c
> >> > @@ -57,8 +57,7 @@
> >> >  
> >> >  static unsigned int i_hash_mask __read_mostly;
> >> >  static unsigned int i_hash_shift __read_mostly;
> >> > -static struct hlist_head *inode_hashtable __read_mostly;
> >> > -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
> >> > +static struct hlist_bl_head *inode_hashtable __read_mostly;
> >> 
> >> I'm a little concerned that we're losing a lockdep map here.  
> >> 
> >> Nobody seems to have done this for list_bl yet, and I'd be reluctant
> >> to gate your patch on "Hey, Dave, solve this problem nobody else has
> >> done yet".
> >
> > I really don't care about lockdep. Adding lockdep support to
> > hlist_bl is somebody else's problem - I'm just using infrastructure
> > that already exists. Also, the dentry cache usage of hlist_bl is
> > vastly more complex and so if lockdep coverage was really necessary,
> > it would have already been done....
> >
> > And, FWIW, I'm also aware of the problems that RT kernels have with
> > the use of bit spinlocks and being unable to turn them into sleeping
> > mutexes by preprocessor magic. I don't care about that either,
> > because dentry cache...
> 
> In the dentry cache it's a non-issue.

Incorrect.

> RT does not have a problem with bit spinlocks per se, it depends on how
> they are used and what nests inside. Most of them are just kept as bit
> spinlocks because the lock held, and therefore preempt disabled times
> are small and no other on RT conflicting operations happen inside.
> 
> In the case at hand this is going to be a problem because inode->i_lock
> nests inside the bit spinlock and we can't make inode->i_lock a raw
> spinlock because it protects way heavier weight code pathes as well.

Yes, that's exactly the "problem" I'm refering to. And I don't care,
precisely because, well, dentry cache....

THat is, the dcache calls wake_up_all() from under the
hlist_bl_lock() in __d_lookup_done(). That ends up in
__wake_up_common_lock() which takes a spin lock embedded inside a
wait_queue_head.  That's not a raw spinlock, either, so we already
have this "spinlock inside bit lock" situation with the dcache usage
of hlist_bl.

FYI, this dentry cache behaviour was added to the dentry cache in
2016 by commit d9171b934526 ("parallel lookups machinery, part 4
(and last)"), so it's not like it's a new thing, either.

If you want to make hlist_bl RT safe, then re-implement it behind
the scenes for RT enabled kernels. All it takes is more memory
usage for the hash table + locks, but that's something that non-RT
people should not be burdened with caring about....

Cheers,

Dave.
Thomas Gleixner April 12, 2021, 11:18 p.m. UTC | #6
Dave,

On Tue, Apr 13 2021 at 08:15, Dave Chinner wrote:
> On Mon, Apr 12, 2021 at 05:20:53PM +0200, Thomas Gleixner wrote:
>> On Wed, Apr 07 2021 at 07:22, Dave Chinner wrote:
>> > And, FWIW, I'm also aware of the problems that RT kernels have with
>> > the use of bit spinlocks and being unable to turn them into sleeping
>> > mutexes by preprocessor magic. I don't care about that either,
>> > because dentry cache...
>> 
>> In the dentry cache it's a non-issue.
>
> Incorrect.

I'm impressed about your detailed knowledge of something you do not care
about in the first place.

>> RT does not have a problem with bit spinlocks per se, it depends on how
>> they are used and what nests inside. Most of them are just kept as bit
>> spinlocks because the lock held, and therefore preempt disabled times
>> are small and no other on RT conflicting operations happen inside.
>> 
>> In the case at hand this is going to be a problem because inode->i_lock
>> nests inside the bit spinlock and we can't make inode->i_lock a raw
>> spinlock because it protects way heavier weight code pathes as well.
>
> Yes, that's exactly the "problem" I'm refering to. And I don't care,
> precisely because, well, dentry cache....
>
> THat is, the dcache calls wake_up_all() from under the
> hlist_bl_lock() in __d_lookup_done(). That ends up in
> __wake_up_common_lock() which takes a spin lock embedded inside a
> wait_queue_head.  That's not a raw spinlock, either, so we already
> have this "spinlock inside bit lock" situation with the dcache usage
> of hlist_bl.

Sure, but you are missing that RT solves that by substituting the
wait_queue with a swait_queue, which does not suffer from that. But that
can't be done for the inode::i_lock case for various reasons.

> FYI, this dentry cache behaviour was added to the dentry cache in
> 2016 by commit d9171b934526 ("parallel lookups machinery, part 4
> (and last)"), so it's not like it's a new thing, either.

Really? I wasn't aware of that. Thanks for the education.

> If you want to make hlist_bl RT safe, then re-implement it behind
> the scenes for RT enabled kernels. All it takes is more memory
> usage for the hash table + locks, but that's something that non-RT
> people should not be burdened with caring about....

I'm well aware that anything outside of @fromorbit universe is not
interesting to you, but I neverless want to take the opportunity to
express my appreciation for your truly caring and collaborative attitude
versus interests of others who unfortunately do no share that universe.

Thanks,

        tglx
Dave Chinner April 13, 2021, 9:58 a.m. UTC | #7
On Tue, Apr 13, 2021 at 01:18:35AM +0200, Thomas Gleixner wrote:
> Dave,
> 
> On Tue, Apr 13 2021 at 08:15, Dave Chinner wrote:
> > On Mon, Apr 12, 2021 at 05:20:53PM +0200, Thomas Gleixner wrote:
> >> On Wed, Apr 07 2021 at 07:22, Dave Chinner wrote:
> >> > And, FWIW, I'm also aware of the problems that RT kernels have with
> >> > the use of bit spinlocks and being unable to turn them into sleeping
> >> > mutexes by preprocessor magic. I don't care about that either,
> >> > because dentry cache...
> >> 
> >> In the dentry cache it's a non-issue.
> >
> > Incorrect.
> 
> I'm impressed about your detailed knowledge of something you do not care
> about in the first place.

There's a difference between "don't care because don't understand"
and "don't care because I know how complex real-time is and I know I
can't validate any code I write to be RT safe".

Indeed, just because I work on filesystems now doesn't mean I don't
know what real-time is - I spent the best part of a decade as an
industrial control engineer building systems that provided water and
electricity to populations of millions of people before I started
working on filesystems....

> >> RT does not have a problem with bit spinlocks per se, it depends on how
> >> they are used and what nests inside. Most of them are just kept as bit
> >> spinlocks because the lock held, and therefore preempt disabled times
> >> are small and no other on RT conflicting operations happen inside.
> >> 
> >> In the case at hand this is going to be a problem because inode->i_lock
> >> nests inside the bit spinlock and we can't make inode->i_lock a raw
> >> spinlock because it protects way heavier weight code pathes as well.
> >
> > Yes, that's exactly the "problem" I'm refering to. And I don't care,
> > precisely because, well, dentry cache....
> >
> > THat is, the dcache calls wake_up_all() from under the
> > hlist_bl_lock() in __d_lookup_done(). That ends up in
> > __wake_up_common_lock() which takes a spin lock embedded inside a
> > wait_queue_head.  That's not a raw spinlock, either, so we already
> > have this "spinlock inside bit lock" situation with the dcache usage
> > of hlist_bl.
> 
> Sure, but you are missing that RT solves that by substituting the
> wait_queue with a swait_queue, which does not suffer from that. But that
> can't be done for the inode::i_lock case for various reasons.

I didn't know about that specific forklift replacement. But, really
that simply adds weight to my comment below....

> > FYI, this dentry cache behaviour was added to the dentry cache in
> > 2016 by commit d9171b934526 ("parallel lookups machinery, part 4
> > (and last)"), so it's not like it's a new thing, either.
> 
> Really? I wasn't aware of that. Thanks for the education.
> 
> > If you want to make hlist_bl RT safe, then re-implement it behind
> > the scenes for RT enabled kernels. All it takes is more memory
> > usage for the hash table + locks, but that's something that non-RT
> > people should not be burdened with caring about....

... because if RT devs are willing to forklift replace core kernel
functionality like wait queues to provide RT kernels with a
completely different locking schema to vanilla kernels, then
slightly modifying the hlist-bl structure in a RT compatible way is
child's play....

> I'm well aware that anything outside of @fromorbit universe is not
> interesting to you, but I neverless want to take the opportunity to
> express my appreciation for your truly caring and collaborative attitude
> versus interests of others who unfortunately do no share that universe.

I'm being realistic. I dont' have the time or mental bandwidth to
solve RT kernel problems. I don't have any way to test RT kernels,
and lockdep is a crock of shit for validating RT locking on vanilla
kernels because of the forklift upgrade games like the above that
give the RT kernel a different locking schema.

Because I have sufficient knowledge of the real-time game, I know
*I'm not an RT expert* these days. I know that I don't know all the
games it plays, nor do I have the time (or patience) to learn about
all of them, nor the resources or knowledge to test whether the code
I write follows all the rules I don't know about, whether I
introduced interrupt hold-offs longer than 50us, etc.

IOWs, I chose not to care about RT because I know I don't know
enough about it to write solid, well tested RT compatible kernel
code. I can write untested shit as well as any other programmer, but
I have much higher professional standards than that.

And I also know there are paid professionals who are RT experts who
are supposed to take care of this stuff so random kernel devs like
myself *don't need to care about the details of how RT kernels do
their magic*.

So for solving the inode cache scalability issue with RT in mind,
we're left with these choices:

a) increase memory consumption and cacheline misses for everyone by
   adding a spinlock per hash chain so that RT kernels can do their
   substitution magic and make the memory footprint and scalability
   for RT kernels worse

b) convert the inode hash table to something different (rhashtable,
   radix tree, Xarray, etc) that is more scalable and more "RT
   friendly".

c) have RT kernel substitute hlist-bl with hlist_head and a spinlock
   so that it all works correctly on RT kernels and only RT kernels
   take the memory footprint and cacheline miss penalties...

We rejected a) for the dentry hash table, so it is not an appropriate
soltion for the inode hash table for the same reasons.

There is a lot of downside to b). Firstly there's the time and
resources needed for experimentation to find an appropriate
algorithm for both scalability and RT. Then all the insert, removal
and search facilities will have to be rewritten, along with all the
subtlies like "fake hashing" to allow fielsysetms to provide their
own inode caches.  The changes in behaviour and, potentially, API
semantics will greatly increase the risk of regressions and adverse
behaviour on both vanilla and RT kernels compared to option a) or
c).

It is clear that option c) is of minimal risk to vanilla kernels,
and low risk to RT kernels. It's pretty straight forward to do for
both configs, and only the RT kernels take the memory footprint
penalty.

So a technical analysis points to c) being the most reasonable
resolution of the problem.

Making sure RT kernels work correctly is your job, Thomas, not mine.
Converting hlist-bl to a rt compatible structure should be pretty
simple:


struct hlist_bl_head {
        struct hlist_bl_node *first;
+#idef CONFIG_RT
+	spinlock_t lock;
+#endif
};

.....
static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
+#ifdef CONFIG_RT
+	spin_lock(&b->lock);
+#else
	bit_spin_lock(0, (unsigned long *)b);
+#endif
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
+#ifdef CONFIG_RT
+	spin_unlock(&b->lock);
+#else
	bit_spin_lock(0, (unsigned long *)b);
+#endif
}

So if you want to run up a patch that converts hlist-bl to be rt
compatible and test it on your RT test farm and send it to me, then
I'll happily include it in my patchset....

Cheers,

Dave.
Thomas Gleixner April 13, 2021, 9:24 p.m. UTC | #8
Dave,

On Tue, Apr 13 2021 at 19:58, Dave Chinner wrote:
> On Tue, Apr 13, 2021 at 01:18:35AM +0200, Thomas Gleixner wrote:
> So for solving the inode cache scalability issue with RT in mind,
> we're left with these choices:
>
> a) increase memory consumption and cacheline misses for everyone by
>    adding a spinlock per hash chain so that RT kernels can do their
>    substitution magic and make the memory footprint and scalability
>    for RT kernels worse
>
> b) convert the inode hash table to something different (rhashtable,
>    radix tree, Xarray, etc) that is more scalable and more "RT
>    friendly".
>
> c) have RT kernel substitute hlist-bl with hlist_head and a spinlock
>    so that it all works correctly on RT kernels and only RT kernels
>    take the memory footprint and cacheline miss penalties...
>
> We rejected a) for the dentry hash table, so it is not an appropriate
> soltion for the inode hash table for the same reasons.
>
> There is a lot of downside to b). Firstly there's the time and
> resources needed for experimentation to find an appropriate
> algorithm for both scalability and RT. Then all the insert, removal
> and search facilities will have to be rewritten, along with all the
> subtlies like "fake hashing" to allow fielsysetms to provide their
> own inode caches.  The changes in behaviour and, potentially, API
> semantics will greatly increase the risk of regressions and adverse
> behaviour on both vanilla and RT kernels compared to option a) or
> c).
>
> It is clear that option c) is of minimal risk to vanilla kernels,
> and low risk to RT kernels. It's pretty straight forward to do for
> both configs, and only the RT kernels take the memory footprint
> penalty.
>
> So a technical analysis points to c) being the most reasonable
> resolution of the problem.

I agree with that analysis for technical reasons and I'm not entirely
unfamiliar how to solve hlist_bl conversions on RT either as you might
have guessed.

Having a technical argument to discuss and agree on is far simpler
than going along with "I don't care".

Thanks for taking the time to put a technical rationale on this!

       tglx
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index b8d9eb3454dc..867af386177b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -57,8 +57,7 @@ 
 
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
-static struct hlist_head *inode_hashtable __read_mostly;
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+static struct hlist_bl_head *inode_hashtable __read_mostly;
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
 {
@@ -70,7 +69,7 @@  static unsigned long hash(struct super_block *sb, unsigned long hashval)
 	return tmp & i_hash_mask;
 }
 
-static inline struct hlist_head *i_hash_head(struct super_block *sb,
+static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
 		unsigned int hashval)
 {
 	return inode_hashtable + hash(sb, hashval);
@@ -407,7 +406,7 @@  EXPORT_SYMBOL(address_space_init_once);
 void inode_init_once(struct inode *inode)
 {
 	memset(inode, 0, sizeof(*inode));
-	INIT_HLIST_NODE(&inode->i_hash);
+	INIT_HLIST_BL_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_io_list);
 	INIT_LIST_HEAD(&inode->i_wb_list);
@@ -491,6 +490,17 @@  static inline void inode_sb_list_del(struct inode *inode)
 	}
 }
 
+/*
+ * Ensure that we store the hash head in the inode when we insert the inode into
+ * the hlist_bl_head...
+ */
+static inline void
+__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
+{
+	hlist_bl_add_head_rcu(&inode->i_hash, b);
+	inode->i_hash_head = b;
+}
+
 /**
  *	__insert_inode_hash - hash an inode
  *	@inode: unhashed inode
@@ -501,13 +511,13 @@  static inline void inode_sb_list_del(struct inode *inode)
  */
 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
-	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
 
-	spin_lock(&inode_hash_lock);
+	hlist_bl_lock(b);
 	spin_lock(&inode->i_lock);
-	hlist_add_head_rcu(&inode->i_hash, b);
+	__insert_inode_hash_head(inode, b);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_unlock(b);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 
@@ -519,11 +529,44 @@  EXPORT_SYMBOL(__insert_inode_hash);
  */
 void __remove_inode_hash(struct inode *inode)
 {
-	spin_lock(&inode_hash_lock);
-	spin_lock(&inode->i_lock);
-	hlist_del_init_rcu(&inode->i_hash);
-	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_hash_lock);
+	struct hlist_bl_head *b = inode->i_hash_head;
+
+	/*
+	 * There are some callers that come through here without synchronisation
+	 * and potentially with multiple references to the inode. Hence we have
+	 * to handle the case that we might race with a remove and insert to a
+	 * different list. Coda, in particular, seems to have a userspace API
+	 * that can directly trigger "unhash/rehash to different list" behaviour
+	 * without any serialisation at all.
+	 *
+	 * Hence we have to handle the situation where the inode->i_hash_head
+	 * might point to a different list than what we expect, indicating that
+	 * we raced with another unhash and potentially a new insertion. This
+	 * means we have to retest the head once we have everything locked up
+	 * and loop again if it doesn't match.
+	 */
+	while (b) {
+		hlist_bl_lock(b);
+		spin_lock(&inode->i_lock);
+		if (b != inode->i_hash_head) {
+			hlist_bl_unlock(b);
+			b = inode->i_hash_head;
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		/*
+		 * Need to set the pprev pointer to NULL after list removal so
+		 * that both RCU traversals and hlist_bl_unhashed() work
+		 * correctly at this point.
+		 */
+		hlist_bl_del_rcu(&inode->i_hash);
+		inode->i_hash.pprev = NULL;
+		inode->i_hash_head = NULL;
+		spin_unlock(&inode->i_lock);
+		hlist_bl_unlock(b);
+		break;
+	}
+
 }
 EXPORT_SYMBOL(__remove_inode_hash);
 
@@ -813,26 +856,28 @@  long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 	return freed;
 }
 
-static void __wait_on_freeing_inode(struct inode *inode);
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
+				struct inode *inode);
 /*
  * Called with the inode lock held.
  */
 static struct inode *find_inode(struct super_block *sb,
-				struct hlist_head *head,
+				struct hlist_bl_head *b,
 				int (*test)(struct inode *, void *),
 				void *data)
 {
+	struct hlist_bl_node *node;
 	struct inode *inode = NULL;
 
 repeat:
-	hlist_for_each_entry(inode, head, i_hash) {
+	hlist_bl_for_each_entry(inode, node, b, i_hash) {
 		if (inode->i_sb != sb)
 			continue;
 		if (!test(inode, data))
 			continue;
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
-			__wait_on_freeing_inode(inode);
+			__wait_on_freeing_inode(b, inode);
 			goto repeat;
 		}
 		if (unlikely(inode->i_state & I_CREATING)) {
@@ -851,19 +896,20 @@  static struct inode *find_inode(struct super_block *sb,
  * iget_locked for details.
  */
 static struct inode *find_inode_fast(struct super_block *sb,
-				struct hlist_head *head, unsigned long ino)
+				struct hlist_bl_head *b, unsigned long ino)
 {
+	struct hlist_bl_node *node;
 	struct inode *inode = NULL;
 
 repeat:
-	hlist_for_each_entry(inode, head, i_hash) {
+	hlist_bl_for_each_entry(inode, node, b, i_hash) {
 		if (inode->i_ino != ino)
 			continue;
 		if (inode->i_sb != sb)
 			continue;
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
-			__wait_on_freeing_inode(inode);
+			__wait_on_freeing_inode(b, inode);
 			goto repeat;
 		}
 		if (unlikely(inode->i_state & I_CREATING)) {
@@ -1072,26 +1118,26 @@  EXPORT_SYMBOL(unlock_two_nondirectories);
  * return it locked, hashed, and with the I_NEW flag set. The file system gets
  * to fill it in before unlocking it via unlock_new_inode().
  *
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note both @test and @set are called with the inode hash chain lock held,
+ * so can't sleep.
  */
 struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
 			    int (*test)(struct inode *, void *),
 			    int (*set)(struct inode *, void *), void *data)
 {
-	struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
+	struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
 	struct inode *old;
 	bool creating = inode->i_state & I_CREATING;
 
 again:
-	spin_lock(&inode_hash_lock);
-	old = find_inode(inode->i_sb, head, test, data);
+	hlist_bl_lock(b);
+	old = find_inode(inode->i_sb, b, test, data);
 	if (unlikely(old)) {
 		/*
 		 * Uhhuh, somebody else created the same inode under us.
 		 * Use the old inode instead of the preallocated one.
 		 */
-		spin_unlock(&inode_hash_lock);
+		hlist_bl_unlock(b);
 		if (IS_ERR(old))
 			return NULL;
 		wait_on_inode(old);
@@ -1113,12 +1159,12 @@  struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
 	 */
 	spin_lock(&inode->i_lock);
 	inode->i_state |= I_NEW;
-	hlist_add_head_rcu(&inode->i_hash, head);
+	__insert_inode_hash_head(inode, b);
 	spin_unlock(&inode->i_lock);
 	if (!creating)
 		inode_sb_list_add(inode);
 unlock:
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_unlock(b);
 
 	return inode;
 }
@@ -1179,12 +1225,12 @@  EXPORT_SYMBOL(iget5_locked);
  */
 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
-	struct hlist_head *head = i_hash_head(sb, ino);
+	struct hlist_bl_head *b = i_hash_head(sb, ino);
 	struct inode *inode;
 again:
-	spin_lock(&inode_hash_lock);
-	inode = find_inode_fast(sb, head, ino);
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_lock(b);
+	inode = find_inode_fast(sb, b, ino);
+	hlist_bl_unlock(b);
 	if (inode) {
 		if (IS_ERR(inode))
 			return NULL;
@@ -1200,17 +1246,17 @@  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 	if (inode) {
 		struct inode *old;
 
-		spin_lock(&inode_hash_lock);
+		hlist_bl_lock(b);
 		/* We released the lock, so.. */
-		old = find_inode_fast(sb, head, ino);
+		old = find_inode_fast(sb, b, ino);
 		if (!old) {
 			inode->i_ino = ino;
 			spin_lock(&inode->i_lock);
 			inode->i_state = I_NEW;
-			hlist_add_head_rcu(&inode->i_hash, head);
+			__insert_inode_hash_head(inode, b);
 			spin_unlock(&inode->i_lock);
 			inode_sb_list_add(inode);
-			spin_unlock(&inode_hash_lock);
+			hlist_bl_unlock(b);
 
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
@@ -1223,7 +1269,7 @@  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		spin_unlock(&inode_hash_lock);
+		hlist_bl_unlock(b);
 		destroy_inode(inode);
 		if (IS_ERR(old))
 			return NULL;
@@ -1247,10 +1293,11 @@  EXPORT_SYMBOL(iget_locked);
  */
 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 {
-	struct hlist_head *b = i_hash_head(sb, ino);
+	struct hlist_bl_head *b = i_hash_head(sb, ino);
+	struct hlist_bl_node *node;
 	struct inode *inode;
 
-	hlist_for_each_entry_rcu(inode, b, i_hash) {
+	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
 		if (inode->i_ino == ino && inode->i_sb == sb)
 			return 0;
 	}
@@ -1334,12 +1381,12 @@  EXPORT_SYMBOL(igrab);
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
-	struct hlist_head *head = i_hash_head(sb, hashval);
+	struct hlist_bl_head *b = i_hash_head(sb, hashval);
 	struct inode *inode;
 
-	spin_lock(&inode_hash_lock);
-	inode = find_inode(sb, head, test, data);
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_lock(b);
+	inode = find_inode(sb, b, test, data);
+	hlist_bl_unlock(b);
 
 	return IS_ERR(inode) ? NULL : inode;
 }
@@ -1389,12 +1436,12 @@  EXPORT_SYMBOL(ilookup5);
  */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
-	struct hlist_head *head = i_hash_head(sb, ino);
+	struct hlist_bl_head *b = i_hash_head(sb, ino);
 	struct inode *inode;
 again:
-	spin_lock(&inode_hash_lock);
-	inode = find_inode_fast(sb, head, ino);
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_lock(b);
+	inode = find_inode_fast(sb, b, ino);
+	hlist_bl_unlock(b);
 
 	if (inode) {
 		if (IS_ERR(inode))
@@ -1438,12 +1485,13 @@  struct inode *find_inode_nowait(struct super_block *sb,
 					     void *),
 				void *data)
 {
-	struct hlist_head *head = i_hash_head(sb, hashval);
+	struct hlist_bl_head *b = i_hash_head(sb, hashval);
+	struct hlist_bl_node *node;
 	struct inode *inode, *ret_inode = NULL;
 	int mval;
 
-	spin_lock(&inode_hash_lock);
-	hlist_for_each_entry(inode, head, i_hash) {
+	hlist_bl_lock(b);
+	hlist_bl_for_each_entry(inode, node, b, i_hash) {
 		if (inode->i_sb != sb)
 			continue;
 		mval = match(inode, hashval, data);
@@ -1454,7 +1502,7 @@  struct inode *find_inode_nowait(struct super_block *sb,
 		goto out;
 	}
 out:
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_unlock(b);
 	return ret_inode;
 }
 EXPORT_SYMBOL(find_inode_nowait);
@@ -1483,13 +1531,14 @@  EXPORT_SYMBOL(find_inode_nowait);
 struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
 			     int (*test)(struct inode *, void *), void *data)
 {
-	struct hlist_head *head = i_hash_head(sb, hashval);
+	struct hlist_bl_head *b = i_hash_head(sb, hashval);
+	struct hlist_bl_node *node;
 	struct inode *inode;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
 			 "suspicious find_inode_rcu() usage");
 
-	hlist_for_each_entry_rcu(inode, head, i_hash) {
+	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
 		if (inode->i_sb == sb &&
 		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
 		    test(inode, data))
@@ -1521,13 +1570,14 @@  EXPORT_SYMBOL(find_inode_rcu);
 struct inode *find_inode_by_ino_rcu(struct super_block *sb,
 				    unsigned long ino)
 {
-	struct hlist_head *head = i_hash_head(sb, ino);
+	struct hlist_bl_head *b = i_hash_head(sb, ino);
+	struct hlist_bl_node *node;
 	struct inode *inode;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
 			 "suspicious find_inode_by_ino_rcu() usage");
 
-	hlist_for_each_entry_rcu(inode, head, i_hash) {
+	hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
 		if (inode->i_ino == ino &&
 		    inode->i_sb == sb &&
 		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
@@ -1541,39 +1591,42 @@  int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
-	struct hlist_head *head = i_hash_head(sb, ino);
+	struct hlist_bl_head *b = i_hash_head(sb, ino);
 
 	while (1) {
-		struct inode *old = NULL;
-		spin_lock(&inode_hash_lock);
-		hlist_for_each_entry(old, head, i_hash) {
-			if (old->i_ino != ino)
+		struct hlist_bl_node *node;
+		struct inode *old = NULL, *t;
+
+		hlist_bl_lock(b);
+		hlist_bl_for_each_entry(t, node, b, i_hash) {
+			if (t->i_ino != ino)
 				continue;
-			if (old->i_sb != sb)
+			if (t->i_sb != sb)
 				continue;
-			spin_lock(&old->i_lock);
-			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
-				spin_unlock(&old->i_lock);
+			spin_lock(&t->i_lock);
+			if (t->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&t->i_lock);
 				continue;
 			}
+			old = t;
 			break;
 		}
 		if (likely(!old)) {
 			spin_lock(&inode->i_lock);
 			inode->i_state |= I_NEW | I_CREATING;
-			hlist_add_head_rcu(&inode->i_hash, head);
+			__insert_inode_hash_head(inode, b);
 			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_hash_lock);
+			hlist_bl_unlock(b);
 			return 0;
 		}
 		if (unlikely(old->i_state & I_CREATING)) {
 			spin_unlock(&old->i_lock);
-			spin_unlock(&inode_hash_lock);
+			hlist_bl_unlock(b);
 			return -EBUSY;
 		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
-		spin_unlock(&inode_hash_lock);
+		hlist_bl_unlock(b);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
@@ -2046,17 +2099,18 @@  EXPORT_SYMBOL(inode_needs_sync);
  * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
  * will DTRT.
  */
-static void __wait_on_freeing_inode(struct inode *inode)
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
+				struct inode *inode)
 {
 	wait_queue_head_t *wq;
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_hash_lock);
+	hlist_bl_unlock(b);
 	schedule();
 	finish_wait(wq, &wait.wq_entry);
-	spin_lock(&inode_hash_lock);
+	hlist_bl_lock(b);
 }
 
 static __initdata unsigned long ihash_entries;
@@ -2082,7 +2136,7 @@  void __init inode_init_early(void)
 
 	inode_hashtable =
 		alloc_large_system_hash("Inode-cache",
-					sizeof(struct hlist_head),
+					sizeof(struct hlist_bl_head),
 					ihash_entries,
 					14,
 					HASH_EARLY | HASH_ZERO,
@@ -2108,7 +2162,7 @@  void __init inode_init(void)
 
 	inode_hashtable =
 		alloc_large_system_hash("Inode-cache",
-					sizeof(struct hlist_head),
+					sizeof(struct hlist_bl_head),
 					ihash_entries,
 					14,
 					HASH_ZERO,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..89c9e49a71a6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -664,7 +664,8 @@  struct inode {
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 	unsigned long		dirtied_time_when;
 
-	struct hlist_node	i_hash;
+	struct hlist_bl_node	i_hash;
+	struct hlist_bl_head	*i_hash_head;
 	struct list_head	i_io_list;	/* backing dev IO list */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
@@ -730,7 +731,7 @@  static inline unsigned int i_blocksize(const struct inode *node)
 
 static inline int inode_unhashed(struct inode *inode)
 {
-	return hlist_unhashed(&inode->i_hash);
+	return hlist_bl_unhashed(&inode->i_hash);
 }
 
 /*
@@ -741,7 +742,7 @@  static inline int inode_unhashed(struct inode *inode)
  */
 static inline void inode_fake_hash(struct inode *inode)
 {
-	hlist_add_fake(&inode->i_hash);
+	hlist_bl_add_fake(&inode->i_hash);
 }
 
 /*
@@ -3065,7 +3066,7 @@  static inline void insert_inode_hash(struct inode *inode)
 extern void __remove_inode_hash(struct inode *);
 static inline void remove_inode_hash(struct inode *inode)
 {
-	if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
+	if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
 		__remove_inode_hash(inode);
 }