[v3,3/3] vfs: Use per-cpu list for superblock's inode list

On Tue 23-02-16 14:04:32, Waiman Long wrote:
> When many threads are trying to add or delete inode to or from
> a superblock's s_inodes list, spinlock contention on the list can
> become a performance bottleneck.
> 
> This patch changes the s_inodes field to become a per-cpu list with
> per-cpu spinlocks. As a result, the following superblock inode list
> (sb->s_inodes) iteration functions in vfs are also being modified:
> 
>  1. iterate_bdevs()
>  2. drop_pagecache_sb()
>  3. wait_sb_inodes()
>  4. evict_inodes()
>  5. invalidate_inodes()
>  6. fsnotify_unmount_inodes()
>  7. add_dquot_ref()
>  8. remove_dquot_ref()
> 
> With an exit microbenchmark that creates a large number of threads,
> attachs many inodes to them and then exits. The runtimes of that
> microbenchmark with 1000 threads before and after the patch on a
> 4-socket Intel E7-4820 v3 system (40 cores, 80 threads) were as
> follows:
> 
>   Kernel            Elapsed Time    System Time
>   ------            ------------    -----------
>   Vanilla 4.5-rc4      65.29s         82m14s
>   Patched 4.5-rc4      22.81s         23m03s
> 
> Before the patch, spinlock contention at the inode_sb_list_add()
> function at the startup phase and the inode_sb_list_del() function at
> the exit phase were about 79% and 93% of total CPU time respectively
> (as measured by perf). After the patch, the percpu_list_add()
> function consumed only about 0.04% of CPU time at startup phase. The
> percpu_list_del() function consumed about 0.4% of CPU time at exit
> phase. There were still some spinlock contention, but they happened
> elsewhere.

While looking through this patch, I have noticed that the
list_for_each_entry_safe() iterations in evict_inodes() and
invalidate_inodes() are actually unnecessary. So if you first apply the
attached patch, you don't have to implement safe iteration variants at all.

As a second comment, I'd note that this patch grows struct inode by 1
pointer. It is probably acceptable for large machines given the speedup but
it should be noted in the changelog. Furthermore for UP or even small SMP
systems this is IMHO undesired bloat since the speedup won't be noticeable.

So for these small systems it would be good if per-cpu list magic would just
fall back to single linked list with a spinlock. Do you think that is
reasonably doable?

								Honza

> 
> Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
> ---
>  fs/block_dev.c         |   13 +++++++------
>  fs/drop_caches.c       |   10 +++++-----
>  fs/fs-writeback.c      |   13 +++++++------
>  fs/inode.c             |   40 +++++++++++++++++-----------------------
>  fs/notify/inode_mark.c |   10 +++++-----
>  fs/quota/dquot.c       |   16 ++++++++--------
>  fs/super.c             |    7 ++++---
>  include/linux/fs.h     |    8 ++++----
>  8 files changed, 57 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 39b3a17..759d9b6 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1865,11 +1865,13 @@ EXPORT_SYMBOL(__invalidate_device);
>  void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
>  {
>  	struct inode *inode, *old_inode = NULL;
> +	DEFINE_PCPU_LIST_STATE(state);
>  
> -	spin_lock(&blockdev_superblock->s_inode_list_lock);
> -	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
> -		struct address_space *mapping = inode->i_mapping;
> +	while (pcpu_list_iterate(blockdev_superblock->s_inodes, &state)) {
> +		struct address_space *mapping;
>  
> +		inode   = list_entry(state.curr, struct inode, i_sb_list);
> +		mapping = inode->i_mapping;
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
>  		    mapping->nrpages == 0) {
> @@ -1878,7 +1880,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
>  		}
>  		__iget(inode);
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&blockdev_superblock->s_inode_list_lock);
> +		spin_unlock(state.lock);
>  		/*
>  		 * We hold a reference to 'inode' so it couldn't have been
>  		 * removed from s_inodes list while we dropped the
> @@ -1892,8 +1894,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
>  
>  		func(I_BDEV(inode), arg);
>  
> -		spin_lock(&blockdev_superblock->s_inode_list_lock);
> +		spin_lock(state.lock);
>  	}
> -	spin_unlock(&blockdev_superblock->s_inode_list_lock);
>  	iput(old_inode);
>  }
> diff --git a/fs/drop_caches.c b/fs/drop_caches.c
> index d72d52b..ec272ed 100644
> --- a/fs/drop_caches.c
> +++ b/fs/drop_caches.c
> @@ -16,9 +16,10 @@ int sysctl_drop_caches;
>  static void drop_pagecache_sb(struct super_block *sb, void *unused)
>  {
>  	struct inode *inode, *toput_inode = NULL;
> +	DEFINE_PCPU_LIST_STATE(state);
>  
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> +	while (pcpu_list_iterate(sb->s_inodes, &state)) {
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		spin_lock(&inode->i_lock);
>  		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
>  		    (inode->i_mapping->nrpages == 0)) {
> @@ -27,15 +28,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
>  		}
>  		__iget(inode);
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&sb->s_inode_list_lock);
> +		spin_unlock(state.lock);
>  
>  		invalidate_mapping_pages(inode->i_mapping, 0, -1);
>  		iput(toput_inode);
>  		toput_inode = inode;
>  
> -		spin_lock(&sb->s_inode_list_lock);
> +		spin_lock(state.lock);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  	iput(toput_inode);
>  }
>  
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 6915c95..05b3f85 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -2107,6 +2107,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
>  static void wait_sb_inodes(struct super_block *sb)
>  {
>  	struct inode *inode, *old_inode = NULL;
> +	DEFINE_PCPU_LIST_STATE(state);
>  
>  	/*
>  	 * We need to be protected against the filesystem going from
> @@ -2115,7 +2116,6 @@ static void wait_sb_inodes(struct super_block *sb)
>  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
>  
>  	mutex_lock(&sb->s_sync_lock);
> -	spin_lock(&sb->s_inode_list_lock);
>  
>  	/*
>  	 * Data integrity sync. Must wait for all pages under writeback,
> @@ -2124,9 +2124,11 @@ static void wait_sb_inodes(struct super_block *sb)
>  	 * In which case, the inode may not be on the dirty list, but
>  	 * we still have to wait for that writeout.
>  	 */
> -	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> -		struct address_space *mapping = inode->i_mapping;
> +	while (pcpu_list_iterate(sb->s_inodes, &state)) {
> +		struct address_space *mapping;
>  
> +		inode   = list_entry(state.curr, struct inode, i_sb_list);
> +		mapping = inode->i_mapping;
>  		spin_lock(&inode->i_lock);
>  		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
>  		    (mapping->nrpages == 0)) {
> @@ -2135,7 +2137,7 @@ static void wait_sb_inodes(struct super_block *sb)
>  		}
>  		__iget(inode);
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&sb->s_inode_list_lock);
> +		spin_unlock(state.lock);
>  
>  		/*
>  		 * We hold a reference to 'inode' so it couldn't have been
> @@ -2157,9 +2159,8 @@ static void wait_sb_inodes(struct super_block *sb)
>  
>  		cond_resched();
>  
> -		spin_lock(&sb->s_inode_list_lock);
> +		spin_lock(state.lock);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  	iput(old_inode);
>  	mutex_unlock(&sb->s_sync_lock);
>  }
> diff --git a/fs/inode.c b/fs/inode.c
> index 9f62db3..e6e41ef 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -28,7 +28,7 @@
>   *   inode->i_state, inode->i_hash, __iget()
>   * Inode LRU list locks protect:
>   *   inode->i_sb->s_inode_lru, inode->i_lru
> - * inode->i_sb->s_inode_list_lock protects:
> + * inode->i_sb->s_inodes->lock protects:
>   *   inode->i_sb->s_inodes, inode->i_sb_list
>   * bdi->wb.list_lock protects:
>   *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
> @@ -37,7 +37,7 @@
>   *
>   * Lock ordering:
>   *
> - * inode->i_sb->s_inode_list_lock
> + * inode->i_sb->s_inodes->lock
>   *   inode->i_lock
>   *     Inode LRU list locks
>   *
> @@ -45,7 +45,7 @@
>   *   inode->i_lock
>   *
>   * inode_hash_lock
> - *   inode->i_sb->s_inode_list_lock
> + *   inode->i_sb->s_inodes->lock
>   *   inode->i_lock
>   *
>   * iunique_lock
> @@ -424,19 +424,14 @@ static void inode_lru_list_del(struct inode *inode)
>   */
>  void inode_sb_list_add(struct inode *inode)
>  {
> -	spin_lock(&inode->i_sb->s_inode_list_lock);
> -	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
> -	spin_unlock(&inode->i_sb->s_inode_list_lock);
> +	pcpu_list_add(&inode->i_sb_list, inode->i_sb->s_inodes);
>  }
>  EXPORT_SYMBOL_GPL(inode_sb_list_add);
>  
>  static inline void inode_sb_list_del(struct inode *inode)
>  {
> -	if (!list_empty(&inode->i_sb_list)) {
> -		spin_lock(&inode->i_sb->s_inode_list_lock);
> -		list_del_init(&inode->i_sb_list);
> -		spin_unlock(&inode->i_sb->s_inode_list_lock);
> -	}
> +	if (!list_empty(&inode->i_sb_list.list))
> +		pcpu_list_del(&inode->i_sb_list);
>  }
>  
>  static unsigned long hash(struct super_block *sb, unsigned long hashval)
> @@ -590,12 +585,14 @@ static void dispose_list(struct list_head *head)
>   */
>  void evict_inodes(struct super_block *sb)
>  {
> -	struct inode *inode, *next;
> +	struct inode *inode;
> +	struct pcpu_list_state state;
>  	LIST_HEAD(dispose);
>  
>  again:
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> +	init_pcpu_list_state(&state);
> +	while (pcpu_list_iterate_safe(sb->s_inodes, &state)) {
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		if (atomic_read(&inode->i_count))
>  			continue;
>  
> @@ -616,13 +613,12 @@ again:
>  		 * bit so we don't livelock.
>  		 */
>  		if (need_resched()) {
> -			spin_unlock(&sb->s_inode_list_lock);
> +			spin_unlock(state.lock);
>  			cond_resched();
>  			dispose_list(&dispose);
>  			goto again;
>  		}
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  
>  	dispose_list(&dispose);
>  }
> @@ -640,11 +636,12 @@ again:
>  int invalidate_inodes(struct super_block *sb, bool kill_dirty)
>  {
>  	int busy = 0;
> -	struct inode *inode, *next;
> +	struct inode *inode;
>  	LIST_HEAD(dispose);
> +	DEFINE_PCPU_LIST_STATE(state);
>  
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> +	while (pcpu_list_iterate_safe(sb->s_inodes, &state)) {
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
>  			spin_unlock(&inode->i_lock);
> @@ -666,7 +663,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
>  		spin_unlock(&inode->i_lock);
>  		list_add(&inode->i_lru, &dispose);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  
>  	dispose_list(&dispose);
>  
> @@ -881,7 +877,7 @@ struct inode *new_inode_pseudo(struct super_block *sb)
>  		spin_lock(&inode->i_lock);
>  		inode->i_state = 0;
>  		spin_unlock(&inode->i_lock);
> -		INIT_LIST_HEAD(&inode->i_sb_list);
> +		init_pcpu_list_node(&inode->i_sb_list);
>  	}
>  	return inode;
>  }
> @@ -902,8 +898,6 @@ struct inode *new_inode(struct super_block *sb)
>  {
>  	struct inode *inode;
>  
> -	spin_lock_prefetch(&sb->s_inode_list_lock);
> -
>  	inode = new_inode_pseudo(sb);
>  	if (inode)
>  		inode_sb_list_add(inode);
> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> index a364524..12515a4 100644
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -151,14 +151,15 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
>  void fsnotify_unmount_inodes(struct super_block *sb)
>  {
>  	struct inode *inode, *iput_inode = NULL;
> +	DEFINE_PCPU_LIST_STATE(state);
>  
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> +	while (pcpu_list_iterate(sb->s_inodes, &state)) {
>  		/*
>  		 * We cannot __iget() an inode in state I_FREEING,
>  		 * I_WILL_FREE, or I_NEW which is fine because by that point
>  		 * the inode cannot have any associated watches.
>  		 */
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
>  			spin_unlock(&inode->i_lock);
> @@ -178,7 +179,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
>  
>  		__iget(inode);
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&sb->s_inode_list_lock);
> +		spin_unlock(state.lock);
>  
>  		if (iput_inode)
>  			iput(iput_inode);
> @@ -190,9 +191,8 @@ void fsnotify_unmount_inodes(struct super_block *sb)
>  
>  		iput_inode = inode;
>  
> -		spin_lock(&sb->s_inode_list_lock);
> +		spin_lock(state.lock);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  
>  	if (iput_inode)
>  		iput(iput_inode);
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 3c3b81b..2564271 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -924,12 +924,13 @@ static int dqinit_needed(struct inode *inode, int type)
>  static void add_dquot_ref(struct super_block *sb, int type)
>  {
>  	struct inode *inode, *old_inode = NULL;
> +	DEFINE_PCPU_LIST_STATE(state);
>  #ifdef CONFIG_QUOTA_DEBUG
>  	int reserved = 0;
>  #endif
>  
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> +	while (pcpu_list_iterate(sb->s_inodes, &state)) {
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		spin_lock(&inode->i_lock);
>  		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
>  		    !atomic_read(&inode->i_writecount) ||
> @@ -939,7 +940,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
>  		}
>  		__iget(inode);
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&sb->s_inode_list_lock);
> +		spin_unlock(state.lock);
>  
>  #ifdef CONFIG_QUOTA_DEBUG
>  		if (unlikely(inode_get_rsv_space(inode) > 0))
> @@ -957,9 +958,8 @@ static void add_dquot_ref(struct super_block *sb, int type)
>  		 * later.
>  		 */
>  		old_inode = inode;
> -		spin_lock(&sb->s_inode_list_lock);
> +		spin_lock(state.lock);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  	iput(old_inode);
>  
>  #ifdef CONFIG_QUOTA_DEBUG
> @@ -1027,15 +1027,16 @@ static void remove_dquot_ref(struct super_block *sb, int type,
>  {
>  	struct inode *inode;
>  	int reserved = 0;
> +	DEFINE_PCPU_LIST_STATE(state);
>  
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> +	while (pcpu_list_iterate(sb->s_inodes, &state)) {
>  		/*
>  		 *  We have to scan also I_NEW inodes because they can already
>  		 *  have quota pointer initialized. Luckily, we need to touch
>  		 *  only quota pointers and these have separate locking
>  		 *  (dq_data_lock).
>  		 */
> +		inode = list_entry(state.curr, struct inode, i_sb_list);
>  		spin_lock(&dq_data_lock);
>  		if (!IS_NOQUOTA(inode)) {
>  			if (unlikely(inode_get_rsv_space(inode) > 0))
> @@ -1044,7 +1045,6 @@ static void remove_dquot_ref(struct super_block *sb, int type,
>  		}
>  		spin_unlock(&dq_data_lock);
>  	}
> -	spin_unlock(&sb->s_inode_list_lock);
>  #ifdef CONFIG_QUOTA_DEBUG
>  	if (reserved) {
>  		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
> diff --git a/fs/super.c b/fs/super.c
> index 1182af8..7d44fad 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -163,6 +163,7 @@ static void destroy_super(struct super_block *s)
>  {
>  	list_lru_destroy(&s->s_dentry_lru);
>  	list_lru_destroy(&s->s_inode_lru);
> +	free_pcpu_list_head(&s->s_inodes);
>  	security_sb_free(s);
>  	WARN_ON(!list_empty(&s->s_mounts));
>  	kfree(s->s_subtype);
> @@ -204,9 +205,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
>  	INIT_HLIST_NODE(&s->s_instances);
>  	INIT_HLIST_BL_HEAD(&s->s_anon);
>  	mutex_init(&s->s_sync_lock);
> -	INIT_LIST_HEAD(&s->s_inodes);
> -	spin_lock_init(&s->s_inode_list_lock);
>  
> +	if (init_pcpu_list_head(&s->s_inodes))
> +		goto fail;
>  	if (list_lru_init_memcg(&s->s_dentry_lru))
>  		goto fail;
>  	if (list_lru_init_memcg(&s->s_inode_lru))
> @@ -426,7 +427,7 @@ void generic_shutdown_super(struct super_block *sb)
>  		if (sop->put_super)
>  			sop->put_super(sb);
>  
> -		if (!list_empty(&sb->s_inodes)) {
> +		if (!pcpu_list_empty(sb->s_inodes)) {
>  			printk("VFS: Busy inodes after unmount of %s. "
>  			   "Self-destruct in 5 seconds.  Have a nice day...\n",
>  			   sb->s_id);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ae68100..b533bda 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -28,6 +28,7 @@
>  #include <linux/uidgid.h>
>  #include <linux/lockdep.h>
>  #include <linux/percpu-rwsem.h>
> +#include <linux/percpu-list.h>
>  #include <linux/blk_types.h>
>  #include <linux/workqueue.h>
>  #include <linux/percpu-rwsem.h>
> @@ -648,7 +649,7 @@ struct inode {
>  	u16			i_wb_frn_history;
>  #endif
>  	struct list_head	i_lru;		/* inode LRU list */
> -	struct list_head	i_sb_list;
> +	struct pcpu_list_node	i_sb_list;
>  	union {
>  		struct hlist_head	i_dentry;
>  		struct rcu_head		i_rcu;
> @@ -1397,9 +1398,8 @@ struct super_block {
>  	 */
>  	int s_stack_depth;
>  
> -	/* s_inode_list_lock protects s_inodes */
> -	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
> -	struct list_head	s_inodes;	/* all inodes */
> +	/* The percpu locks protect s_inodes */
> +	struct pcpu_list_head __percpu *s_inodes;	/* all inodes */
>  };
>  
>  extern struct timespec current_fs_time(struct super_block *sb);
> -- 
> 1.7.1
> 
>

[v3,3/3] vfs: Use per-cpu list for superblock's inode list

Commit Message

Comments

Patch