diff mbox

[v2,3/3] vfs: Use per-cpu list for superblock's inode list

Message ID 1455916245-32707-4-git-send-email-Waiman.Long@hpe.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Feb. 19, 2016, 9:10 p.m. UTC
When many threads are trying to add or delete inode to or from
a superblock's s_inodes list, spinlock contention on the list can
become a performance bottleneck.

This patch changes the s_inodes field to become a per-cpu list with
per-cpu spinlocks.

With an exit microbenchmark that creates a large number of threads,
attachs many inodes to them and then exits. The runtimes of that
microbenchmark with 1000 threads before and after the patch on a
4-socket Intel E7-4820 v3 system (40 cores, 80 threads) were as
follows:

  Kernel            Elapsed Time    System Time
  ------            ------------    -----------
  Vanilla 4.5-rc4      65.29s         82m14s
  Patched 4.5-rc4      22.81s         23m03s

Before the patch, spinlock contention at the inode_sb_list_add()
function at the startup phase and the inode_sb_list_del() function at
the exit phase were about 79% and 93% of total CPU time respectively
(as measured by perf). After the patch, the percpu_list_add()
function consumed only about 0.04% of CPU time at startup phase. The
percpu_list_del() function consumed about 0.4% of CPU time at exit
phase. There were still some spinlock contention, but they happened
elsewhere.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
---
 fs/block_dev.c         |   40 +++++++++++--------
 fs/drop_caches.c       |   31 ++++++++-------
 fs/fs-writeback.c      |   30 ++++++++------
 fs/inode.c             |   99 ++++++++++++++++++++++++-----------------------
 fs/notify/inode_mark.c |   43 ++++++++++-----------
 fs/quota/dquot.c       |   83 ++++++++++++++++++++++------------------
 fs/super.c             |    7 ++-
 include/linux/fs.h     |   37 ++++++++++++++++--
 8 files changed, 211 insertions(+), 159 deletions(-)

Comments

Dave Chinner Feb. 21, 2016, 9:34 p.m. UTC | #1
On Fri, Feb 19, 2016 at 04:10:45PM -0500, Waiman Long wrote:
> +/*
> + * Superblock's inode list iterator function and arguments macros
> + */
> +#define SB_INODES_ITER_FUNC(name, lock, struct_fields)			\
> +	struct name ## _arg {						\
> +		struct_fields;						\
> +	};								\
> +	static int name ## _iter(struct pcpu_list_node *_node,		\
> +				 struct pcpu_list_node **_pnext,	\
> +				 spinlock_t *lock, void *_arg)
> +
> +#define SB_INODES_ITER_ARGS(name, i, a)					\
> +	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
> +	struct name ## _arg *a = (struct name ## _arg *)_arg
> +
> +#define SB_INODES_ITER_ARGS_SAFE(name, i, n, a)				\
> +	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
> +	struct inode *n = container_of(*_pnext, struct inode, i_sb_list);\
> +	struct name ## _arg *a = (struct name ## _arg *)_arg
> +
> +#define SB_INODES_ITER_SET_PCPU_LIST_NEXT(n)				\
> +	{ *_pnext = &(n)->i_sb_list; }
> +
> +#define SB_INODES_ITER_CALL(name, sb)					\
> +	pcpu_list_iterate(sb->s_inodes, false, NULL, name ## _iter, &arg)
> +
> +#define SB_INODES_ITER_CALL_SAFE(name, sb, phead)			\
> +	pcpu_list_iterate(sb->s_inodes, true, phead, name ## _iter, &arg)
> +

No, just no.

Ungreppable, breaks cscope, obfuscates everything, shouts a lot,
code using the API looks completely broken (e.g. semi-colons in
"function declarations"), and it reminds me of the worst of the
worst unmaintainable code in an exceedingly buggy and undebuggable
proprietary filesystem I've previously had the "joy" of working
with.

Just fix the bug in the previous version; it's so much cleaner than
this .... mess.

Cheers,

Dave.
Waiman Long Feb. 23, 2016, 6:56 p.m. UTC | #2
On 02/21/2016 04:34 PM, Dave Chinner wrote:
> On Fri, Feb 19, 2016 at 04:10:45PM -0500, Waiman Long wrote:
>> +/*
>> + * Superblock's inode list iterator function and arguments macros
>> + */
>> +#define SB_INODES_ITER_FUNC(name, lock, struct_fields)			\
>> +	struct name ## _arg {						\
>> +		struct_fields;						\
>> +	};								\
>> +	static int name ## _iter(struct pcpu_list_node *_node,		\
>> +				 struct pcpu_list_node **_pnext,	\
>> +				 spinlock_t *lock, void *_arg)
>> +
>> +#define SB_INODES_ITER_ARGS(name, i, a)					\
>> +	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
>> +	struct name ## _arg *a = (struct name ## _arg *)_arg
>> +
>> +#define SB_INODES_ITER_ARGS_SAFE(name, i, n, a)				\
>> +	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
>> +	struct inode *n = container_of(*_pnext, struct inode, i_sb_list);\
>> +	struct name ## _arg *a = (struct name ## _arg *)_arg
>> +
>> +#define SB_INODES_ITER_SET_PCPU_LIST_NEXT(n)				\
>> +	{ *_pnext =&(n)->i_sb_list; }
>> +
>> +#define SB_INODES_ITER_CALL(name, sb)					\
>> +	pcpu_list_iterate(sb->s_inodes, false, NULL, name ## _iter,&arg)
>> +
>> +#define SB_INODES_ITER_CALL_SAFE(name, sb, phead)			\
>> +	pcpu_list_iterate(sb->s_inodes, true, phead, name ## _iter,&arg)
>> +
> No, just no.
>
> Ungreppable, breaks cscope, obfuscates everything, shouts a lot,
> code using the API looks completely broken (e.g. semi-colons in
> "function declarations"), and it reminds me of the worst of the
> worst unmaintainable code in an exceedingly buggy and undebuggable
> proprietary filesystem I've previously had the "joy" of working
> with.
>
> Just fix the bug in the previous version; it's so much cleaner than
> this .... mess.
>
> Cheers,
>
> Dave.

Sorry for that. I will scrap the current approach and use another way to 
iterate the list instead. I will send out an updated patch soon.

Cheers,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6eaeedf..5992a1f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1862,21 +1862,27 @@  int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 }
 EXPORT_SYMBOL(__invalidate_device);
 
-static inline void
-__iterate_bdev(spinlock_t *lock, struct inode *inode, struct inode **old_inode,
-	       void (*func)(struct block_device *, void *), void *arg)
+/*
+ * iterate_bdev_iter  - iteration function for each inode of a block
+ *			device superblock
+ */
+SB_INODES_ITER_FUNC(iterate_bdev, pcpu_lock,
+		    struct inode *old_inode;
+		    void (*func)(struct block_device *, void *);
+		    void *arg)
 {
+	SB_INODES_ITER_ARGS(iterate_bdev, inode, arg);
 	struct address_space *mapping = inode->i_mapping;
 
 	spin_lock(&inode->i_lock);
 	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
 	    mapping->nrpages == 0) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(lock);
+	spin_unlock(pcpu_lock);
 	/*
 	 * We hold a reference to 'inode' so it couldn't have been
 	 * removed from s_inodes list while we dropped the
@@ -1884,23 +1890,23 @@  __iterate_bdev(spinlock_t *lock, struct inode *inode, struct inode **old_inode,
 	 * be holding the last reference and we cannot iput it under
 	 * pcpu_lock. So we keep the reference and iput it later.
 	 */
-	iput(*old_inode);
-	*old_inode = inode;
+	iput(arg->old_inode);
+	arg->old_inode = inode;
 
-	func(I_BDEV(inode), arg);
+	arg->func(I_BDEV(inode), arg->arg);
 
-	spin_lock(lock);
+	spin_lock(pcpu_lock);
+	return 0;
 }
 
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *f_arg)
 {
-	struct inode *inode, *old_inode = NULL;
+	struct iterate_bdev_arg arg;
 
-	spin_lock(&blockdev_superblock->s_inode_list_lock);
-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
-		__iterate_bdev(&blockdev_superblock->s_inode_list_lock,
-			       inode, &old_inode, func, arg);
+	arg.arg = f_arg;
+	arg.func = func;
+	arg.old_inode = NULL;
 
-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
-	iput(old_inode);
+	SB_INODES_ITER_CALL(iterate_bdev, blockdev_superblock);
+	iput(arg.old_inode);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d3449d5..63b1842 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -13,37 +13,40 @@ 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
-static inline void __drop_pagecache_sb(spinlock_t *lock, struct inode *inode,
-				       struct inode **toput_inode)
+/*
+ * drop_pagecache_iter - iteration function for each inode of a superblock
+ */
+SB_INODES_ITER_FUNC(drop_pagecache, pcpu_lock,
+		    struct inode *toput_inode)
 {
+	SB_INODES_ITER_ARGS(drop_pagecache, inode, arg);
+
 	spin_lock(&inode->i_lock);
 	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
 	    (inode->i_mapping->nrpages == 0)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(lock);
+	spin_unlock(pcpu_lock);
 
 	invalidate_mapping_pages(inode->i_mapping, 0, -1);
-	iput(*toput_inode);
-	*toput_inode = inode;
+	iput(arg->toput_inode);
+	arg->toput_inode = inode;
 
-	spin_lock(lock);
+	spin_lock(pcpu_lock);
+	return 0;
 }
 
 static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
-	struct inode *inode, *toput_inode = NULL;
+	struct drop_pagecache_arg arg;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
-		__drop_pagecache_sb(&sb->s_inode_list_lock, inode,
-				    &toput_inode);
+	arg.toput_inode = NULL;
 
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(toput_inode);
+	SB_INODES_ITER_CALL(drop_pagecache, sb);
+	iput(arg.toput_inode);
 }
 
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5ad6eda..080bae5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2095,20 +2095,24 @@  out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-static inline void __wait_sb_inode(spinlock_t *lock, struct inode *inode,
-				   struct inode **old_inode)
+/*
+ * wait_sb_inode_iter - iteration function for each inode of a superblock
+ */
+SB_INODES_ITER_FUNC(wait_sb_inode, pcpu_lock,
+		    struct inode *old_inode)
 {
+	SB_INODES_ITER_ARGS(wait_sb_inode, inode, arg);
 	struct address_space *mapping = inode->i_mapping;
 
 	spin_lock(&inode->i_lock);
 	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
 	    (mapping->nrpages == 0)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(lock);
+	spin_unlock(pcpu_lock);
 
 	/*
 	 * We hold a reference to 'inode' so it couldn't have been
@@ -2117,8 +2121,8 @@  static inline void __wait_sb_inode(spinlock_t *lock, struct inode *inode,
 	 * be holding the last reference and we cannot iput it under
 	 * pcpu_lock. So we keep the reference and iput it later.
 	 */
-	iput(*old_inode);
-	*old_inode = inode;
+	iput(arg->old_inode);
+	arg->old_inode = inode;
 
 	/*
 	 * We keep the error status of individual mapping so that
@@ -2129,7 +2133,8 @@  static inline void __wait_sb_inode(spinlock_t *lock, struct inode *inode,
 
 	cond_resched();
 
-	spin_lock(lock);
+	spin_lock(pcpu_lock);
+	return 0;
 }
 
 /*
@@ -2143,7 +2148,9 @@  static inline void __wait_sb_inode(spinlock_t *lock, struct inode *inode,
  */
 static void wait_sb_inodes(struct super_block *sb)
 {
-	struct inode *inode, *old_inode = NULL;
+	struct wait_sb_inode_arg arg;
+
+	arg.old_inode = NULL;
 
 	/*
 	 * We need to be protected against the filesystem going from
@@ -2152,7 +2159,6 @@  static void wait_sb_inodes(struct super_block *sb)
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	mutex_lock(&sb->s_sync_lock);
-	spin_lock(&sb->s_inode_list_lock);
 
 	/*
 	 * Data integrity sync. Must wait for all pages under writeback,
@@ -2161,11 +2167,9 @@  static void wait_sb_inodes(struct super_block *sb)
 	 * In which case, the inode may not be on the dirty list, but
 	 * we still have to wait for that writeout.
 	 */
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
-		__wait_sb_inode(&sb->s_inode_list_lock, inode, &old_inode);
+	SB_INODES_ITER_CALL(wait_sb_inode, sb);
 
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(old_inode);
+	iput(arg.old_inode);
 	mutex_unlock(&sb->s_sync_lock);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 6dd609e..0f2fba4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,7 +28,7 @@ 
  *   inode->i_state, inode->i_hash, __iget()
  * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
- * inode->i_sb->s_inode_list_lock protects:
+ * inode->i_sb->s_inodes->lock protects:
  *   inode->i_sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
@@ -37,7 +37,7 @@ 
  *
  * Lock ordering:
  *
- * inode->i_sb->s_inode_list_lock
+ * inode->i_sb->s_inodes->lock
  *   inode->i_lock
  *     Inode LRU list locks
  *
@@ -45,7 +45,7 @@ 
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode->i_sb->s_inode_list_lock
+ *   inode->i_sb->s_inodes->lock
  *   inode->i_lock
  *
  * iunique_lock
@@ -424,19 +424,14 @@  static void inode_lru_list_del(struct inode *inode)
  */
 void inode_sb_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_list_lock);
-	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-	spin_unlock(&inode->i_sb->s_inode_list_lock);
+	pcpu_list_add(&inode->i_sb_list, inode->i_sb->s_inodes);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 static inline void inode_sb_list_del(struct inode *inode)
 {
-	if (!list_empty(&inode->i_sb_list)) {
-		spin_lock(&inode->i_sb->s_inode_list_lock);
-		list_del_init(&inode->i_sb_list);
-		spin_unlock(&inode->i_sb->s_inode_list_lock);
-	}
+	if (!list_empty(&inode->i_sb_list.list))
+		pcpu_list_del(&inode->i_sb_list);
 }
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -579,9 +574,15 @@  static void dispose_list(struct list_head *head)
 	}
 }
 
-static inline int __evict_inode(spinlock_t *lock, struct inode *inode,
-				struct list_head *dispose)
+/*
+ * evict_inode_iter - iteration function for each inode of a superblock
+ */
+SB_INODES_ITER_FUNC(evict_inode, pcpu_lock,
+		    struct list_head dispose;
+		    bool iter_again)
 {
+	SB_INODES_ITER_ARGS(evict_inode, inode, arg);
+
 	if (atomic_read(&inode->i_count))
 		return 0;
 
@@ -594,7 +595,7 @@  static inline int __evict_inode(spinlock_t *lock, struct inode *inode,
 	inode->i_state |= I_FREEING;
 	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
-	list_add(&inode->i_lru, dispose);
+	list_add(&inode->i_lru, &arg->dispose);
 
 	/*
 	 * We can have a ton of inodes to evict at unmount time given
@@ -602,9 +603,10 @@  static inline int __evict_inode(spinlock_t *lock, struct inode *inode,
 	 * bit so we don't livelock.
 	 */
 	if (need_resched()) {
-		spin_unlock(lock);
+		spin_unlock(pcpu_lock);
 		cond_resched();
-		dispose_list(dispose);
+		dispose_list(&arg->dispose);
+		arg->iter_again = true;
 		return 1;	/* Redo it again */
 	}
 	return 0;
@@ -621,47 +623,53 @@  static inline int __evict_inode(spinlock_t *lock, struct inode *inode,
  */
 void evict_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
+	struct evict_inode_arg arg;
+
+	INIT_LIST_HEAD(&arg.dispose);
 
 again:
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (__evict_inode(&sb->s_inode_list_lock, inode, &dispose))
-			goto again;
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	arg.iter_again = false;
+	SB_INODES_ITER_CALL_SAFE(evict_inode, sb, NULL);
+	if (arg.iter_again)
+		goto again;
 
-	dispose_list(&dispose);
+	dispose_list(&arg.dispose);
 }
 
-static inline void __invalidate_inode(struct inode *inode, bool kill_dirty,
-				      struct list_head *dispose, int *busy)
+/*
+ * invalidate_inode_iter - attempt to free an inode on a superblock
+ */
+SB_INODES_ITER_FUNC(invalidate_inode, pcpu_lock,
+		    struct list_head dispose;
+		    bool busy;
+		    bool kill_dirty)
 {
+	SB_INODES_ITER_ARGS(invalidate_inode, inode, arg);
+
 	spin_lock(&inode->i_lock);
 	if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 
-	if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
+	if (inode->i_state & I_DIRTY_ALL && !arg->kill_dirty) {
 		spin_unlock(&inode->i_lock);
-		*busy = 1;
-		return;
+		arg->busy = 1;
+		return 0;
 	}
 
 	if (atomic_read(&inode->i_count)) {
 		spin_unlock(&inode->i_lock);
-		*busy = 1;
-		return;
+		arg->busy = 1;
+		return 0;
 	}
 
 	inode->i_state |= I_FREEING;
 	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
-	list_add(&inode->i_lru, dispose);
+	list_add(&inode->i_lru, &arg->dispose);
+	return 0;
 }
-
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
@@ -674,19 +682,16 @@  static inline void __invalidate_inode(struct inode *inode, bool kill_dirty,
  */
 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
-	int busy = 0;
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
+	struct invalidate_inode_arg arg;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list)
-		__invalidate_inode(inode, kill_dirty, &dispose, &busy);
+	arg.kill_dirty = kill_dirty;
+	arg.busy = 0;
+	INIT_LIST_HEAD(&arg.dispose);
 
-	spin_unlock(&sb->s_inode_list_lock);
+	SB_INODES_ITER_CALL_SAFE(invalidate_inode, sb, NULL);
+	dispose_list(&arg.dispose);
 
-	dispose_list(&dispose);
-
-	return busy;
+	return arg.busy;
 }
 
 /*
@@ -897,7 +902,7 @@  struct inode *new_inode_pseudo(struct super_block *sb)
 		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
 		spin_unlock(&inode->i_lock);
-		INIT_LIST_HEAD(&inode->i_sb_list);
+		init_pcpu_list_node(&inode->i_sb_list);
 	}
 	return inode;
 }
@@ -918,8 +923,6 @@  struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 
-	spin_lock_prefetch(&sb->s_inode_list_lock);
-
 	inode = new_inode_pseudo(sb);
 	if (inode)
 		inode_sb_list_add(inode);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index ec52dcb..1fd1daf 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -141,13 +141,15 @@  int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	return ret;
 }
 
-static inline void
-__fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
-			 struct list_head *head, struct inode **pnext,
-			 struct inode **need_iput)
+/*
+ * unmount_inode_iter - iteration function for each inode of a SB
+ */
+SB_INODES_ITER_FUNC(unmount_inode, pcpu_lock,
+		    struct inode *need_iput;
+		    struct list_head *percpu_head)
 {
+	SB_INODES_ITER_ARGS_SAFE(unmount_inode, inode, next_i, arg);
 	struct inode *need_iput_tmp;
-	struct inode *next_i = *pnext;
 
 	/*
 	 * We cannot __iget() an inode in state I_FREEING,
@@ -157,7 +159,7 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
 	spin_lock(&inode->i_lock);
 	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 
 	/*
@@ -168,11 +170,11 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
 	 */
 	if (!atomic_read(&inode->i_count)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 
-	need_iput_tmp = *need_iput;
-	*need_iput = NULL;
+	need_iput_tmp = arg->need_iput;
+	arg->need_iput = NULL;
 
 	/* In case fsnotify_inode_delete() drops a reference. */
 	if (inode != need_iput_tmp)
@@ -182,19 +184,19 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
 	spin_unlock(&inode->i_lock);
 
 	/* In case the dropping of a reference would nuke next_i. */
-	while (&next_i->i_sb_list != head) {
+	while (&next_i->i_sb_list.list != arg->percpu_head) {
 		spin_lock(&next_i->i_lock);
 		if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
 					atomic_read(&next_i->i_count)) {
 			__iget(next_i);
-			*need_iput = next_i;
+			arg->need_iput = next_i;
 			spin_unlock(&next_i->i_lock);
 			break;
 		}
 		spin_unlock(&next_i->i_lock);
-		next_i = list_next_entry(next_i, i_sb_list);
+		next_i = pcpu_list_next_entry(next_i, i_sb_list);
 	}
-	*pnext = next_i;
+	SB_INODES_ITER_SET_PCPU_LIST_NEXT(next_i);
 
 	/*
 	 * We can safely drop pcpu_lock  here because either
@@ -202,7 +204,7 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
 	 * end of list.  Also no new inodes will be added since the
 	 * umount has begun.
 	 */
-	spin_unlock(lock);
+	spin_unlock(pcpu_lock);
 
 	if (need_iput_tmp)
 		iput(need_iput_tmp);
@@ -214,7 +216,8 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
 
 	iput(inode);
 
-	spin_lock(lock);
+	spin_lock(pcpu_lock);
+	return 0;
 }
 
 /**
@@ -227,12 +230,8 @@  __fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode,
  */
 void fsnotify_unmount_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list)
-		__fsnotify_unmount_inode(&sb->s_inode_list_lock, inode,
-					 &sb->s_inodes, &next_i, &need_iput);
+	struct unmount_inode_arg arg;
 
-	spin_unlock(&sb->s_inode_list_lock);
+	arg.need_iput = NULL;
+	SB_INODES_ITER_CALL_SAFE(unmount_inode, sb, &arg.percpu_head);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 143183b..6aa593e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -920,30 +920,33 @@  static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-static inline void
-__add_dquot_ref(spinlock_t *lock, struct inode *inode, int type,
-#ifdef CONFIG_QUOTA_DEBUG
-		int *reserved,
-#endif
-		struct inode **old_inode)
+/*
+ * add_dquot_iter - iteration function for each inode of a superblock
+ */
+SB_INODES_ITER_FUNC(add_dquot, pcpu_lock,
+		    struct inode *old_inode;
+		    int type;
+		    int reserved)
 {
+	SB_INODES_ITER_ARGS(add_dquot, inode, arg);
+
 	spin_lock(&inode->i_lock);
 	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
 	    !atomic_read(&inode->i_writecount) ||
-	    !dqinit_needed(inode, type)) {
+	    !dqinit_needed(inode, arg->type)) {
 		spin_unlock(&inode->i_lock);
-		return;
+		return 0;
 	}
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(lock);
+	spin_unlock(pcpu_lock);
 
 #ifdef CONFIG_QUOTA_DEBUG
 	if (unlikely(inode_get_rsv_space(inode) > 0))
-		*reserved = 1;
+		arg->reserved = 1;
 #endif
-	iput(*old_inode);
-	__dquot_initialize(inode, type);
+	iput(arg->old_inode);
+	__dquot_initialize(inode, arg->type);
 
 	/*
 	 * We hold a reference to 'inode' so it couldn't have been
@@ -952,30 +955,27 @@  __add_dquot_ref(spinlock_t *lock, struct inode *inode, int type,
 	 * holding the last reference and we cannot iput it under
 	 * pcpu_lock. So we keep the reference and iput it later.
 	 */
-	*old_inode = inode;
-	spin_lock(lock);
+	arg->old_inode = inode;
+	spin_lock(pcpu_lock);
+	return 0;
 }
 
 /* This routine is guarded by dqonoff_mutex mutex */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
-	struct inode *inode, *old_inode = NULL;
-#ifdef CONFIG_QUOTA_DEBUG
-	int reserved = 0;
-#endif
+	struct add_dquot_arg arg;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
-		__add_dquot_ref(&sb->s_inode_list_lock, inode, type,
 #ifdef CONFIG_QUOTA_DEBUG
-				&reserved,
+	arg.reserved = 0;
 #endif
-				&old_inode);
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(old_inode);
+	arg.old_inode = NULL;
+	arg.type = type;
+
+	SB_INODES_ITER_CALL(add_dquot, sb);
+	iput(arg.old_inode);
 
 #ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
+	if (arg.reserved) {
 		quota_error(sb, "Writes happened before quota was turned on "
 			"thus quota information is probably inconsistent. "
 			"Please run quotacheck(8)");
@@ -1034,10 +1034,16 @@  static void put_dquot_list(struct list_head *tofree_head)
 	}
 }
 
-static inline void
-__remove_dquot_ref(struct inode *inode, int type,
-		   struct list_head *tofree_head, int *reserved)
+/*
+ * add_dquot_iter - iteration function for each inode of a superblock
+ */
+SB_INODES_ITER_FUNC(remove_dquot, pcpu_lock,
+		    struct list_head *tofree_head;
+		    int type;
+		    int reserved)
 {
+	SB_INODES_ITER_ARGS(remove_dquot, inode, arg);
+
 	/*
 	 *  We have to scan also I_NEW inodes because they can already
 	 *  have quota pointer initialized. Luckily, we need to touch
@@ -1047,25 +1053,26 @@  __remove_dquot_ref(struct inode *inode, int type,
 	spin_lock(&dq_data_lock);
 	if (!IS_NOQUOTA(inode)) {
 		if (unlikely(inode_get_rsv_space(inode) > 0))
-			*reserved = 1;
-		remove_inode_dquot_ref(inode, type, tofree_head);
+			arg->reserved = 1;
+		remove_inode_dquot_ref(inode, arg->type, arg->tofree_head);
 	}
 	spin_unlock(&dq_data_lock);
+	return 0;
 }
 
 static void remove_dquot_ref(struct super_block *sb, int type,
 		struct list_head *tofree_head)
 {
-	struct inode *inode;
-	int reserved = 0;
+	struct remove_dquot_arg arg;
+
+	arg.reserved = 0;
+	arg.type = type;
+	arg.tofree_head = tofree_head;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
-		__remove_dquot_ref(inode, type, tofree_head, &reserved);
+	SB_INODES_ITER_CALL(remove_dquot, sb);
 
-	spin_unlock(&sb->s_inode_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
+	if (arg.reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
 			" was disabled thus quota information is probably "
 			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
diff --git a/fs/super.c b/fs/super.c
index 1182af8..7d44fad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -163,6 +163,7 @@  static void destroy_super(struct super_block *s)
 {
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+	free_pcpu_list_head(&s->s_inodes);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
@@ -204,9 +205,9 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	mutex_init(&s->s_sync_lock);
-	INIT_LIST_HEAD(&s->s_inodes);
-	spin_lock_init(&s->s_inode_list_lock);
 
+	if (init_pcpu_list_head(&s->s_inodes))
+		goto fail;
 	if (list_lru_init_memcg(&s->s_dentry_lru))
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru))
@@ -426,7 +427,7 @@  void generic_shutdown_super(struct super_block *sb)
 		if (sop->put_super)
 			sop->put_super(sb);
 
-		if (!list_empty(&sb->s_inodes)) {
+		if (!pcpu_list_empty(sb->s_inodes)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae68100..c30cdb6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -27,6 +27,7 @@ 
 #include <linux/migrate_mode.h>
 #include <linux/uidgid.h>
 #include <linux/lockdep.h>
+#include <linux/percpu-list.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
 #include <linux/workqueue.h>
@@ -648,7 +649,7 @@  struct inode {
 	u16			i_wb_frn_history;
 #endif
 	struct list_head	i_lru;		/* inode LRU list */
-	struct list_head	i_sb_list;
+	struct pcpu_list_node	i_sb_list;
 	union {
 		struct hlist_head	i_dentry;
 		struct rcu_head		i_rcu;
@@ -1397,11 +1398,39 @@  struct super_block {
 	 */
 	int s_stack_depth;
 
-	/* s_inode_list_lock protects s_inodes */
-	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inodes;	/* all inodes */
+	/* The percpu locks protect s_inodes */
+	struct pcpu_list_head __percpu *s_inodes;	/* all inodes */
 };
 
+/*
+ * Superblock's inode list iterator function and arguments macros
+ */
+#define SB_INODES_ITER_FUNC(name, lock, struct_fields)			\
+	struct name ## _arg {						\
+		struct_fields;						\
+	};								\
+	static int name ## _iter(struct pcpu_list_node *_node,		\
+				 struct pcpu_list_node **_pnext,	\
+				 spinlock_t *lock, void *_arg)
+
+#define SB_INODES_ITER_ARGS(name, i, a)					\
+	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
+	struct name ## _arg *a = (struct name ## _arg *)_arg
+
+#define SB_INODES_ITER_ARGS_SAFE(name, i, n, a)				\
+	struct inode *i = container_of(_node, struct inode, i_sb_list);	\
+	struct inode *n = container_of(*_pnext, struct inode, i_sb_list);\
+	struct name ## _arg *a = (struct name ## _arg *)_arg
+
+#define SB_INODES_ITER_SET_PCPU_LIST_NEXT(n)				\
+	{ *_pnext = &(n)->i_sb_list; }
+
+#define SB_INODES_ITER_CALL(name, sb)					\
+	pcpu_list_iterate(sb->s_inodes, false, NULL, name ## _iter, &arg)
+
+#define SB_INODES_ITER_CALL_SAFE(name, sb, phead)			\
+	pcpu_list_iterate(sb->s_inodes, true, phead, name ## _iter, &arg)
+
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*