diff mbox

[8/9] inode: convert per-sb inode list to a list_lru

Message ID 1426016724-23912-9-git-send-email-jbacik@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik March 10, 2015, 7:45 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

The per-superblock inode list and lock is a bottleneck for systems
that cycle inodes in and out of cache concurrently. The global lock
is a limiting factor.

Most of the additions to the sb inode list occur on the CPU that
allocated the inode, and most of the removals occur during evict()
calls as a result of memory reclaim. Both of these events are local
to the node that the inode belongs to, so it maps to the per-node
lists that the list_lru uses.

There are several places where the inode list is walked. These can
be converted easily to use list_lru_walk() to do their work on each
inode on the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/block_dev.c         |  76 ++++++++++++--------
 fs/drop_caches.c       |  58 ++++++++++-----
 fs/inode.c             | 136 ++++++++++++++++++-----------------
 fs/notify/inode_mark.c | 121 +++++++++++++-------------------
 fs/quota/dquot.c       | 187 ++++++++++++++++++++++++++++++++-----------------
 fs/super.c             |   8 ++-
 include/linux/fs.h     |   9 ++-
 7 files changed, 340 insertions(+), 255 deletions(-)

Comments

Jan Kara March 16, 2015, 12:27 p.m. UTC | #1
Hello,

On Tue 10-03-15 15:45:23, Josef Bacik wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The per-superblock inode list and lock is a bottleneck for systems
> that cycle inodes in and out of cache concurrently. The global lock
> is a limiting factor.
> 
> Most of the additions to the sb inode list occur on the CPU that
> allocated the inode, and most of the removals occur during evict()
> calls as a result of memory reclaim. Both of these events are local
> to the node that the inode belongs to, so it maps to the per-node
> lists that the list_lru uses.
> 
> There are several places where the inode list is walked. These can
> be converted easily to use list_lru_walk() to do their work on each
> inode on the list.
  Err, after looking at the patch I'm not so sure list_lru is such a great
fit for inode sb list. We don't use any of its LRU features so arguably
that just makes things less readable, there's some memcg code which I hope
isn't activated for the inode list but how do I tell? We just use it as
general per-numa-node linked lists so if we go down the path of
per-numa-node linked list I'd prefer to have an abstraction just for that.
Also there are some lifetime issues I'll comment on in the code.

And do we really want per-numa-node lists? Why not per-cpu? Is it the
memory cost we are concerned about? Or is it the cost of iterating all CPUs
when wanting to traverse all inodes? The changelog doesn't tell. The
traversal is used on unmount (inode eviction, fsnotify mark eviction),
quota on/off (adding / removing dquot pointers to / from inodes),
drop_caches, sync (iterate all block devices). Sync is IMO the only one
where CPU overhead might be a concern but we could create a better method
for iterating all block devices. After all creating / removing block
devices could happily use just ordinary lists. So CPU overhead shouldn't be
a huge problem. Memory overhead is (list_head + spinlock) per-sb-per-cpu.
That doesn't sound too terrible either.

When speaking about this with Andi Kleen, he was suggesting that we maybe
want to not have a per-cpu lists but a list per a batch of CPUs (say 8, the
actual number would be tuned during boot based on the number of actual
CPUs). That may be an option as well.

I wouldn't like to block this series just for this patch since all the
others are fine (modulo some minor comments). So could you repost the
series without this patch so that it can get queued for the coming merge
window? It should be easy to rebase the patch 9/9 to not need this patch.

> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 2eb2436..d23ce6f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1749,38 +1749,56 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
>  }
>  EXPORT_SYMBOL(__invalidate_device);
>  
> -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
> -{
> -	struct inode *inode, *old_inode = NULL;
> +struct bdev_iter {
> +	void (*func)(struct block_device *, void *);
> +	void *arg;
> +	struct inode *toput_inode;
> +};
>  
> -	spin_lock(&blockdev_superblock->s_inode_list_lock);
> -	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
> -		struct address_space *mapping = inode->i_mapping;
> +static enum lru_status
> +bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
> +	     spinlock_t *lock, void *cb_arg)
> +{
> +	struct bdev_iter *iter = cb_arg;
> +	struct inode *inode = container_of(item, struct inode, i_sb_list);
>  
> -		spin_lock(&inode->i_lock);
> -		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> -		    mapping->nrpages == 0) {
> -			spin_unlock(&inode->i_lock);
> -			continue;
> -		}
> -		__iget(inode);
> +	spin_lock(&inode->i_lock);
> +	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> +	    inode->i_mapping->nrpages == 0) {
>  		spin_unlock(&inode->i_lock);
> -		spin_unlock(&blockdev_superblock->s_inode_list_lock);
> -		/*
> -		 * We hold a reference to 'inode' so it couldn't have been
> -		 * removed from s_inodes list while we dropped the
> -		 * s_inode_list_lock  We cannot iput the inode now as we can
> -		 * be holding the last reference and we cannot iput it under
> -		 * s_inode_list_lock. So we keep the reference and iput it
> -		 * later.
> -		 */
> -		iput(old_inode);
> -		old_inode = inode;
> +		return LRU_SKIP;
> +	}
> +	__iget(inode);
> +	spin_unlock(&inode->i_lock);
> +	spin_unlock(lock);
>  
> -		func(I_BDEV(inode), arg);
> +	iput(iter->toput_inode);
> +	iter->toput_inode = inode;
>  
> -		spin_lock(&blockdev_superblock->s_inode_list_lock);
> -	}
> -	spin_unlock(&blockdev_superblock->s_inode_list_lock);
> -	iput(old_inode);
> +	iter->func(I_BDEV(inode), iter->arg);
> +
> +	/*
> +	 * Even though we have dropped the lock here, we can return LRU_SKIP as
> +	 * we have a reference to the current inode and so it's next pointer is
> +	 * guaranteed to be valid even though we dropped the list lock.
> +	 */
> +	spin_lock(lock);
> +	return LRU_SKIP;
> +}
  So I actually think doing the iteration like this is buggy with list_lru
code. When we pin inode by grabbing i_count, we are only sure the inode
won't go away under us. However there's nothing which protects from list
modifications. So this method is only safe to be combined with
list_for_each[_entry](). Specifically it does *not* work when combined with
list_for_each_safe() which __list_lru_walk_one() uses because the next
pointer that is cached may become invalid once we drop the lock. And I'd
really hate to try to make these two work together - that would seem really
fragile since subtle changes to how lru_list iteration work could break
inode iteration.

This is true for all the iterations over the inodes that are playing with
i_count. It's not just about this particular case of blockdev iteration.

> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> index a4e1a8f..a0cdc66 100644
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -161,87 +161,60 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
>  	return ret;
>  }
>  
> -/**
> - * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
> - * @sb: superblock being unmounted.
> - *
> - * Called during unmount with no locks held, so needs to be safe against
> - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
> - */
> -void fsnotify_unmount_inodes(struct super_block *sb)
> -{
> -	struct inode *inode, *next_i, *need_iput = NULL;
> -
> -	spin_lock(&sb->s_inode_list_lock);
> -	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
> -		struct inode *need_iput_tmp;
> -
> -		/*
> -		 * We cannot __iget() an inode in state I_FREEING,
> -		 * I_WILL_FREE, or I_NEW which is fine because by that point
> -		 * the inode cannot have any associated watches.
> -		 */
> -		spin_lock(&inode->i_lock);
> -		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> -			spin_unlock(&inode->i_lock);
> -			continue;
> -		}
> +static enum lru_status
> +fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
> +		       spinlock_t *lock, void *cb_arg)
> + {
> +	struct inode **toput_inode = cb_arg;
> +	struct inode *inode = container_of(item, struct inode, i_sb_list);
> +
> +	/* New or being freed inodes cannot have any associated watches. */
> +	spin_lock(&inode->i_lock);
> +	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> +		spin_unlock(&inode->i_lock);
> +		return LRU_SKIP;
> +	}
>  
> -		/*
> -		 * If i_count is zero, the inode cannot have any watches and
> -		 * doing an __iget/iput with MS_ACTIVE clear would actually
> -		 * evict all inodes with zero i_count from icache which is
> -		 * unnecessarily violent and may in fact be illegal to do.
> -		 */
> -		if (!atomic_read(&inode->i_count)) {
> -			spin_unlock(&inode->i_lock);
> -			continue;
> -		}
> -
> -		need_iput_tmp = need_iput;
> -		need_iput = NULL;
> -
> -		/* In case fsnotify_inode_delete() drops a reference. */
> -		if (inode != need_iput_tmp)
> -			__iget(inode);
> -		else
> -			need_iput_tmp = NULL;
> +	/* If i_count is zero, the inode cannot have any watches */
> +	if (!atomic_read(&inode->i_count)) {
>  		spin_unlock(&inode->i_lock);
> +		return LRU_SKIP;
> +	}
>  
> -		/* In case the dropping of a reference would nuke next_i. */
> -		while (&next_i->i_sb_list != &sb->s_inodes) {
> -			spin_lock(&next_i->i_lock);
> -			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
> -						atomic_read(&next_i->i_count)) {
> -				__iget(next_i);
> -				need_iput = next_i;
> -				spin_unlock(&next_i->i_lock);
> -				break;
> -			}
> -			spin_unlock(&next_i->i_lock);
> -			next_i = list_entry(next_i->i_sb_list.next,
> -						struct inode, i_sb_list);
> -		}
> +	__iget(inode);
> +	spin_unlock(&inode->i_lock);
> +	spin_unlock(lock);
>  
> -		/*
> -		 * We can safely drop s_inode_list_lock here because either
> -		 * we actually hold references on both inode and next_i or
> -		 * end of list.  Also no new inodes will be added since the
> -		 * umount has begun.
> -		 */
> -		spin_unlock(&sb->s_inode_list_lock);
> +	iput(*toput_inode);
> +	*toput_inode = inode;
>  
> -		if (need_iput_tmp)
> -			iput(need_iput_tmp);
> +	/* for each watch, send FS_UNMOUNT and then remove it */
> +	fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> +	fsnotify_inode_delete(inode);
>  
> -		/* for each watch, send FS_UNMOUNT and then remove it */
> -		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> +	/*
> +	 * Even though we have dropped the lock here, we can return LRU_SKIP as
> +	 * we have a reference to the current inode and so it's next pointer is
> +	 * guaranteed to be valid even though we dropped the list lock.
> +	 */
> +	spin_lock(lock);
> +	return LRU_SKIP;
> +}
  So in the original code of fsnotify_unmount_inodes() you can see the
hoops you have to jump through when having to iterate inode list with 
list_for_each_entry_safe(). Deleting that code without deeper thought
wasn't a good idea ;).

As a side note, it should be possible to convert this code to use just
list_for_each() and avoid all that crap but it will require me to dwelve into
fsnotify magic again to verify it's correct. I guess I'll leave that after
your patches are queued so that I don't make life harder to you.

								Honza
Josef Bacik March 16, 2015, 3:34 p.m. UTC | #2
On 03/16/2015 08:27 AM, Jan Kara wrote:
>    Hello,
>
> On Tue 10-03-15 15:45:23, Josef Bacik wrote:
>> From: Dave Chinner <dchinner@redhat.com>
>>
>> The per-superblock inode list and lock is a bottleneck for systems
>> that cycle inodes in and out of cache concurrently. The global lock
>> is a limiting factor.
>>
>> Most of the additions to the sb inode list occur on the CPU that
>> allocated the inode, and most of the removals occur during evict()
>> calls as a result of memory reclaim. Both of these events are local
>> to the node that the inode belongs to, so it maps to the per-node
>> lists that the list_lru uses.
>>
>> There are several places where the inode list is walked. These can
>> be converted easily to use list_lru_walk() to do their work on each
>> inode on the list.
>    Err, after looking at the patch I'm not so sure list_lru is such a great
> fit for inode sb list. We don't use any of its LRU features so arguably
> that just makes things less readable, there's some memcg code which I hope
> isn't activated for the inode list but how do I tell? We just use it as
> general per-numa-node linked lists so if we go down the path of
> per-numa-node linked list I'd prefer to have an abstraction just for that.
> Also there are some lifetime issues I'll comment on in the code.
>
> And do we really want per-numa-node lists? Why not per-cpu? Is it the
> memory cost we are concerned about? Or is it the cost of iterating all CPUs
> when wanting to traverse all inodes? The changelog doesn't tell. The
> traversal is used on unmount (inode eviction, fsnotify mark eviction),
> quota on/off (adding / removing dquot pointers to / from inodes),
> drop_caches, sync (iterate all block devices). Sync is IMO the only one
> where CPU overhead might be a concern but we could create a better method
> for iterating all block devices. After all creating / removing block
> devices could happily use just ordinary lists. So CPU overhead shouldn't be
> a huge problem. Memory overhead is (list_head + spinlock) per-sb-per-cpu.
> That doesn't sound too terrible either.

Yeah sorry, I had it in my head that the list was already per-cpu, but 
you're right its per-numa.

>
> When speaking about this with Andi Kleen, he was suggesting that we maybe
> want to not have a per-cpu lists but a list per a batch of CPUs (say 8, the
> actual number would be tuned during boot based on the number of actual
> CPUs). That may be an option as well.
>
> I wouldn't like to block this series just for this patch since all the
> others are fine (modulo some minor comments). So could you repost the
> series without this patch so that it can get queued for the coming merge
> window? It should be easy to rebase the patch 9/9 to not need this patch.
>

Yeah I can do that.

>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 2eb2436..d23ce6f 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1749,38 +1749,56 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
>>   }
>>   EXPORT_SYMBOL(__invalidate_device);
>>
>> -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
>> -{
>> -	struct inode *inode, *old_inode = NULL;
>> +struct bdev_iter {
>> +	void (*func)(struct block_device *, void *);
>> +	void *arg;
>> +	struct inode *toput_inode;
>> +};
>>
>> -	spin_lock(&blockdev_superblock->s_inode_list_lock);
>> -	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
>> -		struct address_space *mapping = inode->i_mapping;
>> +static enum lru_status
>> +bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
>> +	     spinlock_t *lock, void *cb_arg)
>> +{
>> +	struct bdev_iter *iter = cb_arg;
>> +	struct inode *inode = container_of(item, struct inode, i_sb_list);
>>
>> -		spin_lock(&inode->i_lock);
>> -		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
>> -		    mapping->nrpages == 0) {
>> -			spin_unlock(&inode->i_lock);
>> -			continue;
>> -		}
>> -		__iget(inode);
>> +	spin_lock(&inode->i_lock);
>> +	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
>> +	    inode->i_mapping->nrpages == 0) {
>>   		spin_unlock(&inode->i_lock);
>> -		spin_unlock(&blockdev_superblock->s_inode_list_lock);
>> -		/*
>> -		 * We hold a reference to 'inode' so it couldn't have been
>> -		 * removed from s_inodes list while we dropped the
>> -		 * s_inode_list_lock  We cannot iput the inode now as we can
>> -		 * be holding the last reference and we cannot iput it under
>> -		 * s_inode_list_lock. So we keep the reference and iput it
>> -		 * later.
>> -		 */
>> -		iput(old_inode);
>> -		old_inode = inode;
>> +		return LRU_SKIP;
>> +	}
>> +	__iget(inode);
>> +	spin_unlock(&inode->i_lock);
>> +	spin_unlock(lock);
>>
>> -		func(I_BDEV(inode), arg);
>> +	iput(iter->toput_inode);
>> +	iter->toput_inode = inode;
>>
>> -		spin_lock(&blockdev_superblock->s_inode_list_lock);
>> -	}
>> -	spin_unlock(&blockdev_superblock->s_inode_list_lock);
>> -	iput(old_inode);
>> +	iter->func(I_BDEV(inode), iter->arg);
>> +
>> +	/*
>> +	 * Even though we have dropped the lock here, we can return LRU_SKIP as
>> +	 * we have a reference to the current inode and so it's next pointer is
>> +	 * guaranteed to be valid even though we dropped the list lock.
>> +	 */
>> +	spin_lock(lock);
>> +	return LRU_SKIP;
>> +}
>    So I actually think doing the iteration like this is buggy with list_lru
> code. When we pin inode by grabbing i_count, we are only sure the inode
> won't go away under us. However there's nothing which protects from list
> modifications. So this method is only safe to be combined with
> list_for_each[_entry](). Specifically it does *not* work when combined with
> list_for_each_safe() which __list_lru_walk_one() uses because the next
> pointer that is cached may become invalid once we drop the lock. And I'd
> really hate to try to make these two work together - that would seem really
> fragile since subtle changes to how lru_list iteration work could break
> inode iteration.
>
> This is true for all the iterations over the inodes that are playing with
> i_count. It's not just about this particular case of blockdev iteration.
>
>> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
>> index a4e1a8f..a0cdc66 100644
>> --- a/fs/notify/inode_mark.c
>> +++ b/fs/notify/inode_mark.c
>> @@ -161,87 +161,60 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
>>   	return ret;
>>   }
>>
>> -/**
>> - * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
>> - * @sb: superblock being unmounted.
>> - *
>> - * Called during unmount with no locks held, so needs to be safe against
>> - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
>> - */
>> -void fsnotify_unmount_inodes(struct super_block *sb)
>> -{
>> -	struct inode *inode, *next_i, *need_iput = NULL;
>> -
>> -	spin_lock(&sb->s_inode_list_lock);
>> -	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
>> -		struct inode *need_iput_tmp;
>> -
>> -		/*
>> -		 * We cannot __iget() an inode in state I_FREEING,
>> -		 * I_WILL_FREE, or I_NEW which is fine because by that point
>> -		 * the inode cannot have any associated watches.
>> -		 */
>> -		spin_lock(&inode->i_lock);
>> -		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
>> -			spin_unlock(&inode->i_lock);
>> -			continue;
>> -		}
>> +static enum lru_status
>> +fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
>> +		       spinlock_t *lock, void *cb_arg)
>> + {
>> +	struct inode **toput_inode = cb_arg;
>> +	struct inode *inode = container_of(item, struct inode, i_sb_list);
>> +
>> +	/* New or being freed inodes cannot have any associated watches. */
>> +	spin_lock(&inode->i_lock);
>> +	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
>> +		spin_unlock(&inode->i_lock);
>> +		return LRU_SKIP;
>> +	}
>>
>> -		/*
>> -		 * If i_count is zero, the inode cannot have any watches and
>> -		 * doing an __iget/iput with MS_ACTIVE clear would actually
>> -		 * evict all inodes with zero i_count from icache which is
>> -		 * unnecessarily violent and may in fact be illegal to do.
>> -		 */
>> -		if (!atomic_read(&inode->i_count)) {
>> -			spin_unlock(&inode->i_lock);
>> -			continue;
>> -		}
>> -
>> -		need_iput_tmp = need_iput;
>> -		need_iput = NULL;
>> -
>> -		/* In case fsnotify_inode_delete() drops a reference. */
>> -		if (inode != need_iput_tmp)
>> -			__iget(inode);
>> -		else
>> -			need_iput_tmp = NULL;
>> +	/* If i_count is zero, the inode cannot have any watches */
>> +	if (!atomic_read(&inode->i_count)) {
>>   		spin_unlock(&inode->i_lock);
>> +		return LRU_SKIP;
>> +	}
>>
>> -		/* In case the dropping of a reference would nuke next_i. */
>> -		while (&next_i->i_sb_list != &sb->s_inodes) {
>> -			spin_lock(&next_i->i_lock);
>> -			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
>> -						atomic_read(&next_i->i_count)) {
>> -				__iget(next_i);
>> -				need_iput = next_i;
>> -				spin_unlock(&next_i->i_lock);
>> -				break;
>> -			}
>> -			spin_unlock(&next_i->i_lock);
>> -			next_i = list_entry(next_i->i_sb_list.next,
>> -						struct inode, i_sb_list);
>> -		}
>> +	__iget(inode);
>> +	spin_unlock(&inode->i_lock);
>> +	spin_unlock(lock);
>>
>> -		/*
>> -		 * We can safely drop s_inode_list_lock here because either
>> -		 * we actually hold references on both inode and next_i or
>> -		 * end of list.  Also no new inodes will be added since the
>> -		 * umount has begun.
>> -		 */
>> -		spin_unlock(&sb->s_inode_list_lock);
>> +	iput(*toput_inode);
>> +	*toput_inode = inode;
>>
>> -		if (need_iput_tmp)
>> -			iput(need_iput_tmp);
>> +	/* for each watch, send FS_UNMOUNT and then remove it */
>> +	fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
>> +	fsnotify_inode_delete(inode);
>>
>> -		/* for each watch, send FS_UNMOUNT and then remove it */
>> -		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
>> +	/*
>> +	 * Even though we have dropped the lock here, we can return LRU_SKIP as
>> +	 * we have a reference to the current inode and so it's next pointer is
>> +	 * guaranteed to be valid even though we dropped the list lock.
>> +	 */
>> +	spin_lock(lock);
>> +	return LRU_SKIP;
>> +}
>    So in the original code of fsnotify_unmount_inodes() you can see the
> hoops you have to jump through when having to iterate inode list with
> list_for_each_entry_safe(). Deleting that code without deeper thought
> wasn't a good idea ;).
>
> As a side note, it should be possible to convert this code to use just
> list_for_each() and avoid all that crap but it will require me to dwelve into
> fsnotify magic again to verify it's correct. I guess I'll leave that after
> your patches are queued so that I don't make life harder to you.
>

Well these two cases are just bugs, we are supposed to return LRU_RETRY 
every time we drop the lru list lock so we loop back to the beginning. 
But I agree with your other points, I'll just drop this one and 
find/create a batched per-cpu list to use instead and do that separately 
so we can still get the meat of this patchset in.  Thanks,

Josef

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara March 16, 2015, 3:48 p.m. UTC | #3
On Mon 16-03-15 11:34:41, Josef Bacik wrote:
...
> >>diff --git a/fs/block_dev.c b/fs/block_dev.c
> >>index 2eb2436..d23ce6f 100644
> >>--- a/fs/block_dev.c
> >>+++ b/fs/block_dev.c
> >>@@ -1749,38 +1749,56 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
> >>  }
> >>  EXPORT_SYMBOL(__invalidate_device);
> >>
> >>-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
> >>-{
> >>-	struct inode *inode, *old_inode = NULL;
> >>+struct bdev_iter {
> >>+	void (*func)(struct block_device *, void *);
> >>+	void *arg;
> >>+	struct inode *toput_inode;
> >>+};
> >>
> >>-	spin_lock(&blockdev_superblock->s_inode_list_lock);
> >>-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
> >>-		struct address_space *mapping = inode->i_mapping;
> >>+static enum lru_status
> >>+bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
> >>+	     spinlock_t *lock, void *cb_arg)
> >>+{
> >>+	struct bdev_iter *iter = cb_arg;
> >>+	struct inode *inode = container_of(item, struct inode, i_sb_list);
> >>
> >>-		spin_lock(&inode->i_lock);
> >>-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> >>-		    mapping->nrpages == 0) {
> >>-			spin_unlock(&inode->i_lock);
> >>-			continue;
> >>-		}
> >>-		__iget(inode);
> >>+	spin_lock(&inode->i_lock);
> >>+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> >>+	    inode->i_mapping->nrpages == 0) {
> >>  		spin_unlock(&inode->i_lock);
> >>-		spin_unlock(&blockdev_superblock->s_inode_list_lock);
> >>-		/*
> >>-		 * We hold a reference to 'inode' so it couldn't have been
> >>-		 * removed from s_inodes list while we dropped the
> >>-		 * s_inode_list_lock  We cannot iput the inode now as we can
> >>-		 * be holding the last reference and we cannot iput it under
> >>-		 * s_inode_list_lock. So we keep the reference and iput it
> >>-		 * later.
> >>-		 */
> >>-		iput(old_inode);
> >>-		old_inode = inode;
> >>+		return LRU_SKIP;
> >>+	}
> >>+	__iget(inode);
> >>+	spin_unlock(&inode->i_lock);
> >>+	spin_unlock(lock);
> >>
> >>-		func(I_BDEV(inode), arg);
> >>+	iput(iter->toput_inode);
> >>+	iter->toput_inode = inode;
> >>
> >>-		spin_lock(&blockdev_superblock->s_inode_list_lock);
> >>-	}
> >>-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
> >>-	iput(old_inode);
> >>+	iter->func(I_BDEV(inode), iter->arg);
> >>+
> >>+	/*
> >>+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
> >>+	 * we have a reference to the current inode and so it's next pointer is
> >>+	 * guaranteed to be valid even though we dropped the list lock.
> >>+	 */
> >>+	spin_lock(lock);
> >>+	return LRU_SKIP;
> >>+}
> >   So I actually think doing the iteration like this is buggy with list_lru
> >code. When we pin inode by grabbing i_count, we are only sure the inode
> >won't go away under us. However there's nothing which protects from list
> >modifications. So this method is only safe to be combined with
> >list_for_each[_entry](). Specifically it does *not* work when combined with
> >list_for_each_safe() which __list_lru_walk_one() uses because the next
> >pointer that is cached may become invalid once we drop the lock. And I'd
> >really hate to try to make these two work together - that would seem really
> >fragile since subtle changes to how lru_list iteration work could break
> >inode iteration.
> >
> >This is true for all the iterations over the inodes that are playing with
> >i_count. It's not just about this particular case of blockdev iteration.
> >
> >>diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> >>index a4e1a8f..a0cdc66 100644
> >>--- a/fs/notify/inode_mark.c
> >>+++ b/fs/notify/inode_mark.c
> >>@@ -161,87 +161,60 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
> >>  	return ret;
> >>  }
> >>
> >>-/**
> >>- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
> >>- * @sb: superblock being unmounted.
> >>- *
> >>- * Called during unmount with no locks held, so needs to be safe against
> >>- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
> >>- */
> >>-void fsnotify_unmount_inodes(struct super_block *sb)
> >>-{
> >>-	struct inode *inode, *next_i, *need_iput = NULL;
> >>-
> >>-	spin_lock(&sb->s_inode_list_lock);
> >>-	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
> >>-		struct inode *need_iput_tmp;
> >>-
> >>-		/*
> >>-		 * We cannot __iget() an inode in state I_FREEING,
> >>-		 * I_WILL_FREE, or I_NEW which is fine because by that point
> >>-		 * the inode cannot have any associated watches.
> >>-		 */
> >>-		spin_lock(&inode->i_lock);
> >>-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> >>-			spin_unlock(&inode->i_lock);
> >>-			continue;
> >>-		}
> >>+static enum lru_status
> >>+fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
> >>+		       spinlock_t *lock, void *cb_arg)
> >>+ {
> >>+	struct inode **toput_inode = cb_arg;
> >>+	struct inode *inode = container_of(item, struct inode, i_sb_list);
> >>+
> >>+	/* New or being freed inodes cannot have any associated watches. */
> >>+	spin_lock(&inode->i_lock);
> >>+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> >>+		spin_unlock(&inode->i_lock);
> >>+		return LRU_SKIP;
> >>+	}
> >>
> >>-		/*
> >>-		 * If i_count is zero, the inode cannot have any watches and
> >>-		 * doing an __iget/iput with MS_ACTIVE clear would actually
> >>-		 * evict all inodes with zero i_count from icache which is
> >>-		 * unnecessarily violent and may in fact be illegal to do.
> >>-		 */
> >>-		if (!atomic_read(&inode->i_count)) {
> >>-			spin_unlock(&inode->i_lock);
> >>-			continue;
> >>-		}
> >>-
> >>-		need_iput_tmp = need_iput;
> >>-		need_iput = NULL;
> >>-
> >>-		/* In case fsnotify_inode_delete() drops a reference. */
> >>-		if (inode != need_iput_tmp)
> >>-			__iget(inode);
> >>-		else
> >>-			need_iput_tmp = NULL;
> >>+	/* If i_count is zero, the inode cannot have any watches */
> >>+	if (!atomic_read(&inode->i_count)) {
> >>  		spin_unlock(&inode->i_lock);
> >>+		return LRU_SKIP;
> >>+	}
> >>
> >>-		/* In case the dropping of a reference would nuke next_i. */
> >>-		while (&next_i->i_sb_list != &sb->s_inodes) {
> >>-			spin_lock(&next_i->i_lock);
> >>-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
> >>-						atomic_read(&next_i->i_count)) {
> >>-				__iget(next_i);
> >>-				need_iput = next_i;
> >>-				spin_unlock(&next_i->i_lock);
> >>-				break;
> >>-			}
> >>-			spin_unlock(&next_i->i_lock);
> >>-			next_i = list_entry(next_i->i_sb_list.next,
> >>-						struct inode, i_sb_list);
> >>-		}
> >>+	__iget(inode);
> >>+	spin_unlock(&inode->i_lock);
> >>+	spin_unlock(lock);
> >>
> >>-		/*
> >>-		 * We can safely drop s_inode_list_lock here because either
> >>-		 * we actually hold references on both inode and next_i or
> >>-		 * end of list.  Also no new inodes will be added since the
> >>-		 * umount has begun.
> >>-		 */
> >>-		spin_unlock(&sb->s_inode_list_lock);
> >>+	iput(*toput_inode);
> >>+	*toput_inode = inode;
> >>
> >>-		if (need_iput_tmp)
> >>-			iput(need_iput_tmp);
> >>+	/* for each watch, send FS_UNMOUNT and then remove it */
> >>+	fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> >>+	fsnotify_inode_delete(inode);
> >>
> >>-		/* for each watch, send FS_UNMOUNT and then remove it */
> >>-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> >>+	/*
> >>+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
> >>+	 * we have a reference to the current inode and so it's next pointer is
> >>+	 * guaranteed to be valid even though we dropped the list lock.
> >>+	 */
> >>+	spin_lock(lock);
> >>+	return LRU_SKIP;
> >>+}
> >   So in the original code of fsnotify_unmount_inodes() you can see the
> >hoops you have to jump through when having to iterate inode list with
> >list_for_each_entry_safe(). Deleting that code without deeper thought
> >wasn't a good idea ;).
> >
> >As a side note, it should be possible to convert this code to use just
> >list_for_each() and avoid all that crap but it will require me to dwelve into
> >fsnotify magic again to verify it's correct. I guess I'll leave that after
> >your patches are queued so that I don't make life harder to you.
> >
> 
> Well these two cases are just bugs, we are supposed to return
> LRU_RETRY every time we drop the lru list lock so we loop back to
> the beginning. But I agree with your other points, I'll just drop
> this one and find/create a batched per-cpu list to use instead and
> do that separately so we can still get the meat of this patchset in.
  I don't think you can just return LRU_RETRY. That way e.g. iterating
all bdevs to wait for IO would become an infinite loop. The trouble with
inode iteration is we need to iterate all of them, sleep on each inode but
we don't remove the inode from the list (at least not in all callers) so
forward progress isn't guaranteed by that.

								Honza
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eb2436..d23ce6f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1749,38 +1749,56 @@  int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 }
 EXPORT_SYMBOL(__invalidate_device);
 
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
-{
-	struct inode *inode, *old_inode = NULL;
+struct bdev_iter {
+	void (*func)(struct block_device *, void *);
+	void *arg;
+	struct inode *toput_inode;
+};
 
-	spin_lock(&blockdev_superblock->s_inode_list_lock);
-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
-		struct address_space *mapping = inode->i_mapping;
+static enum lru_status
+bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
+	     spinlock_t *lock, void *cb_arg)
+{
+	struct bdev_iter *iter = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
-		    mapping->nrpages == 0) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+	    inode->i_mapping->nrpages == 0) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&blockdev_superblock->s_inode_list_lock);
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock  We cannot iput the inode now as we can
-		 * be holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		iput(old_inode);
-		old_inode = inode;
+		return LRU_SKIP;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		func(I_BDEV(inode), arg);
+	iput(iter->toput_inode);
+	iter->toput_inode = inode;
 
-		spin_lock(&blockdev_superblock->s_inode_list_lock);
-	}
-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
-	iput(old_inode);
+	iter->func(I_BDEV(inode), iter->arg);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+/*
+ * iterate_bdevs - run a callback across all block devices
+ */
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+	struct bdev_iter iter = {
+		.func = func,
+		.arg = arg,
+	};
+
+	list_lru_walk(&blockdev_superblock->s_inode_list, bdev_iter_cb, &iter,
+		      ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
+	iput(iter.toput_inode);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b..ee381e1 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -13,29 +13,51 @@ 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
-static void drop_pagecache_sb(struct super_block *sb, void *unused)
+static enum lru_status
+drop_pagecache_inode(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
 {
-	struct inode *inode, *toput_inode = NULL;
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (inode->i_mapping->nrpages == 0)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+	    (inode->i_mapping->nrpages == 0)) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
+		return LRU_SKIP;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		invalidate_mapping_pages(inode->i_mapping, 0, -1);
-		iput(toput_inode);
-		toput_inode = inode;
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	invalidate_mapping_pages(inode->i_mapping, 0, -1);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+
+/*
+ * This is a best effort scan, so we don't need to be absolutely sure we hit al
+ * inodes on the superblock. Hence a single pass is sufficient to catch them
+ * all.
+ */
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
+{
+	struct inode *toput_inode = NULL;
+
+	list_lru_walk(&sb->s_inode_list, drop_pagecache_inode, &toput_inode,
+		      ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
 	iput(toput_inode);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index b961e5a..17da8801 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,8 +28,8 @@ 
  *   inode->i_state, inode->i_hash, __iget()
  * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
- * inode->i_sb->s_inode_list_lock protects:
- *   inode->i_sb->s_inodes, inode->i_sb_list
+ * Inode list locks protects:
+ *   inode->i_sb->s_inode_list, inode->i_sb_list
  * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
  * inode_hash_lock protects:
@@ -37,7 +37,7 @@ 
  *
  * Lock ordering:
  *
- * inode->i_sb->s_inode_list_lock
+ * Inode list lock
  *   inode->i_lock
  *     Inode LRU list locks
  *
@@ -45,7 +45,7 @@ 
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode->i_sb->s_inode_list_lock
+ *   Inode list lock
  *   inode->i_lock
  *
  * iunique_lock
@@ -357,6 +357,7 @@  void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_io_list);
 	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_sb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
@@ -423,19 +424,13 @@  static void inode_lru_list_del(struct inode *inode)
  */
 void inode_sb_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_list_lock);
-	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-	spin_unlock(&inode->i_sb->s_inode_list_lock);
+	list_lru_add(&inode->i_sb->s_inode_list, &inode->i_sb_list);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 static inline void inode_sb_list_del(struct inode *inode)
 {
-	if (!list_empty(&inode->i_sb_list)) {
-		spin_lock(&inode->i_sb->s_inode_list_lock);
-		list_del_init(&inode->i_sb_list);
-		spin_unlock(&inode->i_sb->s_inode_list_lock);
-	}
+	list_lru_del(&inode->i_sb->s_inode_list, &inode->i_sb_list);
 }
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -577,6 +572,50 @@  static void dispose_list(struct list_head *head)
 	}
 }
 
+static enum lru_status
+__evict_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+		       spinlock_t *lock, void *cb_arg, bool kill_dirty)
+{
+	struct list_head *dispose = cb_arg;
+	struct inode	*inode = container_of(item, struct inode, i_sb_list);
+
+	if (atomic_read(&inode->i_count))
+		return LRU_SKIP;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
+
+	if ((inode->i_state & I_DIRTY) && !kill_dirty) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
+
+	inode->i_state |= I_FREEING;
+	inode_lru_list_del(inode);
+	list_add(&inode->i_lru, dispose);
+
+	list_lru_isolate(lru, item);
+	spin_unlock(&inode->i_lock);
+	return LRU_REMOVED;
+}
+
+static enum lru_status
+evict_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return __evict_inodes_isolate(item, lru, lock, cb_arg, true);
+}
+
+static enum lru_status
+invalidate_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+			  spinlock_t *lock, void *cb_arg)
+{
+	return __evict_inodes_isolate(item, lru, lock, cb_arg, false);
+}
+
 /**
  * evict_inodes	- evict all evictable inodes for a superblock
  * @sb:		superblock to operate on
@@ -588,28 +627,15 @@  static void dispose_list(struct list_head *head)
  */
 void evict_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (atomic_read(&inode->i_count))
-			continue;
-
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
+	long freed;
 
-		inode->i_state |= I_FREEING;
-		inode_lru_list_del(inode);
-		spin_unlock(&inode->i_lock);
-		list_add(&inode->i_lru, &dispose);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	do {
+		LIST_HEAD(dispose);
 
-	dispose_list(&dispose);
+		freed = list_lru_walk(&sb->s_inode_list, evict_inodes_isolate,
+				      &dispose, ULONG_MAX);
+		dispose_list(&dispose);
+	} while (freed > 0);
 }
 
 /**
@@ -624,38 +650,24 @@  void evict_inodes(struct super_block *sb)
  */
 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
-	int busy = 0;
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
+	list_lru_walk_cb isolate;
+	long freed;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
-			spin_unlock(&inode->i_lock);
-			busy = 1;
-			continue;
-		}
-		if (atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			busy = 1;
-			continue;
-		}
+	isolate = kill_dirty ? evict_inodes_isolate :invalidate_inodes_isolate;
 
-		inode->i_state |= I_FREEING;
-		inode_lru_list_del(inode);
-		spin_unlock(&inode->i_lock);
-		list_add(&inode->i_lru, &dispose);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	do {
+		LIST_HEAD(dispose);
 
-	dispose_list(&dispose);
+		freed = list_lru_walk(&sb->s_inode_list, isolate,
+				      &dispose, ULONG_MAX);
+		dispose_list(&dispose);
+	} while (freed > 0);
 
-	return busy;
+	/*
+	 * if we skipped any inodes because we couldn't isolate them, tell the
+	 * caller there are still active inodes.
+	 */
+	return !!list_lru_count(&sb->s_inode_list);
 }
 
 /*
@@ -849,7 +861,7 @@  EXPORT_SYMBOL(get_next_ino);
  *	@sb: superblock
  *
  *	Allocates a new inode for given superblock.
- *	Inode wont be chained in superblock s_inodes list
+ *	Inode wont be chained in superblock s_inode_list list
  *	This means :
  *	- fs can't be unmount
  *	- quotas, fsnotify, writeback can't work
@@ -883,8 +895,6 @@  struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 
-	spin_lock_prefetch(&sb->s_inode_list_lock);
-
 	inode = new_inode_pseudo(sb);
 	if (inode)
 		inode_sb_list_add(inode);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a4e1a8f..a0cdc66 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -161,87 +161,60 @@  int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	return ret;
 }
 
-/**
- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-void fsnotify_unmount_inodes(struct super_block *sb)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
-		struct inode *need_iput_tmp;
-
-		/*
-		 * We cannot __iget() an inode in state I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
+static enum lru_status
+fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
+		       spinlock_t *lock, void *cb_arg)
+ {
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
+
+	/* New or being freed inodes cannot have any associated watches. */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
 
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-
-		/* In case fsnotify_inode_delete() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
+	/* If i_count is zero, the inode cannot have any watches */
+	if (!atomic_read(&inode->i_count)) {
 		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
 
-		/* In case the dropping of a reference would nuke next_i. */
-		while (&next_i->i_sb_list != &sb->s_inodes) {
-			spin_lock(&next_i->i_lock);
-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
-						atomic_read(&next_i->i_count)) {
-				__iget(next_i);
-				need_iput = next_i;
-				spin_unlock(&next_i->i_lock);
-				break;
-			}
-			spin_unlock(&next_i->i_lock);
-			next_i = list_entry(next_i->i_sb_list.next,
-						struct inode, i_sb_list);
-		}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		/*
-		 * We can safely drop s_inode_list_lock here because either
-		 * we actually hold references on both inode and next_i or
-		 * end of list.  Also no new inodes will be added since the
-		 * umount has begun.
-		 */
-		spin_unlock(&sb->s_inode_list_lock);
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
+	/* for each watch, send FS_UNMOUNT and then remove it */
+	fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify_inode_delete(inode);
 
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
 
-		fsnotify_inode_delete(inode);
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @sb: superblock being unmounted.
+ *
+ * Called during unmount with the sb->s_umount held exclusively and so the inode
+ * list will not grow and so a single pass will catch all inodes.
+ */
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+	struct inode *toput_inode = NULL;
 
-		iput(inode);
+	list_lru_walk(&sb->s_inode_list, fsnotify_unmount_inode, &toput_inode,
+		      ULONG_MAX);
 
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	/* the list walk doesn't release the last inode it sees! */
+	iput(toput_inode);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 68c7ae3..20c97c7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -912,55 +912,81 @@  static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqonoff_mutex mutex */
-static void add_dquot_ref(struct super_block *sb, int type)
+static enum lru_status
+add_dquot_ref_type(struct list_head *item, struct list_lru_one *lru,
+		   spinlock_t *lock, void *cb_arg, int type)
 {
-	struct inode *inode, *old_inode = NULL;
-#ifdef CONFIG_QUOTA_DEBUG
-	int reserved = 0;
-#endif
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    !atomic_read(&inode->i_writecount) ||
-		    !dqinit_needed(inode, type)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+	    !atomic_read(&inode->i_writecount) ||
+	    !dqinit_needed(inode, type)) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
+		return LRU_SKIP;
+	}
+
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
 #ifdef CONFIG_QUOTA_DEBUG
-		if (unlikely(inode_get_rsv_space(inode) > 0))
-			reserved = 1;
+	if (unlikely(inode_get_rsv_space(inode) > 0))
+		quota_error(inode->i_sb, "Writes happened before quota was "
+			    "turned on thus quota information is probably "
+			    "inconsistent. Please run quotacheck(8)");
 #endif
-		iput(old_inode);
-		__dquot_initialize(inode, type);
 
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock. We cannot iput the inode now as we can be
-		 * holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		old_inode = inode;
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(old_inode);
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-#ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
-		quota_error(sb, "Writes happened before quota was turned on "
-			"thus quota information is probably inconsistent. "
-			"Please run quotacheck(8)");
+	__dquot_initialize(inode, type);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+static enum lru_status
+add_dquot_ref_usr(struct list_head *item, struct list_lru_one *lru,
+		  spinlock_t *lock, void *cb_arg)
+{
+	return add_dquot_ref_type(item, lru, lock, cb_arg, USRQUOTA);
+}
+
+static enum lru_status
+add_dquot_ref_grp(struct list_head *item, struct list_lru_one *lru,
+		  spinlock_t *lock, void *cb_arg)
+{
+	return add_dquot_ref_type(item, lru, lock, cb_arg, GRPQUOTA);
+}
+
+/* add_dquot_ref is protected by the dqonoff_mutex mutex */
+void add_dquot_ref(struct super_block *sb, int type)
+{
+	struct inode *toput_inode = NULL;
+	list_lru_walk_cb isolate;
+
+	switch (type) {
+	case USRQUOTA:
+		isolate = add_dquot_ref_usr;
+		break;
+	case GRPQUOTA:
+		isolate = add_dquot_ref_grp;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
 	}
-#endif
+	list_lru_walk(&sb->s_inode_list, isolate, &toput_inode, ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
+	iput(toput_inode);
 }
 
 /*
@@ -1013,36 +1039,67 @@  static void put_dquot_list(struct list_head *tofree_head)
 	}
 }
 
-static void remove_dquot_ref(struct super_block *sb, int type,
-		struct list_head *tofree_head)
+static enum lru_status
+remove_dquot_ref_type(struct list_head *item, struct list_lru_one *lru,
+		      spinlock_t *lock, void *cb_arg, int type)
 {
-	struct inode *inode;
-	int reserved = 0;
+	struct list_head *tofree_head = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		/*
-		 *  We have to scan also I_NEW inodes because they can already
-		 *  have quota pointer initialized. Luckily, we need to touch
-		 *  only quota pointers and these have separate locking
-		 *  (dq_data_lock).
-		 */
-		spin_lock(&dq_data_lock);
-		if (!IS_NOQUOTA(inode)) {
-			if (unlikely(inode_get_rsv_space(inode) > 0))
-				reserved = 1;
-			remove_inode_dquot_ref(inode, type, tofree_head);
-		}
-		spin_unlock(&dq_data_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	/*
+	 *  We have to scan also I_NEW inodes because they can already
+	 *  have quota pointer initialized. Luckily, we need to touch
+	 *  only quota pointers and these have separate locking
+	 *  (dqptr_sem).
+	 */
+	spin_lock(&dq_data_lock);
+	if (!IS_NOQUOTA(inode)) {
 #ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
-		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+		if (unlikely(inode_get_rsv_space(inode) > 0)) {
+			printk_ratelimited(KERN_WARNING
+			"VFS (%s): Writes happened after quota"
 			" was disabled thus quota information is probably "
-			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
-	}
+			"inconsistent. Please run quotacheck(8).\n",
+			inode->i_sb->s_id);
+		}
 #endif
+		remove_inode_dquot_ref(inode, type, tofree_head);
+	}
+	spin_unlock(&dq_data_lock);
+	return LRU_SKIP;
+}
+
+static enum lru_status
+remove_dquot_ref_usr(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return remove_dquot_ref_type(item, lru, lock, cb_arg, USRQUOTA);
+}
+
+static enum lru_status
+remove_dquot_ref_grp(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return remove_dquot_ref_type(item, lru, lock, cb_arg, GRPQUOTA);
+}
+
+static void remove_dquot_ref(struct super_block *sb, int type,
+		struct list_head *tofree_head)
+{
+	list_lru_walk_cb isolate;
+
+	switch (type) {
+	case USRQUOTA:
+		isolate = remove_dquot_ref_usr;
+		break;
+	case GRPQUOTA:
+		isolate = remove_dquot_ref_grp;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
+	list_lru_walk(&sb->s_inode_list, isolate, tofree_head, ULONG_MAX);
 }
 
 /* Gather all references from inodes and drop them */
diff --git a/fs/super.c b/fs/super.c
index 6a05d94..721584a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -146,6 +146,7 @@  static void destroy_super(struct super_block *s)
 	int i;
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+	list_lru_destroy(&s->s_inode_list);
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
@@ -191,13 +192,13 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	mutex_init(&s->s_sync_lock);
-	INIT_LIST_HEAD(&s->s_inodes);
-	spin_lock_init(&s->s_inode_list_lock);
 
 	if (list_lru_init_memcg(&s->s_dentry_lru))
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru))
 		goto fail;
+	if (list_lru_init(&s->s_inode_list))
+		goto fail;
 
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -294,6 +295,7 @@  void deactivate_locked_super(struct super_block *s)
 		 */
 		list_lru_destroy(&s->s_dentry_lru);
 		list_lru_destroy(&s->s_inode_lru);
+		list_lru_destroy(&s->s_inode_list);
 
 		put_filesystem(fs);
 		put_super(s);
@@ -413,7 +415,7 @@  void generic_shutdown_super(struct super_block *sb)
 		if (sop->put_super)
 			sop->put_super(sb);
 
-		if (!list_empty(&sb->s_inodes)) {
+		if (list_lru_count(&sb->s_inode_list)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66284fe..964aba3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1319,9 +1319,12 @@  struct super_block {
 	 */
 	int s_stack_depth;
 
-	/* s_inode_list_lock protects s_inodes */
-	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inodes;	/* all inodes */
+	/*
+	 * the inode list is not strictly used as a LRU, but uses the list_lru
+	 * construct to provide a scalable list implemenation for adding,
+	 * removing and walking the inodes cached in memory.
+	 */
+	struct list_lru		s_inode_list ____cacheline_aligned_in_smp;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);