diff mbox series

[23/24] xfs: reclaim inodes from the LRU

Message ID 20190801021752.4986-24-david@fromorbit.com (mailing list archive)
State New, archived
Headers show
Series mm, xfs: non-blocking inode reclaim | expand

Commit Message

Dave Chinner Aug. 1, 2019, 2:17 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Replace the AG radix tree walking reclaim code with a list_lru
walker, giving us both node-aware and memcg-aware inode reclaim
at the XFS level. This requires adding an inode isolation function to
determine if the inode can be reclaim, and a list walker to
dispose of the inodes that were isolated.

We want the isolation function to be non-blocking. If we can't
grab an inode then we either skip it or rotate it. If it's clean
then we skip it, if it's dirty then we rotate to give it time to be
cleaned before it is scanned again.

This congregates the dirty inodes at the tail of the LRU, which
means that if we start hitting a majority of dirty inodes either
there are lots of unlinked inodes in the reclaim list or we've
reclaimed all the clean inodes and we're looped back on the dirty
inodes. Either way, this is an indication we should tell kswapd to
back off.

The non-blocking isolation function introduces a complexity for the
filesystem shutdown case. When the filesystem is shut down, we want
to free the inode even if it is dirty, and this may require
blocking. We already hold the locks needed to do this blocking, so
what we do is that we leave inodes locked - both the ILOCK and the
flush lock - while they are sitting on the dispose list to be freed
after the LRU walk completes.  This allows us to process the
shutdown state outside the LRU walk where we can block safely.

Keep in mind we don't have to care about inode lock order or
blocking with inode locks held here because a) we are using
trylocks, and b) once marked with XFS_IRECLAIM they can't be found
via the LRU and inode cache lookups will abort and retry. Hence
nobody will try to lock them in any other context that might also be
holding other inode locks.

Also convert xfs_reclaim_inodes() to use a LRU walk to free all
the reclaimable inodes in the filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 199 ++++++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_icache.h |  10 ++-
 fs/xfs/xfs_inode.h  |   8 ++
 fs/xfs/xfs_super.c  |  50 +++++++++--
 4 files changed, 232 insertions(+), 35 deletions(-)

Comments

Brian Foster Aug. 8, 2019, 4:39 p.m. UTC | #1
On Thu, Aug 01, 2019 at 12:17:51PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Replace the AG radix tree walking reclaim code with a list_lru
> walker, giving us both node-aware and memcg-aware inode reclaim
> at the XFS level. This requires adding an inode isolation function to
> determine if the inode can be reclaim, and a list walker to
> dispose of the inodes that were isolated.
> 
> We want the isolation function to be non-blocking. If we can't
> grab an inode then we either skip it or rotate it. If it's clean
> then we skip it, if it's dirty then we rotate to give it time to be

Do you mean we remove it if it's clean?

> cleaned before it is scanned again.
> 
> This congregates the dirty inodes at the tail of the LRU, which
> means that if we start hitting a majority of dirty inodes either
> there are lots of unlinked inodes in the reclaim list or we've
> reclaimed all the clean inodes and we're looped back on the dirty
> inodes. Either way, this is an indication we should tell kswapd to
> back off.
> 
> The non-blocking isolation function introduces a complexity for the
> filesystem shutdown case. When the filesystem is shut down, we want
> to free the inode even if it is dirty, and this may require
> blocking. We already hold the locks needed to do this blocking, so
> what we do is that we leave inodes locked - both the ILOCK and the
> flush lock - while they are sitting on the dispose list to be freed
> after the LRU walk completes.  This allows us to process the
> shutdown state outside the LRU walk where we can block safely.
> 
> Keep in mind we don't have to care about inode lock order or
> blocking with inode locks held here because a) we are using
> trylocks, and b) once marked with XFS_IRECLAIM they can't be found
> via the LRU and inode cache lookups will abort and retry. Hence
> nobody will try to lock them in any other context that might also be
> holding other inode locks.
> 
> Also convert xfs_reclaim_inodes() to use a LRU walk to free all
> the reclaimable inodes in the filesystem.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_icache.c | 199 ++++++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_icache.h |  10 ++-
>  fs/xfs/xfs_inode.h  |   8 ++
>  fs/xfs/xfs_super.c  |  50 +++++++++--
>  4 files changed, 232 insertions(+), 35 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b5c4c1b6fd19..e3e898a2896c 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
...
> @@ -1810,23 +1811,58 @@ xfs_fs_mount(
>  }
>  
>  static long
> -xfs_fs_nr_cached_objects(
> +xfs_fs_free_cached_objects(
>  	struct super_block	*sb,
>  	struct shrink_control	*sc)
>  {
> -	/* Paranoia: catch incorrect calls during mount setup or teardown */
> -	if (WARN_ON_ONCE(!sb->s_fs_info))
> -		return 0;
> +	struct xfs_mount	*mp = XFS_M(sb);
> +        struct xfs_ireclaim_args ra;

^ whitespace damage

> +	long freed;
>  
> -	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
> +	INIT_LIST_HEAD(&ra.freeable);
> +	ra.lowest_lsn = NULLCOMMITLSN;
> +	ra.dirty_skipped = 0;
> +
> +	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
> +					xfs_inode_reclaim_isolate, &ra);

This is more related to the locking discussion on the earlier patch, but
this looks like it has more similar serialization to the example patch I
posted than the one without locking at all. IIUC, this walk has an
internal lock per node lru that is held across the walk and passed into
the callback. We never cycle it, so for any given node we only allow one
reclaimer through here at a time.

That seems to be Ok given we don't do much in the isolation handler, the
lock isn't held across the dispose sequence and we're still batching in
the shrinker core on top of that. We're still serialized over the lru
fixups such that concurrent reclaimers aren't processing the same
inodes, however.

BTW I got a lockdep splat[1] for some reason on a straight mount/unmount
cycle with this patch.

Brian

[1] dmesg output:

[   39.011867] ============================================
[   39.012643] WARNING: possible recursive locking detected
[   39.013422] 5.3.0-rc2+ #205 Not tainted
[   39.014623] --------------------------------------------
[   39.015636] umount/871 is trying to acquire lock:
[   39.016122] 00000000ea09de26 (&xfs_nondir_ilock_class){+.+.}, at: xfs_ilock+0xd2/0x280 [xfs]
[   39.017072] 
[   39.017072] but task is already holding lock:
[   39.017832] 000000001a5b5707 (&xfs_nondir_ilock_class){+.+.}, at: xfs_ilock_nowait+0xcb/0x320 [xfs]
[   39.018909] 
[   39.018909] other info that might help us debug this:
[   39.019570]  Possible unsafe locking scenario:
[   39.019570] 
[   39.020248]        CPU0
[   39.020512]        ----
[   39.020778]   lock(&xfs_nondir_ilock_class);
[   39.021246]   lock(&xfs_nondir_ilock_class);
[   39.021705] 
[   39.021705]  *** DEADLOCK ***
[   39.021705] 
[   39.022338]  May be due to missing lock nesting notation
[   39.022338] 
[   39.023070] 3 locks held by umount/871:
[   39.023481]  #0: 000000004d39d244 (&type->s_umount_key#61){+.+.}, at: deactivate_super+0x43/0x50
[   39.024462]  #1: 0000000011270366 (&xfs_dir_ilock_class){++++}, at: xfs_ilock_nowait+0xcb/0x320 [xfs]
[   39.025488]  #2: 000000001a5b5707 (&xfs_nondir_ilock_class){+.+.}, at: xfs_ilock_nowait+0xcb/0x320 [xfs]
[   39.027163] 
[   39.027163] stack backtrace:
[   39.027681] CPU: 3 PID: 871 Comm: umount Not tainted 5.3.0-rc2+ #205
[   39.028534] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   39.029152] Call Trace:
[   39.029428]  dump_stack+0x67/0x90
[   39.029889]  __lock_acquire.cold.67+0x121/0x1f9
[   39.030519]  lock_acquire+0x90/0x170
[   39.031170]  ? xfs_ilock+0xd2/0x280 [xfs]
[   39.031603]  down_write_nested+0x4f/0xb0
[   39.032064]  ? xfs_ilock+0xd2/0x280 [xfs]
[   39.032684]  ? xfs_dispose_inodes+0x124/0x320 [xfs]
[   39.033575]  xfs_ilock+0xd2/0x280 [xfs]
[   39.034058]  xfs_dispose_inodes+0x124/0x320 [xfs]
[   39.034656]  xfs_reclaim_inodes+0x149/0x190 [xfs]
[   39.035381]  ? finish_wait+0x80/0x80
[   39.035855]  xfs_unmountfs+0x81/0x190 [xfs]
[   39.036443]  xfs_fs_put_super+0x35/0x90 [xfs]
[   39.037016]  generic_shutdown_super+0x6c/0x100
[   39.037554]  kill_block_super+0x21/0x50
[   39.037986]  deactivate_locked_super+0x34/0x70
[   39.038477]  cleanup_mnt+0xb8/0x140
[   39.038879]  task_work_run+0x9e/0xd0
[   39.039302]  exit_to_usermode_loop+0xb3/0xc0
[   39.039774]  do_syscall_64+0x206/0x210
[   39.040591]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   39.041771] RIP: 0033:0x7fcd4ec0536b
[   39.042627] Code: 0b 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 90 f3 0f 1e fa 31 f6 e9 05 00 00 00 0f 1f 44 00 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ed 0a 0c 00 f7 d8 64 89 01 48
[   39.045336] RSP: 002b:00007ffdedf686c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
[   39.046119] RAX: 0000000000000000 RBX: 00007fcd4ed2f1e4 RCX: 00007fcd4ec0536b
[   39.047506] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000055ad23f2ad90
[   39.048295] RBP: 000055ad23f2ab80 R08: 0000000000000000 R09: 00007ffdedf67440
[   39.049062] R10: 000055ad23f2adb0 R11: 0000000000000246 R12: 000055ad23f2ad90
[   39.049869] R13: 0000000000000000 R14: 000055ad23f2ac78 R15: 0000000000000000
Dave Chinner Aug. 9, 2019, 1:20 a.m. UTC | #2
On Thu, Aug 08, 2019 at 12:39:05PM -0400, Brian Foster wrote:
> On Thu, Aug 01, 2019 at 12:17:51PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Replace the AG radix tree walking reclaim code with a list_lru
> > walker, giving us both node-aware and memcg-aware inode reclaim
> > at the XFS level. This requires adding an inode isolation function to
> > determine if the inode can be reclaim, and a list walker to
> > dispose of the inodes that were isolated.
> > 
> > We want the isolation function to be non-blocking. If we can't
> > grab an inode then we either skip it or rotate it. If it's clean
> > then we skip it, if it's dirty then we rotate to give it time to be
> 
> Do you mean we remove it if it's clean?

No, I mean if we can't grab it and it's clean, then we just skip it,
leaving it at the head of the LRU for the next scanner to
immediately try to reclaim it. If it's dirty, we rotate it so that
time passes before we try to reclaim it again in the hope that it is
already clean by the time we've scanned through the entire LRU...

> > +++ b/fs/xfs/xfs_super.c
> ...
> > @@ -1810,23 +1811,58 @@ xfs_fs_mount(
> >  }
> >  
> >  static long
> > -xfs_fs_nr_cached_objects(
> > +xfs_fs_free_cached_objects(
> >  	struct super_block	*sb,
> >  	struct shrink_control	*sc)
> >  {
> > -	/* Paranoia: catch incorrect calls during mount setup or teardown */
> > -	if (WARN_ON_ONCE(!sb->s_fs_info))
> > -		return 0;
> > +	struct xfs_mount	*mp = XFS_M(sb);
> > +        struct xfs_ireclaim_args ra;
> 
> ^ whitespace damage

Already fixed.

> > +	long freed;
> >  
> > -	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
> > +	INIT_LIST_HEAD(&ra.freeable);
> > +	ra.lowest_lsn = NULLCOMMITLSN;
> > +	ra.dirty_skipped = 0;
> > +
> > +	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
> > +					xfs_inode_reclaim_isolate, &ra);
> 
> This is more related to the locking discussion on the earlier patch, but
> this looks like it has more similar serialization to the example patch I
> posted than the one without locking at all. IIUC, this walk has an
> internal lock per node lru that is held across the walk and passed into
> the callback. We never cycle it, so for any given node we only allow one
> reclaimer through here at a time.

That's not a guarantee that list_lru gives us. It could drop it's
internal lock at any time during that walk and we would be
blissfully unaware that it has done this. And at that point, the
reclaim context is completely unaware that other reclaim contexts
may be scanning the same LRU at the same time and are interleaving
with it.

And, really, that does not matter one iota. If multiple scanners are
interleaving, the reclaim traversal order and the decisions made are
no different from what a single reclaimer does.  i.e. we just don't
have to care if reclaim contexts interleave or not, because they
will not repeat work that has already been done unnecessarily.
That's one of the reasons for moving to IO-less LRU ordered reclaim
- it removes all the gross hacks we've had to implement to guarantee
reclaim scanning progress in one nice neat package of generic
infrastructure.

> That seems to be Ok given we don't do much in the isolation handler, the
> lock isn't held across the dispose sequence and we're still batching in
> the shrinker core on top of that. We're still serialized over the lru
> fixups such that concurrent reclaimers aren't processing the same
> inodes, however.

The only thing that we may need here is need_resched() checks if it
turns out that holding a lock for 1024 items to be scanned proved to
be too long to hold on to a single CPU. If we do that we'd cycle the
LRU lock and return RETRY or RETRY_REMOVE, hence enabling reclaimers
more finer-grained interleaving....

> BTW I got a lockdep splat[1] for some reason on a straight mount/unmount
> cycle with this patch.
....
> [   39.030519]  lock_acquire+0x90/0x170
> [   39.031170]  ? xfs_ilock+0xd2/0x280 [xfs]
> [   39.031603]  down_write_nested+0x4f/0xb0
> [   39.032064]  ? xfs_ilock+0xd2/0x280 [xfs]
> [   39.032684]  ? xfs_dispose_inodes+0x124/0x320 [xfs]
> [   39.033575]  xfs_ilock+0xd2/0x280 [xfs]
> [   39.034058]  xfs_dispose_inodes+0x124/0x320 [xfs]

False positive, AFAICT. It's complaining about the final xfs_ilock()
call we do before freeing the inode because we have other inodes
locked. I don't think this can deadlock because the inodes under
reclaim should not be usable by anyone else at this point because
they have the I_RECLAIM flag set.

I did notice this - I added a XXX comment I added to the case being
complained about to note I needed to resolve this locking issue.

+        * Here we do an (almost) spurious inode lock in order to coordinate
+        * with inode cache radix tree lookups.  This is because the lookup
+        * can reference the inodes in the cache without taking references.
+        *
+        * We make that OK here by ensuring that we wait until the inode is
+        * unlocked after the lookup before we go ahead and free it. 
+        * unlocked after the lookup before we go ahead and free it. 
+        *
+        * XXX: need to check this is still true. Not sure it is.
         */

I added that last line in this patch. In more detail....

The comment is suggesting that we need to take the ILOCK to
co-ordinate with RCU protected lookups in progress before we RCU
free the inode. That's waht RCU is supposed to do, so I'm not at all
sure what this is actually serialising against any more.

i.e. any racing radix tree lookup from this point in time is going
to see the XFS_IRECLAIM flag and ip->i_ino == 0 while under the
rcu_read_lock, and they will go try again after dropping all lock
context and waiting for a bit. The inode may remain visibile until
the next rcu grace period expires, but all lookups will abort long
before the get anywhere near the ILOCK. And once the RCU grace
period expires, lookups will be locked out by the rcu_read_lock(),
the raidx tree moves to a state where the removal of the inode is
guaranteed visibile to all CPUs, and then the object is freed.

So the ILOCK should have no part in lookup serialisation, and I need
to go look at the history of the code to determine where and why
this was added, and whether the condition it protects against is
still a valid concern or not....

Cheers,

Dave.
Brian Foster Aug. 9, 2019, 12:36 p.m. UTC | #3
On Fri, Aug 09, 2019 at 11:20:22AM +1000, Dave Chinner wrote:
> On Thu, Aug 08, 2019 at 12:39:05PM -0400, Brian Foster wrote:
> > On Thu, Aug 01, 2019 at 12:17:51PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Replace the AG radix tree walking reclaim code with a list_lru
> > > walker, giving us both node-aware and memcg-aware inode reclaim
> > > at the XFS level. This requires adding an inode isolation function to
> > > determine if the inode can be reclaim, and a list walker to
> > > dispose of the inodes that were isolated.
> > > 
> > > We want the isolation function to be non-blocking. If we can't
> > > grab an inode then we either skip it or rotate it. If it's clean
> > > then we skip it, if it's dirty then we rotate to give it time to be
> > 
> > Do you mean we remove it if it's clean?
> 
> No, I mean if we can't grab it and it's clean, then we just skip it,
> leaving it at the head of the LRU for the next scanner to
> immediately try to reclaim it. If it's dirty, we rotate it so that
> time passes before we try to reclaim it again in the hope that it is
> already clean by the time we've scanned through the entire LRU...
> 

Ah, Ok. That could probably be worded more explicitly. E.g.:

"If we can't grab an inode, we skip it if it is clean or rotate it if
dirty. Dirty inode rotation gives the inode time to be cleaned before
it's scanned again. ..."

> > > +++ b/fs/xfs/xfs_super.c
> > ...
> > > @@ -1810,23 +1811,58 @@ xfs_fs_mount(
...
> > > +	long freed;
> > >  
> > > -	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
> > > +	INIT_LIST_HEAD(&ra.freeable);
> > > +	ra.lowest_lsn = NULLCOMMITLSN;
> > > +	ra.dirty_skipped = 0;
> > > +
> > > +	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
> > > +					xfs_inode_reclaim_isolate, &ra);
> > 
> > This is more related to the locking discussion on the earlier patch, but
> > this looks like it has more similar serialization to the example patch I
> > posted than the one without locking at all. IIUC, this walk has an
> > internal lock per node lru that is held across the walk and passed into
> > the callback. We never cycle it, so for any given node we only allow one
> > reclaimer through here at a time.
> 
> That's not a guarantee that list_lru gives us. It could drop it's
> internal lock at any time during that walk and we would be
> blissfully unaware that it has done this. And at that point, the
> reclaim context is completely unaware that other reclaim contexts
> may be scanning the same LRU at the same time and are interleaving
> with it.
> 

What is not a guarantee? I'm not following your point here. I suppose it
technically could drop the lock, but then it would have to restart the
iteration and wouldn't exactly provide predictable batching capability
to users.

This internal lock protects the integrity of the list from external
adds/removes, etc., but it's also passed into the callback so of course
it can be cycled at any point. The callback just has to notify the
caller to restart the walk. E.g., from __list_lru_walk_one():

        /*
         * The lru lock has been dropped, our list traversal is
         * now invalid and so we have to restart from scratch.
         */

> And, really, that does not matter one iota. If multiple scanners are
> interleaving, the reclaim traversal order and the decisions made are
> no different from what a single reclaimer does.  i.e. we just don't
> have to care if reclaim contexts interleave or not, because they
> will not repeat work that has already been done unnecessarily.
> That's one of the reasons for moving to IO-less LRU ordered reclaim
> - it removes all the gross hacks we've had to implement to guarantee
> reclaim scanning progress in one nice neat package of generic
> infrastructure.
> 

Yes, exactly. The difference I'm pointing out is that with the earlier
patch that drops locking from the perag based scan mechanism, the
interleaving of multiple reclaim scanners over the same range is not
exactly the same as a single reclaimer. That sort of behavior is the
intent of the example patch I appended to change the locking instead of
remove it.

> > That seems to be Ok given we don't do much in the isolation handler, the
> > lock isn't held across the dispose sequence and we're still batching in
> > the shrinker core on top of that. We're still serialized over the lru
> > fixups such that concurrent reclaimers aren't processing the same
> > inodes, however.
> 
> The only thing that we may need here is need_resched() checks if it
> turns out that holding a lock for 1024 items to be scanned proved to
> be too long to hold on to a single CPU. If we do that we'd cycle the
> LRU lock and return RETRY or RETRY_REMOVE, hence enabling reclaimers
> more finer-grained interleaving....
> 

Sure, with the caveat that we restart the traversal..

> > BTW I got a lockdep splat[1] for some reason on a straight mount/unmount
> > cycle with this patch.
> ....
> > [   39.030519]  lock_acquire+0x90/0x170
> > [   39.031170]  ? xfs_ilock+0xd2/0x280 [xfs]
> > [   39.031603]  down_write_nested+0x4f/0xb0
> > [   39.032064]  ? xfs_ilock+0xd2/0x280 [xfs]
> > [   39.032684]  ? xfs_dispose_inodes+0x124/0x320 [xfs]
> > [   39.033575]  xfs_ilock+0xd2/0x280 [xfs]
> > [   39.034058]  xfs_dispose_inodes+0x124/0x320 [xfs]
> 
> False positive, AFAICT. It's complaining about the final xfs_ilock()
> call we do before freeing the inode because we have other inodes
> locked. I don't think this can deadlock because the inodes under
> reclaim should not be usable by anyone else at this point because
> they have the I_RECLAIM flag set.
> 

Ok. The code looked sane to me at a glance, but lockdep tends to confuse
the hell out of me.

Brian

> I did notice this - I added a XXX comment I added to the case being
> complained about to note I needed to resolve this locking issue.
> 
> +        * Here we do an (almost) spurious inode lock in order to coordinate
> +        * with inode cache radix tree lookups.  This is because the lookup
> +        * can reference the inodes in the cache without taking references.
> +        *
> +        * We make that OK here by ensuring that we wait until the inode is
> +        * unlocked after the lookup before we go ahead and free it. 
> +        * unlocked after the lookup before we go ahead and free it. 
> +        *
> +        * XXX: need to check this is still true. Not sure it is.
>          */
> 
> I added that last line in this patch. In more detail....
> 
> The comment is suggesting that we need to take the ILOCK to
> co-ordinate with RCU protected lookups in progress before we RCU
> free the inode. That's waht RCU is supposed to do, so I'm not at all
> sure what this is actually serialising against any more.
> 
> i.e. any racing radix tree lookup from this point in time is going
> to see the XFS_IRECLAIM flag and ip->i_ino == 0 while under the
> rcu_read_lock, and they will go try again after dropping all lock
> context and waiting for a bit. The inode may remain visibile until
> the next rcu grace period expires, but all lookups will abort long
> before the get anywhere near the ILOCK. And once the RCU grace
> period expires, lookups will be locked out by the rcu_read_lock(),
> the raidx tree moves to a state where the removal of the inode is
> guaranteed visibile to all CPUs, and then the object is freed.
> 
> So the ILOCK should have no part in lookup serialisation, and I need
> to go look at the history of the code to determine where and why
> this was added, and whether the condition it protects against is
> still a valid concern or not....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Aug. 11, 2019, 2:17 a.m. UTC | #4
On Fri, Aug 09, 2019 at 08:36:32AM -0400, Brian Foster wrote:
> On Fri, Aug 09, 2019 at 11:20:22AM +1000, Dave Chinner wrote:
> > On Thu, Aug 08, 2019 at 12:39:05PM -0400, Brian Foster wrote:
> > > On Thu, Aug 01, 2019 at 12:17:51PM +1000, Dave Chinner wrote:
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > Replace the AG radix tree walking reclaim code with a list_lru
> > > > walker, giving us both node-aware and memcg-aware inode reclaim
> > > > at the XFS level. This requires adding an inode isolation function to
> > > > determine if the inode can be reclaim, and a list walker to
> > > > dispose of the inodes that were isolated.
> > > > 
> > > > We want the isolation function to be non-blocking. If we can't
> > > > grab an inode then we either skip it or rotate it. If it's clean
> > > > then we skip it, if it's dirty then we rotate to give it time to be
> > > 
> > > Do you mean we remove it if it's clean?
> > 
> > No, I mean if we can't grab it and it's clean, then we just skip it,
> > leaving it at the head of the LRU for the next scanner to
> > immediately try to reclaim it. If it's dirty, we rotate it so that
> > time passes before we try to reclaim it again in the hope that it is
> > already clean by the time we've scanned through the entire LRU...
> > 
> 
> Ah, Ok. That could probably be worded more explicitly. E.g.:
> 
> "If we can't grab an inode, we skip it if it is clean or rotate it if
> dirty. Dirty inode rotation gives the inode time to be cleaned before
> it's scanned again. ..."

*nod*

> > > > +++ b/fs/xfs/xfs_super.c
> > > ...
> > > > @@ -1810,23 +1811,58 @@ xfs_fs_mount(
> ...
> > > > +	long freed;
> > > >  
> > > > -	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
> > > > +	INIT_LIST_HEAD(&ra.freeable);
> > > > +	ra.lowest_lsn = NULLCOMMITLSN;
> > > > +	ra.dirty_skipped = 0;
> > > > +
> > > > +	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
> > > > +					xfs_inode_reclaim_isolate, &ra);
> > > 
> > > This is more related to the locking discussion on the earlier patch, but
> > > this looks like it has more similar serialization to the example patch I
> > > posted than the one without locking at all. IIUC, this walk has an
> > > internal lock per node lru that is held across the walk and passed into
> > > the callback. We never cycle it, so for any given node we only allow one
> > > reclaimer through here at a time.
> > 
> > That's not a guarantee that list_lru gives us. It could drop it's
> > internal lock at any time during that walk and we would be
> > blissfully unaware that it has done this. And at that point, the
> > reclaim context is completely unaware that other reclaim contexts
> > may be scanning the same LRU at the same time and are interleaving
> > with it.
> > 
> 
> What is not a guarantee? I'm not following your point here. I suppose it
> technically could drop the lock, but then it would have to restart the
> iteration and wouldn't exactly provide predictable batching capability
> to users.

There is no guarantee that the list_lru_shrink_walk() provides a
single list walker at a time or that it provides predictable
batching capability to users.

> This internal lock protects the integrity of the list from external
> adds/removes, etc., but it's also passed into the callback so of course
> it can be cycled at any point. The callback just has to notify the
> caller to restart the walk. E.g., from __list_lru_walk_one():
> 
>         /*
>          * The lru lock has been dropped, our list traversal is
>          * now invalid and so we have to restart from scratch.
>          */

As the designer and author of the list_lru code, I do know how it
works. I also know exactly what this problem this behaviour was
intended to solve, because I had to solve it to meet the
requirements I had for the infrastructure.

The isolation walk lock batching currently done is an optimisation
to minimise lru lock contention - it amortise the cost of getting
the lock over a substantial batch of work. If we drop the lock on
every item we try to isolate - my initial implementations did this -
then the lru lock thrashes badly against concurrent inserts and
deletes and scalability is not much better than the global lock it
was replacing.

IOWs, the behaviour we have now is a result of lock contention
optimisation to meet scalability requirements, not because of some
"predictable batching" requirement. If we were to rework the
traversal mechanism such that the lru lock was not necessary to
protect the state of the LRU list across the batch of isolate
callbacks, then we'd get the scalability we need but we'd completely
change the concurrency behaviour. The list would still do LRU
reclaim, and the isolate functions still work exactly as tehy
currently do (i.e. they work on just the item passed to them) but
we'd have concurrent reclaim contexts isolating items on the same
LRU concurrently rather than being serialised. And that's perfectly
fine, because the isolate/dispose architecture just doesn't care
how the items on the LRU are isolated for disposal.....

What I'm trying to say is that the "isolation batching" we have is
not desirable but it is necessary, and we because that's internal to
the list_lru implementation, we can change that behaviour however
we want and it won't affect the subsystems that own the objects
being reclaimed. They still just get handed a list of items to
dispose, and they all come from the reclaim end of the LRU list...

Indeed, the new XFS inode shrinker is not dependent on any specific
batching order, it's not dependent on isolation being serialised,
and it's not dependent on the lru_lock being held across the
isolation function. IOWs, it's set up just right to take advantage
of any increases in isolation concurrency that the list_lru
infrastructure could provide...

> > > That seems to be Ok given we don't do much in the isolation handler, the
> > > lock isn't held across the dispose sequence and we're still batching in
> > > the shrinker core on top of that. We're still serialized over the lru
> > > fixups such that concurrent reclaimers aren't processing the same
> > > inodes, however.
> > 
> > The only thing that we may need here is need_resched() checks if it
> > turns out that holding a lock for 1024 items to be scanned proved to
> > be too long to hold on to a single CPU. If we do that we'd cycle the
> > LRU lock and return RETRY or RETRY_REMOVE, hence enabling reclaimers
> > more finer-grained interleaving....
> > 
> 
> Sure, with the caveat that we restart the traversal..

Which only re-traverses the inodes we skipped because they were
locked at the time. IOWs, Skipping inodes is rare because if it is
in reclaim then the only things that can be contending is a radix
tree lookup in progress or an inode clustering operation
(write/free) in progress. Either way, they will be relatively rare
and very short term lock holds, so if we have to restart the scan
after dropping the lru lock then it's likely we'll restart at next
inode in line for reclaim, anyway....

Hence I don't think having to restart a traversal would really
matter all that much....

Cheers,

Dave.
Brian Foster Aug. 11, 2019, 12:46 p.m. UTC | #5
On Sun, Aug 11, 2019 at 12:17:47PM +1000, Dave Chinner wrote:
> On Fri, Aug 09, 2019 at 08:36:32AM -0400, Brian Foster wrote:
> > On Fri, Aug 09, 2019 at 11:20:22AM +1000, Dave Chinner wrote:
> > > On Thu, Aug 08, 2019 at 12:39:05PM -0400, Brian Foster wrote:
> > > > On Thu, Aug 01, 2019 at 12:17:51PM +1000, Dave Chinner wrote:
> > > > > From: Dave Chinner <dchinner@redhat.com>
...
> > > > > +	long freed;
> > > > >  
> > > > > -	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
> > > > > +	INIT_LIST_HEAD(&ra.freeable);
> > > > > +	ra.lowest_lsn = NULLCOMMITLSN;
> > > > > +	ra.dirty_skipped = 0;
> > > > > +
> > > > > +	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
> > > > > +					xfs_inode_reclaim_isolate, &ra);
> > > > 
> > > > This is more related to the locking discussion on the earlier patch, but
> > > > this looks like it has more similar serialization to the example patch I
> > > > posted than the one without locking at all. IIUC, this walk has an
> > > > internal lock per node lru that is held across the walk and passed into
> > > > the callback. We never cycle it, so for any given node we only allow one
> > > > reclaimer through here at a time.
> > > 
> > > That's not a guarantee that list_lru gives us. It could drop it's
> > > internal lock at any time during that walk and we would be
> > > blissfully unaware that it has done this. And at that point, the
> > > reclaim context is completely unaware that other reclaim contexts
> > > may be scanning the same LRU at the same time and are interleaving
> > > with it.
> > > 
> > 
> > What is not a guarantee? I'm not following your point here. I suppose it
> > technically could drop the lock, but then it would have to restart the
> > iteration and wouldn't exactly provide predictable batching capability
> > to users.
> 
> There is no guarantee that the list_lru_shrink_walk() provides a
> single list walker at a time or that it provides predictable
> batching capability to users.
> 

Hm, Ok. I suppose the code could change if that's your point. But how is
that relevant to how XFS reclaim behavior as of this patch compares to
the intermediate patch 20? Note again that this the only reason I bring
it up in this patch (which seems mostly sane to me). Perhaps I should
have just replied again to patch 20 to avoid confusion. Apologies for
that, but that ship has sailed...

To reiterate, I suggested not dropping the lock in patch 20 for
$reasons. You replied generally that doing so might cause more problems
than it solves. I replied with a compromise patch that leaves the lock,
but only acquires it across inode grabbing instead of the entire reclaim
operation. This essentially implements "serialized isolation batching"
as we've termed it here (assuming the patch is correct).

I eventually made it to this patch and observe that you end up with an
implementation that does serialized isolation batching. Of course the
locking and implementation is quite different with this being a whole
new mechanism, so performance and things could be quite different too.
But my point is that if serialized isolation batching is acceptable
behavior for XFS inode reclaim right now, then it seems reasonable to me
that it be acceptable in patch 20 for what is ultimately just an
intermediate state.

I appreciate all of the discussion and background information that
follows, but it gets way far off from the original feedback. I've still
seen no direct response to the thoughts above either here or in patch
20. Are you planning to incorporate that example patch or something
similar? If so, then I think we're on the same page.

> > This internal lock protects the integrity of the list from external
> > adds/removes, etc., but it's also passed into the callback so of course
> > it can be cycled at any point. The callback just has to notify the
> > caller to restart the walk. E.g., from __list_lru_walk_one():
> > 
> >         /*
> >          * The lru lock has been dropped, our list traversal is
> >          * now invalid and so we have to restart from scratch.
> >          */
> 
> As the designer and author of the list_lru code, I do know how it
> works. I also know exactly what this problem this behaviour was
> intended to solve, because I had to solve it to meet the
> requirements I had for the infrastructure.
> 
> The isolation walk lock batching currently done is an optimisation
> to minimise lru lock contention - it amortise the cost of getting
> the lock over a substantial batch of work. If we drop the lock on
> every item we try to isolate - my initial implementations did this -
> then the lru lock thrashes badly against concurrent inserts and
> deletes and scalability is not much better than the global lock it
> was replacing.
> 

Ok.

> IOWs, the behaviour we have now is a result of lock contention
> optimisation to meet scalability requirements, not because of some
> "predictable batching" requirement. If we were to rework the
> traversal mechanism such that the lru lock was not necessary to
> protect the state of the LRU list across the batch of isolate
> callbacks, then we'd get the scalability we need but we'd completely
> change the concurrency behaviour. The list would still do LRU
> reclaim, and the isolate functions still work exactly as tehy
> currently do (i.e. they work on just the item passed to them) but
> we'd have concurrent reclaim contexts isolating items on the same
> LRU concurrently rather than being serialised. And that's perfectly
> fine, because the isolate/dispose architecture just doesn't care
> how the items on the LRU are isolated for disposal.....
> 

Sure, that mostly makes sense. We're free to similarly rework the old
mechanism to not require locking just as well. My point is that the
patch to move I/O submission to the AIL doesn't do that sufficiently,
despite that being the initial purpose of the lock.

BTW, the commit[1] that introduces the pag reclaim lock back in 2010
introduces the cursor at the same time and actually does mention some of
the things I'm pointing out as potential problems with patch 20. It
refers to potential for "massive contention" and prevention of
reclaimers "continually scanning the same inodes in each AG." Again, I'm
not worried about that for this series because we ultimately rework the
shrinker past those issues, but it appears that it wasn't purely a
matter of I/O submission ordering that motivated the lock (though I'm
sure that was part of it).

Given that, I think another reasonable option for patch 20 is to remove
the cursor and locking together. That at least matches some form of
historical reclaim behavior in XFS. If we took that approach, it might
make sense to split patch 20 into one patch that shifts I/O to xfsaild
(new behavior) and a subsequent that undoes [1].

[1] 69b491c214d7 ("xfs: serialise inode reclaim within an AG")

> What I'm trying to say is that the "isolation batching" we have is
> not desirable but it is necessary, and we because that's internal to
> the list_lru implementation, we can change that behaviour however
> we want and it won't affect the subsystems that own the objects
> being reclaimed. They still just get handed a list of items to
> dispose, and they all come from the reclaim end of the LRU list...
> 
> Indeed, the new XFS inode shrinker is not dependent on any specific
> batching order, it's not dependent on isolation being serialised,
> and it's not dependent on the lru_lock being held across the
> isolation function. IOWs, it's set up just right to take advantage
> of any increases in isolation concurrency that the list_lru
> infrastructure could provide...
>

Yep, it's a nice abstraction. This has no bearing on the old
implementation which does have XFS specific locking, however.
 
> > > > That seems to be Ok given we don't do much in the isolation handler, the
> > > > lock isn't held across the dispose sequence and we're still batching in
> > > > the shrinker core on top of that. We're still serialized over the lru
> > > > fixups such that concurrent reclaimers aren't processing the same
> > > > inodes, however.
> > > 
> > > The only thing that we may need here is need_resched() checks if it
> > > turns out that holding a lock for 1024 items to be scanned proved to
> > > be too long to hold on to a single CPU. If we do that we'd cycle the
> > > LRU lock and return RETRY or RETRY_REMOVE, hence enabling reclaimers
> > > more finer-grained interleaving....
> > > 
> > 
> > Sure, with the caveat that we restart the traversal..
> 
> Which only re-traverses the inodes we skipped because they were
> locked at the time. IOWs, Skipping inodes is rare because if it is
> in reclaim then the only things that can be contending is a radix
> tree lookup in progress or an inode clustering operation
> (write/free) in progress. Either way, they will be relatively rare
> and very short term lock holds, so if we have to restart the scan
> after dropping the lru lock then it's likely we'll restart at next
> inode in line for reclaim, anyway....
> 
> Hence I don't think having to restart a traversal would really
> matter all that much....
> 

Perhaps, that seems fairly reasonable given that we rotate dirty inodes.
I'm not totally convinced we might not thrash on an LRU with a high
population of dirty inodes or that a restart is even the simplest
approach to dealing with large batch sizes, but I'd reserve judgement on
that until there's code to review.

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 610f643df9f6..891fe3795c8f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1195,7 +1195,7 @@  xfs_reclaim_inode(
  *
  * Return the number of inodes freed.
  */
-STATIC int
+int
 xfs_reclaim_inodes_ag(
 	struct xfs_mount	*mp,
 	int			flags,
@@ -1297,40 +1297,185 @@  xfs_reclaim_inodes_ag(
 	return freed;
 }
 
-void
-xfs_reclaim_inodes(
-	struct xfs_mount	*mp)
+enum lru_status
+xfs_inode_reclaim_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
 {
-	xfs_reclaim_inodes_ag(mp, SYNC_WAIT, INT_MAX);
+        struct xfs_ireclaim_args *ra = arg;
+        struct inode		*inode = container_of(item, struct inode, i_lru);
+        struct xfs_inode	*ip = XFS_I(inode);
+	enum lru_status		ret;
+	xfs_lsn_t		lsn = 0;
+
+	/* Careful: inversion of iflags_lock and everything else here */
+	if (!spin_trylock(&ip->i_flags_lock))
+		return LRU_SKIP;
+
+	ret = LRU_ROTATE;
+	if (!xfs_inode_clean(ip) && !__xfs_iflags_test(ip, XFS_ISTALE)) {
+		lsn = ip->i_itemp->ili_item.li_lsn;
+		ra->dirty_skipped++;
+		goto out_unlock_flags;
+	}
+
+	ret = LRU_SKIP;
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		goto out_unlock_flags;
+
+	if (!__xfs_iflock_nowait(ip)) {
+		lsn = ip->i_itemp->ili_item.li_lsn;
+		ra->dirty_skipped++;
+		goto out_unlock_inode;
+	}
+
+	/* if we are in shutdown, we'll reclaim it even if dirty */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto reclaim;
+
+	/*
+	 * Now the inode is locked, we can actually determine if it is dirty
+	 * without racing with anything.
+	 */
+	ret = LRU_ROTATE;
+	if (xfs_ipincount(ip)) {
+		ra->dirty_skipped++;
+		goto out_ifunlock;
+	}
+	if (!xfs_inode_clean(ip) && !__xfs_iflags_test(ip, XFS_ISTALE)) {
+		lsn = ip->i_itemp->ili_item.li_lsn;
+		ra->dirty_skipped++;
+		goto out_ifunlock;
+	}
+
+reclaim:
+	/*
+	 * Once we mark the inode with XFS_IRECLAIM, no-one will grab it again.
+	 * RCU lookups will still find the inode, but they'll stop when they set
+	 * the IRECLAIM flag. Hence we can leave the inode locked as we move it
+	 * to the dispose list so we can deal with shutdown cleanup there
+	 * outside the LRU lock context.
+	 */
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	list_lru_isolate_move(lru, &inode->i_lru, &ra->freeable);
+	spin_unlock(&ip->i_flags_lock);
+	return LRU_REMOVED;
+
+out_ifunlock:
+	xfs_ifunlock(ip);
+out_unlock_inode:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_unlock_flags:
+	spin_unlock(&ip->i_flags_lock);
+
+	if (lsn && XFS_LSN_CMP(lsn, ra->lowest_lsn) < 0)
+		ra->lowest_lsn = lsn;
+	return ret;
 }
 
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-long
-xfs_reclaim_inodes_nr(
-	struct xfs_mount	*mp,
-	int			nr_to_scan)
+static void
+xfs_dispose_inode(
+	struct xfs_inode	*ip)
 {
-	int			sync_mode = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	xfs_ino_t		ino;
+
+	ASSERT(xfs_isiflocked(ip));
+	ASSERT(xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE));
+	ASSERT(ip->i_ino != 0);
 
 	/*
-	 * For kswapd, we kick background inode writeback. For direct
-	 * reclaim, we issue and wait on inode writeback to throttle
-	 * reclaim rates and avoid shouty OOM-death.
+	 * Process the shutdown reclaim work we deferred from the LRU isolation
+	 * callback before we go any further.
 	 */
-	if (current_is_kswapd())
-		xfs_ail_push_all(mp->m_ail);
-	else
-		sync_mode |= SYNC_WAIT;
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_abort(ip, false);
+	} else {
+		xfs_ifunlock(ip);
+	}
 
-	return xfs_reclaim_inodes_ag(mp, sync_mode, nr_to_scan);
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always appears
+	 * to be reclaimed with an invalid inode number when in the free state.
+	 * We do this as early as possible under the ILOCK so that
+	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+	 * detect races with us here. By doing this, we guarantee that once
+	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+	 * it will see either a valid inode that will serialise correctly, or it
+	 * will see an invalid inode that it can skip.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ino = ip->i_ino; /* for radix_tree_delete */
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(mp, xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino)))
+		ASSERT(0);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 *
+	 * XXX: need to check this is still true. Not sure it is.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	__xfs_inode_free(ip);
+}
+
+void
+xfs_dispose_inodes(
+	struct list_head	*freeable)
+{
+	while (!list_empty(freeable)) {
+		struct inode *inode;
+
+		inode = list_first_entry(freeable, struct inode, i_lru);
+		list_del_init(&inode->i_lru);
+
+		xfs_dispose_inode(XFS_I(inode));
+		cond_resched();
+	}
+}
+void
+xfs_reclaim_inodes(
+	struct xfs_mount	*mp)
+{
+	while (list_lru_count(&mp->m_inode_lru)) {
+		struct xfs_ireclaim_args ra;
+
+		INIT_LIST_HEAD(&ra.freeable);
+		ra.lowest_lsn = NULLCOMMITLSN;
+		list_lru_walk(&mp->m_inode_lru, xfs_inode_reclaim_isolate,
+				&ra, LONG_MAX);
+		xfs_dispose_inodes(&ra.freeable);
+		if (ra.lowest_lsn != NULLCOMMITLSN)
+			xfs_ail_push_sync(mp->m_ail, ra.lowest_lsn);
+	}
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 0ab08b58cd45..dadc69a30f33 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -49,8 +49,16 @@  int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
 struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
 void xfs_inode_free(struct xfs_inode *ip);
 
+struct xfs_ireclaim_args {
+	struct list_head	freeable;
+	xfs_lsn_t		lowest_lsn;
+	unsigned long		dirty_skipped;
+};
+
+enum lru_status xfs_inode_reclaim_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg);
+void xfs_dispose_inodes(struct list_head *freeable);
 void xfs_reclaim_inodes(struct xfs_mount *mp);
-long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f95a03..463170dc4c02 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -263,6 +263,14 @@  static inline int xfs_isiflocked(struct xfs_inode *ip)
 
 extern void __xfs_iflock(struct xfs_inode *ip);
 
+static inline int __xfs_iflock_nowait(struct xfs_inode *ip)
+{
+	if (ip->i_flags & XFS_IFLOCK)
+		return false;
+	ip->i_flags |= XFS_IFLOCK;
+	return true;
+}
+
 static inline int xfs_iflock_nowait(struct xfs_inode *ip)
 {
 	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b5c4c1b6fd19..e3e898a2896c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,6 +17,7 @@ 
 #include "xfs_alloc.h"
 #include "xfs_fsops.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
@@ -1810,23 +1811,58 @@  xfs_fs_mount(
 }
 
 static long
-xfs_fs_nr_cached_objects(
+xfs_fs_free_cached_objects(
 	struct super_block	*sb,
 	struct shrink_control	*sc)
 {
-	/* Paranoia: catch incorrect calls during mount setup or teardown */
-	if (WARN_ON_ONCE(!sb->s_fs_info))
-		return 0;
+	struct xfs_mount	*mp = XFS_M(sb);
+        struct xfs_ireclaim_args ra;
+	long freed;
 
-	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
+	INIT_LIST_HEAD(&ra.freeable);
+	ra.lowest_lsn = NULLCOMMITLSN;
+	ra.dirty_skipped = 0;
+
+	freed = list_lru_shrink_walk(&mp->m_inode_lru, sc,
+					xfs_inode_reclaim_isolate, &ra);
+	xfs_dispose_inodes(&ra.freeable);
+
+	/*
+	 * Deal with dirty inodes if we skipped any. We will have the LSN of
+	 * the oldest dirty inode in our reclaim args if we skipped any.
+	 *
+	 * For direct reclaim, wait on an AIL push to clean some inodes.
+	 *
+	 * For kswapd, if we skipped too many dirty inodes (i.e. more dirty than
+	 * we freed) then we need kswapd to back off once it's scan has been
+	 * completed. That way it will have some clean inodes once it comes back
+	 * and can make progress, but make sure we have inode cleaning in
+	 * progress....
+	 */
+	if (current_is_kswapd()) {
+		if (ra.dirty_skipped >= freed) {
+			if (current->reclaim_state)
+				current->reclaim_state->need_backoff = true;
+			if (ra.lowest_lsn != NULLCOMMITLSN)
+				xfs_ail_push(mp->m_ail, ra.lowest_lsn);
+		}
+	} else if (ra.lowest_lsn != NULLCOMMITLSN) {
+		xfs_ail_push_sync(mp->m_ail, ra.lowest_lsn);
+	}
+
+	return freed;
 }
 
 static long
-xfs_fs_free_cached_objects(
+xfs_fs_nr_cached_objects(
 	struct super_block	*sb,
 	struct shrink_control	*sc)
 {
-	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+	/* Paranoia: catch incorrect calls during mount setup or teardown */
+	if (WARN_ON_ONCE(!sb->s_fs_info))
+		return 0;
+
+	return list_lru_shrink_count(&XFS_M(sb)->m_inode_lru, sc);
 }
 
 static const struct super_operations xfs_super_operations = {