@@ -102,8 +102,14 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
freed += prune_icache_sb(sb, sc);
if (fs_objects) {
+ unsigned long ret;
+
sc->nr_to_scan = fs_objects + 1;
- freed += sb->s_op->free_cached_objects(sb, sc);
+ ret = sb->s_op->free_cached_objects(sb, sc);
+ if (ret == SHRINK_STOP)
+ freed = SHRINK_STOP;
+ else
+ freed += ret;
}
up_read(&sb->s_umount);
@@ -975,7 +975,7 @@ xfs_reclaim_inode(
error = 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iflock_nowait(ip)) {
- if (!(sync_mode & SYNC_WAIT))
+ if (sync_mode & SYNC_TRYLOCK)
goto out;
xfs_iflock(ip);
}
@@ -987,7 +987,7 @@ xfs_reclaim_inode(
goto reclaim;
}
if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT))
+ if (sync_mode & SYNC_TRYLOCK)
goto out_ifunlock;
xfs_iunpin_wait(ip);
}
@@ -1103,7 +1103,7 @@ xfs_reclaim_inode(
* then a shut down during filesystem unmount reclaim walk leak all the
* unreclaimed inodes.
*/
-STATIC int
+STATIC long
xfs_reclaim_inodes_ag(
struct xfs_mount *mp,
int flags,
@@ -1113,18 +1113,22 @@ xfs_reclaim_inodes_ag(
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
- int trylock = flags & SYNC_TRYLOCK;
+ int trylock;
int skipped;
+ int dirty_ags;
restart:
+ trylock = flags & SYNC_TRYLOCK;
ag = 0;
skipped = 0;
+ dirty_ags = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
unsigned long first_index = 0;
int done = 0;
int nr_found = 0;
ag = pag->pag_agno + 1;
+ dirty_ags++;
if (trylock) {
if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
@@ -1132,10 +1136,16 @@ xfs_reclaim_inodes_ag(
xfs_perag_put(pag);
continue;
}
- first_index = pag->pag_ici_reclaim_cursor;
} else
mutex_lock(&pag->pag_ici_reclaim_lock);
+ /*
+ * Always start from the last scanned inode so that we don't
+ * block on inodes that a previous iteration just flushed.
+ * Iterate over the entire inode range before coming back to
+ * skipped/dirty inodes.
+ */
+ first_index = pag->pag_ici_reclaim_cursor;
do {
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;
@@ -1201,23 +1211,31 @@ xfs_reclaim_inodes_ag(
} while (nr_found && !done && *nr_to_scan > 0);
- if (trylock && !done)
- pag->pag_ici_reclaim_cursor = first_index;
- else
- pag->pag_ici_reclaim_cursor = 0;
+ if (done)
+ first_index = 0;
+ pag->pag_ici_reclaim_cursor = first_index;
mutex_unlock(&pag->pag_ici_reclaim_lock);
xfs_perag_put(pag);
}
/*
- * if we skipped any AG, and we still have scan count remaining, do
+ * If we skipped all AGs because they are locked, we've reached maximum
+ * reclaim concurrency. At this point there is no point in having more
+ * attempts to shrink the cache from this context. Tell the shrinker to
+ * stop and defer the reclaim work till later.
+ */
+ if (skipped && skipped == dirty_ags)
+ return SHRINK_STOP;
+
+ /*
+ * If we skipped any AG, and we still have scan count remaining, do
* another pass this time using blocking reclaim semantics (i.e
* waiting on the reclaim locks and ignoring the reclaim cursors). This
* ensure that when we get more reclaimers than AGs we block rather
* than spin trying to execute reclaim.
*/
if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
- trylock = 0;
+ flags &= ~SYNC_TRYLOCK;
goto restart;
}
return last_error;
@@ -1229,8 +1247,12 @@ xfs_reclaim_inodes(
int mode)
{
int nr_to_scan = INT_MAX;
+ long ret;
- return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+ ret = xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+ if (ret == SHRINK_STOP)
+ return 0;
+ return ret;
}
/*
@@ -1241,17 +1263,28 @@ xfs_reclaim_inodes(
* reclaim of inodes. That means if we come across dirty inodes, we wait for
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
+ *
+ * Also, treat kswapd specially - we really want it to run asynchronously and
+ * not block on dirty inodes, unlike direct reclaim that we can tolerate
+ * blocking and some amount of IO latency. If we start to overload the reclaim
+ * subsystem with too many direct reclaimers, it will start returning
+ * SHRINK_STOP to tell the mm subsystem to defer the work rather than continuing
+ * to call us and forcing us to block.
*/
long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
int nr_to_scan)
{
+ int flags = SYNC_TRYLOCK;
+
/* kick background reclaimer and push the AIL */
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ if (!current_is_kswapd())
+ flags |= SYNC_WAIT;
+ return xfs_reclaim_inodes_ag(mp, flags, &nr_to_scan);
}
/*