From patchwork Fri Mar 26 00:21:54 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165275
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 077D9C433E6
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:46 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id AA78C61A3C
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230139AbhCZAWO (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:14 -0400
Received: from mail.kernel.org ([198.145.29.99]:35284 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230164AbhCZAVz (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:21:55 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id BDC59619F3;
        Fri, 26 Mar 2021 00:21:54 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718114;
        bh=BjbVBiMZlzUgM1gI3PbSZk090bVrLGUUmK7/oefqhb0=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=ddaEWUNhgMuQ0oj+oH1xDn+vGb+E1xVaxyXo+y2bgpyCIpqHMzyOdApYVt7NyEi5K
         5VQzjNxolS7oVYKXKAkPWAJlz08wNznZD9YILhMnrw5CBxoa9/RvDZgC76lHp8MD7N
         KzHj8rmZTbeB5X4wpTGvxso5BG+EEDQKHpepaWg+VNJbDAs2IZ58W9Wzf4VDsG1hBU
         +cBlp4K2W5PxJZUm1qPfSI+8Yj4wV4KA9CrK3N+8iIzd4Josy6P0RgmZy0KyYDVaqn
         gx5ncFpLDg7FFh22p3p80e4Jo051t157PYbq10vojZ7+WAnWRYMZS7/Uxyt0ekfyi0
         YTEGnE9cj8edQ==
Subject: [PATCH 1/9] xfs: refactor the inode recycling code
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:21:54 -0700
Message-ID: <161671811446.622901.13774973473200855377.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Hoist the code in xfs_iget_cache_hit that restores the VFS inode state
to an xfs_inode that was previously vfs-destroyed.  The next patch will
add a new set of state flags, so we need the helper to avoid
duplication.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |  144 +++++++++++++++++++++++++++++----------------------
 1 file changed, 83 insertions(+), 61 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 4c124bc98f39..79f61a7f40b2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -286,19 +286,19 @@ xfs_inew_wait(
  * need to retain across reinitialisation, and rewrite them into the VFS inode
  * after reinitialisation even if it fails.
  */
-static int
+static inline int
 xfs_reinit_inode(
 	struct xfs_mount	*mp,
 	struct inode		*inode)
 {
-	int		error;
-	uint32_t	nlink = inode->i_nlink;
-	uint32_t	generation = inode->i_generation;
-	uint64_t	version = inode_peek_iversion(inode);
-	umode_t		mode = inode->i_mode;
-	dev_t		dev = inode->i_rdev;
-	kuid_t		uid = inode->i_uid;
-	kgid_t		gid = inode->i_gid;
+	int			error;
+	uint32_t		nlink = inode->i_nlink;
+	uint32_t		generation = inode->i_generation;
+	uint64_t		version = inode_peek_iversion(inode);
+	umode_t			mode = inode->i_mode;
+	dev_t			dev = inode->i_rdev;
+	kuid_t			uid = inode->i_uid;
+	kgid_t			gid = inode->i_gid;
 
 	error = inode_init_always(mp->m_super, inode);
 
@@ -312,6 +312,73 @@ xfs_reinit_inode(
 	return error;
 }
 
+/*
+ * Carefully nudge an inode whose VFS state has been torn down back into a
+ * usable state.
+ */
+static int
+xfs_iget_recycle(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	/*
+	 * We need to make it look like the inode is being reclaimed to prevent
+	 * the actual reclaim workers from stomping over us while we recycle
+	 * the inode.  We can't clear the radix tree tag yet as it requires
+	 * pag_ici_lock to be held exclusive.
+	 */
+	ip->i_flags |= XFS_IRECLAIM;
+
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+	error = xfs_reinit_inode(mp, inode);
+	if (error) {
+		bool		wake;
+
+		/*
+		 * Re-initializing the inode failed, and we are in deep
+		 * trouble.  Try to re-add it to the inactive list.
+		 */
+		rcu_read_lock();
+		spin_lock(&ip->i_flags_lock);
+		wake = !!__xfs_iflags_test(ip, XFS_INEW);
+		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+		if (wake)
+			wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		return error;
+	}
+
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+
+	/*
+	 * Clear the per-lifetime state in the inode as we are now effectively
+	 * a new inode and need to return to the initial state before reuse
+	 * occurs.
+	 */
+	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+	ip->i_flags |= XFS_INEW;
+	xfs_perag_clear_ici_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			XFS_ICI_RECLAIM_TAG);
+	inode->i_state = I_NEW;
+
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+	init_rwsem(&inode->i_rwsem);
+
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+
+	return 0;
+}
+
 /*
  * If we are allocating a new inode, then check what was returned is
  * actually a free, empty inode. If we are not allocating an inode,
@@ -386,7 +453,7 @@ xfs_iget_cache_hit(
 	/*
 	 * If we are racing with another cache hit that is currently
 	 * instantiating this inode or currently recycling it out of
-	 * reclaimabe state, wait for the initialisation to complete
+	 * reclaimable state, wait for the initialisation to complete
 	 * before continuing.
 	 *
 	 * XXX(hch): eventually we should do something equivalent to
@@ -408,11 +475,11 @@ xfs_iget_cache_hit(
 	if (error)
 		goto out_error;
 
-	/*
-	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-	 * Need to carefully get it back into useable state.
-	 */
 	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		/*
+		 * If IRECLAIMABLE is set, we've torn down the VFS inode
+		 * already, and must carefully restore it to usable state.
+		 */
 		trace_xfs_iget_reclaim(ip);
 
 		if (flags & XFS_IGET_INCORE) {
@@ -420,55 +487,11 @@ xfs_iget_cache_hit(
 			goto out_error;
 		}
 
-		/*
-		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-		 * from stomping over us while we recycle the inode.  We can't
-		 * clear the radix tree reclaimable tag yet as it requires
-		 * pag_ici_lock to be held exclusive.
-		 */
-		ip->i_flags |= XFS_IRECLAIM;
-
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-
-		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
-		error = xfs_reinit_inode(mp, inode);
+		error = xfs_iget_recycle(pag, ip);
 		if (error) {
-			bool wake;
-			/*
-			 * Re-initializing the inode failed, and we are in deep
-			 * trouble.  Try to re-add it to the reclaim list.
-			 */
-			rcu_read_lock();
-			spin_lock(&ip->i_flags_lock);
-			wake = !!__xfs_iflags_test(ip, XFS_INEW);
-			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-			if (wake)
-				wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
-			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 			trace_xfs_iget_reclaim_fail(ip);
-			goto out_error;
+			return error;
 		}
-
-		spin_lock(&pag->pag_ici_lock);
-		spin_lock(&ip->i_flags_lock);
-
-		/*
-		 * Clear the per-lifetime state in the inode as we are now
-		 * effectively a new inode and need to return to the initial
-		 * state before reuse occurs.
-		 */
-		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-		ip->i_flags |= XFS_INEW;
-		xfs_perag_clear_ici_tag(pag,
-				XFS_INO_TO_AGINO(pag->pag_mount, ino),
-				XFS_ICI_RECLAIM_TAG);
-		inode->i_state = I_NEW;
-		ip->i_sick = 0;
-		ip->i_checked = 0;
-
-		spin_unlock(&ip->i_flags_lock);
-		spin_unlock(&pag->pag_ici_lock);
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
@@ -498,7 +521,6 @@ xfs_iget_cache_hit(
 	return error;
 }
 
-
 static int
 xfs_iget_cache_miss(
 	struct xfs_mount	*mp,

From patchwork Fri Mar 26 00:22:00 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165279
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8AE33C433E5
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 663D3619F3
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230153AbhCZAWO (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:14 -0400
Received: from mail.kernel.org ([198.145.29.99]:35302 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230098AbhCZAWA (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:00 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id 4F8E8619F3;
        Fri, 26 Mar 2021 00:22:00 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718120;
        bh=EoQBoRxqRBV7EDpIS3T6ZTy9Soh1b9t6BG9osBGjyJA=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=EHGcVfAWwWrF/bJdNxwkLel/6NOlSI33KqK39qPiFNG92LzF0V6luLecMnLQy2bYN
         N91LP4IMhlv8YNy1DxGK8IdebfUZ0gRfBvzrTfr4/ujVaBI7YZ3zWJSdwTlIjoA9TI
         yvdBiMp46BO9TKKP26jOuY6az+A9XELHK5SsphCCCUftBinATdgk9bzSoK5v/2/8j3
         yC6zU2TdwvFErOKTPoXDygHZ/33bTjpesYNPq61Vq/kXGrlZx14AA8nvZhu0bME8AH
         oALR2c48xFsVBp2Ne03xyz/MSMlAsec/ADr+QYxyT23NLEvzxe9OazvvrzWbZkgpde
         OtdJ1NhShsv1w==
Subject: [PATCH 2/9] xfs: deferred inode inactivation
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:00 -0700
Message-ID: <161671811998.622901.3111903657408177382.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Instead of calling xfs_inactive directly from xfs_fs_destroy_inode,
defer the inactivation phase to a separate workqueue.  With this we
avoid blocking memory reclaim on filesystem metadata updates that are
necessary to free an in-core inode, such as post-eof block freeing, COW
staging extent freeing, and truncating and freeing unlinked inodes.  Now
that work is deferred to a workqueue where we can do the freeing in
batches.

We introduce two new inode flags -- NEEDS_INACTIVE and INACTIVATING.
The first flag helps our worker find inodes needing inactivation, and
the second flag marks inodes that are in the process of being
inactivated.  A concurrent xfs_iget on the inode can still resurrect the
inode by clearing NEEDS_INACTIVE (or bailing if INACTIVATING is set).

Unfortunately, deferring the inactivation has one huge downside --
eventual consistency.  Since all the freeing is deferred to a worker
thread, one can rm a file but the space doesn't come back immediately.
This can cause some odd side effects with quota accounting and statfs,
so we also force inactivation scans in order to maintain the existing
behaviors, at least outwardly.

For this patch we'll set the delay to zero to mimic the old timing as
much as possible; in the next patch we'll play with different delay
settings.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 Documentation/admin-guide/xfs.rst |    3 
 fs/xfs/scrub/common.c             |    2 
 fs/xfs/xfs_fsops.c                |    4 
 fs/xfs/xfs_icache.c               |  313 ++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h               |   36 ++++
 fs/xfs/xfs_inode.c                |   60 +++++++
 fs/xfs/xfs_inode.h                |   15 ++
 fs/xfs/xfs_log_recover.c          |    7 +
 fs/xfs/xfs_mount.c                |   25 +++
 fs/xfs/xfs_mount.h                |    9 +
 fs/xfs/xfs_qm_syscalls.c          |   22 +++
 fs/xfs/xfs_super.c                |   83 +++++++++-
 fs/xfs/xfs_trace.h                |   14 +-
 13 files changed, 560 insertions(+), 33 deletions(-)

diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index 8de008c0c5ad..f9b109bfc6a6 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -524,7 +524,8 @@ and the short name of the data device.  They all can be found in:
                   mount time quotacheck.
   xfs-gc          Background garbage collection of disk space that have been
                   speculatively allocated beyond EOF or for staging copy on
-                  write operations.
+                  write operations; and files that are no longer linked into
+                  the directory tree.
 ================  ===========
 
 For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index d8da0ea772bc..93593861aa6e 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -886,6 +886,7 @@ xchk_stop_reaping(
 {
 	sc->flags |= XCHK_REAPING_DISABLED;
 	xfs_blockgc_stop(sc->mp);
+	xfs_inodegc_stop(sc->mp);
 }
 
 /* Restart background reaping of resources. */
@@ -893,6 +894,7 @@ void
 xchk_start_reaping(
 	struct xfs_scrub	*sc)
 {
+	xfs_inodegc_start(sc->mp);
 	xfs_blockgc_start(sc->mp);
 	sc->flags &= ~XCHK_REAPING_DISABLED;
 }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b33c894b6cf3..85978de7a15f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -19,6 +19,8 @@
 #include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
 
 /*
  * Write new AG headers to disk. Non-transactional, but need to be
@@ -345,6 +347,8 @@ xfs_fs_counts(
 	xfs_mount_t		*mp,
 	xfs_fsop_counts_t	*cnt)
 {
+	xfs_inodegc_summary_flush(mp);
+
 	cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
 	cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
 	cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 79f61a7f40b2..287a1500b9b2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -32,6 +32,9 @@ static int xfs_blockgc_scan_inode(struct xfs_inode *ip,
 static inline void xfs_blockgc_queue(struct xfs_perag *pag);
 static bool xfs_reclaim_inode_grab(struct xfs_inode *ip);
 static void xfs_reclaim_inode(struct xfs_inode *ip, struct xfs_perag *pag);
+static inline void xfs_inodegc_queue(struct xfs_mount *mp);
+static int xfs_inodegc_inactivate(struct xfs_inode *ip, struct xfs_perag *pag,
+		struct xfs_eofblocks *eofb);
 
 /*
  * Allocate and initialise an xfs_inode.
@@ -198,6 +201,9 @@ xfs_perag_set_ici_tag(
 	case XFS_ICI_BLOCKGC_TAG:
 		xfs_blockgc_queue(pag);
 		break;
+	case XFS_ICI_INODEGC_TAG:
+		xfs_inodegc_queue(mp);
+		break;
 	}
 
 	trace_xfs_perag_set_ici_tag(mp, pag->pag_agno, tag, _RET_IP_);
@@ -244,18 +250,28 @@ xfs_perag_clear_ici_tag(
  */
 void
 xfs_inode_destroy(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	bool			need_inactive)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_perag	*pag;
+	unsigned int		tag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
 
-	xfs_perag_set_ici_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			XFS_ICI_RECLAIM_TAG);
-	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	if (need_inactive) {
+		trace_xfs_inode_set_need_inactive(ip);
+		ip->i_flags |= XFS_NEED_INACTIVE;
+		tag = XFS_ICI_INODEGC_TAG;
+	} else {
+		trace_xfs_inode_set_reclaimable(ip);
+		ip->i_flags |= XFS_IRECLAIMABLE;
+		tag = XFS_ICI_RECLAIM_TAG;
+	}
+
+	xfs_perag_set_ici_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), tag);
 
 	spin_unlock(&ip->i_flags_lock);
 	spin_unlock(&pag->pag_ici_lock);
@@ -278,6 +294,26 @@ xfs_inew_wait(
 	finish_wait(wq, &wait.wq_entry);
 }
 
+static inline bool
+xfs_inodegc_running(struct xfs_mount *mp)
+{
+	return test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags);
+}
+
+/* Queue a new inode gc pass if there are inodes needing inactivation. */
+static void
+xfs_inodegc_queue(
+	struct xfs_mount        *mp)
+{
+	if (!xfs_inodegc_running(mp))
+		return;
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
+		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+	rcu_read_unlock();
+}
+
 /*
  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
  * part of the structure. This is made more complex by the fact we store
@@ -319,19 +355,26 @@ xfs_reinit_inode(
 static int
 xfs_iget_recycle(
 	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	unsigned long		iflag)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode		*inode = VFS_I(ip);
+	unsigned int		tag;
 	int			error;
 
+	ASSERT(iflag == XFS_IRECLAIM || iflag == XFS_INACTIVATING);
+
+	tag = (iflag == XFS_INACTIVATING) ? XFS_ICI_INODEGC_TAG :
+					    XFS_ICI_RECLAIM_TAG;
+
 	/*
 	 * We need to make it look like the inode is being reclaimed to prevent
 	 * the actual reclaim workers from stomping over us while we recycle
 	 * the inode.  We can't clear the radix tree tag yet as it requires
 	 * pag_ici_lock to be held exclusive.
 	 */
-	ip->i_flags |= XFS_IRECLAIM;
+	ip->i_flags |= iflag;
 
 	spin_unlock(&ip->i_flags_lock);
 	rcu_read_unlock();
@@ -348,7 +391,7 @@ xfs_iget_recycle(
 		rcu_read_lock();
 		spin_lock(&ip->i_flags_lock);
 		wake = !!__xfs_iflags_test(ip, XFS_INEW);
-		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+		ip->i_flags &= ~(XFS_INEW | iflag);
 		if (wake)
 			wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 		spin_unlock(&ip->i_flags_lock);
@@ -366,8 +409,7 @@ xfs_iget_recycle(
 	 */
 	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 	ip->i_flags |= XFS_INEW;
-	xfs_perag_clear_ici_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			XFS_ICI_RECLAIM_TAG);
+	xfs_perag_clear_ici_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), tag);
 	inode->i_state = I_NEW;
 
 	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
@@ -394,6 +436,13 @@ xfs_iget_check_free_state(
 	struct xfs_inode	*ip,
 	int			flags)
 {
+	/*
+	 * Unlinked inodes awaiting inactivation must not be reused until we
+	 * have a chance to clear the on-disk metadata.
+	 */
+	if (VFS_I(ip)->i_nlink == 0 && (ip->i_flags & XFS_NEED_INACTIVE))
+		return -ENOENT;
+
 	if (flags & XFS_IGET_CREATE) {
 		/* should be a free inode */
 		if (VFS_I(ip)->i_mode != 0) {
@@ -460,7 +509,7 @@ xfs_iget_cache_hit(
 	 *	     wait_on_inode to wait for these flags to be cleared
 	 *	     instead of polling for it.
 	 */
-	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) {
 		trace_xfs_iget_skip(ip);
 		XFS_STATS_INC(mp, xs_ig_frecycle);
 		error = -EAGAIN;
@@ -487,11 +536,28 @@ xfs_iget_cache_hit(
 			goto out_error;
 		}
 
-		error = xfs_iget_recycle(pag, ip);
+		error = xfs_iget_recycle(pag, ip, XFS_IRECLAIM);
 		if (error) {
 			trace_xfs_iget_reclaim_fail(ip);
 			return error;
 		}
+	} else if (ip->i_flags & XFS_NEED_INACTIVE) {
+		/*
+		 * If NEED_INACTIVE is set, we've torn down the VFS inode
+		 * already, and must carefully restore it to usable state.
+		 */
+		trace_xfs_iget_inactive(ip);
+
+		if (flags & XFS_IGET_INCORE) {
+			error = -EAGAIN;
+			goto out_error;
+		}
+
+		error = xfs_iget_recycle(pag, ip, XFS_INACTIVATING);
+		if (error) {
+			trace_xfs_iget_inactive_fail(ip);
+			return error;
+		}
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
@@ -774,10 +840,51 @@ xfs_icache_inode_is_allocated(
  */
 #define XFS_LOOKUP_BATCH	32
 
+/*
+ * Decide if the given @ip is eligible for inactivation, and grab it if so.
+ * Returns true if it's ready to go or false if we should just ignore it.
+ */
+STATIC bool
+xfs_inodegc_grab(
+	struct xfs_inode	*ip)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* Check for stale RCU freed inode */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/*
+	 * Skip inodes that don't need inactivation or are being inactivated
+	 * (or reactivated) by another thread.  Inodes should not be tagged
+	 * for inactivation while also in INEW or any reclaim state.
+	 */
+	if (!(ip->i_flags & XFS_NEED_INACTIVE) ||
+	    (ip->i_flags & XFS_INACTIVATING))
+		goto out_unlock_noent;
+
+	/*
+	 * Mark this inode as being inactivated even if the fs is shut down
+	 * because we need xfs_inodegc_inactivate to push this inode into the
+	 * reclaim state.
+	 */
+	ip->i_flags |= XFS_INACTIVATING;
+	spin_unlock(&ip->i_flags_lock);
+	return true;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return false;
+}
+
 /* Don't try to run block gc on an inode that's in any of these states. */
 #define XFS_BLOCKGC_INELIGIBLE_IFLAGS	(XFS_INEW | \
+					 XFS_NEED_INACTIVE | \
+					 XFS_INACTIVATING | \
 					 XFS_IRECLAIMABLE | \
 					 XFS_IRECLAIM)
+
 /*
  * Decide if the given @ip is eligible for garbage collection of speculative
  * preallocations, and grab it if so.  Returns true if it's ready to go or
@@ -826,6 +933,8 @@ selected_for_walk(
 		return xfs_blockgc_grab(ip);
 	case XFS_ICI_RECLAIM_TAG:
 		return xfs_reclaim_inode_grab(ip);
+	case XFS_ICI_INODEGC_TAG:
+		return xfs_inodegc_grab(ip);
 	default:
 		return false;
 	}
@@ -917,6 +1026,10 @@ xfs_inode_walk_ag(
 				xfs_reclaim_inode(batch[i], pag);
 				error = 0;
 				break;
+			case XFS_ICI_INODEGC_TAG:
+				error = xfs_inodegc_inactivate(batch[i], pag,
+						eofb);
+				break;
 			}
 			if (error == -EAGAIN) {
 				skipped++;
@@ -1053,6 +1166,7 @@ xfs_reclaim_inode(
 
 	xfs_iflags_clear(ip, XFS_IFLUSHING);
 reclaim:
+	trace_xfs_inode_reclaiming(ip);
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always appears
@@ -1645,3 +1759,180 @@ xfs_blockgc_free_quota(
 			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
 			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
 }
+
+/*
+ * Inode Inactivation and Reclaimation
+ * ===================================
+ *
+ * Sometimes, inodes need to have work done on them once the last program has
+ * closed the file.  Typically this means cleaning out any leftover speculative
+ * preallocations after EOF or in the CoW fork.  For inodes that have been
+ * totally unlinked, this means unmapping data/attr/cow blocks, removing the
+ * inode from the unlinked buckets, and marking it free in the inobt and inode
+ * table.
+ *
+ * This process can generate many metadata updates, which shows up as close()
+ * and unlink() calls that take a long time.  We defer all that work to a
+ * workqueue which means that we can batch a lot of work and do it in inode
+ * order for better performance.  Furthermore, we can control the workqueue,
+ * which means that we can avoid doing inactivation work at a bad time, such as
+ * when the fs is frozen.
+ *
+ * Deferred inactivation introduces new inode flag states (NEED_INACTIVE and
+ * INACTIVATING) and adds a new INODEGC radix tree tag for fast access.  We
+ * maintain separate perag counters for both types, and move counts as inodes
+ * wander the state machine, which now works as follows:
+ *
+ * If the inode needs inactivation, we:
+ *   - Set the NEED_INACTIVE inode flag
+ *   - Increment the per-AG inactive count
+ *   - Set the ICI_INODEGC tag in the per-AG inode tree
+ *   - Set the ICI_INODEGC tag in the per-fs AG tree
+ *   - Schedule background inode inactivation
+ *
+ * If the inode does not need inactivation, we:
+ *   - Set the IRECLAIMABLE inode flag
+ *   - Increment the per-AG reclaim count
+ *   - Set the ICI_RECLAIM tag in the per-AG inode tree
+ *   - Set the ICI_RECLAIM tag in the per-fs AG tree
+ *   - Schedule background inode reclamation
+ *
+ * When it is time for background inode inactivation, we:
+ *   - Set the INACTIVATING inode flag
+ *   - Make all the on-disk updates
+ *   - Clear both INACTIVATING and NEED_INACTIVE inode flags
+ *   - Decrement the per-AG inactive count
+ *   - Clear the ICI_INODEGC tag in the per-AG inode tree
+ *   - Clear the ICI_INODEGC tag in the per-fs AG tree if we just inactivated
+ *     the last inode in the AG
+ *   - Kick the inode into reclamation per the previous paragraph
+ *
+ * When it is time for background inode reclamation, we:
+ *   - Set the IRECLAIM inode flag
+ *   - Detach all the resources and remove the inode from the per-AG inode tree
+ *   - Clear both IRECLAIM and RECLAIMABLE inode flags
+ *   - Decrement the per-AG reclaim count
+ *   - Clear the ICI_RECLAIM tag from the per-AG inode tree
+ *   - Clear the ICI_RECLAIM tag from the per-fs AG tree if we just reclaimed
+ *     the last inode in the AG
+ *
+ * When these state transitions occur, the caller must have taken the per-AG
+ * incore inode tree lock and then the inode i_flags lock, in that order.
+ */
+
+/*
+ * Free all speculative preallocations and possibly even the inode itself.
+ * This is the last chance to make changes to an otherwise unreferenced file
+ * before incore reclamation happens.
+ */
+static int
+xfs_inodegc_inactivate(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	struct xfs_eofblocks	*eofb)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+
+	/*
+	 * Inactivation isn't supposed to run when the fs is frozen because
+	 * we don't want kernel threads to block on transaction allocation.
+	 */
+	ASSERT(mp->m_super->s_writers.frozen < SB_FREEZE_FS);
+
+	/*
+	 * Foreground threads that have hit ENOSPC or EDQUOT are allowed to
+	 * pass in a eofb structure to look for inodes to inactivate
+	 * immediately to free some resources.  If this inode isn't a match,
+	 * put it back on the shelf and move on.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!xfs_inode_matches_eofb(ip, eofb)) {
+		ip->i_flags &= ~XFS_INACTIVATING;
+		spin_unlock(&ip->i_flags_lock);
+		return 0;
+	}
+	spin_unlock(&ip->i_flags_lock);
+
+	trace_xfs_inode_inactivating(ip);
+
+	xfs_inactive(ip);
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+	/*
+	 * Move the inode from the inactivation phase to the reclamation phase
+	 * by clearing both inactivation inode state flags and marking the
+	 * inode reclaimable.  Schedule background reclaim to run later.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+
+	ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
+	ip->i_flags |= XFS_IRECLAIMABLE;
+
+	xfs_perag_clear_ici_tag(pag, agino, XFS_ICI_INODEGC_TAG);
+	xfs_perag_set_ici_tag(pag, agino, XFS_ICI_RECLAIM_TAG);
+
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+
+	return 0;
+}
+
+/* Walk the fs and inactivate the inodes in them. */
+int
+xfs_inodegc_free_space(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	trace_xfs_inodegc_free_space(mp, eofb, _RET_IP_);
+
+	return xfs_inode_walk(mp, XFS_ICI_INODEGC_TAG, eofb);
+}
+
+/* Background inode inactivation worker. */
+void
+xfs_inodegc_worker(
+	struct work_struct	*work)
+{
+	struct xfs_mount	*mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_inodegc_work);
+	int			error;
+
+	error = xfs_inodegc_free_space(mp, NULL);
+	if (error && error != -EAGAIN)
+		xfs_err(mp, "inode inactivation failed, error %d", error);
+
+	xfs_inodegc_queue(mp);
+}
+
+/* Force all currently queued inode inactivation work to run immediately. */
+void
+xfs_inodegc_flush(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_inodegc_running(mp) ||
+	    !radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
+		return;
+
+	mod_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+	flush_delayed_work(&mp->m_inodegc_work);
+}
+
+/* Stop all queued inactivation work. */
+void
+xfs_inodegc_stop(
+	struct xfs_mount	*mp)
+{
+	clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags);
+	cancel_delayed_work_sync(&mp->m_inodegc_work);
+}
+
+/* Schedule deferred inode inactivation work. */
+void
+xfs_inodegc_start(
+	struct xfs_mount	*mp)
+{
+	set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags);
+	xfs_inodegc_queue(mp);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 987267797fc4..f43a3dad5652 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -26,6 +26,8 @@ struct xfs_eofblocks {
 #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
 /* Inode has speculative preallocations (posteof or cow) to clean. */
 #define XFS_ICI_BLOCKGC_TAG	1
+/* Inode can be inactivated. */
+#define XFS_ICI_INODEGC_TAG	2
 
 /*
  * Flags for xfs_iget()
@@ -48,7 +50,7 @@ void xfs_reclaim_inodes(struct xfs_mount *mp);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
-void xfs_inode_destroy(struct xfs_inode *ip);
+void xfs_inode_destroy(struct xfs_inode *ip, bool need_inactive);
 
 int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
 		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
@@ -72,4 +74,36 @@ void xfs_blockgc_start(struct xfs_mount *mp);
 
 void xfs_inew_wait(struct xfs_inode *ip);
 
+void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_flush(struct xfs_mount *mp);
+void xfs_inodegc_stop(struct xfs_mount *mp);
+void xfs_inodegc_start(struct xfs_mount *mp);
+int xfs_inodegc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
+
+/*
+ * Process all pending inode inactivations immediately (sort of) so that a
+ * resource usage report will be mostly accurate with regards to files that
+ * have been unlinked recently.
+ *
+ * It isn't practical to maintain a count of the resources used by unlinked
+ * inodes to adjust the values reported by this function.  Resources that are
+ * shared (e.g. reflink) when an inode is queued for inactivation cannot be
+ * counted towards the adjustment, and cross referencing data extents with the
+ * refcount btree is the only way to decide if a resource is shared.  Worse,
+ * unsharing of any data blocks in the system requires either a second
+ * consultation with the refcount btree, or training users to deal with the
+ * free space counts possibly fluctuating upwards as inactivations occur.
+ *
+ * Hence we guard the inactivation flush with a ratelimiter so that the counts
+ * are not way out of whack while ignoring workloads that hammer us with statfs
+ * calls.
+ */
+static inline void
+xfs_inodegc_summary_flush(
+	struct xfs_mount	*mp)
+{
+	if (__ratelimit(&mp->m_inodegc_ratelimit))
+		xfs_inodegc_flush(mp);
+}
+
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28873453c0ea..52a7a0a5894c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1678,6 +1678,55 @@ xfs_inactive_ifree(
 	return 0;
 }
 
+/*
+ * Returns true if we need to update the on-disk metadata before we can free
+ * the memory used by this inode.  Updates include freeing post-eof
+ * preallocations; freeing COW staging extents; and marking the inode free in
+ * the inobt if it is on the unlinked list.
+ */
+bool
+xfs_inode_needs_inactivation(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (VFS_I(ip)->i_mode == 0)
+		return false;
+
+	/* If this is a read-only mount, don't do this (would generate I/O) */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return false;
+
+	/* Metadata inodes require explicit resource cleanup. */
+	if (xfs_is_metadata_inode(ip))
+		return false;
+
+	/* Try to clean out the cow blocks if there are any. */
+	if (cow_ifp && cow_ifp->if_bytes > 0)
+		return true;
+
+	/* Unlinked files must be freed. */
+	if (VFS_I(ip)->i_nlink == 0)
+		return true;
+
+	/*
+	 * This file isn't being freed, so check if there are post-eof blocks
+	 * to free.  @force is true because we are evicting an inode from the
+	 * cache.  Post-eof blocks must be freed, lest we end up with broken
+	 * free space accounting.
+	 *
+	 * Note: don't bother with iolock here since lockdep complains about
+	 * acquiring it in reclaim context. We have the only reference to the
+	 * inode at this point anyways.
+	 */
+	return xfs_can_free_eofblocks(ip, true);
+}
+
 /*
  * xfs_inactive
  *
@@ -1688,7 +1737,7 @@ xfs_inactive_ifree(
  */
 void
 xfs_inactive(
-	xfs_inode_t	*ip)
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp;
 	int			error;
@@ -1714,6 +1763,11 @@ xfs_inactive(
 	if (xfs_is_metadata_inode(ip))
 		return;
 
+	/* Ensure dquots are attached prior to making changes to this file. */
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return;
+
 	/* Try to clean out the cow blocks if there are any. */
 	if (xfs_inode_has_cow_data(ip))
 		xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
@@ -1739,10 +1793,6 @@ xfs_inactive(
 	     ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
 		truncate = 1;
 
-	error = xfs_qm_dqattach(ip);
-	if (error)
-		return;
-
 	if (S_ISLNK(VFS_I(ip)->i_mode))
 		error = xfs_inactive_symlink(ip);
 	else if (truncate)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d3df82795154..8beca572adf8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -234,6 +234,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 #define XFS_IDMAPI		(1 << 4) /* has legacy DMAPI fields set */
 #define __XFS_INEW_BIT		3	 /* inode has just been allocated */
 #define XFS_INEW		(1 << __XFS_INEW_BIT)
+#define XFS_NEED_INACTIVE	(1 << 4) /* see XFS_INACTIVATING below */
 #define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
 #define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
 #define XFS_IFLUSHING		(1 << 7) /* inode is being flushed */
@@ -248,6 +249,15 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 #define XFS_IRECOVERY		(1 << 11)
 #define XFS_ICOWBLOCKS		(1 << 12)/* has the cowblocks tag set */
 
+/*
+ * If we need to update on-disk metadata before this IRECLAIMABLE inode can be
+ * freed, then NEED_INACTIVE will be set.  Once we start the updates, the
+ * INACTIVATING bit will be set to keep iget away from this inode.  After the
+ * inactivation completes, both flags will be cleared and the inode is a
+ * plain old IRECLAIMABLE inode.
+ */
+#define XFS_INACTIVATING	(1 << 13)
+
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
  * inode lookup. This prevents unintended behaviour on the new inode from
@@ -255,7 +265,8 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
  */
 #define XFS_IRECLAIM_RESET_FLAGS	\
 	(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
-	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
+	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+	 XFS_INACTIVATING)
 
 /*
  * Flags for inode locking.
@@ -494,6 +505,8 @@ extern struct kmem_zone	*xfs_inode_zone;
 /* The default CoW extent size hint. */
 #define XFS_DEFAULT_COWEXTSZ_HINT 32
 
+bool xfs_inode_needs_inactivation(struct xfs_inode *ip);
+
 int xfs_iunlink_init(struct xfs_perag *pag);
 void xfs_iunlink_destroy(struct xfs_perag *pag);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e5dd1c0c2f03..dd89eb5743b4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2787,6 +2787,13 @@ xlog_recover_process_iunlinks(
 		}
 		xfs_buf_rele(agibp);
 	}
+
+	/*
+	 * Flush the pending unlinked inodes to ensure that the inactivations
+	 * are fully completed on disk and the incore inodes can be reclaimed
+	 * before we signal that recovery is complete.
+	 */
+	xfs_inodegc_flush(mp);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1c97b155a8ee..f95f913bff38 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -640,6 +640,10 @@ xfs_check_summary_counts(
  * so we need to unpin them, write them back and/or reclaim them before unmount
  * can proceed.
  *
+ * The caller should start the process by flushing queued inactivation work so
+ * that all file updates to on-disk metadata can be flushed with the log.
+ * After the AIL push, all inodes should be ready for reclamation.
+ *
  * An inode cluster that has been freed can have its buffer still pinned in
  * memory because the transaction is still sitting in a iclog. The stale inodes
  * on that buffer will be pinned to the buffer until the transaction hits the
@@ -670,6 +674,7 @@ xfs_unmount_flush_inodes(
 	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
 
 	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_inodegc_stop(mp);
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
@@ -894,6 +899,13 @@ xfs_mountfs(
 	if (error)
 		goto out_log_dealloc;
 
+	/* Don't allow statfs to hammer us with inodegc flushes. */
+	ratelimit_state_init(&mp->m_inodegc_ratelimit, 5 * HZ, 10);
+	ratelimit_set_flags(&mp->m_inodegc_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+
+	/* Enable background workers. */
+	xfs_inodegc_start(mp);
+
 	/*
 	 * Get and sanity-check the root inode.
 	 * Save the pointer to it in the mount structure.
@@ -1049,9 +1061,9 @@ xfs_mountfs(
 	/* Clean out dquots that might be in memory after quotacheck. */
 	xfs_qm_unmount(mp);
 	/*
-	 * Flush all inode reclamation work and flush the log.
-	 * We have to do this /after/ rtunmount and qm_unmount because those
-	 * two will have scheduled delayed reclaim for the rt/quota inodes.
+	 * Flush all inode reclamation work and flush inodes to the log.  Do
+	 * this after rtunmount and qm_unmount because those two will have
+	 * released the rt and quota inodes.
 	 *
 	 * This is slightly different from the unmountfs call sequence
 	 * because we could be tearing down a partially set up mount.  In
@@ -1059,6 +1071,7 @@ xfs_mountfs(
 	 * qm_unmount_quotas and therefore rely on qm_unmount to release the
 	 * quota inodes.
 	 */
+	xfs_inodegc_flush(mp);
 	xfs_unmount_flush_inodes(mp);
  out_log_dealloc:
 	xfs_log_mount_cancel(mp);
@@ -1095,6 +1108,12 @@ xfs_unmountfs(
 	uint64_t		resblks;
 	int			error;
 
+	/*
+	 * Flush all the queued inode inactivation work to disk before tearing
+	 * down rt metadata and quotas.
+	 */
+	xfs_inodegc_flush(mp);
+
 	xfs_blockgc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 81829d19596e..83b8f86448f4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -153,6 +153,8 @@ typedef struct xfs_mount {
 	uint8_t			m_rt_checked;
 	uint8_t			m_rt_sick;
 
+	unsigned long		m_opflags;
+
 	/*
 	 * End of read-mostly variables. Frequently written variables and locks
 	 * should be placed below this comment from now on. The first variable
@@ -177,6 +179,7 @@ typedef struct xfs_mount {
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct delayed_work	m_inodegc_work; /* background inode inactive */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
@@ -213,6 +216,7 @@ typedef struct xfs_mount {
 	unsigned int		*m_errortag;
 	struct xfs_kobj		m_errortag_kobj;
 #endif
+	struct ratelimit_state	m_inodegc_ratelimit;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
@@ -251,6 +255,11 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DAX_ALWAYS	(1ULL << 26)
 #define XFS_MOUNT_DAX_NEVER	(1ULL << 27)
 
+#define XFS_OPFLAG_INODEGC_RUNNING_BIT	(0)	/* are we allowed to start the
+						   inode inactivation worker? */
+#define XFS_OPFLAG_INACTIVATE_NOW_BIT	(1)	/* force foreground inactivation
+						   of unlinked inodes */
+
 /*
  * Max and min values for mount-option defined I/O
  * preallocation sizes.
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 76efae956fa8..678013bffc14 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -113,6 +113,12 @@ xfs_qm_scall_quotaoff(
 		return -EEXIST;
 	error = 0;
 
+	/*
+	 * Flush the inactive list before we turn quota off to reduce the
+	 * amount of quotaoff work we have to do with the quotaoff mutex held.
+	 */
+	xfs_inodegc_flush(mp);
+
 	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
 
 	/*
@@ -178,6 +184,15 @@ xfs_qm_scall_quotaoff(
 	if (error)
 		goto out_unlock;
 
+	/*
+	 * Force inactivations to be done in the foreground and flush any
+	 * background inactivations that may have been scheduled since the
+	 * previous flush.  Once this is done, there won't be any inodes with
+	 * dquots attached.
+	 */
+	set_bit(XFS_OPFLAG_INACTIVATE_NOW_BIT, &mp->m_opflags);
+	xfs_inodegc_flush(mp);
+
 	/*
 	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
 	 * to take care of the race between dqget and quotaoff. We don't take
@@ -216,6 +231,9 @@ xfs_qm_scall_quotaoff(
 	 */
 	xfs_qm_dqpurge_all(mp, dqtype);
 
+	/* Restart background inactivation now that all the dquots are gone. */
+	clear_bit(XFS_OPFLAG_INACTIVATE_NOW_BIT, &mp->m_opflags);
+
 	/*
 	 * Transactions that had started before ACTIVE state bit was cleared
 	 * could have logged many dquots, so they'd have higher LSNs than
@@ -697,6 +715,8 @@ xfs_qm_scall_getquota(
 	struct xfs_dquot	*dqp;
 	int			error;
 
+	xfs_inodegc_summary_flush(mp);
+
 	/*
 	 * Try to get the dquot. We don't want it allocated on disk, so don't
 	 * set doalloc. If it doesn't exist, we'll get ENOENT back.
@@ -735,6 +755,8 @@ xfs_qm_scall_getquota_next(
 	struct xfs_dquot	*dqp;
 	int			error;
 
+	xfs_inodegc_summary_flush(mp);
+
 	error = xfs_qm_dqget_next(mp, *id, type, &dqp);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f5bbfd13a956..b1b01091b673 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -628,6 +628,11 @@ xfs_check_delalloc(
 #define xfs_check_delalloc(ip, whichfork)	do { } while (0)
 #endif
 
+/* iflags that we shouldn't see before scheduling reclaim or inactivation. */
+#define XFS_IDESTROY_BAD_IFLAGS	(XFS_IRECLAIMABLE | \
+				 XFS_IRECLAIM | \
+				 XFS_NEED_INACTIVE | \
+				 XFS_INACTIVATING)
 /*
  * Now that the generic code is guaranteed not to be accessing
  * the linux inode, we can inactivate and reclaim the inode.
@@ -637,28 +642,45 @@ xfs_fs_destroy_inode(
 	struct inode		*inode)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	bool			need_inactive;
 
 	trace_xfs_destroy_inode(ip);
 
 	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
-	XFS_STATS_INC(ip->i_mount, vn_rele);
-	XFS_STATS_INC(ip->i_mount, vn_remove);
+	XFS_STATS_INC(mp, vn_rele);
+	XFS_STATS_INC(mp, vn_remove);
 
-	xfs_inactive(ip);
+	need_inactive = xfs_inode_needs_inactivation(ip);
+	if (need_inactive &&
+	    test_bit(XFS_OPFLAG_INACTIVATE_NOW_BIT, &mp->m_opflags)) {
+		/*
+		 * If we're doing foreground inactivation (i.e. quotaoff),
+		 * take care of the inode right now and push it straight to
+		 * reclaim.
+		 */
+		xfs_inactive(ip);
+		need_inactive = false;
+	}
 
-	if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+	/*
+	 * If we didn't need inactivation to free all the CoW extents and
+	 * revoke the delalloc reservations, we shouldn't have any delalloc
+	 * blocks left.
+	 */
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && !need_inactive &&
+	    ip->i_delayed_blks) {
 		xfs_check_delalloc(ip, XFS_DATA_FORK);
 		xfs_check_delalloc(ip, XFS_COW_FORK);
 		ASSERT(0);
 	}
 
-	XFS_STATS_INC(ip->i_mount, vn_reclaim);
+	XFS_STATS_INC(mp, vn_reclaim);
 
 	/*
-	 * We should never get here with one of the reclaim flags already set.
+	 * We should never get here with any of the reclaim flags already set.
 	 */
-	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IDESTROY_BAD_IFLAGS));
 
 	/*
 	 * We always use background reclaim here because even if the inode is
@@ -667,7 +689,7 @@ xfs_fs_destroy_inode(
 	 * reclaim path handles this more efficiently than we can here, so
 	 * simply let background reclaim tear down all inodes.
 	 */
-	xfs_inode_destroy(ip);
+	xfs_inode_destroy(ip, need_inactive);
 }
 
 static void
@@ -779,6 +801,21 @@ xfs_fs_sync_fs(
 		flush_delayed_work(&mp->m_log->l_work);
 	}
 
+	/*
+	 * If the fs is at FREEZE_PAGEFAULTS, that means the VFS holds the
+	 * umount mutex and is syncing the filesystem just before setting the
+	 * state to FREEZE_FS.  We are not allowed to run transactions on a
+	 * filesystem that is in FREEZE_FS state, so deactivate the background
+	 * workers before we get there, and leave them off for the duration of
+	 * the freeze.
+	 *
+	 * We can't do this in xfs_fs_freeze_super because freeze_super takes
+	 * s_umount, which means we can't lock out a concurrent thaw request
+	 * without adding another layer of locks to the freeze process.
+	 */
+	if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT)
+		xfs_inodegc_stop(mp);
+
 	return 0;
 }
 
@@ -797,6 +834,8 @@ xfs_fs_statfs(
 	xfs_extlen_t		lsize;
 	int64_t			ffree;
 
+	xfs_inodegc_summary_flush(mp);
+
 	statp->f_type = XFS_SUPER_MAGIC;
 	statp->f_namelen = MAXNAMELEN - 1;
 
@@ -907,10 +946,27 @@ xfs_fs_unfreeze(
 
 	xfs_restore_resvblks(mp);
 	xfs_log_work_queue(mp);
+	xfs_inodegc_start(mp);
 	xfs_blockgc_start(mp);
 	return 0;
 }
 
+STATIC int
+xfs_fs_freeze_super(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	/*
+	 * Before we take s_umount to get to FREEZE_WRITE, flush all the
+	 * accumulated background work so that there's less recovery work
+	 * to do if we crash during the freeze.
+	 */
+	xfs_inodegc_flush(mp);
+
+	return freeze_super(sb);
+}
+
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
@@ -1089,6 +1145,7 @@ static const struct super_operations xfs_super_operations = {
 	.show_options		= xfs_fs_show_options,
 	.nr_cached_objects	= xfs_fs_nr_cached_objects,
 	.free_cached_objects	= xfs_fs_free_cached_objects,
+	.freeze_super		= xfs_fs_freeze_super,
 };
 
 static int
@@ -1736,6 +1793,13 @@ xfs_remount_ro(
 		return error;
 	}
 
+	/*
+	 * Perform all on-disk metadata updates required to inactivate inodes.
+	 * Since this can involve finobt updates, do it now before we lose the
+	 * per-AG space reservations to guarantee that we won't fail there.
+	 */
+	xfs_inodegc_flush(mp);
+
 	/* Free the per-AG metadata reservation pool. */
 	error = xfs_fs_unreserve_ag_blocks(mp);
 	if (error) {
@@ -1859,6 +1923,7 @@ static int xfs_init_fs_context(
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+	INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a929ebef89ec..81f65a77d5ce 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -615,14 +615,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
+		__field(unsigned long, iflags)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
+		__entry->iflags = ip->i_flags;
 	),
-	TP_printk("dev %d:%d ino 0x%llx",
+	TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ino)
+		  __entry->ino,
+		  __entry->iflags)
 )
 
 #define DEFINE_INODE_EVENT(name) \
@@ -632,6 +635,8 @@ DEFINE_EVENT(xfs_inode_class, name, \
 DEFINE_INODE_EVENT(xfs_iget_skip);
 DEFINE_INODE_EVENT(xfs_iget_reclaim);
 DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_inactive);
+DEFINE_INODE_EVENT(xfs_iget_inactive_fail);
 DEFINE_INODE_EVENT(xfs_iget_hit);
 DEFINE_INODE_EVENT(xfs_iget_miss);
 
@@ -666,6 +671,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_reclaimable);
+DEFINE_INODE_EVENT(xfs_inode_reclaiming);
+DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
+DEFINE_INODE_EVENT(xfs_inode_inactivating);
 
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -3925,6 +3934,7 @@ DEFINE_EVENT(xfs_eofblocks_class, name,	\
 	TP_ARGS(mp, eofb, caller_ip))
 DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks);
 DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
+DEFINE_EOFBLOCKS_EVENT(xfs_inodegc_free_space);
 
 #endif /* _TRACE_XFS_H */
 

From patchwork Fri Mar 26 00:22:05 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165273
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 7078AC433E2
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 4A3BB61A43
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230158AbhCZAWP (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:15 -0400
Received: from mail.kernel.org ([198.145.29.99]:35320 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230174AbhCZAWG (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:06 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id E6476619F3;
        Fri, 26 Mar 2021 00:22:05 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718126;
        bh=esGM0xUAgACj770/bAMJgwSaga6rT5uwVs+5joG4Pic=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=GBlFFO8XGhmKEGJ88YaJYlrelG/i8jZwyRLY50grnrQ5YN/uXHmPD6q2LUCDZJZhH
         GmZi+UrJJyhXoTC+LUArVzZ7/q3rsMvMMjyeoGD6zYvXuVPbE0vhV0o5g6CqW1SviO
         72BPgrjsOG4LrYHhvtjZFAhfKSq5gsXmAp1mS3wfpEgbIs8WYsHCIQHFB0UQo6QocH
         YCM5yz72qK/EU+rkiSI64gWpehtt7xA69XwiT7ps2XEZjHn0iC5Lqek37QhphUA6A0
         i1ZyZBEbrC79O2Oo1kIcJCuSABuza1lsKYMAL41HW6Ld4bjzSaHhCiTa1C9UxiiL6j
         mRnwVO4H1t2ZA==
Subject: [PATCH 3/9] xfs: expose sysfs knob to control inode inactivation
 delay
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:05 -0700
Message-ID: <161671812555.622901.16473251954083643080.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Allow administrators to control the length that we defer inode
inactivation.  By default we'll set the delay to 2 seconds, as an
arbitrary choice between allowing for some batching of a deltree
operation, and not letting too many inodes pile up in memory.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 Documentation/admin-guide/xfs.rst |    7 +++++++
 fs/xfs/xfs_globals.c              |    3 +++
 fs/xfs/xfs_icache.c               |    3 ++-
 fs/xfs/xfs_linux.h                |    1 +
 fs/xfs/xfs_sysctl.c               |    9 +++++++++
 fs/xfs/xfs_sysctl.h               |    1 +
 6 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index f9b109bfc6a6..9dd62b155fda 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -277,6 +277,13 @@ The following sysctls are available for the XFS filesystem:
 	references and returns timed-out AGs back to the free stream
 	pool.
 
+  fs.xfs.inode_gc_delay
+	(Units: centiseconds   Min: 0  Default: 1  Max: 360000)
+	The amount of time to delay cleanup work that happens after a file is
+	closed by all programs.  This involves clearing speculative
+	preallocations from linked files and freeing unlinked files.  A higher
+	value here increases batching at a risk of background work storms.
+
   fs.xfs.speculative_prealloc_lifetime
 	(Units: seconds   Min: 1  Default: 300  Max: 86400)
 	The interval at which the background scanning for inodes
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f62fa652c2fd..8c359b0b8fd3 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -28,6 +28,9 @@ xfs_param_t xfs_params = {
 	.rotorstep	= {	1,		1,		255	},
 	.inherit_nodfrg	= {	0,		1,		1	},
 	.fstrm_timer	= {	1,		30*100,		3600*100},
+	.inodegc_timer	= {	0,		1,		3600*100},
+
+	/* Values below here are measured in seconds */
 	.blockgc_timer	= {	1,		300,		3600*24},
 };
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 287a1500b9b2..803f1241f867 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -310,7 +310,8 @@ xfs_inodegc_queue(
 
 	rcu_read_lock();
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
-		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work,
+				msecs_to_jiffies(xfs_inodegc_centisecs * 10));
 	rcu_read_unlock();
 }
 
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7688663b9773..3d6b0a407d52 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -99,6 +99,7 @@ typedef __u32			xfs_nlink_t;
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
 #define xfs_blockgc_secs	xfs_params.blockgc_timer.val
+#define xfs_inodegc_centisecs	xfs_params.inodegc_timer.val
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_set_flags_nested(sp, f)		\
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 546a6cd96729..878f31d3a587 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -176,6 +176,15 @@ static struct ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.fstrm_timer.min,
 		.extra2		= &xfs_params.fstrm_timer.max,
 	},
+	{
+		.procname	= "inode_gc_delay",
+		.data		= &xfs_params.inodegc_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.inodegc_timer.min,
+		.extra2		= &xfs_params.inodegc_timer.max
+	},
 	{
 		.procname	= "speculative_prealloc_lifetime",
 		.data		= &xfs_params.blockgc_timer.val,
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 7692e76ead33..a045c33c3d30 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -36,6 +36,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
 	xfs_sysctl_val_t blockgc_timer;	/* Interval between blockgc scans */
+	xfs_sysctl_val_t inodegc_timer;	/* Inode inactivation scan interval */
 } xfs_param_t;
 
 /*

From patchwork Fri Mar 26 00:22:11 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165277
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 169E5C433E8
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:46 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id D1BD8619F3
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:22:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229733AbhCZAWP (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:15 -0400
Received: from mail.kernel.org ([198.145.29.99]:35342 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230180AbhCZAWL (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:11 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id 8526B619D3;
        Fri, 26 Mar 2021 00:22:11 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718131;
        bh=41FbphGw9grZzaFY3nyPpAWytXQLKbHPtdmBQ5/66Ug=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=KIEsQ1BKskO7iPOE2aheqB1F9iEohb8567ArL8N4AwrSvxQsd8yLbxc/iqxjdIB1/
         k0WgfN9zXrUYmdCy5yM/KfrBNu8F8wAsFdq3siMNL+qVKNSa3vVtkFwpJDNq+nJtP0
         WEE8yMFHGZck7wlCBGTuvDPPzVAH4Il46Dzc2ftxKMxAKSo0VzfEohFqkDGt9F9cly
         T++nBiNWXffBskv6NcFOQsAvr716vk4cIX+wblSOtrU+weehVMsu5wC32wBlMecN1L
         TnzHUIaWn+3bjXiFRU4pIOQEpzIT0nQOo10jwpmw3mZIzvOn8+IVB7PZWXGaQ+yahB
         oKi83mfkZMCTg==
Subject: [PATCH 4/9] xfs: force inode inactivation and retry fs writes when
 there isn't space
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:11 -0700
Message-ID: <161671813115.622901.6609359614387194771.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Any time we try to modify a file's contents and it fails due to ENOSPC
or EDQUOT, force inode inactivation work to try to free space.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 803f1241f867..b0293ab55385 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1690,9 +1690,15 @@ xfs_blockgc_free_space(
 	struct xfs_mount	*mp,
 	struct xfs_eofblocks	*eofb)
 {
+	int			error;
+
 	trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
 
-	return xfs_inode_walk(mp, XFS_ICI_BLOCKGC_TAG, eofb);
+	error = xfs_inode_walk(mp, XFS_ICI_BLOCKGC_TAG, eofb);
+	if (error)
+		return error;
+
+	return xfs_inodegc_free_space(mp, eofb);
 }
 
 /*

From patchwork Fri Mar 26 00:22:16 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165281
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D0DA0C433DB
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id A607561A45
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229669AbhCZAWo (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:44 -0400
Received: from mail.kernel.org ([198.145.29.99]:35442 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230179AbhCZAWR (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:17 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id 15B92619D3;
        Fri, 26 Mar 2021 00:22:17 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718137;
        bh=SVNj8KRoxD3nH0o7MP5NORya3u81qIzy+rkc1BxipCE=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=C0gSltf7l9D3lWwuTimGFcE6qR1i0g/YpaaDEafWZ28n3DTyWQ2oIPhRCnV52BnVo
         RTyKNLp20atAkM2JiJz9r/qJaqmvGgRX9OHyYFL2WvZy+TKshGfd6mtjN+B4Kd1eBK
         21OLuoR6MbsKBTYDCqmOu29z8mVSf9mn2mreBdEKZSULmzA1eyY/kZ1B6maR1ZEBY0
         Q8hZoM/I4aZ7YH3pwdNYIcI5UlVpBt8VyXuAoUh7ZG+0WVZLWh8rP8VSZdnS4eicgs
         /RIAUSPr1liMwHas4m0ffl0JASu2o6sr+Mlq1y/odn3L3fvgyh3+rYlC9lkbXLIr4C
         whDWw7Ywb3GMw==
Subject: [PATCH 5/9] xfs: force inode garbage collection before fallocate when
 space is low
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:16 -0700
Message-ID: <161671813674.622901.3730021302919248675.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Generally speaking, when a user calls fallocate, they're looking to
preallocate space in a file in the largest contiguous chunks possible.
If free space is low, it's possible that the free space will look
unnecessarily fragmented because there are unlinked inodes that are
holding on to space that we could allocate.  When this happens,
fallocate makes suboptimal allocation decisions for the sake of deleted
files, which doesn't make much sense, so scan the filesystem for dead
items to delete to try to avoid this.

Note that there are a handful of fstests that fill a filesystem, delete
just enough files to allow a single large allocation, and check that
fallocate actually gets the allocation.  These tests regress because the
test runs fallocate before the inode gc has a chance to run, so add this
behavior to maintain as much of the old behavior as possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_bmap_util.c |   44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e79e3d1ff38d..507d18418142 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -28,6 +28,7 @@
 #include "xfs_icache.h"
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
+#include "xfs_sb.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -727,6 +728,44 @@ xfs_free_eofblocks(
 	return error;
 }
 
+/*
+ * If we suspect that the target device is full enough that it isn't to be able
+ * to satisfy the entire request, try a non-sync inode inactivation scan to
+ * free up space.  While it's perfectly fine to fill a preallocation request
+ * with a bunch of short extents, we'd prefer to do the inactivation work now
+ * to combat long term fragmentation in new file data.  This is purely for
+ * optimization, so we don't take any blocking locks and we only look for space
+ * that is already on the reclaim list (i.e. we don't zap speculative
+ * preallocations).
+ */
+static int
+xfs_alloc_reclaim_inactive_space(
+	struct xfs_mount	*mp,
+	bool			is_rt,
+	xfs_filblks_t		allocatesize_fsb)
+{
+	struct xfs_perag	*pag;
+	struct xfs_sb		*sbp = &mp->m_sb;
+	xfs_extlen_t		free;
+	xfs_agnumber_t		agno;
+
+	if (is_rt) {
+		if (sbp->sb_frextents * sbp->sb_rextsize >= allocatesize_fsb)
+			return 0;
+	} else {
+		for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+			pag = xfs_perag_get(mp, agno);
+			free = pag->pagf_freeblks;
+			xfs_perag_put(pag);
+
+			if (free >= allocatesize_fsb)
+				return 0;
+		}
+	}
+
+	return xfs_inodegc_free_space(mp, NULL);
+}
+
 int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
@@ -811,6 +850,11 @@ xfs_alloc_file_space(
 			rblocks = 0;
 		}
 
+		error = xfs_alloc_reclaim_inactive_space(mp, rt,
+				allocatesize_fsb);
+		if (error)
+			break;
+
 		/*
 		 * Allocate and setup the transaction.
 		 */

From patchwork Fri Mar 26 00:22:22 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165285
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C90FDC433E0
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 94AA6619F8
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230174AbhCZAWp (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:45 -0400
Received: from mail.kernel.org ([198.145.29.99]:35460 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230195AbhCZAWX (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:23 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id A2DE9619D3;
        Fri, 26 Mar 2021 00:22:22 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718142;
        bh=MxPRCk18HqArdMOxJzF52+nKgh1EIpQWUT4lp0zmDso=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=eJoqzvxarlFFxafc2/ZEQQO69OqDdZcKtBUF/rJDR68Y+T2bzsc0jHmlnMNbswadA
         h9iw/vUtEUoSfROqJ37LVKbiIsDlAzveXJ3ofBwetKpVEklFsf/jNXrjgS1Tha7RNc
         8F6m8aLf3saTC8jTMnt/PYpw3wPUxW8PyEY49Ha1EBm0hvj4mLYrWYLr4ipl5KaVtw
         3QQEmZKcgVJkLh4ZQX4nJMonG+1wH/YwjOSWWc/Mih4PiFq0yI8+jIaTmeamqQ+a5E
         uyrxnMpLMQOgH2xzvWsOHrlzS7e3GhlwJ+DaOKousbBUFvxG2E87LHnh2unDZu2r/1
         aDK5V0MIOIlYw==
Subject: [PATCH 6/9] xfs: parallelize inode inactivation
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:22 -0700
Message-ID: <161671814229.622901.8483806808388153781.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Split the inode inactivation work into per-AG work items so that we can
take advantage of parallelization.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |   53 +++++++++++++++++++++++++++++++++++++--------------
 fs/xfs/xfs_mount.c  |    3 +++
 fs/xfs/xfs_mount.h  |    4 +++-
 fs/xfs/xfs_super.c  |    1 -
 4 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index b0293ab55385..00d614730b2c 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -32,7 +32,7 @@ static int xfs_blockgc_scan_inode(struct xfs_inode *ip,
 static inline void xfs_blockgc_queue(struct xfs_perag *pag);
 static bool xfs_reclaim_inode_grab(struct xfs_inode *ip);
 static void xfs_reclaim_inode(struct xfs_inode *ip, struct xfs_perag *pag);
-static inline void xfs_inodegc_queue(struct xfs_mount *mp);
+static inline void xfs_inodegc_queue(struct xfs_perag *pag);
 static int xfs_inodegc_inactivate(struct xfs_inode *ip, struct xfs_perag *pag,
 		struct xfs_eofblocks *eofb);
 
@@ -202,7 +202,7 @@ xfs_perag_set_ici_tag(
 		xfs_blockgc_queue(pag);
 		break;
 	case XFS_ICI_INODEGC_TAG:
-		xfs_inodegc_queue(mp);
+		xfs_inodegc_queue(pag);
 		break;
 	}
 
@@ -303,14 +303,16 @@ xfs_inodegc_running(struct xfs_mount *mp)
 /* Queue a new inode gc pass if there are inodes needing inactivation. */
 static void
 xfs_inodegc_queue(
-	struct xfs_mount        *mp)
+	struct xfs_perag	*pag)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
+
 	if (!xfs_inodegc_running(mp))
 		return;
 
 	rcu_read_lock();
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
-		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work,
+		queue_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work,
 				msecs_to_jiffies(xfs_inodegc_centisecs * 10));
 	rcu_read_unlock();
 }
@@ -1902,28 +1904,38 @@ void
 xfs_inodegc_worker(
 	struct work_struct	*work)
 {
-	struct xfs_mount	*mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_inodegc_work);
+	struct xfs_perag	*pag = container_of(to_delayed_work(work),
+					struct xfs_perag, pag_inodegc_work);
+	struct xfs_mount	*mp = pag->pag_mount;
 	int			error;
 
-	error = xfs_inodegc_free_space(mp, NULL);
+	error = xfs_inode_walk_ag(pag, XFS_ICI_INODEGC_TAG, NULL);
 	if (error && error != -EAGAIN)
 		xfs_err(mp, "inode inactivation failed, error %d", error);
 
-	xfs_inodegc_queue(mp);
+	xfs_inodegc_queue(pag);
 }
 
-/* Force all currently queued inode inactivation work to run immediately. */
+/* Force all queued inode inactivation work to run immediately. */
 void
 xfs_inodegc_flush(
 	struct xfs_mount	*mp)
 {
-	if (!xfs_inodegc_running(mp) ||
-	    !radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	bool			queued = false;
+
+	if (!xfs_inodegc_running(mp))
+		return;
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG) {
+		mod_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, 0);
+		queued = true;
+	}
+	if (!queued)
 		return;
 
-	mod_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
-	flush_delayed_work(&mp->m_inodegc_work);
+	flush_workqueue(mp->m_gc_workqueue);
 }
 
 /* Stop all queued inactivation work. */
@@ -1931,8 +1943,15 @@ void
 xfs_inodegc_stop(
 	struct xfs_mount	*mp)
 {
+	xfs_agnumber_t		agno;
+
 	clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags);
-	cancel_delayed_work_sync(&mp->m_inodegc_work);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
+		xfs_perag_put(pag);
+	}
 }
 
 /* Schedule deferred inode inactivation work. */
@@ -1940,6 +1959,10 @@ void
 xfs_inodegc_start(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags);
-	xfs_inodegc_queue(mp);
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG)
+		xfs_inodegc_queue(pag);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f95f913bff38..c6af1e848171 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -127,6 +127,7 @@ __xfs_free_perag(
 	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
 
 	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+	ASSERT(!delayed_work_pending(&pag->pag_inodegc_work));
 	ASSERT(atomic_read(&pag->pag_ref) == 0);
 	kmem_free(pag);
 }
@@ -148,6 +149,7 @@ xfs_free_perag(
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
 		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -204,6 +206,7 @@ xfs_initialize_perag(
 		pag->pag_mount = mp;
 		spin_lock_init(&pag->pag_ici_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+		INIT_DELAYED_WORK(&pag->pag_inodegc_work, xfs_inodegc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 
 		error = xfs_buf_hash_init(pag);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 83b8f86448f4..416f308aee52 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -179,7 +179,6 @@ typedef struct xfs_mount {
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct delayed_work	m_inodegc_work; /* background inode inactive */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
@@ -377,6 +376,9 @@ typedef struct xfs_perag {
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 
+	/* background inode inactivation */
+	struct delayed_work	pag_inodegc_work;
+
 	/* reference count */
 	uint8_t			pagf_refcount_level;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b1b01091b673..605af79a3e88 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1923,7 +1923,6 @@ static int xfs_init_fs_context(
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-	INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 	/*
 	 * We don't create the finobt per-ag space reservation until after log

From patchwork Fri Mar 26 00:22:27 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165287
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 153D1C433E3
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:17 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id CA58261A4C
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230160AbhCZAWp (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:45 -0400
Received: from mail.kernel.org ([198.145.29.99]:35480 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230203AbhCZAW2 (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:28 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id 326F6619D3;
        Fri, 26 Mar 2021 00:22:28 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718148;
        bh=cTmcElSbM1n4NrPDdajMFI17UpT7Dkeqs1VMuWXPN74=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=WI6v+PGFL7K7V4NouBGd9E6ts93sn521zGKIO0hpOeau+nQPi0HVIQRYityuBBK3Z
         lFl5WXFvsSKcBsfDEXH3dQwPzvba7bW85qmcELrauHngcqtiINZ47EqdseYsIvtuIx
         upV9RF54iodOgvzyFLJts7ekh8bsNtsZUfmlA0ETMlsXcTHquP6irNSUHV/Vkqxqol
         e/1CrvDzQ/WdJvFgEO8Fmh5/P0M+tXRi6ZFcJj89uIKBqAGBNIFBzrWgRNR0jIK4v9
         OZXsGlK7xZg3wfMDpIWFqhxlqMnUPDmNI+QUsAHk5DB/v9NjPlJn8RpRfXMAlApExf
         d7ckRwyr/Z52A==
Subject: [PATCH 7/9] xfs: create a polled function to force inode inactivation
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:27 -0700
Message-ID: <161671814786.622901.3638495557270241904.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Create a polled version of xfs_inactive_force so that we can force
inactivation while holding a lock (usually the umount lock) without
tripping over the softlockup timer.  This is for callers that hold vfs
locks while calling inactivation, which is currently unmount, iunlink
processing during mount, and rw->ro remount.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |   46 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h |    1 +
 fs/xfs/xfs_linux.h  |    1 +
 fs/xfs/xfs_mount.c  |    2 +-
 fs/xfs/xfs_mount.h  |    5 +++++
 fs/xfs/xfs_super.c  |    3 ++-
 6 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 00d614730b2c..23b04cfa38f3 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1894,9 +1894,13 @@ xfs_inodegc_free_space(
 	struct xfs_mount	*mp,
 	struct xfs_eofblocks	*eofb)
 {
+	int			error;
+
 	trace_xfs_inodegc_free_space(mp, eofb, _RET_IP_);
 
-	return xfs_inode_walk(mp, XFS_ICI_INODEGC_TAG, eofb);
+	error = xfs_inode_walk(mp, XFS_ICI_INODEGC_TAG, eofb);
+	wake_up(&mp->m_inactive_wait);
+	return error;
 }
 
 /* Background inode inactivation worker. */
@@ -1938,6 +1942,46 @@ xfs_inodegc_flush(
 	flush_workqueue(mp->m_gc_workqueue);
 }
 
+/*
+ * Force all queued inode inactivation work to run immediately, and poll
+ * until the work is complete.  Callers should only use this function if they
+ * must inactivate inodes while holding VFS mutexes.
+ */
+void
+xfs_inodegc_flush_poll(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	bool			queued = false;
+
+	/*
+	 * Force to the foreground any inode inactivations that may happen
+	 * while we're running our polling loop.
+	 */
+	set_bit(XFS_OPFLAG_INACTIVATE_NOW_BIT, &mp->m_opflags);
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG) {
+		mod_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, 0);
+		queued = true;
+	}
+	if (!queued)
+		goto clear;
+
+	/*
+	 * Touch the softlockup watchdog every 1/10th of a second while there
+	 * are still inactivation-tagged inodes in the filesystem.
+	 */
+	while (!wait_event_timeout(mp->m_inactive_wait,
+				   !radix_tree_tagged(&mp->m_perag_tree,
+						      XFS_ICI_INODEGC_TAG),
+				   HZ / 10)) {
+		touch_softlockup_watchdog();
+	}
+clear:
+	clear_bit(XFS_OPFLAG_INACTIVATE_NOW_BIT, &mp->m_opflags);
+}
+
 /* Stop all queued inactivation work. */
 void
 xfs_inodegc_stop(
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index f43a3dad5652..a1230ebcea3e 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -76,6 +76,7 @@ void xfs_inew_wait(struct xfs_inode *ip);
 
 void xfs_inodegc_worker(struct work_struct *work);
 void xfs_inodegc_flush(struct xfs_mount *mp);
+void xfs_inodegc_flush_poll(struct xfs_mount *mp);
 void xfs_inodegc_stop(struct xfs_mount *mp);
 void xfs_inodegc_start(struct xfs_mount *mp);
 int xfs_inodegc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 3d6b0a407d52..9b6f519a9a43 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -61,6 +61,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/ratelimit.h>
 #include <linux/rhashtable.h>
 #include <linux/xattr.h>
+#include <linux/nmi.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c6af1e848171..5db2a65b3729 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1115,7 +1115,7 @@ xfs_unmountfs(
 	 * Flush all the queued inode inactivation work to disk before tearing
 	 * down rt metadata and quotas.
 	 */
-	xfs_inodegc_flush(mp);
+	xfs_inodegc_flush_poll(mp);
 
 	xfs_blockgc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 416f308aee52..e910e5734026 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -216,6 +216,11 @@ typedef struct xfs_mount {
 	struct xfs_kobj		m_errortag_kobj;
 #endif
 	struct ratelimit_state	m_inodegc_ratelimit;
+	/*
+	 * Use this to wait for the inode inactivation workqueue to finish
+	 * inactivating all the inodes.
+	 */
+	struct wait_queue_head	m_inactive_wait;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 605af79a3e88..de44e9843558 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1798,7 +1798,7 @@ xfs_remount_ro(
 	 * Since this can involve finobt updates, do it now before we lose the
 	 * per-AG space reservations to guarantee that we won't fail there.
 	 */
-	xfs_inodegc_flush(mp);
+	xfs_inodegc_flush_poll(mp);
 
 	/* Free the per-AG metadata reservation pool. */
 	error = xfs_fs_unreserve_ag_blocks(mp);
@@ -1924,6 +1924,7 @@ static int xfs_init_fs_context(
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
+	init_waitqueue_head(&mp->m_inactive_wait);
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
 	 * recovery, so we must set this to true so that an ifree transaction

From patchwork Fri Mar 26 00:22:33 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165283
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id DFDA4C433E2
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id B924361A48
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230180AbhCZAWp (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:45 -0400
Received: from mail.kernel.org ([198.145.29.99]:35502 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230205AbhCZAWe (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:34 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id C6140619D3;
        Fri, 26 Mar 2021 00:22:33 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718153;
        bh=FVa9gsvTp3KTrGOLcPps6bAcq7RxAhd3OIsEehcGDVc=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=TaD7sAkiPJhPm/KmqKRoZK0yUPOiFk45aUqXA6ZdH5Bdl+gwZ57IXIObFT9buJLuC
         sYm94MzWW4LBxGzb7998jzLD9wALyiBuyuHi7Em+T7has6Ai7mRwtkZF32ZI7uoWda
         zU3UjezxxEW28IxivpiUK3DuMLcpMC247LwW231rz3TDk9ZtEXyjO8qeTH4PLXug8/
         eSuJBto5er4nAtdqbw1EnWABJ3siEF3qzCK6apg96iDTw7rxWlnLEM09uYenl1q4ED
         mxm622R9TUAFGDRbzpSLwYFkeRNu/z1vV/moPfgB7efQ/EMCsOuYx1FRx0b/pD4cWF
         t/Yd1Q8zW+vrQ==
Subject: [PATCH 8/9] xfs: add inode scan limits to the eofblocks ioctl
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:33 -0700
Message-ID: <161671815341.622901.4857956893537074437.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Allow callers of the userspace eofblocks ioctl to set a limit on the
number of inodes to scan, and then plumb that through the interface.
This removes a minor wart from the internal inode walk interface.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    7 +++++--
 fs/xfs/xfs_icache.c    |    7 +++++--
 fs/xfs/xfs_icache.h    |    2 --
 fs/xfs/xfs_ioctl.c     |    9 +++++++--
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6fad140d4c8e..0b21d4b93072 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -523,7 +523,7 @@ struct xfs_fs_eofblocks {
 	uid_t		eof_uid;
 	gid_t		eof_gid;
 	prid_t		eof_prid;
-	__u32		pad32;
+	__u32		eof_limit;
 	__u64		eof_min_file_size;
 	__u64		pad64[12];
 };
@@ -537,12 +537,15 @@ struct xfs_fs_eofblocks {
 #define XFS_EOF_FLAGS_UNION		(1 << 5) /* union filter algorithm;
 						  * kernel only, not included in
 						  * valid mask */
+#define XFS_EOF_FLAGS_LIMIT		(1 << 6) /* scan this many inodes */
+
 #define XFS_EOF_FLAGS_VALID	\
 	(XFS_EOF_FLAGS_SYNC |	\
 	 XFS_EOF_FLAGS_UID |	\
 	 XFS_EOF_FLAGS_GID |	\
 	 XFS_EOF_FLAGS_PRID |	\
-	 XFS_EOF_FLAGS_MINFILESIZE)
+	 XFS_EOF_FLAGS_MINFILESIZE | \
+	 XFS_EOF_FLAGS_LIMIT)
 
 
 /*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 23b04cfa38f3..e8a2e1cf7577 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1047,7 +1047,7 @@ xfs_inode_walk_ag(
 			break;
 
 		cond_resched();
-		if (tag == XFS_ICI_RECLAIM_TAG && eofb) {
+		if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_LIMIT)) {
 			eofb->nr_to_scan -= XFS_LOOKUP_BATCH;
 			if (eofb->nr_to_scan < 0)
 				break;
@@ -1249,7 +1249,10 @@ xfs_reclaim_inodes_nr(
 	struct xfs_mount	*mp,
 	int			nr_to_scan)
 {
-	struct xfs_eofblocks	eofb = { .nr_to_scan = nr_to_scan };
+	struct xfs_eofblocks	eofb = {
+		.eof_flags	= XFS_EOF_FLAGS_LIMIT,
+		.nr_to_scan	= nr_to_scan
+	};
 
 	/* kick background reclaimer and push the AIL */
 	xfs_reclaim_work_queue(mp);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a1230ebcea3e..0f832fa95fd4 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -15,8 +15,6 @@ struct xfs_eofblocks {
 	kgid_t		eof_gid;
 	prid_t		eof_prid;
 	__u64		eof_min_file_size;
-
-	/* Number of inodes to scan, currently limited to reclaim */
 	int		nr_to_scan;
 };
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7f139ee442bf..b13869954846 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -2043,8 +2043,7 @@ xfs_fs_eofblocks_from_user(
 	if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
 		return -EINVAL;
 
-	if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
-	    memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+	if (memchr_inv(src->pad64, 0, sizeof(src->pad64)))
 		return -EINVAL;
 
 	dst->eof_flags = src->eof_flags;
@@ -2064,6 +2063,12 @@ xfs_fs_eofblocks_from_user(
 		if (!gid_valid(dst->eof_gid))
 			return -EINVAL;
 	}
+
+	if (src->eof_flags & XFS_EOF_FLAGS_LIMIT)
+		dst->nr_to_scan = min_t(int, src->eof_limit, INT_MAX);
+	else if (src->eof_limit != 0)
+		return -EINVAL;
+
 	return 0;
 }
 

From patchwork Fri Mar 26 00:22:39 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 12165289
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 4A00BC433E4
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:17 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 19A5D61A41
	for <linux-xfs@archiver.kernel.org>; Fri, 26 Mar 2021 00:23:17 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230195AbhCZAWq (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 25 Mar 2021 20:22:46 -0400
Received: from mail.kernel.org ([198.145.29.99]:35522 "EHLO mail.kernel.org"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S230209AbhCZAWj (ORCPT <rfc822;linux-xfs@vger.kernel.org>);
        Thu, 25 Mar 2021 20:22:39 -0400
Received: by mail.kernel.org (Postfix) with ESMTPSA id 7F12C619F3;
        Fri, 26 Mar 2021 00:22:39 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
        s=k20201202; t=1616718159;
        bh=zgl34HvqTAtCGjsQ3Jux5nEoazBUgs+3KVhXjw2Yqrk=;
        h=Subject:From:To:Cc:Date:In-Reply-To:References:From;
        b=nSxl449nN/T8kGwpvKvMLwkwX2hCPLkHNWzTHkf1VbnoWkRj9LEo4w3r2SQc2MPF3
         rd4NnsFbWuPsLxDJSYYUa3+sqQgrNtA+XcNW/YQVekVJgiyY1BkD2GDqaCN2l06sSr
         J2LA5nAZgIhjo3rAaGcZmT0bds7HDeiEFI2cGQbtpNUZstxnQ6dZMrDozvCVBwM/zg
         KuqfRGfJw0Pni39+bF+cNjR4bVSxyfFpV4aCJE5Z6Jc5+JYc7DtlL6sxy8LnNonK/M
         RDbhvLoR+oaV+H81DxxrZtgJXoK6AxJfeM5qH10WsZsZbr+pt3heVnEyLJS5qtXdd7
         Ug1WFsvJcSlEg==
Subject: [PATCH 9/9] xfs: don't run speculative preallocation gc when fs is
 frozen
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org
Date: Thu, 25 Mar 2021 17:22:39 -0700
Message-ID: <161671815908.622901.11201307813479002034.stgit@magnolia>
In-Reply-To: <161671810866.622901.16520335819131743716.stgit@magnolia>
References: <161671810866.622901.16520335819131743716.stgit@magnolia>
User-Agent: StGit/0.19
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Darrick J. Wong <djwong@kernel.org>

Now that we have the infrastructure to switch background workers on and
off at will, fix the block gc worker code so that we don't actually run
the worker when the filesystem is frozen, same as we do for deferred
inactivation.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c |   28 +++++++++++++++++++++++-----
 fs/xfs/xfs_mount.c  |    1 +
 fs/xfs/xfs_mount.h  |    3 +++
 fs/xfs/xfs_super.c  |    5 +++--
 4 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e8a2e1cf7577..75a4df687d5d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1416,6 +1416,12 @@ xfs_inode_free_eofblocks(
 	return 0;
 }
 
+static inline bool
+xfs_blockgc_running(struct xfs_mount *mp)
+{
+	return test_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags);
+}
+
 /*
  * Background scanning to trim preallocated space. This is queued based on the
  * 'speculative_prealloc_lifetime' tunable (5m by default).
@@ -1424,6 +1430,9 @@ static inline void
 xfs_blockgc_queue(
 	struct xfs_perag	*pag)
 {
+	if (!xfs_blockgc_running(pag->pag_mount))
+		return;
+
 	rcu_read_lock();
 	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
 		queue_delayed_work(pag->pag_mount->m_gc_workqueue,
@@ -1628,11 +1637,15 @@ void
 xfs_blockgc_stop(
 	struct xfs_mount	*mp)
 {
-	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 
-	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+	clear_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+		xfs_perag_put(pag);
+	}
 }
 
 /* Enable post-EOF and CoW block auto-reclamation. */
@@ -1643,6 +1656,7 @@ xfs_blockgc_start(
 	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 
+	set_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags);
 	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
 		xfs_blockgc_queue(pag);
 }
@@ -1656,6 +1670,13 @@ xfs_blockgc_scan_inode(
 	unsigned int		lockflags = 0;
 	int			error;
 
+	/*
+	 * Speculative preallocation gc isn't supposed to run when the fs is
+	 * frozen because we don't want kernel threads to block on transaction
+	 * allocation.
+	 */
+	ASSERT(ip->i_mount->m_super->s_writers.frozen < SB_FREEZE_FS);
+
 	error = xfs_inode_free_eofblocks(ip, eofb, &lockflags);
 	if (error)
 		goto unlock;
@@ -1677,13 +1698,10 @@ xfs_blockgc_worker(
 	struct xfs_mount	*mp = pag->pag_mount;
 	int			error;
 
-	if (!sb_start_write_trylock(mp->m_super))
-		return;
 	error = xfs_inode_walk_ag(pag, XFS_ICI_BLOCKGC_TAG, NULL);
 	if (error)
 		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
 				pag->pag_agno, error);
-	sb_end_write(mp->m_super);
 	xfs_blockgc_queue(pag);
 }
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5db2a65b3729..d454d2492f3b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -908,6 +908,7 @@ xfs_mountfs(
 
 	/* Enable background workers. */
 	xfs_inodegc_start(mp);
+	xfs_blockgc_start(mp);
 
 	/*
 	 * Get and sanity-check the root inode.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e910e5734026..629d971c8787 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -263,6 +263,9 @@ typedef struct xfs_mount {
 						   inode inactivation worker? */
 #define XFS_OPFLAG_INACTIVATE_NOW_BIT	(1)	/* force foreground inactivation
 						   of unlinked inodes */
+#define XFS_OPFLAG_BLOCKGC_RUNNING_BIT	(0)	/* are we allowed to start the
+						   speculative preallocation gc
+						   worker? */
 
 /*
  * Max and min values for mount-option defined I/O
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index de44e9843558..75f08d00815b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -813,8 +813,10 @@ xfs_fs_sync_fs(
 	 * s_umount, which means we can't lock out a concurrent thaw request
 	 * without adding another layer of locks to the freeze process.
 	 */
-	if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT)
+	if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
 		xfs_inodegc_stop(mp);
+		xfs_blockgc_stop(mp);
+	}
 
 	return 0;
 }
@@ -931,7 +933,6 @@ xfs_fs_freeze(
 	 * set a GFP_NOFS context here to avoid recursion deadlocks.
 	 */
 	flags = memalloc_nofs_save();
-	xfs_blockgc_stop(mp);
 	xfs_save_resvblks(mp);
 	ret = xfs_log_quiesce(mp);
 	memalloc_nofs_restore(flags);