From patchwork Sun Jun 13 17:20:02 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317801 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 26479C48BCF for ; Sun, 13 Jun 2021 17:20:04 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id ED62361284 for ; Sun, 13 Jun 2021 17:20:03 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231951AbhFMRWE (ORCPT ); Sun, 13 Jun 2021 13:22:04 -0400 Received: from mail.kernel.org ([198.145.29.99]:41188 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWE (ORCPT ); Sun, 13 Jun 2021 13:22:04 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id ADA1661078; Sun, 13 Jun 2021 17:20:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604802; bh=V2R6VpMA805DF/IiZD0t7l8tdX/48BvUPgC4EPk0aEY=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=NP58FwX8dnV5Sh9wNrPJmGC20k0SLGOwhOaw/H/kkLKnCL8S3ncAsWR/CsJzq8ncM qWKFveeYyzVnmzLgh+HcyyMujvtPsW9CBiXvhw2wbqsf2vhk7aass3xyUbZINtFsSf 1YJC1oSVzCCXSt1E9A5L+47vZpmQqChom/p+yikRXM5wTH3a1qkTvfkCotz1LMiEu/ eFjgTM+szsTpdUnNUAFuj6CgiFa4BTlrDhlTj7QjaHDhq67ZgDfkokA+0U0wzQyQJw c+4vSX5IlC5OGXg68iEbImsop8xP5S4usQV+XXwJpB18unMZdRH44w3V/P0sX5ljwp d3jmT4XlfQXSw== Subject: [PATCH 01/16] xfs: refactor the inode recycling code From: "Darrick J. Wong" To: djwong@kernel.org Cc: Dave Chinner , linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:02 -0700 Message-ID: <162360480240.1530792.10821283161255096063.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Hoist the code in xfs_iget_cache_hit that restores the VFS inode state to an xfs_inode that was previously vfs-destroyed. The next patch will add a new set of state flags, so we need the helper to avoid duplication. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 143 +++++++++++++++++++++++++++++---------------------- fs/xfs/xfs_trace.h | 4 + 2 files changed, 83 insertions(+), 64 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4e4682879bbd..37229517c8f7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -355,14 +355,14 @@ xfs_reinit_inode( struct xfs_mount *mp, struct inode *inode) { - int error; - uint32_t nlink = inode->i_nlink; - uint32_t generation = inode->i_generation; - uint64_t version = inode_peek_iversion(inode); - umode_t mode = inode->i_mode; - dev_t dev = inode->i_rdev; - kuid_t uid = inode->i_uid; - kgid_t gid = inode->i_gid; + int error; + uint32_t nlink = inode->i_nlink; + uint32_t generation = inode->i_generation; + uint64_t version = inode_peek_iversion(inode); + umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -376,6 +376,74 @@ xfs_reinit_inode( return error; } +/* + * Carefully nudge an inode whose VFS state has been torn down back into a + * usable state. Drops the i_flags_lock and the rcu read lock. + */ +static int +xfs_iget_recycle( + struct xfs_perag *pag, + struct xfs_inode *ip) __releases(&ip->i_flags_lock) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int error; + + trace_xfs_iget_recycle(ip); + + /* + * We need to make it look like the inode is being reclaimed to prevent + * the actual reclaim workers from stomping over us while we recycle + * the inode. We can't clear the radix tree tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + error = xfs_reinit_inode(mp, inode); + if (error) { + bool wake; + + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + wake = !!__xfs_iflags_test(ip, XFS_INEW); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + trace_xfs_iget_recycle_fail(ip); + return error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now effectively + * a new inode and need to return to the initial state before reuse + * occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + + return 0; +} + /* * If we are allocating a new inode, then check what was returned is * actually a free, empty inode. If we are not allocating an inode, @@ -450,7 +518,7 @@ xfs_iget_cache_hit( /* * If we are racing with another cache hit that is currently * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete + * reclaimable state, wait for the initialisation to complete * before continuing. * * XXX(hch): eventually we should do something equivalent to @@ -472,64 +540,16 @@ xfs_iget_cache_hit( if (error) goto out_error; - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - if (flags & XFS_IGET_INCORE) { error = -EAGAIN; goto out_error; } - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - error = xfs_reinit_inode(mp, inode); - if (error) { - bool wake; - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - xfs_perag_clear_inode_tag(pag, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); + /* Drops i_flags_lock and RCU read lock. */ + error = xfs_iget_recycle(pag, ip); + if (error) + return error; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -559,7 +579,6 @@ xfs_iget_cache_hit( return error; } - static int xfs_iget_cache_miss( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a10612155377..d0b4799ad1e6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -630,8 +630,8 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) DEFINE_INODE_EVENT(xfs_iget_skip); -DEFINE_INODE_EVENT(xfs_iget_reclaim); -DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_recycle); +DEFINE_INODE_EVENT(xfs_iget_recycle_fail); DEFINE_INODE_EVENT(xfs_iget_hit); DEFINE_INODE_EVENT(xfs_iget_miss); From patchwork Sun Jun 13 17:20:07 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317803 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4E3A7C48BCF for ; Sun, 13 Jun 2021 17:20:09 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 23B2D61285 for ; Sun, 13 Jun 2021 17:20:09 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231976AbhFMRWJ (ORCPT ); Sun, 13 Jun 2021 13:22:09 -0400 Received: from mail.kernel.org ([198.145.29.99]:41232 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWJ (ORCPT ); Sun, 13 Jun 2021 13:22:09 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 3274C61284; Sun, 13 Jun 2021 17:20:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604808; bh=QW8tbXwGzlW567uavBiUxGIgNrJH2VxU7aX8TJPbCCE=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=gAwLKqFOasZ1Ho0W4Vm5MQCHsEqmCFmx6mo0A3WbWRMvdcxRmGPZbRfpgAGFQ81Li I7840G1i+FCI5uPQ4as4HGQyqmAsLVUdjqvgk0yGP+f9WopUgEvQrjqQdArCPx7vWE k4yD7FZBLtBEGBVeOvy4k4JR0raZ3Mnd7DCPLAPli3Qn0jnYBUJsuwttJbdDGS3yfK JJInrieIhf+1bv0DqfJIjf32gdpUHwq/1DOe8sT67z/VToVY14wYTBO5u6F+sS+61Y 202vx/QVZlq1frMmodVjYlJXxxjPFQUQC8SvCvO84xqVr/TbaNAlnwZCkZskXk1FFM GPADwRGSKzkUw== Subject: [PATCH 02/16] xfs: move xfs_inactive call to xfs_inode_mark_reclaimable From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:07 -0700 Message-ID: <162360480791.1530792.12003297610956705274.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Move the xfs_inactive call and all the other debugging checks and stats updates into xfs_inode_mark_reclaimable because most of that are implementation details about the inode cache. This is preparation for deferred inactivation that is coming up. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_super.c | 50 -------------------------------------------------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 37229517c8f7..a2d81331867b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -301,6 +301,32 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -313,6 +339,29 @@ xfs_inode_mark_reclaimable( struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + xfs_inactive(ip); + + if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + + XFS_STATS_INC(mp, vn_reclaim); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. + */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3a7fd4f02aa7..dd1ee333dcb3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -603,32 +603,6 @@ xfs_fs_alloc_inode( return NULL; } -#ifdef DEBUG -static void -xfs_check_delalloc( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; - - if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) - return; - do { - if (isnullstartblock(got.br_startblock)) { - xfs_warn(ip->i_mount, - "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", - ip->i_ino, - whichfork == XFS_DATA_FORK ? "data" : "cow", - got.br_startoff, got.br_blockcount); - } - } while (xfs_iext_next_extent(ifp, &icur, &got)); -} -#else -#define xfs_check_delalloc(ip, whichfork) do { } while (0) -#endif - /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -644,30 +618,6 @@ xfs_fs_destroy_inode( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); - - if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { - xfs_check_delalloc(ip, XFS_DATA_FORK); - xfs_check_delalloc(ip, XFS_COW_FORK); - ASSERT(0); - } - - XFS_STATS_INC(ip->i_mount, vn_reclaim); - - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); - - /* - * We always use background reclaim here because even if the inode is - * clean, it still may be under IO and hence we have wait for IO - * completion to occur before we can reclaim the inode. The background - * reclaim path handles this more efficiently than we can here, so - * simply let background reclaim tear down all inodes. - */ xfs_inode_mark_reclaimable(ip); } From patchwork Sun Jun 13 17:20:13 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317805 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 1C788C48BDF for ; Sun, 13 Jun 2021 17:20:16 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E62E061078 for ; Sun, 13 Jun 2021 17:20:15 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232007AbhFMRWQ (ORCPT ); Sun, 13 Jun 2021 13:22:16 -0400 Received: from mail.kernel.org ([198.145.29.99]:41258 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWP (ORCPT ); Sun, 13 Jun 2021 13:22:15 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id AD7B761107; Sun, 13 Jun 2021 17:20:13 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604813; bh=5ltstnOjOSXrVfpaxHSZ8nTRKPJFB1bmc7ZhGKceso0=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=dVDhCzhkUPA4wLR34Bb0zCdCCA311HUHXjiNBVg9763cfDHAbn6c6lF1C4yGkMA5e ShxaaOg5qIgLSrGWRdy2BRJr6FuefxQeKbaraH78vxh2OELmNJ6Ng4D318UBCsKXqd c3dleC0GCP1VrNya8MMcnFO4EgK5mf+4lnRvqBvWVHkxbos8k7YJ8XeD/Caj4KSEyH y0SuIGcYULg6V0cJhOzHtRc17yGdOl6PvbJ9643ojl1uOAoD9jqmunEUApt3W3a1Qz ss6/R3Klp1v2dzqqijy8dAazSOReN1ZZjmNCuxg1g9l1XYty8IcSzkJizznTLyMpD7 61Eyjo2KJX3xg== Subject: [PATCH 03/16] xfs: detach dquots from inode if we don't need to inactivate it From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:13 -0700 Message-ID: <162360481340.1530792.16718628800672012784.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong If we don't need to inactivate an inode, we can detach the dquots and move on to reclamation. This isn't strictly required here; it's a preparation patch for deferred inactivation. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 8 +++++++- fs/xfs/xfs_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 2 ++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a2d81331867b..7939eced3a47 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -338,8 +338,14 @@ xfs_inode_mark_reclaimable( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + bool need_inactive = xfs_inode_needs_inactive(ip); - xfs_inactive(ip); + if (!need_inactive) { + /* Going straight to reclaim, so drop the dquots. */ + xfs_qm_dqdetach(ip); + } else { + xfs_inactive(ip); + } if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { xfs_check_delalloc(ip, XFS_DATA_FORK); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3bee1cd20072..85b2b11b5217 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1654,6 +1654,59 @@ xfs_inactive_ifree( return 0; } +/* + * Returns true if we need to update the on-disk metadata before we can free + * the memory used by this inode. Updates include freeing post-eof + * preallocations; freeing COW staging extents; and marking the inode free in + * the inobt if it is on the unlinked list. + */ +bool +xfs_inode_needs_inactive( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (VFS_I(ip)->i_mode == 0) + return false; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return false; + + /* If the log isn't running, push inodes straight to reclaim. */ + if (XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_NORECOVERY)) + return false; + + /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* Want to clean out the cow blocks if there are any. */ + if (cow_ifp && cow_ifp->if_bytes > 0) + return true; + + /* Unlinked files must be freed. */ + if (VFS_I(ip)->i_nlink == 0) + return true; + + /* + * This file isn't being freed, so check if there are post-eof blocks + * to free. @force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with broken + * free space accounting. + * + * Note: don't bother with iolock here since lockdep complains about + * acquiring it in reclaim context. We have the only reference to the + * inode at this point anyways. + */ + return xfs_can_free_eofblocks(ip, true); +} + /* * xfs_inactive * diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4b6703dbffb8..e3137bbc7b14 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -493,6 +493,8 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 +bool xfs_inode_needs_inactive(struct xfs_inode *ip); + int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); From patchwork Sun Jun 13 17:20:18 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317807 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 49AC5C48BCF for ; Sun, 13 Jun 2021 17:20:20 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 2C72661285 for ; Sun, 13 Jun 2021 17:20:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231997AbhFMRWU (ORCPT ); Sun, 13 Jun 2021 13:22:20 -0400 Received: from mail.kernel.org ([198.145.29.99]:41284 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWU (ORCPT ); Sun, 13 Jun 2021 13:22:20 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 2F96761078; Sun, 13 Jun 2021 17:20:19 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604819; bh=7ZIoRfxfRm+VFi8nyVLZYOM8GZu3m/0OxVGTwbdu0/c=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=AE4u/Yk9q/Kfc6EYYmqox9IB6ZDG5z7K6hvLPXee5guwzgwIjw5tkN1u1P+iC86oP 4qtlfbM/eJ1CBRANj14yZJP2JFrVHqD96X9v/FuWH71Ti0eSfmS4I65cN/DR7DMMtI imf0oVHCCWEIftaQBxpYVw7H/fsGA0vvvQ2KeZfygE9mXN8oSpYB51SZNIkV3CZTAR esE2BjEfDQ6zGXAr/ZgS6iDKiD2Cj9FlbBJAq20Tmr+K9O3zBNipARQR7tbJ+WGLqK RKDMH4xKOrXkuh0/erPgmW4ymAxULG6HwYyB/2Nl+G/IDbLb4z7f2eu4eYNdGFQPTJ 175jaHkPt6w3w== Subject: [PATCH 04/16] xfs: clean up xfs_inactive a little bit From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:18 -0700 Message-ID: <162360481889.1530792.8153660904394768299.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Move the dqattach further up in xfs_inactive. In theory we should always have dquots attached if there are CoW blocks, but this makes the usage pattern more consistent with the rest of xfs (attach dquots, then start making changes). Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 85b2b11b5217..67786814997c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1717,7 +1717,7 @@ xfs_inode_needs_inactive( */ void xfs_inactive( - xfs_inode_t *ip) + struct xfs_inode *ip) { struct xfs_mount *mp; int error; @@ -1743,6 +1743,11 @@ xfs_inactive( if (xfs_is_metadata_inode(ip)) goto out; + /* Ensure dquots are attached prior to making changes to this file. */ + error = xfs_qm_dqattach(ip); + if (error) + goto out; + /* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); @@ -1768,10 +1773,6 @@ xfs_inactive( ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip); - if (error) - goto out; - if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) From patchwork Sun Jun 13 17:20:24 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317809 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id A4429C48BDF for ; Sun, 13 Jun 2021 17:20:25 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 8D7EA61284 for ; Sun, 13 Jun 2021 17:20:25 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232011AbhFMRW0 (ORCPT ); Sun, 13 Jun 2021 13:22:26 -0400 Received: from mail.kernel.org ([198.145.29.99]:41310 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRW0 (ORCPT ); Sun, 13 Jun 2021 13:22:26 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id A770C61078; Sun, 13 Jun 2021 17:20:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604824; bh=eGqhITI7uxnulhLZj+DBJdw+G5jKdmTUT/agH5YTFu0=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=vD6iCBUjB6h4tvUrEHmusFL+UZKT10WwrizRffE8KK1LSplTJYMhTaxhoifeI5Ci6 DY0pDg+nEeK8RjE1wESYiYnRqJ89b6YNh+weoxh6wxYfAY7njVjbxB7I/9HtL0K9uV F7LdmrAHlIRVB+txAFQWcpH3fZdGxdYohukLRDoQH/n96usmvRW0btSNwbhbujdQ+F 88pFU38gUwl//dx+MSDmjwyc7QxMP+1o0BEtS8YXtJrFgIQwmbFl6w9z2i+RTvJCol mL9mVId+tK1Hq6dvzNz4XNV7T7T77JOLfgdH4uxI0zVig6OCVI5hBr48pm2orRm1Hk IYddPgoWRCnuw== Subject: [PATCH 05/16] xfs: separate primary inode selection criteria in xfs_iget_cache_hit From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:24 -0700 Message-ID: <162360482438.1530792.18197198406001465325.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong During review of the v6 deferred inode inactivation patchset[1], Dave commented that _cache_hit should have a clear separation between inode selection criteria and actions performed on a selected inode. Move a hunk to make this true, and compact the shrink cases in the function. [1] https://lore.kernel.org/linux-xfs/162310469340.3465262.504398465311182657.stgit@locust/T/#mca6d958521cb88bbc1bfe1a30767203328d410b5 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7939eced3a47..4002f0b84401 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -562,13 +562,8 @@ xfs_iget_cache_hit( * will not match, so check for that, too. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } - + if (ip->i_ino != ino) + goto out_skip; /* * If we are racing with another cache hit that is currently @@ -580,12 +575,8 @@ xfs_iget_cache_hit( * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + goto out_skip; /* * Check the inode free state is valid. This also detects lookup @@ -595,23 +586,21 @@ xfs_iget_cache_hit( if (error) goto out_error; + /* Skip inodes that have no vfs state. */ + if ((flags & XFS_IGET_INCORE) && + (ip->i_flags & XFS_IRECLAIMABLE)) + goto out_skip; + + /* The inode fits the selection criteria; process it. */ if (ip->i_flags & XFS_IRECLAIMABLE) { - if (flags & XFS_IGET_INCORE) { - error = -EAGAIN; - goto out_error; - } - /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); if (error) return error; } else { /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = -EAGAIN; - goto out_error; - } + if (!igrab(inode)) + goto out_skip; /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); @@ -628,6 +617,10 @@ xfs_iget_cache_hit( return 0; +out_skip: + trace_xfs_iget_skip(ip); + XFS_STATS_INC(mp, xs_ig_frecycle); + error = -EAGAIN; out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); From patchwork Sun Jun 13 17:20:29 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317811 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id B7E37C48BCF for ; Sun, 13 Jun 2021 17:20:31 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id A0AD961107 for ; Sun, 13 Jun 2021 17:20:31 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232012AbhFMRWb (ORCPT ); Sun, 13 Jun 2021 13:22:31 -0400 Received: from mail.kernel.org ([198.145.29.99]:41340 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWb (ORCPT ); Sun, 13 Jun 2021 13:22:31 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 2DD9E61078; Sun, 13 Jun 2021 17:20:30 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604830; bh=IfV9YFk5EcsUf63i2CfWUpEVDgywJOCxC8FYMcFlyQU=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=Wp6ESZXsEpqUQ2VrKs3bzv6hsRqPUOwrGTky3O4+ZOge3Lfk3WqSJvZzW0ddVx152 wc3r/T7tQsfIv2AeC3ZnNBjZc0rp0FXog6Lz96nCDEk4DgTXdUgBqOQFwLIFuI/pAG JxprZ1Y/yjnpKy48IeVRaboTDo7wWK4EF2kR/0iItD7NDCrvlTaIMo4Uz7JTLXKUG/ 3S6vti8JPL1tNoGN6NB0h3YptEhZpudKn5nn7Zmx/mq8Eco3aC+iOeFjD7Xpnx5KA7 eZDafLpA+i/22ed0zcjC15BkQy+NqVqrUL81bHVqFA4bHdMHIgbYuG2RgA3yOlmkkq oOvr5kvHKUPig== Subject: [PATCH 06/16] xfs: defer inode inactivation to a workqueue From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:29 -0700 Message-ID: <162360482987.1530792.9282768072804488207.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Instead of calling xfs_inactive directly from xfs_fs_destroy_inode, defer the inactivation phase to a separate workqueue. With this change, we can speed up directory tree deletions by reducing the duration of unlink() calls to the directory and unlinked list updates. By moving the inactivation work to the background, we can reduce the total cost of deleting a lot of files by performing the file deletions in disk order instead of directory entry order, which can be arbitrary. We introduce two new inode flags -- NEEDS_INACTIVE and INACTIVATING. The first flag helps our worker find inodes needing inactivation, and the second flag marks inodes that are in the process of being inactivated. A concurrent xfs_iget on the inode can still resurrect the inode by clearing NEEDS_INACTIVE (or bailing if INACTIVATING is set). Unfortunately, deferring the inactivation has one huge downside -- eventual consistency. Since all the freeing is deferred to a worker thread, one can rm a file but the space doesn't come back immediately. This can cause some odd side effects with quota accounting and statfs, so we flush inactivation work during syncfs in order to maintain the existing behaviors, at least for callers that unlink() and sync(). For this patch we'll set the delay to zero to mimic the old timing as much as possible; in the next patch we'll play with different delay settings. Signed-off-by: Darrick J. Wong --- Documentation/admin-guide/xfs.rst | 3 fs/xfs/scrub/common.c | 7 + fs/xfs/xfs_icache.c | 332 ++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_icache.h | 5 + fs/xfs/xfs_inode.h | 19 ++ fs/xfs/xfs_log_recover.c | 7 + fs/xfs/xfs_mount.c | 26 +++ fs/xfs/xfs_mount.h | 21 ++ fs/xfs/xfs_super.c | 53 ++++++ fs/xfs/xfs_trace.h | 50 +++++- 10 files changed, 490 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 8de008c0c5ad..f9b109bfc6a6 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -524,7 +524,8 @@ and the short name of the data device. They all can be found in: mount time quotacheck. xfs-gc Background garbage collection of disk space that have been speculatively allocated beyond EOF or for staging copy on - write operations. + write operations; and files that are no longer linked into + the directory tree. ================ =========== For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index cadfd5799909..b674bc6ed78d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -884,6 +884,7 @@ xchk_stop_reaping( { sc->flags |= XCHK_REAPING_DISABLED; xfs_blockgc_stop(sc->mp); + xfs_inodegc_stop(sc->mp); } /* Restart background reaping of resources. */ @@ -891,6 +892,12 @@ void xchk_start_reaping( struct xfs_scrub *sc) { + /* + * Readonly filesystems do not perform inactivation, so there's no + * need to restart the worker. + */ + if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_inodegc_start(sc->mp); xfs_blockgc_start(sc->mp); sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4002f0b84401..e094c16aa8c5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -32,6 +32,8 @@ #define XFS_ICI_RECLAIM_TAG 0 /* Inode has speculative preallocations (posteof or cow) to clean. */ #define XFS_ICI_BLOCKGC_TAG 1 +/* Inode can be inactivated. */ +#define XFS_ICI_INODEGC_TAG 2 /* * The goal for walking incore inodes. These can correspond with incore inode @@ -44,6 +46,7 @@ enum xfs_icwalk_goal { /* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, + XFS_ICWALK_INODEGC = XFS_ICI_INODEGC_TAG, }; #define XFS_ICWALK_NULL_TAG (-1U) @@ -228,6 +231,26 @@ xfs_blockgc_queue( rcu_read_unlock(); } +/* + * Queue a background inactivation worker if there are inodes that need to be + * inactivated and higher level xfs code hasn't disabled the background + * workers. + */ +static void +xfs_inodegc_queue( + struct xfs_mount *mp) +{ + if (!test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) + return; + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { + trace_xfs_inodegc_queue(mp, 0, _RET_IP_); + queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0); + } + rcu_read_unlock(); +} + /* Set a tag on both the AG incore inode tree and the AG radix tree. */ static void xfs_perag_set_inode_tag( @@ -262,6 +285,9 @@ xfs_perag_set_inode_tag( case XFS_ICI_BLOCKGC_TAG: xfs_blockgc_queue(pag); break; + case XFS_ICI_INODEGC_TAG: + xfs_inodegc_queue(mp); + break; } trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); @@ -338,28 +364,26 @@ xfs_inode_mark_reclaimable( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + unsigned int tag; bool need_inactive = xfs_inode_needs_inactive(ip); if (!need_inactive) { /* Going straight to reclaim, so drop the dquots. */ xfs_qm_dqdetach(ip); - } else { - xfs_inactive(ip); - } - if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { - xfs_check_delalloc(ip, XFS_DATA_FORK); - xfs_check_delalloc(ip, XFS_COW_FORK); - ASSERT(0); + if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } } XFS_STATS_INC(mp, vn_reclaim); /* - * We should never get here with one of the reclaim flags already set. + * We should never get here with any of the reclaim flags already set. */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); /* * We always use background reclaim here because even if the inode is @@ -372,9 +396,17 @@ xfs_inode_mark_reclaimable( spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + if (need_inactive) { + trace_xfs_inode_set_need_inactive(ip); + ip->i_flags |= XFS_NEED_INACTIVE; + tag = XFS_ICI_INODEGC_TAG; + } else { + trace_xfs_inode_set_reclaimable(ip); + ip->i_flags |= XFS_IRECLAIMABLE; + tag = XFS_ICI_RECLAIM_TAG; + } + + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), tag); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -442,6 +474,7 @@ xfs_iget_recycle( { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); + unsigned int tag; int error; trace_xfs_iget_recycle(ip); @@ -452,7 +485,16 @@ xfs_iget_recycle( * the inode. We can't clear the radix tree tag yet as it requires * pag_ici_lock to be held exclusive. */ - ip->i_flags |= XFS_IRECLAIM; + if (ip->i_flags & XFS_IRECLAIMABLE) { + tag = XFS_ICI_RECLAIM_TAG; + ip->i_flags |= XFS_IRECLAIM; + } else if (ip->i_flags & XFS_NEED_INACTIVE) { + tag = XFS_ICI_INODEGC_TAG; + ip->i_flags |= XFS_INACTIVATING; + } else { + ASSERT(0); + return -EINVAL; + } spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); @@ -469,10 +511,10 @@ xfs_iget_recycle( rcu_read_lock(); spin_lock(&ip->i_flags_lock); wake = !!__xfs_iflags_test(ip, XFS_INEW); - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING); if (wake) wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + ASSERT(ip->i_flags & (XFS_IRECLAIMABLE | XFS_NEED_INACTIVE)); spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); @@ -490,8 +532,7 @@ xfs_iget_recycle( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), tag); inode->i_state = I_NEW; spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -575,9 +616,15 @@ xfs_iget_cache_hit( * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) goto out_skip; + /* Unlinked inodes cannot be re-grabbed. */ + if (VFS_I(ip)->i_nlink == 0 && (ip->i_flags & XFS_NEED_INACTIVE)) { + error = -ENOENT; + goto out_error; + } + /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -588,11 +635,11 @@ xfs_iget_cache_hit( /* Skip inodes that have no vfs state. */ if ((flags & XFS_IGET_INCORE) && - (ip->i_flags & XFS_IRECLAIMABLE)) + (ip->i_flags & (XFS_IRECLAIMABLE | XFS_NEED_INACTIVE))) goto out_skip; /* The inode fits the selection criteria; process it. */ - if (ip->i_flags & XFS_IRECLAIMABLE) { + if (ip->i_flags & (XFS_IRECLAIMABLE | XFS_NEED_INACTIVE)) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); if (error) @@ -889,22 +936,32 @@ xfs_dqrele_igrab( /* * Skip inodes that are anywhere in the reclaim machinery because we - * drop dquots before tagging an inode for reclamation. + * drop dquots before tagging an inode for reclamation. If the inode + * is being inactivated, skip it because inactivation will drop the + * dquots for us. */ - if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) + if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE | XFS_INACTIVATING)) goto out_unlock; /* - * The inode looks alive; try to grab a VFS reference so that it won't - * get destroyed. If we got the reference, return true to say that - * we grabbed the inode. + * If the inode is queued but not undergoing inactivation, set the + * inactivating flag so everyone will leave it alone and return true + * to say that we are taking ownership of it. + * + * Otherwise, the inode looks alive; try to grab a VFS reference so + * that it won't get destroyed. If we got the reference, return true + * to say that we grabbed the inode. * * If we can't get the reference, then we know the inode had its VFS * state torn down and hasn't yet entered the reclaim machinery. Since * we also know that dquots are detached from an inode before it enters * reclaim, we can skip the inode. */ - ret = igrab(VFS_I(ip)) != NULL; + ret = true; + if (ip->i_flags & XFS_NEED_INACTIVE) + ip->i_flags |= XFS_INACTIVATING; + else if (!igrab(VFS_I(ip))) + ret = false; out_unlock: spin_unlock(&ip->i_flags_lock); @@ -917,6 +974,8 @@ xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_icwalk *icw) { + bool live_inode; + if (xfs_iflags_test(ip, XFS_INEW)) xfs_inew_wait(ip); @@ -934,7 +993,19 @@ xfs_dqrele_inode( ip->i_pdquot = NULL; } xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); + + /* + * If we set INACTIVATING earlier to prevent this inode from being + * touched, clear that state to let the inodegc claim it. Otherwise, + * it's a live inode and we need to release it. + */ + spin_lock(&ip->i_flags_lock); + live_inode = !(ip->i_flags & XFS_INACTIVATING); + ip->i_flags &= ~XFS_INACTIVATING; + spin_unlock(&ip->i_flags_lock); + + if (live_inode) + xfs_irele(ip); } /* @@ -1043,6 +1114,7 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: + trace_xfs_inode_reclaiming(ip); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1520,6 +1592,8 @@ xfs_blockgc_start( /* Don't try to run block gc on an inode that's in any of these states. */ #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING | \ XFS_IRECLAIMABLE | \ XFS_IRECLAIM) /* @@ -1680,6 +1754,203 @@ xfs_blockgc_free_quota( xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); } +/* + * Inode Inactivation and Reclaimation + * =================================== + * + * Sometimes, inodes need to have work done on them once the last program has + * closed the file. Typically this means cleaning out any leftover speculative + * preallocations after EOF or in the CoW fork. For inodes that have been + * totally unlinked, this means unmapping data/attr/cow blocks, removing the + * inode from the unlinked buckets, and marking it free in the inobt and inode + * table. + * + * This process can generate many metadata updates, which shows up as close() + * and unlink() calls that take a long time. We defer all that work to a + * workqueue which means that we can batch a lot of work and do it in inode + * order for better performance. Furthermore, we can control the workqueue, + * which means that we can avoid doing inactivation work at a bad time, such as + * when the fs is frozen. + * + * Deferred inactivation introduces new inode flag states (NEED_INACTIVE and + * INACTIVATING) and adds a new INODEGC radix tree tag for fast access. We + * maintain separate perag counters for both types, and move counts as inodes + * wander the state machine, which now works as follows: + * + * If the inode needs inactivation, we: + * - Set the NEED_INACTIVE inode flag + * - Schedule background inode inactivation + * + * If the inode does not need inactivation, we: + * - Set the IRECLAIMABLE inode flag + * - Schedule background inode reclamation + * + * When it is time to inactivate the inode, we: + * - Set the INACTIVATING inode flag + * - Make all the on-disk updates + * - Clear the inactive state and set the IRECLAIMABLE inode flag + * - Schedule background inode reclamation + * + * When it is time to reclaim the inode, we: + * - Set the IRECLAIM inode flag + * - Reclaim the inode and RCU free it + * + * When these state transitions occur, the caller must have taken the per-AG + * incore inode tree lock and then the inode i_flags lock, in that order. + */ + +/* + * Decide if the given @ip is eligible for inactivation, and grab it if so. + * Returns true if it's ready to go or false if we should just ignore it. + * + * Skip inodes that don't need inactivation or are being inactivated (or + * recycled) by another thread. Inodes should not be tagged for inactivation + * while also in INEW or any reclaim state. + * + * Otherwise, mark this inode as being inactivated even if the fs is shut down + * because we need xfs_inodegc_inactivate to push this inode into the reclaim + * state. + */ +static bool +xfs_inodegc_igrab( + struct xfs_inode *ip) +{ + bool ret = false; + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + if ((ip->i_flags & XFS_NEED_INACTIVE) && + !(ip->i_flags & XFS_INACTIVATING)) { + ret = true; + ip->i_flags |= XFS_INACTIVATING; + } + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ret; +} + +/* + * Free all speculative preallocations and possibly even the inode itself. + * This is the last chance to make changes to an otherwise unreferenced file + * before incore reclamation happens. + */ +static void +xfs_inodegc_inactivate( + struct xfs_inode *ip, + struct xfs_perag *pag, + struct xfs_icwalk *icw) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + + /* + * Inactivation isn't supposed to run when the fs is frozen because + * we don't want kernel threads to block on transaction allocation. + */ + ASSERT(mp->m_super->s_writers.frozen < SB_FREEZE_FS); + + /* + * Foreground threads that have hit ENOSPC or EDQUOT are allowed to + * pass in a icw structure to look for inodes to inactivate + * immediately to free some resources. If this inode isn't a match, + * put it back on the shelf and move on. + */ + spin_lock(&ip->i_flags_lock); + if (!xfs_icwalk_match(ip, icw)) { + ip->i_flags &= ~XFS_INACTIVATING; + spin_unlock(&ip->i_flags_lock); + return; + } + spin_unlock(&ip->i_flags_lock); + + trace_xfs_inode_inactivating(ip); + + xfs_inactive(ip); + + if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + + /* Schedule the inactivated inode for reclaim. */ + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + trace_xfs_inode_set_reclaimable(ip); + ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); + ip->i_flags |= XFS_IRECLAIMABLE; + + xfs_perag_clear_inode_tag(pag, agino, XFS_ICI_INODEGC_TAG); + xfs_perag_set_inode_tag(pag, agino, XFS_ICI_RECLAIM_TAG); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); +} + +/* Inactivate inodes until we run out. */ +void +xfs_inodegc_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_inodegc_work); + + /* + * Inactivation never returns error codes and never fails to push a + * tagged inode to reclaim. Loop until there there's nothing left. + */ + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { + trace_xfs_inodegc_worker(mp, 0, _RET_IP_); + xfs_icwalk(mp, XFS_ICWALK_INODEGC, NULL); + } +} + +/* + * Force all currently queued inode inactivation work to run immediately, and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + trace_xfs_inodegc_flush(mp, 0, _RET_IP_); + flush_delayed_work(&mp->m_inodegc_work); +} + +/* Disable the inode inactivation background worker and wait for it to stop. */ +void +xfs_inodegc_stop( + struct xfs_mount *mp) +{ + if (!test_and_clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) + return; + + cancel_delayed_work_sync(&mp->m_inodegc_work); + trace_xfs_inodegc_stop(mp, 0, _RET_IP_); +} + +/* + * Enable the inode inactivation background worker and schedule deferred inode + * inactivation work if there is any. + */ +void +xfs_inodegc_start( + struct xfs_mount *mp) +{ + if (test_and_set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) + return; + + trace_xfs_inodegc_start(mp, 0, _RET_IP_); + xfs_inodegc_queue(mp); +} + /* XFS Inode Cache Walking Code */ /* @@ -1708,6 +1979,8 @@ xfs_icwalk_igrab( return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: return xfs_reclaim_igrab(ip, icw); + case XFS_ICWALK_INODEGC: + return xfs_inodegc_igrab(ip); default: return false; } @@ -1736,6 +2009,9 @@ xfs_icwalk_process_inode( case XFS_ICWALK_RECLAIM: xfs_reclaim_inode(ip, pag); break; + case XFS_ICWALK_INODEGC: + xfs_inodegc_inactivate(ip, pag, icw); + break; } return error; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 00dc98a92835..840eac06a71b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -80,4 +80,9 @@ int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); +void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_flush(struct xfs_mount *mp); +void xfs_inodegc_stop(struct xfs_mount *mp); +void xfs_inodegc_start(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e3137bbc7b14..fa5be0d071ad 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -240,6 +240,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ +#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -248,6 +249,21 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_IRECOVERY (1 << 11) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ +/* + * If we need to update on-disk metadata before this IRECLAIMABLE inode can be + * freed, then NEED_INACTIVE will be set. Once we start the updates, the + * INACTIVATING bit will be set to keep iget away from this inode. After the + * inactivation completes, both flags will be cleared and the inode is a + * plain old IRECLAIMABLE inode. + */ +#define XFS_INACTIVATING (1 << 13) + +/* All inode state flags related to inode reclaim. */ +#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ + XFS_IRECLAIM | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) + /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from @@ -255,7 +271,8 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) /* * Flags for inode locking. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1227503d2246..9d8fc85bd28d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2784,6 +2784,13 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } + + /* + * Flush the pending unlinked inodes to ensure that the inactivations + * are fully completed on disk and the incore inodes can be reclaimed + * before we signal that recovery is complete. + */ + xfs_inodegc_flush(mp); } STATIC void diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c3a96fb3ad80..ab65a14e51e6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -514,7 +514,8 @@ xfs_check_summary_counts( * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, * so we need to unpin them, write them back and/or reclaim them before unmount - * can proceed. + * can proceed. In other words, callers are required to have inactivated all + * inodes. * * An inode cluster that has been freed can have its buffer still pinned in * memory because the transaction is still sitting in a iclog. The stale inodes @@ -546,6 +547,7 @@ xfs_unmount_flush_inodes( mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_ail_push_all_sync(mp->m_ail); + xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); @@ -782,6 +784,9 @@ xfs_mountfs( if (error) goto out_log_dealloc; + /* Enable background inode inactivation workers. */ + xfs_inodegc_start(mp); + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -936,6 +941,15 @@ xfs_mountfs( xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + + /* + * Inactivate all inodes that might still be in memory after a log + * intent recovery failure so that reclaim can free them. Metadata + * inodes and the root directory shouldn't need inactivation, but the + * mount failed for some reason, so pull down all the state and flee. + */ + xfs_inodegc_flush(mp); + /* * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those @@ -983,6 +997,16 @@ xfs_unmountfs( uint64_t resblks; int error; + /* + * Perform all on-disk metadata updates required to inactivate inodes + * that the VFS evicted earlier in the unmount process. Freeing inodes + * and discarding CoW fork preallocations can cause shape changes to + * the free inode and refcount btrees, respectively, so we must finish + * this before we discard the metadata space reservations. Metadata + * inodes and the root directory do not require inactivation. + */ + xfs_inodegc_flush(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c78b63fe779a..dc906b78e24c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -154,6 +154,13 @@ typedef struct xfs_mount { uint8_t m_rt_checked; uint8_t m_rt_sick; + /* + * This atomic bitset controls flags that alter the behavior of the + * filesystem. Use only the atomic bit helper functions here; see + * XFS_OPFLAG_* for information about the actual flags. + */ + unsigned long m_opflags; + /* * End of read-mostly variables. Frequently written variables and locks * should be placed below this comment from now on. The first variable @@ -184,6 +191,7 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_inodegc_work; /* background inode inactive */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; @@ -258,6 +266,19 @@ typedef struct xfs_mount { #define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) #define XFS_MOUNT_DAX_NEVER (1ULL << 27) +/* + * Operation flags -- each entry here is a bit index into m_opflags and is + * not itself a flag value. Use the atomic bit functions to access. + */ +enum xfs_opflag_bits { + /* + * If set, background inactivation worker threads will be scheduled to + * process queued inodegc work. If not, queued inodes remain in memory + * waiting to be processed. + */ + XFS_OPFLAG_INODEGC_RUNNING_BIT = 0, +}; + /* * Max and min values for mount-option defined I/O * preallocation sizes. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dd1ee333dcb3..0b01d9499395 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -714,6 +714,8 @@ xfs_fs_sync_fs( { struct xfs_mount *mp = XFS_M(sb); + trace_xfs_fs_sync_fs(mp, wait, _RET_IP_); + /* * Doing anything during the async pass would be counterproductive. */ @@ -730,6 +732,25 @@ xfs_fs_sync_fs( flush_delayed_work(&mp->m_log->l_work); } + /* + * Flush all deferred inode inactivation work so that the free space + * counters will reflect recent deletions. Do not force the log again + * because log recovery can restart the inactivation from the info that + * we just wrote into the ondisk log. + * + * For regular operation this isn't strictly necessary since we aren't + * required to guarantee that unlinking frees space immediately, but + * that is how XFS historically behaved. + * + * If, however, the filesystem is at FREEZE_PAGEFAULTS, this is our + * last chance to complete the inactivation work before the filesystem + * freezes and the log is quiesced. The background worker will not + * activate again until the fs is thawed because the VFS won't evict + * any more inodes until freeze_super drops s_umount and we disable the + * worker in xfs_fs_freeze. + */ + xfs_inodegc_flush(mp); + return 0; } @@ -844,6 +865,17 @@ xfs_fs_freeze( */ flags = memalloc_nofs_save(); xfs_blockgc_stop(mp); + + /* + * Stop the inodegc background worker. freeze_super already flushed + * all pending inodegc work when it sync'd the filesystem after setting + * SB_FREEZE_PAGEFAULTS, and it holds s_umount, so we know that inodes + * cannot enter xfs_fs_destroy_inode until the freeze is complete. + * If the filesystem is read-write, inactivated inodes will queue but + * the worker will not run until the filesystem thaws or unmounts. + */ + xfs_inodegc_stop(mp); + xfs_save_resvblks(mp); ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); @@ -859,6 +891,14 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_blockgc_start(mp); + + /* + * Don't reactivate the inodegc worker on a readonly filesystem because + * inodes are sent directly to reclaim. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_inodegc_start(mp); + return 0; } @@ -1665,6 +1705,9 @@ xfs_remount_rw( if (error && error != -ENOSPC) return error; + /* Re-enable the background inode inactivation worker. */ + xfs_inodegc_start(mp); + return 0; } @@ -1687,6 +1730,15 @@ xfs_remount_ro( return error; } + /* + * Stop the inodegc background worker. xfs_fs_reconfigure already + * flushed all pending inodegc work when it sync'd the filesystem. + * The VFS holds s_umount, so we know that inodes cannot enter + * xfs_fs_destroy_inode during a remount operation. In readonly mode + * we send inodes straight to reclaim, so no inodes will be queued. + */ + xfs_inodegc_stop(mp); + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -1810,6 +1862,7 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d0b4799ad1e6..ca9bfbd28886 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -156,6 +156,45 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); +DECLARE_EVENT_CLASS(xfs_fs_class, + TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip), + TP_ARGS(mp, data, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, mflags) + __field(unsigned long, opflags) + __field(unsigned long, sbflags) + __field(int, data) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->mflags = mp->m_flags; + __entry->opflags = mp->m_opflags; + __entry->sbflags = mp->m_super->s_flags; + __entry->data = data; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d flags 0x%llx opflags 0x%lx sflags 0x%lx data %d caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->mflags, + __entry->opflags, + __entry->sbflags, + __entry->data, + (char *)__entry->caller_ip) +); + +#define DEFINE_FS_EVENT(name) \ +DEFINE_EVENT(xfs_fs_class, name, \ + TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip), \ + TP_ARGS(mp, data, caller_ip)) +DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_start); +DEFINE_FS_EVENT(xfs_inodegc_stop); +DEFINE_FS_EVENT(xfs_inodegc_queue); +DEFINE_FS_EVENT(xfs_inodegc_worker); +DEFINE_FS_EVENT(xfs_fs_sync_fs); + DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), TP_ARGS(mp, agno), @@ -615,14 +654,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) + __field(unsigned long, iflags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; ), - TP_printk("dev %d:%d ino 0x%llx", + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino) + __entry->ino, + __entry->iflags) ) #define DEFINE_INODE_EVENT(name) \ @@ -666,6 +708,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_reclaimable); +DEFINE_INODE_EVENT(xfs_inode_reclaiming); +DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); +DEFINE_INODE_EVENT(xfs_inode_inactivating); /* * ftrace's __print_symbolic requires that all enum values be wrapped in the From patchwork Sun Jun 13 17:20:35 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317813 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id C9EA0C48BCF for ; Sun, 13 Jun 2021 17:20:38 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id A73BA61284 for ; Sun, 13 Jun 2021 17:20:38 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232013AbhFMRWj (ORCPT ); Sun, 13 Jun 2021 13:22:39 -0400 Received: from mail.kernel.org ([198.145.29.99]:41366 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231915AbhFMRWh (ORCPT ); Sun, 13 Jun 2021 13:22:37 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id AD7AB61107; Sun, 13 Jun 2021 17:20:35 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604835; bh=g3CBACY5w7E1y1TkTQ46shW8kvRHHuONN/3os9dIIpA=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=Q+q9SGLwYjK6Vr9vc8SlkeCSYDS3detuWXrfVUETExHDwu08Rr/Dq20Me5+12+xdV ywRA3288JQxmfDgFN45QDyuckWEhotsCCCwPPvjfYtUbdl3QNFCY96y2ObOfAO1nXA RR5yh3RGpb8ajvtQnzZqUXdgMWD94pxdAj8RruM3NEByUjbjqAV/jBNbT8xbv8RAr9 2TTMMNLA8d9eoHBQmxDZDPhVQHK89k56esxhrZAkoGUgQmG7AM4B9aq2BHhb9QkYYR V0vbUnQV+mA/1apUXriaWEPjeTBMG59GgxWwfgzMGgxnbzy39ZlKmBYV+cGBgGBzdv TarvyVEVw8X1w== Subject: [PATCH 07/16] xfs: drop dead dquots before scheduling inode for inactivation From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:35 -0700 Message-ID: <162360483541.1530792.3129912273735341414.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Since we now defer inode inactivation to a background workqueue, there can be a considerable delay between the point in time when an inode moves into NEED_INACTIVE state (and hence quotaoff cannot reach it) and when xfs_inactive() actually finishes the inode. To avoid delaying quotaoff any more than necessary, drop dead dquots as soon as we know that we're going to inactivate the inode. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 6 ++++++ fs/xfs/xfs_qm.c | 28 ++++++++++++++++++++++++++++ fs/xfs/xfs_quota.h | 2 ++ 3 files changed, 36 insertions(+) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e094c16aa8c5..17c4cd91ea15 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -376,6 +376,12 @@ xfs_inode_mark_reclaimable( xfs_check_delalloc(ip, XFS_COW_FORK); ASSERT(0); } + } else { + /* + * Drop dquots for disabled quota types to avoid delaying + * quotaoff while we wait for inactivation to occur. + */ + xfs_qm_prepare_inactive(ip); } XFS_STATS_INC(mp, vn_reclaim); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fe341f3fd419..b193a84e47c2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -408,6 +408,34 @@ xfs_qm_dqdetach( } } +/* + * If a quota type is turned off but we still have a dquot attached to the + * inode, detach it before tagging this inode for inactivation (or reclaim) to + * avoid delaying quotaoff for longer than is necessary. Because the inode has + * no VFS state and has not yet been tagged for reclaim or inactivation, it is + * safe to drop the dquots locklessly because iget, quotaoff, blockgc, and + * reclaim will not touch the inode. + */ +void +xfs_qm_prepare_inactive( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_UQUOTA_ON(mp)) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (!XFS_IS_GQUOTA_ON(mp)) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (!XFS_IS_PQUOTA_ON(mp)) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } +} + struct xfs_qm_isolate { struct list_head buffers; struct list_head dispose; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index d00d01302545..75d8b7bc0e25 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -101,6 +101,7 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, extern int xfs_qm_dqattach(struct xfs_inode *); extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); +void xfs_qm_prepare_inactive(struct xfs_inode *ip); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); @@ -162,6 +163,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) +#define xfs_qm_prepare_inactive(ip) ((void)0) #define xfs_qm_dqrele(d) do { (d) = (d); } while(0) #define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) From patchwork Sun Jun 13 17:20:40 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317815 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id F1284C48BCF for ; Sun, 13 Jun 2021 17:20:43 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id CD04D61107 for ; Sun, 13 Jun 2021 17:20:43 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231915AbhFMRWo (ORCPT ); Sun, 13 Jun 2021 13:22:44 -0400 Received: from mail.kernel.org ([198.145.29.99]:41392 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRWm (ORCPT ); Sun, 13 Jun 2021 13:22:42 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 3443861078; Sun, 13 Jun 2021 17:20:41 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604841; bh=pR1ASSTOo7jCaXPb9aaB7T6MY/c5B8OQgR+xGD2NjYQ=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=REooMlE01jn3IFh7eVUx328Wm8Kwen0KtknQq9QUg3Uy6HqijS0PXw/k9cFcQWnbg 9JqXi9cFBGeFnnbqe0HZk0oInz6PCGzmGxnHV6dHXUqektka1/wuFredcSWudcewOZ idWGBmpJVvsYHJrmaJuwDHy1z8n6q6vNByWH7TJ1HkgpPxtsOqEqHw3SvDegli40Ac GswGNaqJCCooTx5isCGpu5XL8J+LB7qi5JNw6cMFCi61nwfopeg49R8h/jr0XHBpc5 bdqOTxDP/+1w2vOlkr+R1yaCMRylN5FKUh6QonPHcbN7DjjjQM/Nmzzan8SPFelAcI 4wlJsP7lZASng== Subject: [PATCH 08/16] xfs: expose sysfs knob to control inode inactivation delay From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:40 -0700 Message-ID: <162360484090.1530792.16682068626300537360.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Allow administrators to control the length that we defer inode inactivation. By default we'll set the delay to 100ms, as an arbitrary choice between allowing for some batching of a deltree operation, and not letting too many inodes pile up in memory. Signed-off-by: Darrick J. Wong --- Documentation/admin-guide/xfs.rst | 7 +++++++ fs/xfs/xfs_globals.c | 5 +++++ fs/xfs/xfs_icache.c | 5 +++-- fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_sysctl.c | 9 +++++++++ fs/xfs/xfs_sysctl.h | 1 + 6 files changed, 26 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index f9b109bfc6a6..f095cfe7137f 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -277,6 +277,13 @@ The following sysctls are available for the XFS filesystem: references and returns timed-out AGs back to the free stream pool. + fs.xfs.inode_gc_delay_ms + (Units: milliseconds Min: 0 Default: 100 Max: 3600000) + The amount of time to delay cleanup work that happens after a file is + closed by all programs. This involves clearing speculative + preallocations from linked files and freeing unlinked files. A higher + value here increases batching at a risk of background work storms. + fs.xfs.speculative_prealloc_lifetime (Units: seconds Min: 1 Default: 300 Max: 86400) The interval at which the background scanning for inodes diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..64674f424ff8 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -28,6 +28,11 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + + /* Values below here are measured in milliseconds */ + .inodegc_ms = { 0, 100, 3600*1000}, + + /* Values below here are measured in seconds */ .blockgc_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 17c4cd91ea15..ddf43a60a55c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -245,8 +245,9 @@ xfs_inodegc_queue( rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { - trace_xfs_inodegc_queue(mp, 0, _RET_IP_); - queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0); + trace_xfs_inodegc_queue(mp, xfs_inodegc_ms, _RET_IP_); + queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, + msecs_to_jiffies(xfs_inodegc_ms)); } rcu_read_unlock(); } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7688663b9773..e762762256e4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -99,6 +99,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_blockgc_secs xfs_params.blockgc_timer.val +#define xfs_inodegc_ms xfs_params.inodegc_ms.val #define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 546a6cd96729..6495887f4f00 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -176,6 +176,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.fstrm_timer.min, .extra2 = &xfs_params.fstrm_timer.max, }, + { + .procname = "inode_gc_delay_ms", + .data = &xfs_params.inodegc_ms.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inodegc_ms.min, + .extra2 = &xfs_params.inodegc_ms.max + }, { .procname = "speculative_prealloc_lifetime", .data = &xfs_params.blockgc_timer.val, diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 7692e76ead33..9a867b379a1f 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -36,6 +36,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */ + xfs_sysctl_val_t inodegc_ms; /* Inode inactivation scan interval */ } xfs_param_t; /* From patchwork Sun Jun 13 17:20:46 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317817 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id E3B6BC48BDF for ; Sun, 13 Jun 2021 17:20:47 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id CCCB861284 for ; Sun, 13 Jun 2021 17:20:47 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232016AbhFMRWs (ORCPT ); Sun, 13 Jun 2021 13:22:48 -0400 Received: from mail.kernel.org ([198.145.29.99]:41418 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRWs (ORCPT ); Sun, 13 Jun 2021 13:22:48 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id AC54561107; Sun, 13 Jun 2021 17:20:46 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604846; bh=BJogehxRorbaOlZZFDwUdra0JwVpUn672r2IriDHHUM=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=gh++hvcXBGcxKqcv/OWwuuY817vJ6QZ1IUfzaJWosR6Y97FuKX6Ky/f0NSSeapzue Bs6fSitB2L464JnhP4VinfWNz76/IFYzFQFGiMjVWNtoXNQVSC24tie7j+c6d6Qb+m n/N7IUWPRPwbd+lGUKG+JiDRpLQSlBaXI6Lr/+ZitNDDk1MbmmB+o6J9avf/zV8hDR T54+Ul2wYPGUdrIVBzfIbjASJo0/a5Dj+sTP2UIHA9n9ZnGfCdaNhFi3E1nkGx7HkY 8GjK7bQesxStQ/PmmQBH4DxYg7p/hz0jfNTHDno78hC2IVcsg2InJQD1kwTsjmsE2k ZOqJX8SLau9PQ== Subject: [PATCH 09/16] xfs: reduce inactivation delay when things are tight From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:46 -0700 Message-ID: <162360484641.1530792.6759006798754085532.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Now that we have made the inactivation of unlinked inodes a background task to increase the throughput of file deletions, we need to be a little more careful about how long of a delay we can tolerate. On a mostly empty filesystem, the risk of the allocator making poor decisions due to fragmentation of the free space on account a lengthy delay in background updates is minimal because there's plenty of space. However, if free space is tight, we want to deallocate unlinked inodes as quickly as possible to avoid fallocate ENOSPC and to give the allocator the best shot at optimal allocations for new writes. Furthermore, if we're near the quota limits, we want to run the background work as soon as possible to avoid going over the limits even temporarily. Therefore, use the same free space and quota thresholds that we use to limit preallocation to scale down the delay between an AG being tagged for needing inodgc work and the inodegc worker being executed. This follows the same principle that XFS becomes less aggressive about allocations (and more precise about accounting) when nearing full. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 157 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ddf43a60a55c..97c2901017e4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -215,6 +215,111 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } +/* + * Scale down the background work delay if we're close to a quota limit. + * Similar to the way that we throttle preallocations, we halve the delay time + * for every low free space threshold that isn't met, and we zero it if we're + * over the hard limit. Return value is in ms. + */ +static inline unsigned int +xfs_worker_delay_dquot( + struct xfs_inode *ip, + xfs_dqtype_t type, + unsigned int delay_ms) +{ + struct xfs_dquot *dqp; + int64_t freesp; + unsigned int shift = 0; + + if (!ip) + goto out; + + /* + * Leave the delay untouched if there are no quota limits to enforce. + * These comparisons are done locklessly because at worst we schedule + * background work sooner than necessary. + */ + dqp = xfs_inode_dquot(ip, type); + if (!dqp || !xfs_dquot_is_enforced(dqp)) + goto out; + + /* no hi watermark, no throttle */ + if (!dqp->q_prealloc_hi_wmark) + goto out; + + /* under the lo watermark, no throttle */ + if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + goto out; + + /* If we're over the hard limit, run immediately. */ + if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + return 0; + + /* Scale down the delay if we're close to the soft limits. */ + freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; + if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dqp->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + delay_ms >>= shift; +out: + return delay_ms; +} + +/* + * Scale down the background work delay if we're low on free space. Similar to + * the way that we throttle preallocations, we halve the delay time for every + * low free space threshold that isn't met. Return value is in ms. + */ +static inline unsigned int +xfs_worker_delay_freesp( + struct xfs_mount *mp, + unsigned int delay_ms) +{ + int64_t freesp; + unsigned int shift = 0; + + freesp = percpu_counter_read_positive(&mp->m_fdblocks); + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + + return delay_ms >> shift; +} + +/* + * Compute the lag between scheduling and executing background work based on + * free space in the filesystem. If an inode is passed in, its dquots will + * be considered in the lag computation. Return value is in ms. + */ +static inline unsigned int +xfs_worker_delay_ms( + struct xfs_mount *mp, + struct xfs_inode *ip, + unsigned int default_ms) +{ + unsigned int udelay, gdelay, pdelay, fdelay; + + udelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_USER, default_ms); + gdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_GROUP, default_ms); + pdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_PROJ, default_ms); + fdelay = xfs_worker_delay_freesp(mp, default_ms); + + return min(min(udelay, gdelay), min(pdelay, fdelay)); +} + /* * Background scanning to trim preallocated space. This is queued based on the * 'speculative_prealloc_lifetime' tunable (5m by default). @@ -238,28 +343,63 @@ xfs_blockgc_queue( */ static void xfs_inodegc_queue( - struct xfs_mount *mp) + struct xfs_mount *mp, + struct xfs_inode *ip) { if (!test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) return; rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { - trace_xfs_inodegc_queue(mp, xfs_inodegc_ms, _RET_IP_); + unsigned int delay; + + delay = xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms); + trace_xfs_inodegc_queue(mp, delay, _RET_IP_); queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, - msecs_to_jiffies(xfs_inodegc_ms)); + msecs_to_jiffies(delay)); } rcu_read_unlock(); } -/* Set a tag on both the AG incore inode tree and the AG radix tree. */ +/* + * Reschedule the background inactivation worker immediately if space is + * getting tight and the worker hasn't started running yet. + */ static void +xfs_inodegc_queue_sooner( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + if (!XFS_IS_QUOTA_ON(mp) || + !delayed_work_pending(&mp->m_inodegc_work) || + !test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) + return; + + rcu_read_lock(); + if (!radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) + goto unlock; + + if (xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms) == xfs_inodegc_ms) + goto unlock; + + trace_xfs_inodegc_queue(mp, 0, _RET_IP_); + queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0); +unlock: + rcu_read_unlock(); +} + +/* + * Set a tag on both the AG incore inode tree and the AG radix tree. + * Returns true if the tag was previously set on any item in the incore tree. + */ +static bool xfs_perag_set_inode_tag( struct xfs_perag *pag, - xfs_agino_t agino, + struct xfs_inode *ip, unsigned int tag) { struct xfs_mount *mp = pag->pag_mount; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); bool was_tagged; lockdep_assert_held(&pag->pag_ici_lock); @@ -271,7 +411,7 @@ xfs_perag_set_inode_tag( pag->pag_ici_reclaimable++; if (was_tagged) - return; + return true; /* propagate the tag up into the perag radix tree */ spin_lock(&mp->m_perag_lock); @@ -287,11 +427,12 @@ xfs_perag_set_inode_tag( xfs_blockgc_queue(pag); break; case XFS_ICI_INODEGC_TAG: - xfs_inodegc_queue(mp); + xfs_inodegc_queue(mp, ip); break; } trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); + return false; } /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ @@ -367,6 +508,7 @@ xfs_inode_mark_reclaimable( struct xfs_perag *pag; unsigned int tag; bool need_inactive = xfs_inode_needs_inactive(ip); + bool already_queued; if (!need_inactive) { /* Going straight to reclaim, so drop the dquots. */ @@ -413,10 +555,14 @@ xfs_inode_mark_reclaimable( tag = XFS_ICI_RECLAIM_TAG; } - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), tag); + already_queued = xfs_perag_set_inode_tag(pag, ip, tag); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); + + if (need_inactive && already_queued) + xfs_inodegc_queue_sooner(mp, ip); + xfs_perag_put(pag); } @@ -1413,8 +1559,7 @@ xfs_blockgc_set_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); + xfs_perag_set_inode_tag(pag, ip, XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); @@ -1895,7 +2040,7 @@ xfs_inodegc_inactivate( ip->i_flags |= XFS_IRECLAIMABLE; xfs_perag_clear_inode_tag(pag, agino, XFS_ICI_INODEGC_TAG); - xfs_perag_set_inode_tag(pag, agino, XFS_ICI_RECLAIM_TAG); + xfs_perag_set_inode_tag(pag, ip, XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -1955,7 +2100,7 @@ xfs_inodegc_start( return; trace_xfs_inodegc_start(mp, 0, _RET_IP_); - xfs_inodegc_queue(mp); + xfs_inodegc_queue(mp, NULL); } /* XFS Inode Cache Walking Code */ From patchwork Sun Jun 13 17:20:51 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317819 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 59934C48BDF for ; Sun, 13 Jun 2021 17:20:53 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 43CDF61107 for ; Sun, 13 Jun 2021 17:20:53 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231948AbhFMRWx (ORCPT ); Sun, 13 Jun 2021 13:22:53 -0400 Received: from mail.kernel.org ([198.145.29.99]:41444 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232017AbhFMRWx (ORCPT ); Sun, 13 Jun 2021 13:22:53 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 30DFA61078; Sun, 13 Jun 2021 17:20:52 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604852; bh=QUb17QJbj6TLcvBF1SN55fvqhUW2SVn5xsuQq7o6bf4=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=GmZctl92bODtBA1V2P7H3D2SzyuMtT82jf3fR43/dn4Fkr+ydPZOE7Ei9kA2clHfz f0LMKFAJ8HnwdFQgk2JJcGk6JeFVZsw3jiXlOfIXWCTIQ53LHpPevUDfKiuW+taE63 hdPs1cLfLT9C3BYNiQgOrvguCRs0muysPP7ykF+QNVqpBs/bGSfDz3NIyO6/9pOzQq 2Lv5ZznpyZm8kXz8lwErqBeWi9EhBFoDbXvasi9d04DpnbxDnkz7jx5A0R/Qp68usV 45ae62PsI/JHPf/tLo9vZD9dz5HePBDtAo3RdTtdhEsAAD8C2L17NYxtBx3gjxu+nM 51OnlA5mNYThA== Subject: [PATCH 10/16] xfs: inactivate inodes any time we try to free speculative preallocations From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:51 -0700 Message-ID: <162360485190.1530792.729777050167640805.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Other parts of XFS have learned to call xfs_blockgc_free_{space,quota} to try to free speculative preallocations when space is tight. This means that file writes, transaction reservation failures, quota limit enforcement, and the EOFBLOCKS ioctl all call this function to free space when things are tight. Since inode inactivation is now a background task, this means that the filesystem can be hanging on to unlinked but not yet freed space. Add this to the list of things that xfs_blockgc_free_* makes writer threads scan for when they cannot reserve space. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 97c2901017e4..210a9e3cd19e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1828,16 +1828,23 @@ xfs_blockgc_worker( } /* - * Try to free space in the filesystem by purging eofblocks and cowblocks. + * Try to free space in the filesystem by purging inactive inodes, eofblocks + * and cowblocks. */ int xfs_blockgc_free_space( struct xfs_mount *mp, struct xfs_icwalk *icw) { + int error; + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); - return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + if (error) + return error; + + return xfs_icwalk(mp, XFS_ICWALK_INODEGC, icw); } /* From patchwork Sun Jun 13 17:20:57 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317821 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 11C9AC48BCF for ; Sun, 13 Jun 2021 17:20:59 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E1D5D61078 for ; Sun, 13 Jun 2021 17:20:58 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232018AbhFMRW7 (ORCPT ); Sun, 13 Jun 2021 13:22:59 -0400 Received: from mail.kernel.org ([198.145.29.99]:41472 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRW7 (ORCPT ); Sun, 13 Jun 2021 13:22:59 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id A934761107; Sun, 13 Jun 2021 17:20:57 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604857; bh=zFgBoTO21uNmTCMlEEzvIbDbrRdHu1ZBnRuFhgH0hqs=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=nhKzpp8Stts8nq4AzLiKf1FCyIBt95t3lJZydR90wjcscFIKW/1HU8Fs2urIa3eVY 1u+bhkP9CB0vs3OKP4j8t6BH3uyQ1SaOn74FIe1BvgUYa1ZK7ervfaa0KAbiycqXN3 /6Xnrbn/ujwqRjJ21RseB4Faar0iqi7JHSkVQjcoR64gjgjYmPSi82O3dvi3I0tXPB swGA++xiUcCc9Kv4CKi6D9oIj8GQFgtWjDU2d1Qxsi6oxGoNWqmIAp6W2ShaS2A1HA 5y0/OeFgZB4i7AayJM2K+JJlgM7X2E2cKHkdXSaSZIABgDEMnXp4hSc2idBupSQqWM tGVJvRvWpub6A== Subject: [PATCH 11/16] xfs: flush inode inactivation work when compiling usage statistics From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:20:57 -0700 Message-ID: <162360485739.1530792.14898030232791253243.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Users have come to expect that the space accounting information in statfs and getquota reports are fairly accurate. Now that we inactivate inodes from a background queue, these numbers can be thrown off by whatever resources are singly-owned by the inodes in the queue. Flush the pending inactivations when userspace asks for a space usage report. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm_syscalls.c | 8 ++++++++ fs/xfs/xfs_super.c | 3 +++ 2 files changed, 11 insertions(+) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 13a56e1ea15c..e203489cd212 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -698,6 +698,10 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (id == 0) + xfs_inodegc_flush(mp); + /* * Try to get the dquot. We don't want it allocated on disk, so don't * set doalloc. If it doesn't exist, we'll get ENOENT back. @@ -736,6 +740,10 @@ xfs_qm_scall_getquota_next( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (*id == 0) + xfs_inodegc_flush(mp); + error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0b01d9499395..45ef63b5b2f0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -769,6 +769,9 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; + /* Wait for whatever inactivations are in progress. */ + xfs_inodegc_flush(mp); + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; From patchwork Sun Jun 13 17:21:02 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317823 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 15E43C48BDF for ; Sun, 13 Jun 2021 17:21:05 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id EFE1E61284 for ; Sun, 13 Jun 2021 17:21:04 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232019AbhFMRXF (ORCPT ); Sun, 13 Jun 2021 13:23:05 -0400 Received: from mail.kernel.org ([198.145.29.99]:41502 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRXE (ORCPT ); Sun, 13 Jun 2021 13:23:04 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 2EBD861078; Sun, 13 Jun 2021 17:21:03 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604863; bh=BFcTqCUkWD1n32IJELtkpAAXqLSVBcbxtyyzsG9A/Y0=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=WdtjKn26970pqVlbHCzddlwXUm5gHr3SSGo+k2CoA6CZ+FbAoFE7rNv92TTeUqHcY Fqa2jFUGsfILMDgeqB6r3SSCqqF0v8UFR/M6BpR61JVNAPapcM4kws6gAomuWhNmaH 8wZ50JBK5OHfP1fwMsbk4PtFWA7Br7lyDkhuEZd3T5yO6dW0BZ0wu5iHbzgTsZekGC 4tTOrDjdyxE56O0Xls68ZB3w7gFMGs/DX4GKjosUDrhmCvCxSXBm5xgL2szZawIDrH dxU1q7X1BzxzlwyjScoqEzvQZjMP2rjjPjY2+XLK2LYLd8MN5MrrTpLbcs2t4A5Q6A 0Dm/r7kBYEwtg== Subject: [PATCH 12/16] xfs: parallelize inode inactivation From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:21:02 -0700 Message-ID: <162360486288.1530792.18351614470122965770.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Split the inode inactivation work into per-AG work items so that we can take advantage of parallelization. Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 3 + fs/xfs/libxfs/xfs_ag.h | 3 + fs/xfs/xfs_icache.c | 101 ++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_mount.c | 11 +++-- fs/xfs/xfs_mount.h | 2 - fs/xfs/xfs_super.c | 1 fs/xfs/xfs_trace.h | 8 ++-- 7 files changed, 97 insertions(+), 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 0765a0ba30e1..7652d90d7d0d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,6 +173,7 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); + ASSERT(!delayed_work_pending(&pag->pag_inodegc_work)); ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -195,6 +196,7 @@ xfs_free_perag( ASSERT(atomic_read(&pag->pag_ref) == 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); + cancel_delayed_work_sync(&pag->pag_inodegc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); @@ -253,6 +255,7 @@ xfs_initialize_perag( spin_lock_init(&pag->pagb_lock); spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_DELAYED_WORK(&pag->pag_inodegc_work, xfs_inodegc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); init_waitqueue_head(&pag->pagb_wait); pag->pagb_count = 0; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4c6f9045baca..3929ea35b0d4 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -96,6 +96,9 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; + /* background inode inactivation */ + struct delayed_work pag_inodegc_work; + /* * Unlinked inode information. This incore information reflects * data stored in the AGI, so callers must hold the AGI buffer lock diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 210a9e3cd19e..f58d0455e38f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -299,6 +299,43 @@ xfs_worker_delay_freesp( return delay_ms >> shift; } +/* + * Scale down the background work delay if we're low on free space in this AG. + * Similar to the way that we throttle preallocations, we halve the delay time + * for every low free space threshold that isn't met. Return value is in ms. + */ +static inline unsigned int +xfs_work_delay_perag( + struct xfs_perag *pag, + unsigned int delay_ms) +{ + struct xfs_mount *mp = pag->pag_mount; + xfs_extlen_t freesp; + unsigned int shift = 0; + + if (!pag->pagf_init) + return delay_ms; + + /* Free space in this AG that can be allocated to file data */ + freesp = pag->pagf_freeblks + pag->pagf_flcount; + freesp -= (pag->pag_meta_resv.ar_reserved + + pag->pag_rmapbt_resv.ar_reserved); + + if (freesp < mp->m_ag_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_ag_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_ag_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_ag_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_ag_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + + return delay_ms >> shift; +} + /* * Compute the lag between scheduling and executing background work based on * free space in the filesystem. If an inode is passed in, its dquots will @@ -306,18 +343,20 @@ xfs_worker_delay_freesp( */ static inline unsigned int xfs_worker_delay_ms( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip, unsigned int default_ms) { - unsigned int udelay, gdelay, pdelay, fdelay; + struct xfs_mount *mp = pag->pag_mount; + unsigned int udelay, gdelay, pdelay, fdelay, adelay; udelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_USER, default_ms); gdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_GROUP, default_ms); pdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_PROJ, default_ms); fdelay = xfs_worker_delay_freesp(mp, default_ms); + adelay = xfs_work_delay_perag(pag, default_ms); - return min(min(udelay, gdelay), min(pdelay, fdelay)); + return min(adelay, min(min(udelay, gdelay), min(pdelay, fdelay))); } /* @@ -343,9 +382,11 @@ xfs_blockgc_queue( */ static void xfs_inodegc_queue( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip) { + struct xfs_mount *mp = pag->pag_mount; + if (!test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) return; @@ -353,9 +394,9 @@ xfs_inodegc_queue( if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { unsigned int delay; - delay = xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms); - trace_xfs_inodegc_queue(mp, delay, _RET_IP_); - queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, + delay = xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms); + trace_xfs_inodegc_queue(mp, pag->pag_agno, delay, _RET_IP_); + queue_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, msecs_to_jiffies(delay)); } rcu_read_unlock(); @@ -367,11 +408,13 @@ xfs_inodegc_queue( */ static void xfs_inodegc_queue_sooner( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip) { + struct xfs_mount *mp = pag->pag_mount; + if (!XFS_IS_QUOTA_ON(mp) || - !delayed_work_pending(&mp->m_inodegc_work) || + !delayed_work_pending(&pag->pag_inodegc_work) || !test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) return; @@ -379,11 +422,11 @@ xfs_inodegc_queue_sooner( if (!radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) goto unlock; - if (xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms) == xfs_inodegc_ms) + if (xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms) == xfs_inodegc_ms) goto unlock; - trace_xfs_inodegc_queue(mp, 0, _RET_IP_); - queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0); + trace_xfs_inodegc_queue(mp, pag->pag_agno, 0, _RET_IP_); + mod_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, 0); unlock: rcu_read_unlock(); } @@ -427,7 +470,7 @@ xfs_perag_set_inode_tag( xfs_blockgc_queue(pag); break; case XFS_ICI_INODEGC_TAG: - xfs_inodegc_queue(mp, ip); + xfs_inodegc_queue(pag, ip); break; } @@ -561,7 +604,7 @@ xfs_inode_mark_reclaimable( spin_unlock(&pag->pag_ici_lock); if (need_inactive && already_queued) - xfs_inodegc_queue_sooner(mp, ip); + xfs_inodegc_queue_sooner(pag, ip); xfs_perag_put(pag); } @@ -2058,16 +2101,17 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_inodegc_work); + struct xfs_perag *pag = container_of(to_delayed_work(work), + struct xfs_perag, pag_inodegc_work); + struct xfs_mount *mp = pag->pag_mount; /* * Inactivation never returns error codes and never fails to push a * tagged inode to reclaim. Loop until there there's nothing left. */ - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) { - trace_xfs_inodegc_worker(mp, 0, _RET_IP_); - xfs_icwalk(mp, XFS_ICWALK_INODEGC, NULL); + while (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_INODEGC_TAG)) { + trace_xfs_inodegc_worker(mp, pag->pag_agno, 0, _RET_IP_); + xfs_icwalk_ag(pag, XFS_ICWALK_INODEGC, NULL); } } @@ -2079,8 +2123,13 @@ void xfs_inodegc_flush( struct xfs_mount *mp) { + struct xfs_perag *pag; + xfs_agnumber_t agno; + trace_xfs_inodegc_flush(mp, 0, _RET_IP_); - flush_delayed_work(&mp->m_inodegc_work); + + for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG) + flush_delayed_work(&pag->pag_inodegc_work); } /* Disable the inode inactivation background worker and wait for it to stop. */ @@ -2088,10 +2137,14 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { + struct xfs_perag *pag; + xfs_agnumber_t agno; + if (!test_and_clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) return; - cancel_delayed_work_sync(&mp->m_inodegc_work); + for_each_perag(mp, agno, pag) + cancel_delayed_work_sync(&pag->pag_inodegc_work); trace_xfs_inodegc_stop(mp, 0, _RET_IP_); } @@ -2103,11 +2156,15 @@ void xfs_inodegc_start( struct xfs_mount *mp) { + struct xfs_perag *pag; + xfs_agnumber_t agno; + if (test_and_set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) return; trace_xfs_inodegc_start(mp, 0, _RET_IP_); - xfs_inodegc_queue(mp, NULL); + for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG) + xfs_inodegc_queue(pag, NULL); } /* XFS Inode Cache Walking Code */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ab65a14e51e6..eff375f92005 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -365,13 +365,16 @@ void xfs_set_low_space_thresholds( struct xfs_mount *mp) { - int i; + uint64_t space = mp->m_sb.sb_dblocks; + uint32_t ag_space = mp->m_sb.sb_agblocks; + int i; + + do_div(space, 100); + do_div(ag_space, 100); for (i = 0; i < XFS_LOWSP_MAX; i++) { - uint64_t space = mp->m_sb.sb_dblocks; - - do_div(space, 100); mp->m_low_space[i] = space * (i + 1); + mp->m_ag_low_space[i] = ag_space * (i + 1); } } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dc906b78e24c..154aa95d968c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -131,6 +131,7 @@ typedef struct xfs_mount { uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ + int32_t m_ag_low_space[XFS_LOWSP_MAX]; uint64_t m_flags; /* global mount flags */ int64_t m_low_space[XFS_LOWSP_MAX]; struct xfs_ino_geometry m_ino_geo; /* inode geometry */ @@ -191,7 +192,6 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_inodegc_work; /* background inode inactive */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 45ef63b5b2f0..66b61d38f401 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1865,7 +1865,6 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ca9bfbd28886..404f2f32002f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -122,7 +122,7 @@ TRACE_EVENT(xlog_intent_recovery_failed, __entry->error, __entry->function) ); -DECLARE_EVENT_CLASS(xfs_perag_class, +DECLARE_EVENT_CLASS(xfs_perag_ref_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, unsigned long caller_ip), TP_ARGS(mp, agno, refcount, caller_ip), @@ -146,7 +146,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, ); #define DEFINE_PERAG_REF_EVENT(name) \ -DEFINE_EVENT(xfs_perag_class, name, \ +DEFINE_EVENT(xfs_perag_ref_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) @@ -155,6 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); +DEFINE_PERAG_REF_EVENT(xfs_inodegc_queue); +DEFINE_PERAG_REF_EVENT(xfs_inodegc_worker); DECLARE_EVENT_CLASS(xfs_fs_class, TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip), @@ -191,8 +193,6 @@ DEFINE_EVENT(xfs_fs_class, name, \ DEFINE_FS_EVENT(xfs_inodegc_flush); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); -DEFINE_FS_EVENT(xfs_inodegc_queue); -DEFINE_FS_EVENT(xfs_inodegc_worker); DEFINE_FS_EVENT(xfs_fs_sync_fs); DECLARE_EVENT_CLASS(xfs_ag_class, From patchwork Sun Jun 13 17:21:08 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317825 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 6B59CC48BDF for ; Sun, 13 Jun 2021 17:21:13 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 4744861107 for ; Sun, 13 Jun 2021 17:21:13 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232020AbhFMRXN (ORCPT ); Sun, 13 Jun 2021 13:23:13 -0400 Received: from mail.kernel.org ([198.145.29.99]:41528 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRXK (ORCPT ); Sun, 13 Jun 2021 13:23:10 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id ABF7061107; Sun, 13 Jun 2021 17:21:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604868; bh=j5uuBQ9l0QBe8KlRMgq54x5z325uRHbxMX0sXhEFwkE=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=G3eziSZtqF/mGoioG14Ls8KdDdCkSuzKxv/CMKARfzXAYLuBCByXW7r6z5Wby9MVA 4xlvAdiLxTg1KBx8Ik3NSLtq6gNNHuHWvS/SBvcX6X+gpzzHZq1UbEOePrjlR9I5eS dLr2PXO5ZfDujVy1YYtK0V5FEOXxPSyR1YY0ROfiAciu8AEdzLWiUUONTZglhjvki8 aKPmm4RapE4iiypHc7C5kp/9PVgpwngN7cQh+je4oKQW2cRdYLiVHp65N+AReDIHFg KzcfEqf7Rr/CySLmnRBsWeIJYGtJW6CknVFRUdukcuveY9G5X/i7b/XdQeA/10yB4a YnxtaIPvx2/fg== Subject: [PATCH 13/16] xfs: don't run speculative preallocation gc when fs is frozen From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:21:08 -0700 Message-ID: <162360486839.1530792.12073922123665591653.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Now that we have the infrastructure to switch background workers on and off at will, fix the block gc worker code so that we don't actually run the worker when the filesystem is frozen, same as we do for deferred inactivation. Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/common.c | 9 +++++---- fs/xfs/xfs_icache.c | 38 ++++++++++++++++++++++++++++++-------- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 7 +++++++ fs/xfs/xfs_super.c | 9 ++++++--- fs/xfs/xfs_trace.h | 4 ++++ 6 files changed, 53 insertions(+), 15 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index b674bc6ed78d..2deda2c5189a 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -893,11 +893,12 @@ xchk_start_reaping( struct xfs_scrub *sc) { /* - * Readonly filesystems do not perform inactivation, so there's no - * need to restart the worker. + * Readonly filesystems do not perform inactivation or speculative + * preallocation, so there's no need to restart the workers. */ - if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) + if (!(sc->mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_inodegc_start(sc->mp); - xfs_blockgc_start(sc->mp); + xfs_blockgc_start(sc->mp); + } sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f58d0455e38f..780100756738 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -367,11 +367,19 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + + if (!test_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags)) + return; + rcu_read_lock(); - if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, - &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { + unsigned int delay = xfs_blockgc_secs * 1000; + + trace_xfs_blockgc_queue(mp, pag->pag_agno, delay, _RET_IP_); + queue_delayed_work(mp->m_gc_workqueue, &pag->pag_blockgc_work, + msecs_to_jiffies(delay)); + } rcu_read_unlock(); } @@ -1769,8 +1777,12 @@ xfs_blockgc_stop( struct xfs_perag *pag; xfs_agnumber_t agno; - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + if (!test_and_clear_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags)) + return; + + for_each_perag(mp, agno, pag) cancel_delayed_work_sync(&pag->pag_blockgc_work); + trace_xfs_blockgc_stop(mp, 0, _RET_IP_); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1781,6 +1793,10 @@ xfs_blockgc_start( struct xfs_perag *pag; xfs_agnumber_t agno; + if (!test_and_set_bit(XFS_OPFLAG_BLOCKGC_RUNNING_BIT, &mp->m_opflags)) + return; + + trace_xfs_blockgc_start(mp, 0, _RET_IP_); for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) xfs_blockgc_queue(pag); } @@ -1838,6 +1854,13 @@ xfs_blockgc_scan_inode( unsigned int lockflags = 0; int error; + /* + * Speculative preallocation gc isn't supposed to run when the fs is + * frozen because we don't want kernel threads to block on transaction + * allocation. + */ + ASSERT(ip->i_mount->m_super->s_writers.frozen < SB_FREEZE_FS); + error = xfs_inode_free_eofblocks(ip, icw, &lockflags); if (error) goto unlock; @@ -1860,13 +1883,12 @@ xfs_blockgc_worker( struct xfs_mount *mp = pag->pag_mount; int error; - if (!sb_start_write_trylock(mp->m_super)) - return; + trace_xfs_blockgc_worker(mp, pag->pag_agno, 0, _RET_IP_); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); - sb_end_write(mp->m_super); xfs_blockgc_queue(pag); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index eff375f92005..558414e1460c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -789,6 +789,7 @@ xfs_mountfs( /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); + xfs_blockgc_start(mp); /* * Get and sanity-check the root inode. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 154aa95d968c..8d6565fdf56a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -277,6 +277,13 @@ enum xfs_opflag_bits { * waiting to be processed. */ XFS_OPFLAG_INODEGC_RUNNING_BIT = 0, + + /* + * If set, background speculative prealloc gc worker threads will be + * scheduled to process queued blockgc work. If not, inodes retain + * their preallocations until explicitly deleted. + */ + XFS_OPFLAG_BLOCKGC_RUNNING_BIT = 1, }; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 66b61d38f401..4aae20d2761f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -893,14 +893,17 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_blockgc_start(mp); /* * Don't reactivate the inodegc worker on a readonly filesystem because - * inodes are sent directly to reclaim. + * inodes are sent directly to reclaim. Don't reactivate the blockgc + * worker because there are no speculative preallocations on a readonly + * filesystem. */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_blockgc_start(mp); xfs_inodegc_start(mp); + } return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 404f2f32002f..0795427e8f38 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -157,6 +157,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_inodegc_queue); DEFINE_PERAG_REF_EVENT(xfs_inodegc_worker); +DEFINE_PERAG_REF_EVENT(xfs_blockgc_worker); +DEFINE_PERAG_REF_EVENT(xfs_blockgc_queue); DECLARE_EVENT_CLASS(xfs_fs_class, TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip), @@ -194,6 +196,8 @@ DEFINE_FS_EVENT(xfs_inodegc_flush); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_fs_sync_fs); +DEFINE_FS_EVENT(xfs_blockgc_start); +DEFINE_FS_EVENT(xfs_blockgc_stop); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), From patchwork Sun Jun 13 17:21:13 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317827 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4D3FAC48BE8 for ; Sun, 13 Jun 2021 17:21:15 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 1FAE561284 for ; Sun, 13 Jun 2021 17:21:15 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232017AbhFMRXP (ORCPT ); Sun, 13 Jun 2021 13:23:15 -0400 Received: from mail.kernel.org ([198.145.29.99]:41556 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRXP (ORCPT ); Sun, 13 Jun 2021 13:23:15 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 3031661107; Sun, 13 Jun 2021 17:21:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604874; bh=HsSEZQ0jQ4HEgeetc+Oia8BIIZwLyfhrj/q9hfygaTI=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=KBU9ubZSUSKZ+4B+8t5X83aS1ojOQNIgyq74gwUpTiPsJUWPEU/+gSjdYAf1xn5Kn S10JWxhKSHCE7xmPobqTnlI0XluIZeblnV8HG24IdW3CSc/M0TLAzJTsiz4JrrqjO6 xP1yplj2UiKNWq9om10onEQk7AcB3+Z0a92rWXag88NSx6NuQdevhw9m4BjoSxTtJR ku2PHySoNEdz6AOCW+Qi/NZzHbCVw8fiMaueWsLmUTRVwjOWBtyzIbyvesyGYcfyA2 hMwPPyBwGfOqZf1w/R6JFLeikhE9kwfCoG55wzpPxcvlnsOE3mwoWNvEYS7jA/vRpb uEmWGeuOZEnCA== Subject: [PATCH 14/16] xfs: scale speculative preallocation gc delay based on free space From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:21:13 -0700 Message-ID: <162360487389.1530792.11066249469501881271.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Now that we have the ability to scale down the lag between scheduling and executing background cleanup work for inode inactivations, apply the same logic to speculative preallocation gc. In other words, be more proactive about trimming unused speculative preallocations if space is low. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 780100756738..f530dc2803ed 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -365,7 +365,8 @@ xfs_worker_delay_ms( */ static inline void xfs_blockgc_queue( - struct xfs_perag *pag) + struct xfs_perag *pag, + struct xfs_inode *ip) { struct xfs_mount *mp = pag->pag_mount; @@ -374,8 +375,9 @@ xfs_blockgc_queue( rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { - unsigned int delay = xfs_blockgc_secs * 1000; + unsigned int delay; + delay = xfs_worker_delay_ms(pag, ip, xfs_blockgc_secs * 1000); trace_xfs_blockgc_queue(mp, pag->pag_agno, delay, _RET_IP_); queue_delayed_work(mp->m_gc_workqueue, &pag->pag_blockgc_work, msecs_to_jiffies(delay)); @@ -383,6 +385,36 @@ xfs_blockgc_queue( rcu_read_unlock(); } +/* + * Reschedule the background speculative gc worker immediately if space is + * getting tight and the worker hasn't started running yet. + */ +static void +xfs_blockgc_queue_sooner( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = pag->pag_mount; + unsigned int blockgc_ms = xfs_blockgc_secs * 1000; + + if (!XFS_IS_QUOTA_ON(mp) || + !delayed_work_pending(&pag->pag_blockgc_work) || + !test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags)) + return; + + rcu_read_lock(); + if (!radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) + goto unlock; + + if (xfs_worker_delay_ms(pag, ip, blockgc_ms) == blockgc_ms) + goto unlock; + + trace_xfs_blockgc_queue(mp, pag->pag_agno, 0, _RET_IP_); + mod_delayed_work(mp->m_gc_workqueue, &pag->pag_blockgc_work, 0); +unlock: + rcu_read_unlock(); +} + /* * Queue a background inactivation worker if there are inodes that need to be * inactivated and higher level xfs code hasn't disabled the background @@ -475,7 +507,7 @@ xfs_perag_set_inode_tag( xfs_reclaim_work_queue(mp); break; case XFS_ICI_BLOCKGC_TAG: - xfs_blockgc_queue(pag); + xfs_blockgc_queue(pag, ip); break; case XFS_ICI_INODEGC_TAG: xfs_inodegc_queue(pag, ip); @@ -1594,6 +1626,7 @@ xfs_blockgc_set_iflag( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + bool already_queued; ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); @@ -1610,9 +1643,13 @@ xfs_blockgc_set_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - xfs_perag_set_inode_tag(pag, ip, XFS_ICI_BLOCKGC_TAG); + already_queued = xfs_perag_set_inode_tag(pag, ip, XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); + + if (already_queued) + xfs_blockgc_queue_sooner(pag, ip); + xfs_perag_put(pag); } @@ -1798,7 +1835,7 @@ xfs_blockgc_start( trace_xfs_blockgc_start(mp, 0, _RET_IP_); for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) - xfs_blockgc_queue(pag); + xfs_blockgc_queue(pag, NULL); } /* Don't try to run block gc on an inode that's in any of these states. */ @@ -1889,7 +1926,7 @@ xfs_blockgc_worker( if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); - xfs_blockgc_queue(pag); + xfs_blockgc_queue(pag, NULL); } /* From patchwork Sun Jun 13 17:21:19 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317829 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0755DC48BCF for ; Sun, 13 Jun 2021 17:21:21 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E54B261284 for ; Sun, 13 Jun 2021 17:21:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232021AbhFMRXV (ORCPT ); Sun, 13 Jun 2021 13:23:21 -0400 Received: from mail.kernel.org ([198.145.29.99]:41582 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232014AbhFMRXV (ORCPT ); Sun, 13 Jun 2021 13:23:21 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id A7D1761078; Sun, 13 Jun 2021 17:21:19 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604879; bh=6yr9JbqpIh9c87TQcZ9DPfuqvYWxRyMQ/BuyvDwXB9I=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=vM48YorukCMroHlQz5eNo5uj6OaY6loO4K5wW7tPHhj0LB1TSymzFmEZvYZnUa9up WyIov/sRusZZbLDzRhekkLRNU/Hf3harxra9a1ywXGIoJedR4Ncq6jHaMAPiz+TUal a4w1wihZk4HDV0IWFMb6b8ZrVixOSKrmXKl0XyDlaORYCwTaaVWyYju+wsLOakBV4O fQp/cu1tmjWIrMcwfiG+wJsECoozVGRmL3VU3j3st9LOTPJ5YH9rCJTMNtNlaOozPw +PmeyR/wR7xkEUdP02hj2Zt2zqf/PuDycBOrQY40+vDxH833jmrCBQkrJqNILsh5Kr oJrT5jb5dAHUg== Subject: [PATCH 15/16] xfs: use background worker pool when transactions can't get free space From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:21:19 -0700 Message-ID: <162360487939.1530792.9794697674663330727.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong In xfs_trans_alloc, if the block reservation call returns ENOSPC, we call xfs_blockgc_free_space with a NULL icwalk structure to try to free space. Each frontend thread that encounters this situation starts its own walk of the inode cache to see if it can find anything, which is wasteful since we don't have any additional selection criteria. For this one common case, create a function that reschedules all pending background work immediately and flushes the workqueue so that the scan can run in parallel. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 19 +++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_trans.c | 5 +---- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f530dc2803ed..656ae8d81ec5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1949,6 +1949,25 @@ xfs_blockgc_free_space( return xfs_icwalk(mp, XFS_ICWALK_INODEGC, icw); } +/* + * Reclaim all the free space that we can by scheduling the background blockgc + * and inodegc workers immediately and waiting for them all to clear. + */ +void +xfs_blockgc_flush_all( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + trace_xfs_blockgc_flush_all(mp, 0, _RET_IP_); + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + flush_delayed_work(&pag->pag_blockgc_work); + + xfs_inodegc_flush(mp); +} + /* * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which * quota caused an allocation failure, so we make a best effort by including diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 840eac06a71b..a9e9bb3ce4bb 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -59,6 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); +void xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0795427e8f38..ecbfa8399776 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -198,6 +198,7 @@ DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_fs_sync_fs); DEFINE_FS_EVENT(xfs_blockgc_start); DEFINE_FS_EVENT(xfs_blockgc_stop); +DEFINE_FS_EVENT(xfs_blockgc_flush_all); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 586f2992b789..9fa340cc018d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -295,10 +295,7 @@ xfs_trans_alloc( * Do not perform a synchronous scan because callers can hold * other locks. */ - error = xfs_blockgc_free_space(mp, NULL); - if (error) - return error; - + xfs_blockgc_flush_all(mp); want_retry = false; goto retry; } From patchwork Sun Jun 13 17:21:24 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 12317831 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-16.2 required=3.0 tests=BAYES_00,DKIMWL_WL_HIGH, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 69829C48BDF for ; Sun, 13 Jun 2021 17:21:26 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 4EC9261284 for ; Sun, 13 Jun 2021 17:21:26 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232014AbhFMRX0 (ORCPT ); Sun, 13 Jun 2021 13:23:26 -0400 Received: from mail.kernel.org ([198.145.29.99]:41608 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232023AbhFMRX0 (ORCPT ); Sun, 13 Jun 2021 13:23:26 -0400 Received: by mail.kernel.org (Postfix) with ESMTPSA id 2DF9861107; Sun, 13 Jun 2021 17:21:25 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1623604885; bh=1dux238x4nCpTc/Ks1zNtPWbwcksk/ApfN3YPgywvcA=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=AxUvaM6/JpWXw1DhYU0Ok7EdmweRl5u2LbGP+HWxCo5ryp8YF3/w7OOoKJ5OeN4g/ BPyGpbZ6Cx29QjqOyM/oi8Zu1jcTR+gdd+onxDgP/I6E5AYkDEd43uU6KI/I+3LD7P 0rWobUowNY4/I8N6h3+iJjIwxQNGsUndhActzbewPfZMrcexjeAIE2dO8gDOYujoNb eoAt7E/aT6HLSHQbck+fuFnbTSZ4iaSYoR6/mDLdJOV2pMIGrMRvih3wHyHNUlp3T0 6rSmQXcZDwBK45hl38iPk3uNFyPNy0pYWiRjWCFuZHseFeHe5Zj6sFpPHxe3/GHbOk rtOuB4AZkesug== Subject: [PATCH 16/16] xfs: avoid buffer deadlocks when walking fs inodes From: "Darrick J. Wong" To: djwong@kernel.org Cc: Dave Chinner , Christoph Hellwig , linux-xfs@vger.kernel.org, david@fromorbit.com, hch@infradead.org, bfoster@redhat.com Date: Sun, 13 Jun 2021 10:21:24 -0700 Message-ID: <162360488488.1530792.1140215685661520911.stgit@locust> In-Reply-To: <162360479631.1530792.17147217854887531696.stgit@locust> References: <162360479631.1530792.17147217854887531696.stgit@locust> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong When we're servicing an INUMBERS or BULKSTAT request or running quotacheck, grab an empty transaction so that we can use its inherent recursive buffer locking abilities to detect inode btree cycles without hitting ABBA buffer deadlocks. This patch requires the deferred inode inactivation patchset because xfs_irele cannot directly call xfs_inactive when the iwalk itself has an (empty) transaction. Found by fuzzing an inode btree pointer to introduce a cycle into the tree (xfs/365). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_itable.c | 42 +++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_iwalk.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f331975a16de..84c17a9f9869 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -19,6 +19,7 @@ #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_health.h" +#include "xfs_trans.h" /* * Bulk Stat @@ -163,6 +164,7 @@ xfs_bulkstat_one( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -178,9 +180,18 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL, - breq->startino, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + breq->startino, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -244,6 +255,7 @@ xfs_bulkstat( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -259,9 +271,18 @@ xfs_bulkstat( if (!bc.buf) return -ENOMEM; - error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags, + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, xfs_bulkstat_iwalk, breq->icount, &bc); - + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -374,13 +395,24 @@ xfs_inumbers( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags, + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, xfs_inumbers_walk, breq->icount, &ic); + xfs_trans_cancel(tp); +out: /* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 917d51eefee3..7558486f4937 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -83,6 +83,9 @@ struct xfs_iwalk_ag { /* Skip empty inobt records? */ unsigned int skip_empty:1; + + /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ + unsigned int drop_trans:1; }; /* @@ -352,7 +355,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -362,10 +364,15 @@ xfs_iwalk_run_callbacks( ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ - xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + if (iwag->drop_trans) { + xfs_trans_cancel(iwag->tp); + iwag->tp = NULL; + } + error = xfs_iwalk_ag_recs(iwag); if (error) return error; @@ -376,8 +383,15 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0; + if (iwag->drop_trans) { + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + return error; + } + /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp, + agi_bpp); if (error) return error; @@ -390,7 +404,6 @@ xfs_iwalk_ag( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; @@ -469,7 +482,7 @@ xfs_iwalk_ag( error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: - xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); return error; } @@ -599,8 +612,18 @@ xfs_iwalk_ag_work( error = xfs_iwalk_alloc(iwag); if (error) goto out; + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + goto out; + iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); + if (iwag->tp) + xfs_trans_cancel(iwag->tp); xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag);