From patchwork Tue Sep 26 23:31:22 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399747 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id A30BFE7F14D for ; Wed, 27 Sep 2023 00:15:05 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232733AbjI0APE (ORCPT ); Tue, 26 Sep 2023 20:15:04 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35036 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232101AbjI0ANC (ORCPT ); Tue, 26 Sep 2023 20:13:02 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BFBEC1F9D4 for ; Tue, 26 Sep 2023 16:31:23 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5D82CC433C8; Tue, 26 Sep 2023 23:31:23 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771083; bh=TBoQb4giXJsf6fBCEmEO+yy34bin2T8ADAhxEpmxJMs=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=XxgQe6V1gBbWhTOYEkEdxyyMmcoRmsDX3z+SydB5mXlxPX/KDlY6HmvXWgtzkCOiS Q7STETB4TNEcCQL26Mhb+TEmqNfX/6A9NOWB3lNmVtQmWhTdMPmRIwpDovxpYlsGbT k2CzUj7bxgAipDvMlOjgfqf7B0oCKXQHUoBgI8OaY+ZPtPzU6lSrSLhB/A2P1lpraq 9qZ6hPWMAmQRPHmqmLxz6l7/5J+ME64DEeKO2Rnt6t04DbhsEJDjDlrnC6YQsE7/iN w7rJOlun2zHn74nQLhmmpNkrWKpiWdrSSxfahCsedyJ7SsuuBZPIRmwgUIylidQIjw nvVsvFbwZMaGg== Date: Tue, 26 Sep 2023 16:31:22 -0700 Subject: [PATCH 1/7] xfs: don't append work items to logged xfs_defer_pending objects From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059164.3312911.8148982456892861553.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong When someone tries to add a deferred work item to xfs_defer_add, it will try to attach the work item to the most recently added xfs_defer_pending object attached to the transaction. However, it doesn't check if the pending object has a log intent item attached to it. This is incorrect behavior because we cannot add more work to an object that has already been committed to the ondisk log. Therefore, change the behavior not to append to pending items with a non null dfp_intent. In practice this has not been an issue because the only way xfs_defer_add gets called after log intent items have been committed is from the defer ops ->finish_item functions themselves, and the @dop_pending isolation in xfs_defer_finish_noroll protects the pending items that have already been logged. However, the next patch will add the ability to pause a deferred extent free object during online btree rebuilding, and any new extfree work items need to have their own pending event. While we're at it, hoist the predicate to its own static inline function for readability. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 48 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index bcfb6a4203cdd..ad41c6d0113ce 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -617,6 +617,40 @@ xfs_defer_cancel( xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline struct xfs_defer_pending * +xfs_defer_try_append( + struct xfs_trans *tp, + enum xfs_defer_ops_type type, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_type != type) + return NULL; + + /* Already logged? */ + if (dfp->dfp_intent) + return NULL; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return NULL; + + return dfp; +} + /* Add an item for later deferred processing. */ void xfs_defer_add( @@ -630,19 +664,9 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } + dfp = xfs_defer_try_append(tp, type, ops); if (!dfp) { + /* Create a new pending item at the end of the intake list. */ dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); dfp->dfp_type = type; From patchwork Tue Sep 26 23:31:38 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399746 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4DF0CE7F14A for ; Wed, 27 Sep 2023 00:15:04 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229815AbjI0APD (ORCPT ); Tue, 26 Sep 2023 20:15:03 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35056 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232159AbjI0ANC (ORCPT ); Tue, 26 Sep 2023 20:13:02 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 859A81F9D9 for ; Tue, 26 Sep 2023 16:31:39 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2203CC433C7; Tue, 26 Sep 2023 23:31:39 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771099; bh=M+GTiHi7z0MzrioKFxJ0rC1Xn6RxRgWcnLXNbo84Hv4=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=dKziU1wfiF+n4z+9gaLPOwhxp6s3gC7Z2rzJG56PUO+qnvPp45zuTb+/lruXTfOCe fF3k/kiWrV66sej6CFmEUGdmPKYKxiJk1WD99oFCIa0x33N90jYQjj8zfqgR/JEvuC hI9MZ+BNQv820GuB2fGEyZaAHQKWXlbmCJnN3sm2yz8J182o8mt7UWLyn0c84eOF7v BuANwK5YJaWtgAAwhaYP211MYX55zBusqHek5RH/cdAQe9A4qtk6PA+nk4N3yNS5Fp dQnT328ixd7xn5xsK3zwLgm/Gxbw5ES8SwODq0uBzSpjYQkMqy+eA8LRfSBC+vVyf2 WN1EGM2JYSLRw== Date: Tue, 26 Sep 2023 16:31:38 -0700 Subject: [PATCH 2/7] xfs: allow pausing of pending deferred work items From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059178.3312911.7770487562460001097.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Traditionally, all pending deferred work attached to a transaction is finished when one of the xfs_defer_finish* functions is called. However, online repair wants to be able to allocate space for a new data structure, format a new metadata structure into the allocated space, and commit that into the filesystem. As a hedge against system crashes during repairs, we also want to log some EFI items for the allocated space speculatively, and cancel them if we elect to commit the new data structure. Therefore, introduce the idea of pausing a pending deferred work item. Log intent items are still created for paused items and relogged as necessary. However, paused items are pushed onto a side list before we start calling ->finish_item, and the whole list is reattach to the transaction afterwards. New work items are never attached to paused pending items. Modify xfs_defer_cancel to clean up pending deferred work items holding a log intent item but not a log intent done item, since that is now possible. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 98 +++++++++++++++++++++++++++++++++++++++------ fs/xfs/libxfs/xfs_defer.h | 17 +++++++- fs/xfs/xfs_trace.h | 13 +++++- 3 files changed, 112 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ad41c6d0113ce..6ed1ab8a7e522 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -410,7 +410,7 @@ xfs_defer_cancel_list( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -451,10 +451,6 @@ xfs_defer_relog( XFS_STATS_INC((*tpp)->t_mountp, defer_relog); dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); } - - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; } /* @@ -510,6 +506,24 @@ xfs_defer_finish_one( return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -525,6 +539,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -543,6 +558,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -555,22 +572,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -583,6 +611,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -602,7 +633,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -614,6 +648,7 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } @@ -644,6 +679,10 @@ xfs_defer_try_append( if (dfp->dfp_intent) return NULL; + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) return NULL; @@ -652,7 +691,7 @@ xfs_defer_try_append( } /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, enum xfs_defer_ops_type type, @@ -680,6 +719,8 @@ xfs_defer_add( list_add_tail(li, &dfp->dfp_work); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); dfp->dfp_count++; + + return dfp; } /* @@ -954,3 +995,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3c..7fb4f60e5e4c5 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -34,11 +34,24 @@ struct xfs_defer_pending { struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ + unsigned int dfp_flags; enum xfs_defer_ops_type dfp_type; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, + enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); void xfs_defer_cancel(struct xfs_trans *); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6ed..514095b6ba2bd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2551,6 +2551,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), @@ -2558,13 +2559,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->type = dfp->dfp_type; __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2675,6 +2678,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -2692,6 +2698,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( @@ -2700,13 +2707,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) From patchwork Tue Sep 26 23:31:54 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399801 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id DB330E7F14B for ; Wed, 27 Sep 2023 01:52:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233728AbjI0BwR (ORCPT ); Tue, 26 Sep 2023 21:52:17 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:59950 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234462AbjI0BuQ (ORCPT ); Tue, 26 Sep 2023 21:50:16 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 297114491 for ; Tue, 26 Sep 2023 16:31:55 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id C207BC433C7; Tue, 26 Sep 2023 23:31:54 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771114; bh=veTqd52MA3RtLqlC2r+xTdVkHpm23M5ZunbQZ+bCI8Y=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=PC8Dt8EPXJalXhmY+QygwEt29I+9sEdK2KdwSrixJuW/M0UQh91sL6N4HoXr68Qf8 Wwo7CKsjLRgZYgyQU2GzUtViQEd0wOnhD4710ww6RxKi5wFYD2edLWnNrNMgb9ho0I gXOlKN4pqDJEHiERQyhml1zGJaT238vAxNBVLDAl2X89icZWo7g6ywy+M1IxBUOj2F /r0bjjTwIiZztk2X7bntRcMYEuF4UIO3L6qMBrEmBpWpzwWckrSJGi4RLZxuTOA9A9 Jd/pJnn8I4Y3YhpSUqybZBFe2+wJbaDXt+agWTP4cBqVaPzARHdFo1yQAN+hd73bpM dBQagjy69F4qg== Date: Tue, 26 Sep 2023 16:31:54 -0700 Subject: [PATCH 3/7] xfs: remove __xfs_free_extent_later From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059193.3312911.17799392857205480363.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong xfs_free_extent_later is a trivial helper, so remove it to reduce the amount of thinking required to understand the deferred freeing interface. This will make it easier to introduce automatic reaping of speculative allocations in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_alloc.h | 14 +------------- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 5 +++-- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/scrub/reap.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_reflink.c | 2 +- 12 files changed, 17 insertions(+), 28 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e9cc481b4ddff..ab429956bdbfc 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3069194527dd0..295d11a27f632 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2502,7 +2502,7 @@ xfs_defer_agfl_block( * The list is maintained sorted (by block number). */ int -__xfs_free_extent_later( +xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321d..6b95d1d8a8537 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -256,18 +256,6 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} - - extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 30c931b38853c..b688f2801a361 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -5235,7 +5235,7 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd23..8360256cff168 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -272,7 +272,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c709069..d61d03e5b853b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1854,7 +1854,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1900,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015e..42a5e1f227a05 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -161,7 +161,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad0..3702b4a071100 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1153,7 +1153,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1215,7 +1215,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1985,7 +1985,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc242..3fa795e2488dd 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,7 +112,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c6..78c9f2085db46 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -410,7 +410,7 @@ xreap_agextent_iter( * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad9..9e7b58f3566c0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -717,7 +717,7 @@ xfs_efi_item_recover( error = xfs_free_extent_later(tp, fake.xefi_startblock, fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); + fake.xefi_agresv, false); if (!error) { requeue_only = true; continue; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index eb9102453affb..7c98ed075ee89 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; From patchwork Tue Sep 26 23:32:09 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399748 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3FD56E7F14C for ; Wed, 27 Sep 2023 00:15:05 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229704AbjI0APE (ORCPT ); Tue, 26 Sep 2023 20:15:04 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35068 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232733AbjI0ANC (ORCPT ); Tue, 26 Sep 2023 20:13:02 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id DB973A5CB for ; Tue, 26 Sep 2023 16:32:10 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8020BC433C7; Tue, 26 Sep 2023 23:32:10 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771130; bh=zQc41z6cGFYy7VjekKVwFr4tiTd9jTJLnc+K2zh1cVI=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=rCFuCZgifjExg7EOEO+d6eZv0BFkBwlMFH3O6H9PhO6YXyUpek39musrLVCyHjqyS 3ZxNMdRUdWeL7C1RAOZ2a4oRjeqDnpJL69GytZpuDvNKUiy3INvwNNCyv0pOsjau2p JmdR2G2/t05LHZAddhnv/MmI4xp052MEFPXvvOSoJHiE075KXkRWSiZCEdgYjCMeTp xlQ5Eht5cX7Hk7jX4+e2wSJiLX/XsJk2/Wip5H046VYu8dVGIZoN9XHaYzZB6PQgSV d7XUYcWxcs5KzAJXfzscAXxUDvjMxOnDDF+bQtMv7mB25Z2grnpHcx5FSyg0vyelyH nb6bY2yqqNgSQ== Date: Tue, 26 Sep 2023 16:32:09 -0700 Subject: [PATCH 4/7] xfs: automatic freeing of freshly allocated unwritten space From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059209.3312911.11197509089553101214.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong As mentioned in the previous commit, online repair wants to allocate space to write out a new metadata structure, and it also wants to hedge against system crashes during repairs by logging (and later cancelling) EFIs to free the space if we crash before committing the new data structure. Therefore, create a trio of functions to schedule automatic reaping of freshly allocated unwritten space. xfs_alloc_schedule_autoreap creates a paused EFI representing the space we just allocated. Once the allocations are made and the autoreaps scheduled, we can start writing to disk. If the writes succeed, xfs_alloc_cancel_autoreap marks the EFI work items as stale and unpauses the pending deferred work item. Assuming that's done in the same transaction that commits the new structure into the filesystem, we guarantee that either the new object is fully visible, or that all the space gets reclaimed. If the writes succeed but only part of an extent was used, repair must call the same _cancel_autoreap function to kill the first EFI and then log a new EFI to free the unused space. The first EFI is already committed, so it cannot be changed. For full extents that aren't used, xfs_alloc_commit_autoreap will unpause the EFI, which results in the space being freed during the next _defer_finish cycle. Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 104 +++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_alloc.h | 12 +++++ fs/xfs/xfs_extfree_item.c | 11 +++-- 3 files changed, 120 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 295d11a27f632..c1ee1862cc1af 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2501,14 +2501,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -xfs_free_extent_later( +static int +__xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2556,10 +2557,105 @@ xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); return 0; } +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return __xfs_free_extent_later(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = __xfs_free_extent_later(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_STALE; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6b95d1d8a8537..60d04dc13cc76 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -255,6 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_STALE (1U << 3) /* dont actually free the space */ + +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; + +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 9e7b58f3566c0..98c2667d369e8 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -392,9 +392,14 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, agbno, xefi->xefi_blockcount); - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (xefi->xefi_flags & XFS_EFI_STALE) { + error = 0; + } else { + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, + xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + } /* * Mark the transaction dirty, even on error. This ensures the From patchwork Tue Sep 26 23:32:25 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399804 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 05C81E7F151 for ; Wed, 27 Sep 2023 01:52:46 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234469AbjI0Bwp (ORCPT ); Tue, 26 Sep 2023 21:52:45 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:46902 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S235378AbjI0Buo (ORCPT ); Tue, 26 Sep 2023 21:50:44 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7710D16634 for ; Tue, 26 Sep 2023 16:32:26 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 137C7C433C8; Tue, 26 Sep 2023 23:32:26 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771146; bh=7CB0EXTniJDEfkz/BKdJ1MKdn5drfm5h5IEWwpSGJsQ=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=hjqvG87Jf8Cfz1RFWvlmnF4gPpXOj++4eNUPY1CcYq7KC9P25FRqXYVuZ6PgwutoM siywJMs2b9TQQ1JzSGvARnUa86TNjNdAXbfQMIX8GRKAXvwU4rMfhb0uspXe/SAZvQ 1YjxrkYqHiPtmkp9aBVRmsTl57yAWkES7CkoQb7ntE99aUSL8ek24yIURSr8QWAkRT D9hXLDJtZBP8k6zhHVi8IbwitI9G6E9f1Sc5KQckiTW13PgL7gCLtU+MWkIcdpbB+W a4JTfV2PqsMubslGDQlXswqu/qz/T2SPYg63L3Pfsx5alLvWac0kTyJlnJToI5fuJG hx1nM29cvOv4w== Date: Tue, 26 Sep 2023 16:32:25 -0700 Subject: [PATCH 5/7] xfs: implement block reservation accounting for btrees we're staging From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059224.3312911.3596538645136769266.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Create a new xrep_newbt structure to encapsulate a fake root for creating a staged btree cursor as well as to track all the blocks that we need to reserve in order to build that btree. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/Makefile | 1 fs/xfs/libxfs/xfs_btree_staging.h | 7 - fs/xfs/scrub/agheader_repair.c | 1 fs/xfs/scrub/common.c | 1 fs/xfs/scrub/newbt.c | 492 +++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/newbt.h | 62 +++++ fs/xfs/scrub/scrub.c | 2 fs/xfs/scrub/trace.h | 37 +++ 8 files changed, 598 insertions(+), 5 deletions(-) create mode 100644 fs/xfs/scrub/newbt.c create mode 100644 fs/xfs/scrub/newbt.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cfb..1537d66e5ab01 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -181,6 +181,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + newbt.o \ reap.o \ repair.o \ ) diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050aea..d6dea3f0088c6 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -38,11 +38,8 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; + /* Which fork is this btree being built for? */ + int if_whichfork; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b0637..36c511f96b004 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 23944fcc1a6ca..4bba3c49f8c59 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_inode.h" diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 0000000000000..4e8d6637426e4 --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (1) If someone turned one of the debug knobs. + * (2) If this is a per-AG btree and the AG has less than ~9% space free. + * (3) If this is an inode btree and the FS has less than ~9% space free. + * + * Note that we actually use 3/32 for the comparison to avoid division. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* Let the btree code compute the default slack values. */ + bload->leaf_slack = -1; + bload->node_slack = -1; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 3/32ths space left. */ + if (free >= ((sz * 3) >> 5)) + return; + + /* We're low on space; load the btrees as tightly as possible. */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 0; + if (bload->node_slack < 0) + bload->node_slack = 0; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + xnr->ifake.if_whichfork = whichfork; + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, free the entire space extent. + */ + goto free; + } + + /* + * We used space and committed the btree. Remove the written blocks + * from the reservation and possibly log a new EFI to free any unused + * reservation space. + */ + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + +free: + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + return 0; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 0000000000000..ca53271f3a4c6 --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33ae..474f4c4a9cd3b 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -17,6 +17,8 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_scrub.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index cbd4d01e253c0..fb27a4cf8f535 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1331,6 +1331,43 @@ TRACE_EVENT(xrep_ialloc_insert, __entry->freemask) ) +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From patchwork Tue Sep 26 23:32:41 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399749 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 6657DE7F14E for ; Wed, 27 Sep 2023 00:15:07 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232081AbjI0APG (ORCPT ); Tue, 26 Sep 2023 20:15:06 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:50138 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231245AbjI0ANF (ORCPT ); Tue, 26 Sep 2023 20:13:05 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 238691F20 for ; Tue, 26 Sep 2023 16:32:42 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id BEE3BC433C8; Tue, 26 Sep 2023 23:32:41 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771161; bh=zMd8nriu0ZnhfozJfe5g5iuukPI40ZhpJzcDE1n0pf8=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=rlLoFOQRqSGlCmkwqojpGaQbng53zPIK43GGpZapNShAopNTNKZx2BLvB5TDxfJIC gWtqbuVlN83qdnA2+LWxe/ZCR54g4x3eg9y1xJAq/stPVbhVaj3DvzxzSMITPMzbxv GJ5P/5IrKgB6gOjuvo/8X3UQToZzi095PyH+H7TsxLAgue3KGm/66q+Yf5Tb02fNAS GTqwkEAbYU2E6S6bYpo5PeO7qu8XCGHJ8dtW3iYh+tpIO3BCkfJf2HvH6G2Wv/ypSz fFZUjbC51Pz0uZTLPPrEb5ELpwhaPB7kyoRWI1dBkBMipXdtZUhtYl2DKnnAedFNLy 4RcFNn22xOWsA== Date: Tue, 26 Sep 2023 16:32:41 -0700 Subject: [PATCH 6/7] xfs: log EFIs for all btree blocks being used to stage a btree From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059238.3312911.11027644382774083646.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong We need to log EFIs for every extent that we allocate for the purpose of staging a new btree so that if we fail then the blocks will be freed during log recovery. Use the autoreaping mechanism provided by the previous patch to attach paused freeing work to the scrub transaction. We can then mark the EFIs stale if we decide to commit the new btree, or we can unpause the EFIs if we decide to abort the repair. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/newbt.c | 34 ++++++++++++++++++++++++++-------- fs/xfs/scrub/newbt.h | 3 +++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 4e8d6637426e4..2932fd317ab23 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -136,6 +136,7 @@ xrep_newbt_add_blocks( { struct xfs_mount *mp = xnr->sc->mp; struct xrep_newbt_resv *resv; + int error; resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); if (!resv) @@ -147,8 +148,18 @@ xrep_newbt_add_blocks( resv->used = 0; resv->pag = xfs_perag_hold(pag); + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + list_add_tail(&resv->list, &xnr->resv_list); return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; } /* Don't let our allocation hint take us beyond this AG */ @@ -327,16 +338,21 @@ xrep_newbt_free_extent( if (!btree_committed || resv->used == 0) { /* * If we're not committing a new btree or we didn't use the - * space reservation, free the entire space extent. + * space reservation, let the existing EFI free the entire + * space extent. */ - goto free; + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; } /* - * We used space and committed the btree. Remove the written blocks - * from the reservation and possibly log a new EFI to free any unused - * reservation space. + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); free_agbno += resv->used; free_aglen -= resv->used; @@ -348,7 +364,6 @@ xrep_newbt_free_extent( ASSERT(xnr->resv != XFS_AG_RESV_AGFL); -free: /* * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. @@ -408,9 +423,10 @@ xrep_newbt_free( /* * If we still have reservations attached to @newbt, cleanup must have * failed and the filesystem is about to go down. Clean up the incore - * reservations. + * reservations and try to commit to freeing the space we used. */ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); list_del(&resv->list); xfs_perag_put(resv->pag); kfree(resv); @@ -488,5 +504,7 @@ xrep_newbt_claim_block( agbno)); else ptr->s = cpu_to_be32(agbno); - return 0; + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); } diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index ca53271f3a4c6..d2baffa17b1ae 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -12,6 +12,9 @@ struct xrep_newbt_resv { struct xfs_perag *pag; + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + /* AG block of the extent we reserved. */ xfs_agblock_t agbno; From patchwork Tue Sep 26 23:32:56 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13399750 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D6908E7F14A for ; Wed, 27 Sep 2023 00:15:07 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230422AbjI0APG (ORCPT ); Tue, 26 Sep 2023 20:15:06 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35164 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232250AbjI0ANF (ORCPT ); Tue, 26 Sep 2023 20:13:05 -0400 Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D91FD30CD for ; Tue, 26 Sep 2023 16:32:57 -0700 (PDT) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 71056C433C7; Tue, 26 Sep 2023 23:32:57 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1695771177; bh=9YIpxyf44PWFDrzquU0FHnz1QFvCaiUtxKhiGoeudH8=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=pqaO+ts3BkzGbrsF56ZFGcY3GRjhSoEMuiMyNpN+hONKlC4JpKtfpRruH85SF70T6 p5iFU0xqTTh0hao18XzpiNXTX++QB3M11jFGsVn3tzfcnx+Zxw/Ks6zRwA40fqgUSs QRRffJzgyjJtdXHeH/pBlJ8kbno9Up5CHpqKMOUrUG2oc+6ncEb/9E3ZWn/XM840Ni J+ZC7L79+rdyIhjvBnN+fgiq2uy35cnD2G1d/sfHwz5X8Y+tcT0DFNzipW2HTncE9C G0BYlO/Bt57bFSCzzARKZfYtN+NJZJoalll5ej0cgUb0Z/2FmrtOq4OiooDxF4evyF Kq+lvCMxx1VfA== Date: Tue, 26 Sep 2023 16:32:56 -0700 Subject: [PATCH 7/7] xfs: force small EFIs for reaping btree extents From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <169577059253.3312911.14232325060465598331.stgit@frogsfrogsfrogs> In-Reply-To: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> References: <169577059140.3312911.17578000557997208473.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Introduce the concept of a defer ops barrier to separate consecutively queued pending work items of the same type. With a barrier in place, the two work items will be tracked separately, and receive separate log intent items. The goal here is to prevent reaping of old metadata blocks from creating unnecessarily huge EFIs that could then run the risk of overflowing the scrub transaction. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 83 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_defer.h | 3 ++ fs/xfs/scrub/reap.c | 5 +++ 3 files changed, 91 insertions(+) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 6ed1ab8a7e522..072e6458ba30b 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -181,6 +181,58 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} + +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} + +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, +}; static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, @@ -189,6 +241,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, + [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, }; /* @@ -1028,3 +1081,33 @@ xfs_defer_item_unpause( trace_xfs_defer_item_unpause(tp->t_mountp, dfp); } + +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + if (!list_empty(&tp->t_dfops)) { + dfp = list_last_entry(&tp->t_dfops, + struct xfs_defer_pending, dfp_list); + if (dfp->dfp_type == XFS_DEFER_OPS_TYPE_BARRIER) + return; + } + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_type = XFS_DEFER_OPS_TYPE_BARRIER; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &tp->t_dfops); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); + dfp->dfp_count++; +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7fb4f60e5e4c5..c8889ea5ab8bf 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -20,6 +20,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_ATTR, + XFS_DEFER_OPS_TYPE_BARRIER, XFS_DEFER_OPS_TYPE_MAX, }; @@ -141,4 +142,6 @@ void xfs_defer_resources_rele(struct xfs_defer_resources *dres); int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 78c9f2085db46..ee26fcb500b78 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -31,6 +31,7 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -409,6 +410,8 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); @@ -416,6 +419,8 @@ xreap_agextent_iter( return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; }