From patchwork Fri Jun 16 01:37:01 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281923 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 33355EB64D9 for ; Fri, 16 Jun 2023 01:37:06 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229762AbjFPBhF (ORCPT ); Thu, 15 Jun 2023 21:37:05 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56280 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBhE (ORCPT ); Thu, 15 Jun 2023 21:37:04 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 72F87294A for ; Thu, 15 Jun 2023 18:37:03 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 0302061B41 for ; Fri, 16 Jun 2023 01:37:03 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 366B9C433C0; Fri, 16 Jun 2023 01:37:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879422; bh=lhvaQ8CsmoaVlMCWNV84JWkreaqq6CqZvwZWZ0JlIVc=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=pH2N0rGL4SmxKJ9AS3YTlg7IecYB1fPoSYg8oJekNlhAPrYe3L6AZgT54yE6S4NlS OcIq62FIaMqfDzJvvsppxVVGkH8Sut78/L9er7GeLeJE9tWQwPgDelMvtuQzle5ctP Ik+cKbjMOJUP0sC1GOi8frWGpKipfjYcWzs8CcIGgn4x58P5OwVJowDt3hSc6VMiSK XyCxG3fg6V5cw2Kp5dvW+VRhizV7/NQ8IjuqrjP0MLDFJjINyb3jAyHYjSto9beCZ8 hD1SoPKIUvikbL7bvyjkhe0EsY1n/LPlDc99idnyRh/hWwJopMb7JtE6nQLggTnqVN icGCPl2+pOolg== Subject: [PATCH 1/8] libxfs: deferred items should call xfs_perag_intent_{get,put} From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:01 -0700 Message-ID: <168687942165.831530.11916953464551186899.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Make the intent item _get_group and _put_group functions call xfs_perag_intent_{get,put} to match the kernel. In userspace they're the same thing so this makes no difference. However, let's not leave unnecessary discrepancies. Fixes: 7cb26322f74 ("xfs: allow queued AG intents to drain before scrubbing") Signed-off-by: Darrick J. Wong --- libxfs/defer_item.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c index dc00d6d6..6c5c7dd5 100644 --- a/libxfs/defer_item.c +++ b/libxfs/defer_item.c @@ -70,7 +70,7 @@ xfs_extent_free_create_done( return NULL; } -/* Take a passive ref to the AG containing the space we're freeing. */ +/* Take an active ref to the AG containing the space we're freeing. */ void xfs_extent_free_get_group( struct xfs_mount *mp, @@ -79,15 +79,15 @@ xfs_extent_free_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); - xefi->xefi_pag = xfs_perag_get(mp, agno); + xefi->xefi_pag = xfs_perag_intent_get(mp, agno); } -/* Release a passive AG ref after some freeing work. */ +/* Release an active AG ref after some freeing work. */ static inline void xfs_extent_free_put_group( struct xfs_extent_free_item *xefi) { - xfs_perag_put(xefi->xefi_pag); + xfs_perag_intent_put(xefi->xefi_pag); } /* Process a free extent. */ @@ -104,6 +104,7 @@ xfs_extent_free_finish_item( int error; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + oinfo.oi_owner = xefi->xefi_owner; if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; @@ -166,6 +167,7 @@ xfs_agfl_free_finish_item( xfs_agblock_t agbno; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(xefi->xefi_blockcount == 1); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; @@ -232,7 +234,7 @@ xfs_rmap_update_create_done( return NULL; } -/* Take a passive ref to the AG containing the space we're rmapping. */ +/* Take an active ref to the AG containing the space we're rmapping. */ void xfs_rmap_update_get_group( struct xfs_mount *mp, @@ -241,15 +243,15 @@ xfs_rmap_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); - ri->ri_pag = xfs_perag_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, agno); } -/* Release a passive AG ref after finishing rmapping work. */ +/* Release an active AG ref after finishing rmapping work. */ static inline void xfs_rmap_update_put_group( struct xfs_rmap_intent *ri) { - xfs_perag_put(ri->ri_pag); + xfs_perag_intent_put(ri->ri_pag); } /* Process a deferred rmap update. */ @@ -344,7 +346,7 @@ xfs_refcount_update_create_done( return NULL; } -/* Take a passive ref to the AG containing the space we're refcounting. */ +/* Take an active ref to the AG containing the space we're refcounting. */ void xfs_refcount_update_get_group( struct xfs_mount *mp, @@ -353,15 +355,15 @@ xfs_refcount_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); - ri->ri_pag = xfs_perag_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, agno); } -/* Release a passive AG ref after finishing refcounting work. */ +/* Release an active AG ref after finishing refcounting work. */ static inline void xfs_refcount_update_put_group( struct xfs_refcount_intent *ri) { - xfs_perag_put(ri->ri_pag); + xfs_perag_intent_put(ri->ri_pag); } /* Process a deferred refcount update. */ @@ -461,7 +463,7 @@ xfs_bmap_update_create_done( return NULL; } -/* Take a passive ref to the AG containing the space we're mapping. */ +/* Take an active ref to the AG containing the space we're mapping. */ void xfs_bmap_update_get_group( struct xfs_mount *mp, @@ -470,15 +472,23 @@ xfs_bmap_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); - bi->bi_pag = xfs_perag_get(mp, agno); + + /* + * Bump the intent count on behalf of the deferred rmap and refcount + * intent items that that we can queue when we finish this bmap work. + * This new intent item will bump the intent count before the bmap + * intent drops the intent count, ensuring that the intent count + * remains nonzero across the transaction roll. + */ + bi->bi_pag = xfs_perag_intent_get(mp, agno); } -/* Release a passive AG ref after finishing mapping work. */ +/* Release an active AG ref after finishing mapping work. */ static inline void xfs_bmap_update_put_group( struct xfs_bmap_intent *bi) { - xfs_perag_put(bi->bi_pag); + xfs_perag_intent_put(bi->bi_pag); } /* Process a deferred rmap update. */ From patchwork Fri Jun 16 01:37:07 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281924 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D1452EB64DA for ; Fri, 16 Jun 2023 01:37:12 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233287AbjFPBhM (ORCPT ); Thu, 15 Jun 2023 21:37:12 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56300 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBhK (ORCPT ); Thu, 15 Jun 2023 21:37:10 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1FFC22948 for ; Thu, 15 Jun 2023 18:37:09 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 9292C61B53 for ; Fri, 16 Jun 2023 01:37:08 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id C5EF5C433C8; Fri, 16 Jun 2023 01:37:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879427; bh=xSXUgYM9N/9LryKOMJLovntgtERKRwRyndINF0sQOC8=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=s3YLDb3N/KQCYxe6s4IjdC4QEuj4G1s2dE9NXvgLU1LchhR23enU+Mo+/QP4YGaI7 RzmEazC4mUjpwkIjZGcR0Dk386JAcaVXI+ySAsOT+cI/wrJ6vgfUrT5MuqhJWTyvdQ 1GO/nTg951a2TZA40abPkhHW59pR5cf6idJlaYOMVOT0W8KZcK5OR6MjPlSEa2XLp7 giOcPka43VaPb2MUq4h6vDIkb4mjgdtHbLfYlohNpdcL29TbK7sYALMx5u6jQq0kzO ADWJViV2Adjy+1tH9kQimS8pu84OsZz8Hu7X6siOfknDcaVfi1E97fww1Yyip+Jw6X 2MUheSn36ZaQQ== Subject: [PATCH 2/8] libxfs: port list_cmp_func_t to userspace From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:07 -0700 Message-ID: <168687942737.831530.10539175194696645464.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Synchronize our list_sort ABI to match the kernel's. This will make it easier to port the log item precommit sorting code to userspace as-is in the next patch. Signed-off-by: Darrick J. Wong --- include/list.h | 7 ++++--- libfrog/list_sort.c | 10 +++------- libxfs/defer_item.c | 32 ++++++++++++++++---------------- scrub/repair.c | 12 ++++++------ 4 files changed, 29 insertions(+), 32 deletions(-) diff --git a/include/list.h b/include/list.h index dab4e23b..e59cbd53 100644 --- a/include/list.h +++ b/include/list.h @@ -156,9 +156,10 @@ static inline void list_splice_init(struct list_head *list, const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) -void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)); +typedef int (*list_cmp_func_t)(void *priv, const struct list_head *a, + const struct list_head *b); + +void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp); #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) diff --git a/libfrog/list_sort.c b/libfrog/list_sort.c index b77eece5..994a51fe 100644 --- a/libfrog/list_sort.c +++ b/libfrog/list_sort.c @@ -12,8 +12,7 @@ * sentinel head node, "prev" links not maintained. */ static struct list_head *merge(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), + list_cmp_func_t cmp, struct list_head *a, struct list_head *b) { struct list_head head, *tail = &head; @@ -41,8 +40,7 @@ static struct list_head *merge(void *priv, * throughout. */ static void merge_and_restore_back_links(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), + list_cmp_func_t cmp, struct list_head *head, struct list_head *a, struct list_head *b) { @@ -96,9 +94,7 @@ static void merge_and_restore_back_links(void *priv, * @b. If @a and @b are equivalent, and their original relative * ordering is to be preserved, @cmp must return 0. */ -void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) +void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp) { struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists -- last slot is a sentinel */ diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c index 6c5c7dd5..3f519252 100644 --- a/libxfs/defer_item.c +++ b/libxfs/defer_item.c @@ -33,11 +33,11 @@ static int xfs_extent_free_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { - struct xfs_extent_free_item *ra; - struct xfs_extent_free_item *rb; + const struct xfs_extent_free_item *ra; + const struct xfs_extent_free_item *rb; ra = container_of(a, struct xfs_extent_free_item, xefi_list); rb = container_of(b, struct xfs_extent_free_item, xefi_list); @@ -197,11 +197,11 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { static int xfs_rmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { - struct xfs_rmap_intent *ra; - struct xfs_rmap_intent *rb; + const struct xfs_rmap_intent *ra; + const struct xfs_rmap_intent *rb; ra = container_of(a, struct xfs_rmap_intent, ri_list); rb = container_of(b, struct xfs_rmap_intent, ri_list); @@ -309,11 +309,11 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { static int xfs_refcount_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { - struct xfs_refcount_intent *ra; - struct xfs_refcount_intent *rb; + const struct xfs_refcount_intent *ra; + const struct xfs_refcount_intent *rb; ra = container_of(a, struct xfs_refcount_intent, ri_list); rb = container_of(b, struct xfs_refcount_intent, ri_list); @@ -427,11 +427,11 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { static int xfs_bmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { - struct xfs_bmap_intent *ba; - struct xfs_bmap_intent *bb; + const struct xfs_bmap_intent *ba; + const struct xfs_bmap_intent *bb; ba = container_of(a, struct xfs_bmap_intent, bi_list); bb = container_of(b, struct xfs_bmap_intent, bi_list); diff --git a/scrub/repair.c b/scrub/repair.c index 67900ea4..5fc5ab83 100644 --- a/scrub/repair.c +++ b/scrub/repair.c @@ -37,7 +37,7 @@ /* Sort action items in severity order. */ static int PRIO( - struct action_item *aitem, + const struct action_item *aitem, int order) { if (aitem->flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -54,7 +54,7 @@ PRIO( /* Sort the repair items in dependency order. */ static int xfs_action_item_priority( - struct action_item *aitem) + const struct action_item *aitem) { switch (aitem->type) { case XFS_SCRUB_TYPE_SB: @@ -95,11 +95,11 @@ xfs_action_item_priority( static int xfs_action_item_compare( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { - struct action_item *ra; - struct action_item *rb; + const struct action_item *ra; + const struct action_item *rb; ra = container_of(a, struct action_item, list); rb = container_of(b, struct action_item, list); From patchwork Fri Jun 16 01:37:12 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281925 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 6A073EB64DA for ; Fri, 16 Jun 2023 01:37:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233316AbjFPBhQ (ORCPT ); Thu, 15 Jun 2023 21:37:16 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56316 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBhP (ORCPT ); Thu, 15 Jun 2023 21:37:15 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B6DC82948 for ; Thu, 15 Jun 2023 18:37:14 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 4556261B74 for ; Fri, 16 Jun 2023 01:37:14 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 896D3C433C8; Fri, 16 Jun 2023 01:37:13 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879433; bh=cigPxtkjXbxI6b11RkxZOc8h5s8DAPW1wmNl6/q3aEg=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=BjqveVuHXKR318R4gegh78HoPYysoNuEy0oDlrz+6IDY+obKqfaMdcW8zrwYcoxMS kF1I3rF+KQiuUTiMe4CnC8ohKFBIxOS+wADWXDOCGBP5fkXZUpykg+uwcnKCfldZzo FP//8CXdK9xjONWCqnyOofvGUwH3ZV2Z6ek4vHuM35l2FnajuqXD3OoOAliV4kiSdM cGq6yMxUUb1THNNq8E5iU3HQlw0LD5mKInmRyydM13lSOF6r75Cqe1G04kXvG3Ir8L 3PvmBeDaBgzRBX2CZJr7SQasI1Fk6bZpFAjoFUBHbzhBPYawtZ9B9KhVCjamh3TJUA 1ZC7ffFURFqMA== Subject: [PATCH 3/8] libxfs: port transaction precommit hooks to userspace From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:12 -0700 Message-ID: <168687943296.831530.837601482403467309.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/xfs_trans.h | 6 +++ libxfs/libxfs_priv.h | 4 ++ libxfs/logitem.c | 11 +++++- libxfs/trans.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ libxfs/util.c | 4 ++ 5 files changed, 117 insertions(+), 4 deletions(-) diff --git a/include/xfs_trans.h b/include/xfs_trans.h index 6627a83f..64aade94 100644 --- a/include/xfs_trans.h +++ b/include/xfs_trans.h @@ -16,6 +16,11 @@ struct xfs_buf_map; * Userspace Transaction interface */ +struct xfs_item_ops { + uint64_t (*iop_sort)(struct xfs_log_item *lip); + int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip); +}; + typedef struct xfs_log_item { struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ @@ -24,6 +29,7 @@ typedef struct xfs_log_item { unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ + const struct xfs_item_ops *li_ops; /* function list */ } xfs_log_item_t; #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index dce5abf2..a23341ae 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -589,8 +589,10 @@ int libxfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); /* xfs_log.c */ +struct xfs_item_ops; bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); -void xfs_log_item_init(struct xfs_mount *, struct xfs_log_item *, int); +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *lip, int type, + const struct xfs_item_ops *ops); #define xfs_attr_use_log_assist(mp) (0) #define xlog_drop_incompat_feat(log) do { } while (0) #define xfs_log_in_recovery(mp) (false) diff --git a/libxfs/logitem.c b/libxfs/logitem.c index 98ccdaef..6b3315c3 100644 --- a/libxfs/logitem.c +++ b/libxfs/logitem.c @@ -59,6 +59,9 @@ xfs_trans_buf_item_match( * The following are from fs/xfs/xfs_buf_item.c */ +static const struct xfs_item_ops xfs_buf_item_ops = { +}; + /* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_log_item field to point to the new @@ -101,7 +104,7 @@ xfs_buf_item_init( fprintf(stderr, "adding buf item %p for not-logged buffer %p\n", bip, bp); #endif - xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF); + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; bip->__bli_format.blf_type = XFS_LI_BUF; bip->__bli_format.blf_blkno = (int64_t)xfs_buf_daddr(bp); @@ -127,6 +130,9 @@ xfs_buf_item_log( bip->bli_flags |= XFS_BLI_DIRTY; } +static const struct xfs_item_ops xfs_inode_item_ops = { +}; + /* * Initialize the inode log item for a newly allocated (in-core) inode. */ @@ -146,6 +152,7 @@ xfs_inode_item_init( spin_lock_init(&iip->ili_lock); - xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE); + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); iip->ili_inode = ip; } diff --git a/libxfs/trans.c b/libxfs/trans.c index 553f9471..a05111bf 100644 --- a/libxfs/trans.c +++ b/libxfs/trans.c @@ -951,6 +951,90 @@ xfs_trans_free_items( } } +/* + * Sort transaction items prior to running precommit operations. This will + * attempt to order the items such that they will always be locked in the same + * order. Items that have no sort function are moved to the end of the list + * and so are locked last. + * + * This may need refinement as different types of objects add sort functions. + * + * Function is more complex than it needs to be because we are comparing 64 bit + * values and the function only returns 32 bit values. + */ +static int +xfs_trans_precommit_sort( + void *unused_arg, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_item *lia = container_of(a, + struct xfs_log_item, li_trans); + struct xfs_log_item *lib = container_of(b, + struct xfs_log_item, li_trans); + int64_t diff; + + /* + * If both items are non-sortable, leave them alone. If only one is + * sortable, move the non-sortable item towards the end of the list. + */ + if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort) + return 0; + if (!lia->li_ops->iop_sort) + return 1; + if (!lib->li_ops->iop_sort) + return -1; + + diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib); + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +/* + * Run transaction precommit functions. + * + * If there is an error in any of the callouts, then stop immediately and + * trigger a shutdown to abort the transaction. There is no recovery possible + * from errors at this point as the transaction is dirty.... + */ +static int +xfs_trans_run_precommits( + struct xfs_trans *tp) +{ + //struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip, *n; + int error = 0; + + /* + * Sort the item list to avoid ABBA deadlocks with other transactions + * running precommit operations that lock multiple shared items such as + * inode cluster buffers. + */ + list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort); + + /* + * Precommit operations can remove the log item from the transaction + * if the log item exists purely to delay modifications until they + * can be ordered against other operations. Hence we have to use + * list_for_each_entry_safe() here. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) + continue; + if (lip->li_ops->iop_precommit) { + error = lip->li_ops->iop_precommit(tp, lip); + if (error) + break; + } + } + if (error) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + /* * Commit the changes represented by this transaction */ @@ -967,6 +1051,13 @@ __xfs_trans_commit( if (tp == NULL) return 0; + error = xfs_trans_run_precommits(tp); + if (error) { + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + xfs_defer_cancel(tp); + goto out_unreserve; + } + /* * Finish deferred items on final commit. Only permanent transactions * should ever have deferred ops. @@ -977,6 +1068,11 @@ __xfs_trans_commit( error = xfs_defer_finish_noroll(&tp); if (error) goto out_unreserve; + + /* Run precommits from final tx in defer chain. */ + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; } if (!(tp->t_flags & XFS_TRANS_DIRTY)) diff --git a/libxfs/util.c b/libxfs/util.c index 6525f63d..e7d3497e 100644 --- a/libxfs/util.c +++ b/libxfs/util.c @@ -643,10 +643,12 @@ void xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, - int type) + int type, + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_type = type; + item->li_ops = ops; INIT_LIST_HEAD(&item->li_trans); INIT_LIST_HEAD(&item->li_bio_list); From patchwork Fri Jun 16 01:37:18 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281926 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 219B3EB64D9 for ; Fri, 16 Jun 2023 01:37:23 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233489AbjFPBhW (ORCPT ); Thu, 15 Jun 2023 21:37:22 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56322 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBhV (ORCPT ); Thu, 15 Jun 2023 21:37:21 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 787382948 for ; Thu, 15 Jun 2023 18:37:20 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 089B5618E1 for ; Fri, 16 Jun 2023 01:37:20 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4B9E6C433C0; Fri, 16 Jun 2023 01:37:19 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879439; bh=UU7W5EGYsuThFqnkk30hpbT6XoWOo0FotRQrx9BcgKM=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=EiR3Nj0BwMXMeNPnPDtDbFFOc6kFg1LUt2wSXyEVfkEjOPxQiq5QHPwjQqa/81o+g IkmCxOfYDuqBzTxb20iyvwf2bDdmnxWrx/Jc7Vyaf2PXPvW2qbILy7BxOq7CgUwDZI sCiRX/gOY1xavvabl/Lz/HBs4bmoRw6VU7l+gqsWc2WxPu1a6YwU+b9doq30TGrdVE pVHBupLY0Dz3Rph/m3N50WANRWu8MIRRpAPn2xGRGU8uDYuQYrB/CHEyeNyE1wZ1jh 4Gnd+MlHdqDO9mQMcZvUdwz/zSo4DiLA5B3PM5XkGX8CWr5lRPjvHedMxELaeoo6gB A7Tew7ty+f+Tg== Subject: [PATCH 4/8] xfs: restore allocation trylock iteration From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: kernel test robot , Dave Chinner , Christoph Hellwig , Dave Chinner , linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:18 -0700 Message-ID: <168687943874.831530.8009396415791436397.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Source kernel commit: 00dcd17cfa7f103f7d640ffd34645a2ddab96330 It was accidentally dropped when refactoring the allocation code, resulting in the AG iteration always doing blocking AG iteration. This results in a small performance regression for a specific fsmark test that runs more user data writer threads than there are AGs. Reported-by: kernel test robot Fixes: 2edf06a50f5b ("xfs: factor xfs_alloc_vextent_this_ag() for _iterate_ags()") Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- libxfs/xfs_alloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 87920191..8d305c3f 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -3183,7 +3183,8 @@ xfs_alloc_vextent_check_args( */ static int xfs_alloc_vextent_prepare_ag( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t flags) { bool need_pag = !args->pag; int error; @@ -3192,7 +3193,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, 0); + error = xfs_alloc_fix_freelist(args, flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3332,7 +3333,7 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_size(args); @@ -3376,7 +3377,7 @@ xfs_alloc_vextent_iterate_ags( for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, flags); if (error) break; if (!args->agbp) { @@ -3542,7 +3543,7 @@ xfs_alloc_vextent_exact_bno( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_exact(args); @@ -3583,7 +3584,7 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_near(args); From patchwork Fri Jun 16 01:37:24 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281927 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 87896EB64DA for ; Fri, 16 Jun 2023 01:37:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229575AbjFPBh2 (ORCPT ); Thu, 15 Jun 2023 21:37:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56328 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBh2 (ORCPT ); Thu, 15 Jun 2023 21:37:28 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 4E8582948 for ; Thu, 15 Jun 2023 18:37:26 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id C462761BCB for ; Fri, 16 Jun 2023 01:37:25 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id F21F4C433C0; Fri, 16 Jun 2023 01:37:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879445; bh=XRHy5yhuy3KQtvIjHc0VxWkbMeX9tswBP+Wcs3yAQAw=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=o8u8bnH7tFb3GdszlnuQ3SUG9ZPMrGEkogt6TeP0SafDnqNOitXB2zsjgcOUBtB0j TGHvVpFxSC/rpK48sPWMGChnTRPEakFRiAufrhESeeal06HteJIxdL49f/+Q2VHv2+ rJCzufegajs3d7Mv4oX7upEdliqwwm3UYVvYuvkf58MYEb31TI2/mP5PJv+9fP3feD HRMsojeKMSTw1Dl9r5GXIVONflscmPZdf3QS57VqifA3B0hmUqR7y/GQgfJAEqRT2q vMYidAAGOYAaK7UTdNkWb7GzJcvurlvfr6pCYADH2UCD6yjB7Poe1O5kTlWp8WLDgB zUlX8yiJEJVoA== Subject: [PATCH 5/8] xfs: fix AGF vs inode cluster buffer deadlock From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: Dave Chinner , Christoph Hellwig , Dave Chinner , linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:24 -0700 Message-ID: <168687944448.831530.3926573287687741867.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Source kernel commit: 82842fee6e5979ca7e2bf4d839ef890c22ffb7aa Lock order in XFS is AGI -> AGF, hence for operations involving inode unlinked list operations we always lock the AGI first. Inode unlinked list operations operate on the inode cluster buffer, so the lock order there is AGI -> inode cluster buffer. For O_TMPFILE operations, this now means the lock order set down in xfs_rename and xfs_link is AGI -> inode cluster buffer -> AGF as the unlinked ops are done before the directory modifications that may allocate space and lock the AGF. Unfortunately, we also now lock the inode cluster buffer when logging an inode so that we can attach the inode to the cluster buffer and pin it in memory. This creates a lock order of AGF -> inode cluster buffer in directory operations as we have to log the inode after we've allocated new space for it. This creates a lock inversion between the AGF and the inode cluster buffer. Because the inode cluster buffer is shared across multiple inodes, the inversion is not specific to individual inodes but can occur when inodes in the same cluster buffer are accessed in different orders. To fix this we need move all the inode log item cluster buffer interactions to the end of the current transaction. Unfortunately, xfs_trans_log_inode() calls are littered throughout the transactions with no thought to ordering against other items or locking. This makes it difficult to do anything that involves changing the call sites of xfs_trans_log_inode() to change locking orders. However, we do now have a mechanism that allows is to postpone dirty item processing to just before we commit the transaction: the ->iop_precommit method. This will be called after all the modifications are done and high level objects like AGI and AGF buffers have been locked and modified, thereby providing a mechanism that guarantees we don't lock the inode cluster buffer before those high level objects are locked. This change is largely moving the guts of xfs_trans_log_inode() to xfs_inode_item_precommit() and providing an extra flag context in the inode log item to track the dirty state of the inode in the current transaction. This also means we do a lot less repeated work in xfs_trans_log_inode() by only doing it once per transaction when all the work is done. Fixes: 298f7bec503f ("xfs: pin inode backing buffer to the inode log item") Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- include/xfs_inode.h | 3 + include/xfs_trans.h | 1 libxfs/logitem.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++ libxfs/xfs_log_format.h | 9 ++- libxfs/xfs_trans_inode.c | 113 ++-------------------------------- 5 files changed, 173 insertions(+), 107 deletions(-) diff --git a/include/xfs_inode.h b/include/xfs_inode.h index b0bba109..069fcf36 100644 --- a/include/xfs_inode.h +++ b/include/xfs_inode.h @@ -14,6 +14,7 @@ struct xfs_trans; struct xfs_mount; struct xfs_inode_log_item; +struct inode; /* * These are not actually used, they are only for userspace build @@ -22,7 +23,7 @@ struct xfs_inode_log_item; #define I_DIRTY_TIME 0 #define I_DIRTY_TIME_EXPIRED 0 -#define IS_I_VERSION(inode) (0) +static inline bool IS_I_VERSION(const struct inode *inode) { return false; } #define inode_maybe_inc_iversion(inode,flags) (0) /* diff --git a/include/xfs_trans.h b/include/xfs_trans.h index 64aade94..8371bc7e 100644 --- a/include/xfs_trans.h +++ b/include/xfs_trans.h @@ -38,6 +38,7 @@ struct xfs_inode_log_item { xfs_log_item_t ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode pointer */ unsigned short ili_lock_flags; /* lock flags */ + unsigned int ili_dirty_flags; /* dirty in current tx */ unsigned int ili_last_fields; /* fields when flushed*/ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* ignored by userspace */ diff --git a/libxfs/logitem.c b/libxfs/logitem.c index 6b3315c3..48928f32 100644 --- a/libxfs/logitem.c +++ b/libxfs/logitem.c @@ -130,7 +130,161 @@ xfs_buf_item_log( bip->bli_flags |= XFS_BLI_DIRTY; } +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + +static uint64_t +xfs_inode_item_sort( + struct xfs_log_item *lip) +{ + return INODE_ITEM(lip)->ili_inode->i_ino; +} + +/* + * Prior to finally logging the inode, we have to ensure that all the + * per-modification inode state changes are applied. This includes VFS inode + * state updates, format conversions, verifier state synchronisation and + * ensuring the inode buffer remains in memory whilst the inode is dirty. + * + * We have to be careful when we grab the inode cluster buffer due to lock + * ordering constraints. The unlinked inode modifications (xfs_iunlink_item) + * require AGI -> inode cluster buffer lock order. The inode cluster buffer is + * not locked until ->precommit, so it happens after everything else has been + * modified. + * + * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we + * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we + * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because + * it can be called on a inode (e.g. via bumplink/droplink) before we take the + * AGF lock modifying directory blocks. + * + * Rather than force a complete rework of all the transactions to call + * xfs_trans_log_inode() once and once only at the end of every transaction, we + * move the pinning of the inode cluster buffer to a ->precommit operation. This + * matches how the xfs_iunlink_item locks the inode cluster buffer, and it + * ensures that the inode cluster buffer locking is always done last in a + * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode + * cluster buffer. + * + * If we return the inode number as the precommit sort key then we'll also + * guarantee that the order all inode cluster buffer locking is the same all the + * inodes and unlink items in the transaction. + */ +static int +xfs_inode_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct inode *inode = VFS_I(ip); + unsigned int flags = iip->ili_dirty_flags; + + /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + } + + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_has_bigtime(ip->i_mount) && + !xfs_inode_has_bigtime(ip)) { + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); + if (error) + return error; + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } + + /* + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. + */ + iip->ili_fields |= (flags | iip->ili_last_fields); + spin_unlock(&iip->ili_lock); + + /* + * We are done with the log item transaction dirty state, so clear it so + * that it doesn't pollute future transactions. + */ + iip->ili_dirty_flags = 0; + return 0; +} + static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_sort = xfs_inode_item_sort, + .iop_precommit = xfs_inode_item_precommit, }; /* diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h index f13e0809..269573c8 100644 --- a/libxfs/xfs_log_format.h +++ b/libxfs/xfs_log_format.h @@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ - /* * The timestamps are dirty, but not necessarily anything else in the inode * core. Unlike the other fields above this one must never make it to disk @@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_TIMESTAMP 0x4000 +/* + * The version field has been changed, but not necessarily anything else of + * interest. This must never make it to disk - it is used purely to ensure that + * the inode item ->precommit operation can update the fsync flag triggers + * in the inode item correctly. + */ +#define XFS_ILOG_IVERSION 0x8000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ diff --git a/libxfs/xfs_trans_inode.c b/libxfs/xfs_trans_inode.c index 276d57cf..c4f81e5d 100644 --- a/libxfs/xfs_trans_inode.c +++ b/libxfs/xfs_trans_inode.c @@ -37,9 +37,8 @@ xfs_trans_ijoin( iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); - /* - * Get a log_item_desc to point at the new item. - */ + /* Reset the per-tx dirty context and add the item to the tx. */ + iip->ili_dirty_flags = 0; xfs_trans_add_item(tp, &iip->ili_item); } @@ -73,17 +72,10 @@ xfs_trans_ichgtime( /* * This is called to mark the fields indicated in fieldmask as needing to be * logged when the transaction is committed. The inode must already be - * associated with the given transaction. - * - * The values for fieldmask are defined in xfs_inode_item.h. We always log all - * of the core inode if any of it has changed, and we always log all of the - * inline data/extents/b-tree root if any of them has changed. - * - * Grab and pin the cluster buffer associated with this inode to avoid RMW - * cycles at inode writeback time. Avoid the need to add error handling to every - * xfs_trans_log_inode() call by shutting down on read error. This will cause - * transactions to fail and everything to error out, just like if we return a - * read error in a dirty transaction and cancel it. + * associated with the given transaction. All we do here is record where the + * inode was dirtied and mark the transaction and inode log item dirty; + * everything else is done in the ->precommit log item operation after the + * changes in the transaction have been completed. */ void xfs_trans_log_inode( @@ -93,7 +85,6 @@ xfs_trans_log_inode( { struct xfs_inode_log_item *iip = ip->i_itemp; struct inode *inode = VFS_I(ip); - uint iversion_flags = 0; ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -101,18 +92,6 @@ xfs_trans_log_inode( tp->t_flags |= XFS_TRANS_DIRTY; - /* - * Don't bother with i_lock for the I_DIRTY_TIME check here, as races - * don't matter - we either will need an extra transaction in 24 hours - * to log the timestamps, or will clear already cleared fields in the - * worst case. - */ - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - } - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the @@ -125,86 +104,10 @@ xfs_trans_log_inode( if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) - iversion_flags = XFS_ILOG_CORE; + flags |= XFS_ILOG_IVERSION; } - /* - * If we're updating the inode core or the timestamps and it's possible - * to upgrade this inode to bigtime format, do so now. - */ - if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_has_bigtime(ip->i_mount) && - !xfs_inode_has_bigtime(ip)) { - ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; - flags |= XFS_ILOG_CORE; - } - - /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; - } - - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. - */ - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= flags; - - if (!iip->ili_item.li_buf) { - struct xfs_buf *bp; - int error; - - /* - * We hold the ILOCK here, so this inode is not going to be - * flushed while we are here. Further, because there is no - * buffer attached to the item, we know that there is no IO in - * progress, so nothing will clear the ili_fields while we read - * in the buffer. Hence we can safely drop the spin lock and - * read the buffer knowing that the state will not change from - * here. - */ - spin_unlock(&iip->ili_lock); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); - if (error) { - xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); - return; - } - - /* - * We need an explicit buffer reference for the log item but - * don't want the buffer to remain attached to the transaction. - * Hold the buffer but release the transaction reference once - * we've attached the inode log item to the buffer log item - * list. - */ - xfs_buf_hold(bp); - spin_lock(&iip->ili_lock); - iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - xfs_trans_brelse(tp, bp); - } - - /* - * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines - * in the eventual clearing of the ili_fields bits. See the big comment - * in xfs_iflush() for an explanation of this coordination mechanism. - */ - iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); - spin_unlock(&iip->ili_lock); + iip->ili_dirty_flags |= flags; } int From patchwork Fri Jun 16 01:37:30 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281928 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id AF74DEB64DA for ; Fri, 16 Jun 2023 01:37:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231452AbjFPBhe (ORCPT ); Thu, 15 Jun 2023 21:37:34 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56338 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229818AbjFPBhc (ORCPT ); Thu, 15 Jun 2023 21:37:32 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BFF782948 for ; Thu, 15 Jun 2023 18:37:31 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 569A461B74 for ; Fri, 16 Jun 2023 01:37:31 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8C5EEC433C0; Fri, 16 Jun 2023 01:37:30 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879450; bh=3EmRSBlXwI5QjdToEYiAI+OLUGDPjKGRkB2Oqls8igY=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=sBuPiKcS6bCnTbCdT0ICoAgpDpSSC4qpUj0lXKeB8+TEBbQPG8oiLHwELDFN8oW6G lEqfqlvWEfYnogDYJE/uva6Iik1GnV1HRDxY1UuSYWbg3fAvPZ6bXkeavuGh1LGnfe gVVQtM0RR3FQHSccFhQo4v4Q+GD8ydaZUsUxjG484LHed1T4w173nOhNfQ4TNUXh9k Qut3djZtCc5WkDe23js0D3wRz46YkZm1h3a/7BM2XveLz0ULWoBk10CuOIfRxgCrrP iEvC/ngNvInpSp1UQGSUxV3GLlcvwzITUPuVZDLy1YlvHyDJSFfAowEGXQiJ6ZuHJg 4tzZUW4j2ioNQ== Subject: [PATCH 6/8] xfs: fix agf/agfl verification on v4 filesystems From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: Dave Chinner , Christoph Hellwig , Dave Chinner , linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:30 -0700 Message-ID: <168687945015.831530.9252823506836536626.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Source kernel commit: e0a8de7da35e5b22b44fa1013ccc0716e17b0c14 When a v4 filesystem has fl_last - fl_first != fl_count, we do not not detect the corruption and allow the AGF to be used as it if was fully valid. On V5 filesystems, we reset the AGFL to empty in these cases and avoid the corruption at a small cost of leaked blocks. If we don't catch the corruption on V4 filesystems, bad things happen later when an allocation attempts to trim the free list and either double-frees stale entries in the AGFl or tries to free NULLAGBNO entries. Either way, this is bad. Prevent this from happening by using the AGFL_NEED_RESET logic for v4 filesysetms, too. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- libxfs/xfs_alloc.c | 59 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 8d305c3f..229b22e6 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -624,6 +624,25 @@ xfs_alloc_fixup_trees( return 0; } +/* + * We do not verify the AGFL contents against AGF-based index counters here, + * even though we may have access to the perag that contains shadow copies. We + * don't know if the AGF based counters have been checked, and if they have they + * still may be inconsistent because they haven't yet been reset on the first + * allocation after the AGF has been read in. + * + * This means we can only check that all agfl entries contain valid or null + * values because we can't reliably determine the active range to exclude + * NULLAGBNO as a valid value. + * + * However, we can't even do that for v4 format filesystems because there are + * old versions of mkfs out there that does not initialise the AGFL to known, + * verifiable values. HEnce we can't tell the difference between a AGFL block + * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. + * + * As a result, we can only fully validate AGFL block numbers when we pull them + * from the freelist in xfs_alloc_get_freelist(). + */ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) @@ -633,12 +652,6 @@ xfs_agfl_verify( __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; - /* - * There is no verification of non-crc AGFLs because mkfs does not - * initialise the AGFL to zero or NULL. Hence the only valid part of the - * AGFL is what the AGF says is active. We can't get to the AGF, so we - * can't verify just those entries are valid. - */ if (!xfs_has_crc(mp)) return NULL; @@ -2317,12 +2330,16 @@ xfs_free_agfl_block( } /* - * Check the agfl fields of the agf for inconsistency or corruption. The purpose - * is to detect an agfl header padding mismatch between current and early v5 - * kernels. This problem manifests as a 1-slot size difference between the - * on-disk flcount and the active [first, last] range of a wrapped agfl. This - * may also catch variants of agfl count corruption unrelated to padding. Either - * way, we'll reset the agfl and warn the user. + * Check the agfl fields of the agf for inconsistency or corruption. + * + * The original purpose was to detect an agfl header padding mismatch between + * current and early v5 kernels. This problem manifests as a 1-slot size + * difference between the on-disk flcount and the active [first, last] range of + * a wrapped agfl. + * + * However, we need to use these same checks to catch agfl count corruptions + * unrelated to padding. This could occur on any v4 or v5 filesystem, so either + * way, we need to reset the agfl and warn the user. * * Return true if a reset is required before the agfl can be used, false * otherwise. @@ -2338,10 +2355,6 @@ xfs_agfl_needs_reset( int agfl_size = xfs_agfl_size(mp); int active; - /* no agfl header on v4 supers */ - if (!xfs_has_crc(mp)) - return false; - /* * The agf read verifier catches severe corruption of these fields. * Repeat some sanity checks to cover a packed -> unpacked mismatch if @@ -2885,6 +2898,19 @@ xfs_alloc_put_freelist( return 0; } +/* + * Verify the AGF is consistent. + * + * We do not verify the AGFL indexes in the AGF are fully consistent here + * because of issues with variable on-disk structure sizes. Instead, we check + * the agfl indexes for consistency when we initialise the perag from the AGF + * information after a read completes. + * + * If the index is inconsistent, then we mark the perag as needing an AGFL + * reset. The first AGFL update performed then resets the AGFL indexes and + * refills the AGFL with known good free blocks, allowing the filesystem to + * continue operating normally at the cost of a few leaked free space blocks. + */ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) @@ -2958,7 +2984,6 @@ xfs_agf_verify( return __this_address; return NULL; - } static void From patchwork Fri Jun 16 01:37:35 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281929 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 14D8DEB64DA for ; Fri, 16 Jun 2023 01:37:41 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231630AbjFPBhk (ORCPT ); Thu, 15 Jun 2023 21:37:40 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56350 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBhi (ORCPT ); Thu, 15 Jun 2023 21:37:38 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6E496294A for ; Thu, 15 Jun 2023 18:37:37 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 07EBB61B41 for ; Fri, 16 Jun 2023 01:37:37 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 52ACDC433C8; Fri, 16 Jun 2023 01:37:36 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879456; bh=CVOhAlTPIXQOBDzV2MpCLskBQLd4LoZojxSnZ3uCys8=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=B7C7tF7h/zi8D0B81tXojVK3IQ8CvbXQ8fKOfWcjuxxMuTkg3Ngrx/55iOCkqWdsj Y1Ww2oo6KlkW+Avdoop0M9VJp4x8vPW4r6KzM7TRcofj335Xo0UCjma96lbZQ8tgNA P8aG3cZ6dIkykvrvHgELvMfz62fhsqD4AJDFuv8Krb8mAwG4Gk1kNfD4iBjrukDkS2 +50y6ix9N2Yc/eFhYEXW1UYl0MJTtvC456LJDj6v8F+vZNKzC4qPmLuFfeo/I4+2I3 1GmAHclZ/BWLyQpQVXGe7/HFQ5S8irMQ2pAK7tG08g0bIvqlj7F/mPI7J6SsYQqBez YFuycApA307nw== Subject: [PATCH 7/8] xfs: validity check agbnos on the AGFL From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: Dave Chinner , Christoph Hellwig , Dave Chinner , linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:35 -0700 Message-ID: <168687945573.831530.7630900598292491670.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Source kernel commit: 3148ebf2c0782340946732bfaf3073d23ac833fa If the agfl or the indexing in the AGF has been corrupted, getting a block form the AGFL could return an invalid block number. If this happens, bad things happen. Check the agbno we pull off the AGFL and return -EFSCORRUPTED if we find somethign bad. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- libxfs/xfs_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 229b22e6..40a36efa 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -2776,6 +2776,9 @@ xfs_alloc_get_freelist( */ agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) + return -EFSCORRUPTED; + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) From patchwork Fri Jun 16 01:37:41 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13281930 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 221CAEB64DA for ; Fri, 16 Jun 2023 01:37:46 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229818AbjFPBhp (ORCPT ); Thu, 15 Jun 2023 21:37:45 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56360 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229453AbjFPBho (ORCPT ); Thu, 15 Jun 2023 21:37:44 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3C6982948 for ; Thu, 15 Jun 2023 18:37:43 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id B8EF661B32 for ; Fri, 16 Jun 2023 01:37:42 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0C177C433C0; Fri, 16 Jun 2023 01:37:42 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1686879462; bh=0KX008XuG0soOyIRD7RT3NItVI+xxyzZGlwZm9QAl94=; h=Subject:From:To:Cc:Date:In-Reply-To:References:From; b=YFmK73gPxBAsX9Zw6jq+7FP0S0wpg6tu+QuHZP1kRnrkupdilvI3Uc/E8Ss838ALV IfKJ7IU4MIDsvEG1p+802D+PX9gZGfg/IErlOK64a8oBQDLLo5q9Sov3xFXYvkpXpr 8+hqs+6zaXvQ+8sjyXx1ig3F/R56e12jAmg2TYc8n2nIvZIctitqENEErW/KcUCTEy JCSHBwoAO13/a0vmhV8o5X1+s/K4F5R2IeoWc2JcEXc7TpUvjaUMm17aQuhak6vlGv YGndVHCQd9f/8rl9vQvlCiQMseCMmnvT8LGc3ySPbxG3gRHS8Ol5BgkdwZ593lPwZn z7wfKeOMKV/Ug== Subject: [PATCH 8/8] xfs: validate block number being freed before adding to xefi From: "Darrick J. Wong" To: cem@kernel.org, djwong@kernel.org Cc: Dave Chinner , Christoph Hellwig , Dave Chinner , linux-xfs@vger.kernel.org Date: Thu, 15 Jun 2023 18:37:41 -0700 Message-ID: <168687946148.831530.5118642712658328389.stgit@frogsfrogsfrogs> In-Reply-To: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> References: <168687941600.831530.8013975214397479444.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Source kernel commit: 7dfee17b13e5024c5c0ab1911859ded4182de3e5 Bad things happen in defered extent freeing operations if it is passed a bad block number in the xefi. This can come from a bogus agno/agbno pair from deferred agfl freeing, or just a bad fsbno being passed to __xfs_free_extent_later(). Either way, it's very difficult to diagnose where a null perag oops in EFI creation is coming from when the operation that queued the xefi has already been completed and there's no longer any trace of it around.... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- libxfs/xfs_ag.c | 5 ++++- libxfs/xfs_alloc.c | 16 +++++++++++++--- libxfs/xfs_alloc.h | 6 +++--- libxfs/xfs_bmap.c | 10 ++++++++-- libxfs/xfs_bmap_btree.c | 7 +++++-- libxfs/xfs_ialloc.c | 24 ++++++++++++++++-------- libxfs/xfs_refcount.c | 13 ++++++++++--- 7 files changed, 59 insertions(+), 22 deletions(-) diff --git a/libxfs/xfs_ag.c b/libxfs/xfs_ag.c index 5d269312..c82afbb8 100644 --- a/libxfs/xfs_ag.c +++ b/libxfs/xfs_ag.c @@ -982,7 +982,10 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); + err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + true); + if (err2) + goto resv_err; /* * Roll the transaction before trying to re-init the per-ag diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 40a36efa..107a8492 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -2427,7 +2427,7 @@ xfs_agfl_reset( * the real allocation can proceed. Deferring the free disconnects freeing up * the AGFL slot from freeing the block. */ -STATIC void +static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2446,17 +2446,21 @@ xfs_defer_agfl_block( xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock))) + return -EFSCORRUPTED; + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + return 0; } /* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -void +int __xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, @@ -2483,6 +2487,9 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; @@ -2506,6 +2513,7 @@ __xfs_free_extent_later( xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + return 0; } #ifdef DEBUG @@ -2666,7 +2674,9 @@ xfs_alloc_fix_freelist( goto out_agbp_relse; /* defer agfl frees */ - xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + if (error) + goto out_agbp_relse; } targs.tp = tp; diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h index 5dbb2554..85ac470b 100644 --- a/libxfs/xfs_alloc.h +++ b/libxfs/xfs_alloc.h @@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); @@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline void +static inline int xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo) { - __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, false); } diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c index 18e0006f..5deeb472 100644 --- a/libxfs/xfs_bmap.c +++ b/libxfs/xfs_bmap.c @@ -565,8 +565,12 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5223,10 +5227,12 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_free_extent_later(tp, del->br_startblock, + error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, (bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN); + if (error) + goto done; } } diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c index c87cb0f6..751e8165 100644 --- a/libxfs/xfs_bmap_btree.c +++ b/libxfs/xfs_bmap_btree.c @@ -266,11 +266,14 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; + int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); return 0; diff --git a/libxfs/xfs_ialloc.c b/libxfs/xfs_ialloc.c index ee292356..42d8863e 100644 --- a/libxfs/xfs_ialloc.c +++ b/libxfs/xfs_ialloc.c @@ -1829,7 +1829,7 @@ xfs_dialloc( * might be sparse and only free the regions that are allocated as part of the * chunk. */ -STATIC void +static int xfs_difree_inode_chunk( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -1846,10 +1846,10 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); - return; + return xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, sagbno), + M_IGEO(mp)->ialloc_blks, + &XFS_RMAP_OINFO_INODES); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1866,6 +1866,8 @@ xfs_difree_inode_chunk( XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { + int error; + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* @@ -1891,8 +1893,11 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + error = xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk, &XFS_RMAP_OINFO_INODES); + if (error) + return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1900,6 +1905,7 @@ xfs_difree_inode_chunk( next: nextbit++; } + return 0; } STATIC int @@ -1998,7 +2004,9 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + if (error) + goto error0; } else { xic->deleted = false; diff --git a/libxfs/xfs_refcount.c b/libxfs/xfs_refcount.c index a406d6b2..0006ad7c 100644 --- a/libxfs/xfs_refcount.c +++ b/libxfs/xfs_refcount.c @@ -1150,8 +1150,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL); + if (error) + goto out_error; } (*agbno) += tmp.rc_blockcount; @@ -1209,8 +1211,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL); + if (error) + goto out_error; } skip: @@ -1975,7 +1979,10 @@ xfs_refcount_recover_cow_leftovers( rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + error = xfs_free_extent_later(tp, fsb, + rr->rr_rrec.rc_blockcount, NULL); + if (error) + goto out_trans; error = xfs_trans_commit(tp); if (error)