[10/10] xfs_scrub: try spot repairs of metadata items to make scrub progress

Message ID	172229852487.1353240.15543838281635454310.stgit@frogsfrogsfrogs (mailing list archive)
State	Accepted, archived
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1B1FBC2C6 for <linux-xfs@vger.kernel.org>; Tue, 30 Jul 2024 01:32:41 +0000 (UTC) Date: Mon, 29 Jul 2024 18:32:40 -0700 Subject: [PATCH 10/10] xfs_scrub: try spot repairs of metadata items to make scrub progress From: "Darrick J. Wong" <djwong@kernel.org> To: djwong@kernel.org, cem@kernel.org Cc: Christoph Hellwig <hch@lst.de>, linux-xfs@vger.kernel.org Message-ID: <172229852487.1353240.15543838281635454310.stgit@frogsfrogsfrogs> In-Reply-To: <172229852355.1353240.6151017907178495656.stgit@frogsfrogsfrogs> References: <172229852355.1353240.6151017907178495656.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit
Series	[01/10] man: document vectored scrub mode \| expand [01/10] man: document vectored scrub mode [02/10] libfrog: support vectored scrub [03/10] xfs_io: support vectored scrub [04/10] xfs_scrub: split the scrub epilogue code into a separate function [05/10] xfs_scrub: split the repair epilogue code into a separate function [06/10] xfs_scrub: convert scrub and repair epilogues to use xfs_scrub_vec [07/10] xfs_scrub: vectorize scrub calls [08/10] xfs_scrub: vectorize repair calls [09/10] xfs_scrub: use scrub barriers to reduce kernel calls [10/10] xfs_scrub: try spot repairs of metadata items to make scrub progress

diff --git a/scrub/phase2.c b/scrub/phase2.c index d435da071..c24d13735 100644 --- a/scrub/phase2.c +++ b/scrub/phase2.c @@ -69,6 +69,53 @@ defer_fs_repair( return 0; } +/* + * If we couldn't check all the scheduled metadata items, try performing spot + * repairs until we check everything or stop making forward progress. + */ +static int +repair_and_scrub_loop( + struct scrub_ctx *ctx, + struct scrub_item *sri, + const char *descr, + bool *defer) +{ + unsigned int to_check; + int ret; + + *defer = false; + if (ctx->mode != SCRUB_MODE_REPAIR) + return 0; + + to_check = scrub_item_count_needscheck(sri); + while (to_check > 0) { + unsigned int nr; + + ret = repair_item_corruption(ctx, sri); + if (ret) + return ret; + + ret = scrub_item_check(ctx, sri); + if (ret) + return ret; + + nr = scrub_item_count_needscheck(sri); + if (nr == to_check) { + /* + * We cannot make forward scanning progress with this + * metadata, so defer the rest until phase 4. + */ + str_info(ctx, descr, + _("Unable to make forward checking progress; will try again in phase 4.")); + *defer = true; + return 0; + } + to_check = nr; + } + + return 0; +} + /* Scrub each AG's metadata btrees. */ static void scan_ag_metadata( @@ -82,6 +129,7 @@ scan_ag_metadata( struct scan_ctl *sctl = arg; char descr[DESCR_BUFSZ]; unsigned int difficulty; + bool defer_repairs; int ret; if (sctl->aborted) @@ -97,10 +145,22 @@ scan_ag_metadata( scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_AGHEADER); scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_PERAG); + /* + * Try to check all of the AG metadata items that we just scheduled. + * If we return with some types still needing a check, try repairing + * any damaged metadata that we've found so far, and try again. Abort + * if we stop making forward progress. + */ ret = scrub_item_check(ctx, &sri); if (ret) goto err; + ret = repair_and_scrub_loop(ctx, &sri, descr, &defer_repairs); + if (ret) + goto err; + if (defer_repairs) + goto defer; + /* * Figure out if we need to perform early fixing. The only * reason we need to do this is if the inobt is broken, which @@ -117,6 +177,7 @@ scan_ag_metadata( if (ret) goto err; +defer: /* Everything else gets fixed during phase 4. */ ret = defer_fs_repair(ctx, &sri); if (ret) @@ -137,11 +198,18 @@ scan_fs_metadata( struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; struct scan_ctl *sctl = arg; unsigned int difficulty; + bool defer_repairs; int ret; if (sctl->aborted) goto out; + /* + * Try to check all of the metadata files that we just scheduled. If + * we return with some types still needing a check, try repairing any + * damaged metadata that we've found so far, and try again. Abort if + * we stop making forward progress. + */ scrub_item_init_fs(&sri); scrub_item_schedule(&sri, type); ret = scrub_item_check(ctx, &sri); @@ -150,10 +218,20 @@ scan_fs_metadata( goto out; } + ret = repair_and_scrub_loop(ctx, &sri, xfrog_scrubbers[type].descr, + &defer_repairs); + if (ret) { + sctl->aborted = true; + goto out; + } + if (defer_repairs) + goto defer; + /* Complain about metadata corruptions that might not be fixable. */ difficulty = repair_item_difficulty(&sri); warn_repair_difficulties(ctx, difficulty, xfrog_scrubbers[type].descr); +defer: ret = defer_fs_repair(ctx, &sri); if (ret) { sctl->aborted = true; diff --git a/scrub/phase3.c b/scrub/phase3.c index 09a1ea452..046a42c1d 100644 --- a/scrub/phase3.c +++ b/scrub/phase3.c @@ -99,6 +99,58 @@ try_inode_repair( return repair_file_corruption(ictx->ctx, sri, fd); } +/* + * If we couldn't check all the scheduled file metadata items, try performing + * spot repairs until we check everything or stop making forward progress. + */ +static int +repair_and_scrub_inode_loop( + struct scrub_ctx *ctx, + struct xfs_bulkstat *bstat, + int fd, + struct scrub_item *sri, + bool *defer) +{ + unsigned int to_check; + int error; + + *defer = false; + if (ctx->mode != SCRUB_MODE_REPAIR) + return 0; + + to_check = scrub_item_count_needscheck(sri); + while (to_check > 0) { + unsigned int nr; + + error = repair_file_corruption(ctx, sri, fd); + if (error) + return error; + + error = scrub_item_check_file(ctx, sri, fd); + if (error) + return error; + + nr = scrub_item_count_needscheck(sri); + if (nr == to_check) { + char descr[DESCR_BUFSZ]; + + /* + * We cannot make forward scanning progress with this + * inode, so defer the rest until phase 4. + */ + scrub_render_ino_descr(ctx, descr, DESCR_BUFSZ, + bstat->bs_ino, bstat->bs_gen, NULL); + str_info(ctx, descr, + _("Unable to make forward checking progress; will try again in phase 4.")); + *defer = true; + return 0; + } + to_check = nr; + } + + return 0; +} + /* Verify the contents, xattrs, and extent maps of an inode. */ static int scrub_inode( @@ -169,11 +221,28 @@ scrub_inode( scrub_item_schedule(&sri, XFS_SCRUB_TYPE_XATTR); scrub_item_schedule(&sri, XFS_SCRUB_TYPE_PARENT); - /* Try to check and repair the file while it's open. */ + /* + * Try to check all of the metadata items that we just scheduled. If + * we return with some types still needing a check and the space + * metadata isn't also in need of repairs, try repairing any damaged + * file metadata that we've found so far, and try checking the file + * again. Worst case, defer the repairs and the checks to phase 4 if + * we can't make any progress on anything. + */ error = scrub_item_check_file(ctx, &sri, fd); if (error) goto out; + if (!ictx->always_defer_repairs) { + bool defer_repairs; + + error = repair_and_scrub_inode_loop(ctx, bstat, fd, &sri, + &defer_repairs); + if (error || defer_repairs) + goto out; + } + + /* Try to repair the file while it's open. */ error = try_inode_repair(ictx, &sri, fd); if (error) goto out; diff --git a/scrub/repair.c b/scrub/repair.c index 8a28f6b13..e594e704f 100644 --- a/scrub/repair.c +++ b/scrub/repair.c @@ -860,6 +860,7 @@ repair_item_to_action_item( struct action_item **aitemp) { struct action_item *aitem; + unsigned int scrub_type; if (repair_item_count_needsrepair(sri) == 0) return 0; @@ -875,6 +876,20 @@ repair_item_to_action_item( INIT_LIST_HEAD(&aitem->list); memcpy(&aitem->sri, sri, sizeof(struct scrub_item)); + /* + * If the scrub item indicates that there is unchecked metadata, assume + * that the scrub type checker depends on something that couldn't be + * fixed. Mark that type as corrupt so that phase 4 will try it again. + */ + foreach_scrub_type(scrub_type) { + __u8 *state = aitem->sri.sri_state; + + if (state[scrub_type] & SCRUB_ITEM_NEEDSCHECK) { + state[scrub_type] &= ~SCRUB_ITEM_NEEDSCHECK; + state[scrub_type] |= SCRUB_ITEM_CORRUPT; + } + } + *aitemp = aitem; return 0; }

[10/10] xfs_scrub: try spot repairs of metadata items to make scrub progress

Commit Message

Patch