From patchwork Fri May 26 01:47:46 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13256147 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4F7E2C7EE29 for ; Fri, 26 May 2023 01:47:50 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S239896AbjEZBrt (ORCPT ); Thu, 25 May 2023 21:47:49 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39558 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231712AbjEZBrs (ORCPT ); Thu, 25 May 2023 21:47:48 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C546318D for ; Thu, 25 May 2023 18:47:47 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 51AA364868 for ; Fri, 26 May 2023 01:47:47 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id AF8E3C433EF; Fri, 26 May 2023 01:47:46 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1685065666; bh=6Z6Tex5GuzZB/AK4ZVJSk4S4b+KDWqsF7Nv5FXKyP/4=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=bcLkj6eNzRH4T2G2sH0OIx4TeGAi8EBnfl6Xhm8aqbcZVyK82jEzzn03Purnk5AbQ 6znvxewIrvM6vIUEqwGeJzOh5uZcMRJmDUgIPFWmoLQ2+6lBhd6eKg93UQI6pJp9Tw ba+Va3NNpj7oJgw+OwWkZdPnEHabRXfs2KbcNy+Mf18opA9PY+1mOkaYsE06SM97Cw Za6pwFnXcVqmfuybxL/rX9VaEN97q6YRLZuLuMaXpvrgUZkKbQ5ZeTNzglw0UbxoeF h8wDwz1qFVfJmPrlMTlbQsiK/KYq5vEWRnoUWNi7zAMQhN9AqENlbu0iHNxcZenMwG mmkOps558hGYQ== Date: Thu, 25 May 2023 18:47:46 -0700 Subject: [PATCH 1/4] libfrog: enhance ptvar to support initializer functions From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <168506072765.3744428.13292568255598716550.stgit@frogsfrogsfrogs> In-Reply-To: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> References: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Modify the per-thread variable code to support passing in an initializer function that will set up each thread's variable space when it is claimed. Signed-off-by: Darrick J. Wong --- libfrog/ptvar.c | 9 ++++++++- libfrog/ptvar.h | 4 +++- scrub/counter.c | 2 +- scrub/descr.c | 2 +- scrub/phase7.c | 2 +- scrub/read_verify.c | 2 +- 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/libfrog/ptvar.c b/libfrog/ptvar.c index 7ac8c541862..9d5ae6bc8e3 100644 --- a/libfrog/ptvar.c +++ b/libfrog/ptvar.c @@ -26,6 +26,7 @@ struct ptvar { pthread_key_t key; pthread_mutex_t lock; + ptvar_init_fn init_fn; size_t nr_used; size_t nr_counters; size_t data_size; @@ -38,6 +39,7 @@ int ptvar_alloc( size_t nr, size_t size, + ptvar_init_fn init_fn, struct ptvar **pptv) { struct ptvar *ptv; @@ -58,6 +60,7 @@ ptvar_alloc( ptv->data_size = size; ptv->nr_counters = nr; ptv->nr_used = 0; + ptv->init_fn = init_fn; memset(ptv->data, 0, nr * size); ret = -pthread_mutex_init(&ptv->lock, NULL); if (ret) @@ -98,11 +101,15 @@ ptvar_get( if (!p) { pthread_mutex_lock(&ptv->lock); assert(ptv->nr_used < ptv->nr_counters); - p = &ptv->data[(ptv->nr_used++) * ptv->data_size]; + p = &ptv->data[ptv->nr_used * ptv->data_size]; ret = -pthread_setspecific(ptv->key, p); if (ret) goto out_unlock; + ptv->nr_used++; pthread_mutex_unlock(&ptv->lock); + + if (ptv->init_fn) + ptv->init_fn(p); } *retp = 0; return p; diff --git a/libfrog/ptvar.h b/libfrog/ptvar.h index b7d02d6269e..e4a181ffe76 100644 --- a/libfrog/ptvar.h +++ b/libfrog/ptvar.h @@ -8,7 +8,9 @@ struct ptvar; -int ptvar_alloc(size_t nr, size_t size, struct ptvar **pptv); +typedef void (*ptvar_init_fn)(void *data); +int ptvar_alloc(size_t nr, size_t size, ptvar_init_fn init_fn, + struct ptvar **pptv); void ptvar_free(struct ptvar *ptv); void *ptvar_get(struct ptvar *ptv, int *ret); diff --git a/scrub/counter.c b/scrub/counter.c index 20d15365514..ed12cc42856 100644 --- a/scrub/counter.c +++ b/scrub/counter.c @@ -38,7 +38,7 @@ ptcounter_alloc( p = malloc(sizeof(struct ptcounter)); if (!p) return errno; - ret = -ptvar_alloc(nr, sizeof(uint64_t), &p->var); + ret = -ptvar_alloc(nr, sizeof(uint64_t), NULL, &p->var); if (ret) { free(p); return ret; diff --git a/scrub/descr.c b/scrub/descr.c index 77d5378ec3f..88ca5d95a78 100644 --- a/scrub/descr.c +++ b/scrub/descr.c @@ -89,7 +89,7 @@ descr_init_phase( int ret; assert(descr_ptvar == NULL); - ret = -ptvar_alloc(nr_threads, DESCR_BUFSZ, &descr_ptvar); + ret = -ptvar_alloc(nr_threads, DESCR_BUFSZ, NULL, &descr_ptvar); if (ret) str_liberror(ctx, ret, _("creating description buffer")); diff --git a/scrub/phase7.c b/scrub/phase7.c index 6476ba0cc92..ad9cba92bfd 100644 --- a/scrub/phase7.c +++ b/scrub/phase7.c @@ -136,7 +136,7 @@ phase7_func( } error = -ptvar_alloc(scrub_nproc(ctx), sizeof(struct summary_counts), - &ptvar); + NULL, &ptvar); if (error) { str_liberror(ctx, error, _("setting up block counter")); return error; diff --git a/scrub/read_verify.c b/scrub/read_verify.c index e950efdc4e2..131b9d9f7d4 100644 --- a/scrub/read_verify.c +++ b/scrub/read_verify.c @@ -120,7 +120,7 @@ read_verify_pool_alloc( rvp->disk = disk; rvp->ioerr_fn = ioerr_fn; ret = -ptvar_alloc(submitter_threads, sizeof(struct read_verify), - &rvp->rvstate); + NULL, &rvp->rvstate); if (ret) goto out_counter; ret = -workqueue_create(&rvp->wq, (struct xfs_mount *)rvp, From patchwork Fri May 26 01:48:01 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13256148 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 38576C77B7E for ; Fri, 26 May 2023 01:48:07 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231712AbjEZBsG (ORCPT ); Thu, 25 May 2023 21:48:06 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39572 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230297AbjEZBsG (ORCPT ); Thu, 25 May 2023 21:48:06 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6E7FB189 for ; Thu, 25 May 2023 18:48:03 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id E8FBB64C19 for ; Fri, 26 May 2023 01:48:02 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4F452C433EF; Fri, 26 May 2023 01:48:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1685065682; bh=skz6LcFdfSbWykM1h6MHG4FeFF/YBJTSXgMt96HDT1o=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=MZOf3mnFrD109jRmE6MNutxqFuCmr26jXCREf1gb/zlMUX8XNbVqJ0jDqpUEZb9Wj mPiZfbShHbgirtCqsFeQz/2HuJHqBy41tmIhJ/2Ggmxs/nFuASZ3UPKBY/JE9Later /aUY2jayOgwPAb37j5OrUq0IS1NUIsuF7nfTMIA88jPWl+S5rVwRJQ867+ZlO3dT8b FO1a3zVsg34xp+CIZZmzKS40sXMTRjtjr1F0Hgyrv2tvkBBMEFkYAcfGC4YNo1uy8B /Y2r/e97LJKmtDHv9K3TdBOBtS4QZbPshTiTw0QUCz2iAtwpddNXPJNV/yoWqIcTUC CjyqrqxkECi7A== Date: Thu, 25 May 2023 18:48:01 -0700 Subject: [PATCH 2/4] xfs_scrub: improve thread scheduling repair items during phase 4 From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <168506072779.3744428.14221328041885064782.stgit@frogsfrogsfrogs> In-Reply-To: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> References: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong As it stands, xfs_scrub doesn't do a good job of scheduling repair items during phase 4. The repair lists are sharded by AG, and one repair worker is started for each per-AG repair list. Consequently, if one AG requires considerably more work than the others (e.g. inodes are not spread evenly among the AGs) then phase 4 can stall waiting for that one worker thread when there's still plenty of CPU power available. While our initial assumptions were that repairs would be vanishingly scarce, the reality is that "repairs" can be triggered for optimizations like gaps in the xattr structures, or clearing the inode reflink flag on inodes that no longer share data. In real world testing scenarios, the lack of balance leads to complaints about excessive runtime of xfs_scrub. To fix these balance problems, we replace the per-AG repair item lists in the scrub context with a single repair item list. Phase 4 will be redesigned as follows: The repair worker will grab a repair item from the main list, try to repair it, record whether the repair attempt made any progress, and requeue the item if it was not fully fixed. A separate repair scheduler function starts the repair workers, and waits for them all to complete. Requeued repairs are merged back into the main repair list. If we made any forward progress, we'll start another round of repairs with the repair workers. Phase 4 retains the behavior that if the pool stops making forward progress, it will try all the repairs one last time, serially. To facilitate this new design, phase 2 will queue repairs of space metadata items directly to the main list. Phase 3's worker threads will queue repair items to per-thread lists and splice those lists into the main list at the end. On a filesystem crafted to put all the inodes in a single AG, this restores xfs_scrub's ability to parallelize repairs. There seems to be a slight performance hit for the evenly-spread case, but avoiding a performance cliff due to an unbalanced fs is more important here. Signed-off-by: Darrick J. Wong --- include/list.h | 14 +++ scrub/phase1.c | 8 +- scrub/phase2.c | 23 +++++ scrub/phase3.c | 106 ++++++++++++++++--------- scrub/phase4.c | 226 ++++++++++++++++++++++++++++++++++++++--------------- scrub/repair.c | 135 +++++++++++++++++--------------- scrub/repair.h | 37 +++++++-- scrub/xfs_scrub.h | 2 8 files changed, 377 insertions(+), 174 deletions(-) diff --git a/include/list.h b/include/list.h index dab4e23bd10..b1e2d49c8f3 100644 --- a/include/list.h +++ b/include/list.h @@ -152,6 +152,20 @@ static inline void list_splice_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = head__->next; \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) diff --git a/scrub/phase1.c b/scrub/phase1.c index 09edd05625a..97b4386096e 100644 --- a/scrub/phase1.c +++ b/scrub/phase1.c @@ -89,7 +89,8 @@ scrub_cleanup( if (error) return error; - action_lists_free(&ctx->action_lists); + action_list_free(&ctx->action_list); + if (ctx->fshandle) free_handle(ctx->fshandle, ctx->fshandle_len); if (ctx->rtdev) @@ -185,10 +186,9 @@ _("Not an XFS filesystem.")); return error; } - error = action_lists_alloc(ctx->mnt.fsgeom.agcount, - &ctx->action_lists); + error = action_list_alloc(&ctx->action_list); if (error) { - str_liberror(ctx, error, _("allocating action lists")); + str_liberror(ctx, error, _("allocating repair list")); return error; } diff --git a/scrub/phase2.c b/scrub/phase2.c index 8fe9cd4c831..a40545f6daf 100644 --- a/scrub/phase2.c +++ b/scrub/phase2.c @@ -50,6 +50,25 @@ warn_repair_difficulties( str_info(ctx, descr, _("Filesystem might not be repairable.")); } +/* Add a scrub item that needs more work to fs metadata repair list. */ +static int +defer_fs_repair( + struct scrub_ctx *ctx, + const struct scrub_item *sri) +{ + struct action_item *aitem = NULL; + int error; + + error = repair_item_to_action_item(ctx, sri, &aitem); + if (error || !aitem) + return error; + + pthread_mutex_lock(&ctx->lock); + action_list_add(ctx->action_list, aitem); + pthread_mutex_unlock(&ctx->lock); + return 0; +} + /* Scrub each AG's metadata btrees. */ static void scan_ag_metadata( @@ -108,7 +127,7 @@ scan_ag_metadata( goto err; /* Everything else gets fixed during phase 4. */ - ret = repair_item_defer(ctx, &sri); + ret = defer_fs_repair(ctx, &sri); if (ret) goto err; return; @@ -144,7 +163,7 @@ scan_metafile( difficulty = repair_item_difficulty(&sri); warn_repair_difficulties(ctx, difficulty, xfrog_scrubbers[type].descr); - ret = repair_item_defer(ctx, &sri); + ret = defer_fs_repair(ctx, &sri); if (ret) { sctl->aborted = true; goto out; diff --git a/scrub/phase3.c b/scrub/phase3.c index 93f503818f5..ad2723f9ae3 100644 --- a/scrub/phase3.c +++ b/scrub/phase3.c @@ -10,6 +10,7 @@ #include "list.h" #include "libfrog/paths.h" #include "libfrog/workqueue.h" +#include "libfrog/ptvar.h" #include "xfs_scrub.h" #include "common.h" #include "counter.h" @@ -26,8 +27,8 @@ struct scrub_inode_ctx { /* Number of inodes scanned. */ struct ptcounter *icount; - /* per-AG locks to protect the repair lists */ - pthread_mutex_t *locks; + /* Per-thread lists of file repair items. */ + struct ptvar *repair_ptlists; /* Set to true to abort all threads. */ bool aborted; @@ -51,28 +52,28 @@ report_close_error( str_errno(ctx, descr); } -/* - * Defer all the repairs until phase 4, being careful about locking since the - * inode scrub threads are not per-AG. - */ +/* Defer all the repairs until phase 4. */ static int defer_inode_repair( struct scrub_inode_ctx *ictx, - const struct xfs_bulkstat *bstat, - struct scrub_item *sri) + const struct scrub_item *sri) { + struct action_list *alist; struct action_item *aitem = NULL; - xfs_agnumber_t agno; int ret; ret = repair_item_to_action_item(ictx->ctx, sri, &aitem); if (ret || !aitem) return ret; - agno = cvt_ino_to_agno(&ictx->ctx->mnt, bstat->bs_ino); - pthread_mutex_lock(&ictx->locks[agno]); - action_list_add(&ictx->ctx->action_lists[agno], aitem); - pthread_mutex_unlock(&ictx->locks[agno]); + alist = ptvar_get(ictx->repair_ptlists, &ret); + if (ret) { + str_liberror(ictx->ctx, ret, + _("getting per-thread inode repair list")); + return ret; + } + + action_list_add(alist, aitem); return 0; } @@ -81,8 +82,7 @@ static int try_inode_repair( struct scrub_inode_ctx *ictx, struct scrub_item *sri, - int fd, - const struct xfs_bulkstat *bstat) + int fd) { /* * If at the start of phase 3 we already had ag/rt metadata repairs @@ -149,7 +149,7 @@ scrub_inode( if (error) goto out; - error = try_inode_repair(ictx, &sri, fd, bstat); + error = try_inode_repair(ictx, &sri, fd); if (error) goto out; @@ -161,7 +161,7 @@ scrub_inode( if (error) goto out; - error = try_inode_repair(ictx, &sri, fd, bstat); + error = try_inode_repair(ictx, &sri, fd); if (error) goto out; @@ -178,7 +178,7 @@ scrub_inode( goto out; /* Try to repair the file while it's open. */ - error = try_inode_repair(ictx, &sri, fd, bstat); + error = try_inode_repair(ictx, &sri, fd); if (error) goto out; @@ -195,7 +195,7 @@ scrub_inode( progress_add(1); if (!error && !ictx->aborted) - error = defer_inode_repair(ictx, bstat, &sri); + error = defer_inode_repair(ictx, &sri); if (fd >= 0) { int err2; @@ -212,6 +212,33 @@ scrub_inode( return error; } +/* + * Collect all the inode repairs in the file repair list. No need for locks + * here, since we're single-threaded. + */ +static int +collect_repairs( + struct ptvar *ptv, + void *data, + void *foreach_arg) +{ + struct scrub_ctx *ctx = foreach_arg; + struct action_list *alist = data; + + action_list_merge(ctx->action_list, alist); + return 0; +} + +/* Initialize this per-thread file repair item list. */ +static void +action_ptlist_init( + void *priv) +{ + struct action_list *alist = priv; + + action_list_init(alist); +} + /* Verify all the inodes in a filesystem. */ int phase3_func( @@ -222,17 +249,18 @@ phase3_func( xfs_agnumber_t agno; int err; + err = -ptvar_alloc(scrub_nproc(ctx), sizeof(struct action_list), + action_ptlist_init, &ictx.repair_ptlists); + if (err) { + str_liberror(ctx, err, + _("creating per-thread file repair item lists")); + return err; + } + err = ptcounter_alloc(scrub_nproc(ctx), &ictx.icount); if (err) { str_liberror(ctx, err, _("creating scanned inode counter")); - return err; - } - - ictx.locks = calloc(ctx->mnt.fsgeom.agcount, sizeof(pthread_mutex_t)); - if (!ictx.locks) { - str_errno(ctx, _("creating per-AG repair list locks")); - err = ENOMEM; - goto out_ptcounter; + goto out_ptvar; } /* @@ -241,9 +269,7 @@ phase3_func( * to repair the space metadata. */ for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { - pthread_mutex_init(&ictx.locks[agno], NULL); - - if (!action_list_empty(&ctx->action_lists[agno])) + if (!action_list_empty(ctx->action_list)) ictx.always_defer_repairs = true; } @@ -251,22 +277,30 @@ phase3_func( if (!err && ictx.aborted) err = ECANCELED; if (err) - goto out_locks; + goto out_ptcounter; + + /* + * Combine all of the file repair items into the main repair list. + * We don't need locks here since we're the only thread running now. + */ + err = -ptvar_foreach(ictx.repair_ptlists, collect_repairs, ctx); + if (err) { + str_liberror(ctx, err, _("collecting inode repair lists")); + goto out_ptcounter; + } scrub_report_preen_triggers(ctx); err = ptcounter_value(ictx.icount, &val); if (err) { str_liberror(ctx, err, _("summing scanned inode counter")); - goto out_locks; + goto out_ptcounter; } ctx->inodes_checked = val; -out_locks: - for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) - pthread_mutex_destroy(&ictx.locks[agno]); - free(ictx.locks); out_ptcounter: ptcounter_free(ictx.icount); +out_ptvar: + ptvar_free(ictx.repair_ptlists); return err; } diff --git a/scrub/phase4.c b/scrub/phase4.c index b3491f27312..61405dacec4 100644 --- a/scrub/phase4.c +++ b/scrub/phase4.c @@ -17,57 +17,166 @@ #include "scrub.h" #include "repair.h" #include "vfs.h" +#include "atomic.h" /* Phase 4: Repair filesystem. */ -/* Fix all the problems in our per-AG list. */ +struct repair_list_schedule { + struct action_list *repair_list; + + /* Action items that we could not resolve and want to try again. */ + struct action_list requeue_list; + + pthread_mutex_t lock; + + /* Workers use this to signal the scheduler when all work is done. */ + pthread_cond_t done; + + /* Number of workers that are still running. */ + unsigned int workers; + + /* Or should we all abort? */ + bool aborted; + + /* Did we make any progress this round? */ + bool made_progress; +}; + +/* Try to repair as many things on our list as we can. */ static void -repair_ag( +repair_list_worker( struct workqueue *wq, xfs_agnumber_t agno, void *priv) { + struct repair_list_schedule *rls = priv; struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; - bool *aborted = priv; - struct action_list *alist; - unsigned long long unfixed; - unsigned long long new_unfixed; - unsigned int flags = 0; - int ret; - - alist = &ctx->action_lists[agno]; - unfixed = action_list_length(alist); - - /* Repair anything broken until we fail to make progress. */ - do { - ret = action_list_process(ctx, alist, flags); + + pthread_mutex_lock(&rls->lock); + while (!rls->aborted) { + struct action_item *aitem; + enum tryrepair_outcome outcome; + int ret; + + aitem = action_list_pop(rls->repair_list); + if (!aitem) + break; + + pthread_mutex_unlock(&rls->lock); + ret = action_item_try_repair(ctx, aitem, &outcome); + pthread_mutex_lock(&rls->lock); + if (ret) { - *aborted = true; - return; + rls->aborted = true; + free(aitem); + break; } - new_unfixed = action_list_length(alist); - if (new_unfixed == unfixed) + + switch (outcome) { + case TR_REQUEUE: + /* + * Partial progress. Make a note of that and requeue + * this item for the next round. + */ + rls->made_progress = true; + action_list_add(&rls->requeue_list, aitem); + break; + case TR_NOPROGRESS: + /* + * No progress. Requeue this item for a later round, + * which could happen if something else makes progress. + */ + action_list_add(&rls->requeue_list, aitem); break; - unfixed = new_unfixed; - if (*aborted) - return; - } while (unfixed > 0); - - /* Try once more, but this time complain if we can't fix things. */ - flags |= XRM_FINAL_WARNING; - ret = action_list_process(ctx, alist, flags); - if (ret) - *aborted = true; + case TR_REPAIRED: + /* Item is clean. Free it. */ + free(aitem); + break; + } + } + + rls->workers--; + if (rls->workers == 0) + pthread_cond_broadcast(&rls->done); + pthread_mutex_unlock(&rls->lock); +} + +/* + * Schedule repair list workers. Returns 1 if we made progress, 0 if we + * did not, or -1 if we need to abort everything. + */ +static int +repair_list_schedule( + struct scrub_ctx *ctx, + struct workqueue *wq, + struct action_list *repair_list) +{ + struct repair_list_schedule rls = { + .lock = PTHREAD_MUTEX_INITIALIZER, + .done = PTHREAD_COND_INITIALIZER, + .repair_list = repair_list, + }; + unsigned int i; + unsigned int nr_workers = scrub_nproc(ctx); + bool made_any_progress = false; + int ret = 0; + + if (action_list_empty(repair_list)) + return 0; + + action_list_init(&rls.requeue_list); + + /* + * Use the workers to run through the entire repair list once. Requeue + * anything that did not make progress, and keep trying as long as the + * workers made any kind of progress. + */ + do { + rls.made_progress = false; + + /* Start all the worker threads. */ + for (i = 0; i < nr_workers; i++) { + pthread_mutex_lock(&rls.lock); + rls.workers++; + pthread_mutex_unlock(&rls.lock); + + ret = -workqueue_add(wq, repair_list_worker, 0, &rls); + if (ret) { + str_liberror(ctx, ret, + _("queueing repair list worker")); + pthread_mutex_lock(&rls.lock); + rls.workers--; + pthread_mutex_unlock(&rls.lock); + break; + } + } + + /* Wait for all worker functions to return. */ + pthread_mutex_lock(&rls.lock); + if (rls.workers > 0) + pthread_cond_wait(&rls.done, &rls.lock); + pthread_mutex_unlock(&rls.lock); + + action_list_merge(repair_list, &rls.requeue_list); + + if (ret || rls.aborted) + return -1; + if (rls.made_progress) + made_any_progress = true; + } while (rls.made_progress && !action_list_empty(repair_list)); + + if (made_any_progress) + return 1; + return 0; } -/* Process all the action items. */ +/* Process both repair lists. */ static int repair_everything( struct scrub_ctx *ctx) { struct workqueue wq; - xfs_agnumber_t agno; - bool aborted = false; + int fixed_anything; int ret; ret = -workqueue_create(&wq, (struct xfs_mount *)ctx, @@ -76,41 +185,32 @@ repair_everything( str_liberror(ctx, ret, _("creating repair workqueue")); return ret; } - for (agno = 0; !aborted && agno < ctx->mnt.fsgeom.agcount; agno++) { - if (action_list_length(&ctx->action_lists[agno]) == 0) - continue; - ret = -workqueue_add(&wq, repair_ag, agno, &aborted); - if (ret) { - str_liberror(ctx, ret, _("queueing repair work")); + /* + * Try to fix everything on the space metadata repair list and then the + * file repair list until we stop making progress. These repairs can + * be threaded, if the user desires. + */ + do { + fixed_anything = 0; + + ret = repair_list_schedule(ctx, &wq, ctx->action_list); + if (ret < 0) break; - } - } + if (ret == 1) + fixed_anything++; + } while (fixed_anything > 0); ret = -workqueue_terminate(&wq); if (ret) str_liberror(ctx, ret, _("finishing repair work")); workqueue_destroy(&wq); - if (aborted) - return ECANCELED; + if (ret < 0) + return ret; - return 0; -} - -/* Decide if we have any repair work to do. */ -static inline bool -have_action_items( - struct scrub_ctx *ctx) -{ - xfs_agnumber_t agno; - - for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { - if (action_list_length(&ctx->action_lists[agno]) > 0) - return true; - } - - return false; + /* Repair everything serially. Last chance to fix things. */ + return action_list_process(ctx, ctx->action_list, XRM_FINAL_WARNING); } /* Trim the unused areas of the filesystem if the caller asked us to. */ @@ -132,7 +232,7 @@ phase4_func( struct scrub_item sri; int ret; - if (!have_action_items(ctx)) + if (action_list_empty(ctx->action_list)) goto maybe_trim; /* @@ -190,12 +290,12 @@ phase4_estimate( unsigned int *nr_threads, int *rshift) { - xfs_agnumber_t agno; - unsigned long long need_fixing = 0; + unsigned long long need_fixing; - for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) - need_fixing += action_list_length(&ctx->action_lists[agno]); + /* Everything on the repair list plus FSTRIM. */ + need_fixing = action_list_length(ctx->action_list); need_fixing++; + *items = need_fixing; *nr_threads = scrub_nproc(ctx) + 1; *rshift = 0; diff --git a/scrub/repair.c b/scrub/repair.c index b49f2c7a8e1..b1395275160 100644 --- a/scrub/repair.c +++ b/scrub/repair.c @@ -396,58 +396,41 @@ repair_item_difficulty( return ret; } -/* - * Allocate a certain number of repair lists for the scrub context. Returns - * zero or a positive error number. - */ +/* Create a new repair action list. */ int -action_lists_alloc( - size_t nr, - struct action_list **listsp) +action_list_alloc( + struct action_list **listp) { - struct action_list *lists; - xfs_agnumber_t agno; + struct action_list *alist; - lists = calloc(nr, sizeof(struct action_list)); - if (!lists) + alist = malloc(sizeof(struct action_list)); + if (!alist) return errno; - for (agno = 0; agno < nr; agno++) - action_list_init(&lists[agno]); - *listsp = lists; - + action_list_init(alist); + *listp = alist; return 0; } -/* Discard repair list contents. */ +/* Free the repair lists. */ void -action_list_discard( - struct action_list *alist) +action_list_free( + struct action_list **listp) { + struct action_list *alist = *listp; struct action_item *aitem; struct action_item *n; + if (!(*listp)) + return; + list_for_each_entry_safe(aitem, n, &alist->list, list) { list_del(&aitem->list); free(aitem); } -} -/* Free the repair lists. */ -void -action_lists_free( - struct action_list **listsp) -{ - free(*listsp); - *listsp = NULL; -} - -/* Initialize repair list */ -void -action_list_init( - struct action_list *alist) -{ - INIT_LIST_HEAD(&alist->list); + free(alist); + *listp = NULL; } /* Number of pending repairs in this list. */ @@ -464,7 +447,23 @@ action_list_length( return ret; } -/* Add to the list of repairs. */ +/* Remove the first action item from the action list. */ +struct action_item * +action_list_pop( + struct action_list *alist) +{ + struct action_item *aitem; + + aitem = list_first_entry_or_null(&alist->list, struct action_item, + list); + if (!aitem) + return NULL; + + list_del_init(&aitem->list); + return aitem; +} + +/* Add an action item to the end of a list. */ void action_list_add( struct action_list *alist, @@ -473,6 +472,46 @@ action_list_add( list_add_tail(&aitem->list, &alist->list); } +/* + * Try to repair a filesystem object and let the caller know what it should do + * with the action item. The caller must be able to requeue action items, so + * we don't complain if repairs are not totally successful. + */ +int +action_item_try_repair( + struct scrub_ctx *ctx, + struct action_item *aitem, + enum tryrepair_outcome *outcome) +{ + struct scrub_item *sri = &aitem->sri; + unsigned int before, after; + int ret; + + before = repair_item_count_needsrepair(sri); + + ret = repair_item(ctx, sri, 0); + if (ret) + return ret; + + after = repair_item_count_needsrepair(sri); + if (after > 0) { + /* + * The kernel did not complete all of the repairs requested. + * If it made some progress we'll requeue; otherwise, let the + * caller know that nothing got fixed. + */ + if (before != after) + *outcome = TR_REQUEUE; + else + *outcome = TR_NOPROGRESS; + return 0; + } + + /* Repairs complete. */ + *outcome = TR_REPAIRED; + return 0; +} + /* Repair everything on this list. */ int action_list_process( @@ -676,29 +715,3 @@ repair_item_to_action_item( *aitemp = aitem; return 0; } - -/* Defer all the repairs until phase 4. */ -int -repair_item_defer( - struct scrub_ctx *ctx, - const struct scrub_item *sri) -{ - struct action_item *aitem = NULL; - unsigned int agno; - int error; - - error = repair_item_to_action_item(ctx, sri, &aitem); - if (error || !aitem) - return error; - - if (sri->sri_agno != -1U) - agno = sri->sri_agno; - else if (sri->sri_ino != -1ULL && sri->sri_gen != -1U) - agno = cvt_ino_to_agno(&ctx->mnt, sri->sri_ino); - else - agno = 0; - ASSERT(agno < ctx->mnt.fsgeom.agcount); - - action_list_add(&ctx->action_lists[agno], aitem); - return 0; -} diff --git a/scrub/repair.h b/scrub/repair.h index 185e539d53c..2ca00bef64a 100644 --- a/scrub/repair.h +++ b/scrub/repair.h @@ -12,19 +12,43 @@ struct action_list { struct action_item; -int action_lists_alloc(size_t nr, struct action_list **listsp); -void action_lists_free(struct action_list **listsp); +int action_list_alloc(struct action_list **listp); +void action_list_free(struct action_list **listp); +static inline void action_list_init(struct action_list *alist) +{ + INIT_LIST_HEAD(&alist->list); +} -void action_list_init(struct action_list *alist); +unsigned long long action_list_length(struct action_list *alist); + +/* Move all the items of @src to the tail of @dst, and reinitialize @src. */ +static inline void +action_list_merge( + struct action_list *dst, + struct action_list *src) +{ + list_splice_tail_init(&src->list, &dst->list); +} + +struct action_item *action_list_pop(struct action_list *alist); +void action_list_add(struct action_list *alist, struct action_item *aitem); static inline bool action_list_empty(const struct action_list *alist) { return list_empty(&alist->list); } -unsigned long long action_list_length(struct action_list *alist); -void action_list_add(struct action_list *dest, struct action_item *item); -void action_list_discard(struct action_list *alist); +enum tryrepair_outcome { + /* No progress was made on repairs at all. */ + TR_NOPROGRESS = 0, + /* Some progress was made on repairs; try again soon. */ + TR_REQUEUE, + /* Repairs completely successful. */ + TR_REPAIRED, +}; + +int action_item_try_repair(struct scrub_ctx *ctx, struct action_item *aitem, + enum tryrepair_outcome *outcome); void repair_item_mustfix(struct scrub_item *sri, struct scrub_item *fix_now); @@ -56,7 +80,6 @@ int repair_item(struct scrub_ctx *ctx, struct scrub_item *sri, unsigned int repair_flags); int repair_item_to_action_item(struct scrub_ctx *ctx, const struct scrub_item *sri, struct action_item **aitemp); -int repair_item_defer(struct scrub_ctx *ctx, const struct scrub_item *sri); static inline unsigned int repair_item_count_needsrepair( diff --git a/scrub/xfs_scrub.h b/scrub/xfs_scrub.h index 1df3191e85c..96a6aa06eac 100644 --- a/scrub/xfs_scrub.h +++ b/scrub/xfs_scrub.h @@ -71,7 +71,7 @@ struct scrub_ctx { /* Mutable scrub state; use lock. */ pthread_mutex_t lock; - struct action_list *action_lists; + struct action_list *action_list; unsigned long long max_errors; unsigned long long runtime_errors; unsigned long long corruptions_found; From patchwork Fri May 26 01:48:17 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13256149 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 81896C7EE29 for ; Fri, 26 May 2023 01:48:25 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234031AbjEZBsX (ORCPT ); Thu, 25 May 2023 21:48:23 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39618 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230297AbjEZBsU (ORCPT ); Thu, 25 May 2023 21:48:20 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 07FBC18D for ; Thu, 25 May 2023 18:48:19 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 8256760B6C for ; Fri, 26 May 2023 01:48:18 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id E7EA8C433EF; Fri, 26 May 2023 01:48:17 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1685065698; bh=SE+I4fqkekP63F+dYA/ltK8wmLGm7Ge3c8Ranpf/U98=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=CReKMIQo4sAPGaRsoAw8r8q2huEjI5BMA8mZ4Cgdg0Qq1XJOPwu2rz1W2KMyLbH5i v3zCv2iOj8RIKroExlbG649fvEwhR2HKAEROLI3nfcjL//OFBH/VrHLMQ4QwPQtquh OAeYx/8hiQhjjHKnSTwtYCkuJNkpBkdHGSEEbFPSCb67L0GNleaJI5ngq/vhbeK4HR si168cIvH7Y+x8e+xG81hvJCBJ3WVKgMsDwmmhGGa1VDGD/awSpZ3i5yacd+lhDyVu /AU7bHlmUC+7V0qrSFFe/BZWxUUj/CJaEBTG2RRfcA48Oblr6b5/SIv6sz8ZrRP2aA tJ+F77Kx9SeCQ== Date: Thu, 25 May 2023 18:48:17 -0700 Subject: [PATCH 3/4] xfs_scrub: recheck entire metadata objects after corruption repairs From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <168506072792.3744428.13295410202659476156.stgit@frogsfrogsfrogs> In-Reply-To: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> References: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong When we've finished making repairs to some domain of filesystem metadata (file, AG, etc.) to correct an inconsistency, we should recheck all the other metadata types within that domain to make sure that we neither made things worse nor introduced more cross-referencing problems. If we did, requeue the item to make the repairs. If the only changes we made were optimizations, don't bother. Signed-off-by: Darrick J. Wong --- scrub/repair.c | 37 +++++++++++++++++++++++++++++++++++++ scrub/scrub.c | 5 +++-- scrub/scrub.h | 10 ++++++++++ scrub/scrub_private.h | 2 ++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/scrub/repair.c b/scrub/repair.c index b1395275160..079a9edf77e 100644 --- a/scrub/repair.c +++ b/scrub/repair.c @@ -485,8 +485,10 @@ action_item_try_repair( { struct scrub_item *sri = &aitem->sri; unsigned int before, after; + unsigned int scrub_type; int ret; + BUILD_BUG_ON(sizeof(sri->sri_selected) * NBBY < XFS_SCRUB_TYPE_NR); before = repair_item_count_needsrepair(sri); ret = repair_item(ctx, sri, 0); @@ -507,6 +509,41 @@ action_item_try_repair( return 0; } + /* + * Nothing in this fs object was marked inconsistent. This means we + * were merely optimizing metadata and there is no revalidation work to + * be done. + */ + if (!sri->sri_inconsistent) { + *outcome = TR_REPAIRED; + return 0; + } + + /* + * We fixed inconsistent metadata, so reschedule the entire object for + * immediate revalidation to see if anything else went wrong. + */ + foreach_scrub_type(scrub_type) + if (sri->sri_selected & (1U << scrub_type)) + sri->sri_state[scrub_type] = SCRUB_ITEM_NEEDSCHECK; + sri->sri_inconsistent = false; + sri->sri_revalidate = true; + + ret = scrub_item_check(ctx, sri); + if (ret) + return ret; + + after = repair_item_count_needsrepair(sri); + if (after > 0) { + /* + * Uhoh, we found something else broken. Tell the caller that + * this item needs to be queued for more repairs. + */ + sri->sri_revalidate = false; + *outcome = TR_REQUEUE; + return 0; + } + /* Repairs complete. */ *outcome = TR_REPAIRED; return 0; diff --git a/scrub/scrub.c b/scrub/scrub.c index 8ceafc36af7..0227da5a926 100644 --- a/scrub/scrub.c +++ b/scrub/scrub.c @@ -117,11 +117,12 @@ xfs_check_metadata( dbg_printf("check %s flags %xh\n", descr_render(&dsc), meta.sm_flags); error = -xfrog_scrub_metadata(xfdp, &meta); - if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR") && !error) - meta.sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; switch (error) { case 0: /* No operational errors encountered. */ + if (!sri->sri_revalidate && + debug_tweak_on("XFS_SCRUB_FORCE_REPAIR")) + meta.sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; break; case ENOENT: /* Metadata not present, just skip it. */ diff --git a/scrub/scrub.h b/scrub/scrub.h index b3d2f824bd1..0da45fb5dc6 100644 --- a/scrub/scrub.h +++ b/scrub/scrub.h @@ -59,11 +59,20 @@ struct scrub_item { __u32 sri_gen; __u32 sri_agno; + /* Bitmask of scrub types that were scheduled here. */ + __u32 sri_selected; + /* Scrub item state flags, one for each XFS_SCRUB_TYPE. */ __u8 sri_state[XFS_SCRUB_TYPE_NR]; /* Track scrub and repair call retries for each scrub type. */ __u8 sri_tries[XFS_SCRUB_TYPE_NR]; + + /* Were there any corruption repairs needed? */ + bool sri_inconsistent:1; + + /* Are we revalidating after repairs? */ + bool sri_revalidate:1; }; #define foreach_scrub_type(loopvar) \ @@ -103,6 +112,7 @@ static inline void scrub_item_schedule(struct scrub_item *sri, unsigned int scrub_type) { sri->sri_state[scrub_type] = SCRUB_ITEM_NEEDSCHECK; + sri->sri_selected |= (1U << scrub_type); } void scrub_item_schedule_group(struct scrub_item *sri, diff --git a/scrub/scrub_private.h b/scrub/scrub_private.h index 1baded653f7..92bc0afc4f1 100644 --- a/scrub/scrub_private.h +++ b/scrub/scrub_private.h @@ -71,6 +71,8 @@ scrub_item_save_state( unsigned int scrub_flags) { sri->sri_state[scrub_type] = scrub_flags & SCRUB_ITEM_REPAIR_ANY; + if (scrub_flags & SCRUB_ITEM_NEEDSREPAIR) + sri->sri_inconsistent = true; } static inline void From patchwork Fri May 26 01:48:33 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13256150 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D36E4C77B7E for ; Fri, 26 May 2023 01:48:36 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233159AbjEZBsg (ORCPT ); Thu, 25 May 2023 21:48:36 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39662 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230297AbjEZBsf (ORCPT ); Thu, 25 May 2023 21:48:35 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [139.178.84.217]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 804C6189 for ; Thu, 25 May 2023 18:48:34 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 1259060B6C for ; Fri, 26 May 2023 01:48:34 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7A82AC433EF; Fri, 26 May 2023 01:48:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1685065713; bh=guE29lCGy3oNNUyTL0MEL5OsrT1AyVc9CkE2WQ80my0=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=Ze5gvoXhM4bqMmpHAKk0LhK3DcAKiEuFIPJqELQJpBnTER9KW0dgTS0DBQMpsBs01 HimiPBW2wLUZrTF/96BVw8ywEE9ggJgEwZTDaTFC6sN4duWuOeeAhV4et4A4aae+d1 rn9BUP3IkJzvhGaGU9qLSQzzr9cugWRrRhyxFEFM0BSFdNf7LndXxCeMwuWxx31C+C wZfa7va3VKpmbhFLUAezuSldTs/d+PuWxKtNmIKU+DjwVzxUv34cqIvi705j0UfFAk tLO3qDupIlGFU+7j0r/3bvDmLseGKLKU8dgDhRl7GFgwVjte01Na1IlBEp6liH1+6y CVC6eWMJMFYZQ== Date: Thu, 25 May 2023 18:48:33 -0700 Subject: [PATCH 4/4] xfs_scrub: try to repair space metadata before file metadata From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <168506072806.3744428.17596786190019798618.stgit@frogsfrogsfrogs> In-Reply-To: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> References: <168506072752.3744428.6237393655315422413.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Darrick J. Wong Phase 4 (metadata repairs) of xfs_scrub has suffered a mild race condition since the beginning of its existence. Repair functions for higher level metadata such as directories build the new directory blocks in an unlinked temporary file and use atomic extent swapping to commit the corrected directory contents into the existing directory. Atomic extent swapping requires consistent filesystem space metadata, but phase 4 has never enforced correctness dependencies between space and file metadata repairs. Before the previous patch eliminated the per-AG repair lists, this error was not often hit in testing scenarios because the allocator generally succeeds in placing file data blocks in the same AG as the inode. With pool threads now able to pop file repairs from the repair list before space repairs complete, this error became much more obvious. Fortunately, the new phase 4 design makes it easy to try to enforce the consistency requirements of higher level file metadata repairs. Split the repair list into one for space metadata and another for file metadata. Phase 4 will now try to fix the space metadata until it stops making progress on that, and only then will it try to fix file metadata. Signed-off-by: Darrick J. Wong --- scrub/phase1.c | 13 ++++++++++--- scrub/phase2.c | 2 +- scrub/phase3.c | 4 ++-- scrub/phase4.c | 22 +++++++++++++++++----- scrub/xfs_scrub.h | 3 ++- 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/scrub/phase1.c b/scrub/phase1.c index 97b4386096e..99c7a7a5d28 100644 --- a/scrub/phase1.c +++ b/scrub/phase1.c @@ -89,7 +89,8 @@ scrub_cleanup( if (error) return error; - action_list_free(&ctx->action_list); + action_list_free(&ctx->file_repair_list); + action_list_free(&ctx->fs_repair_list); if (ctx->fshandle) free_handle(ctx->fshandle, ctx->fshandle_len); @@ -186,9 +187,15 @@ _("Not an XFS filesystem.")); return error; } - error = action_list_alloc(&ctx->action_list); + error = action_list_alloc(&ctx->fs_repair_list); if (error) { - str_liberror(ctx, error, _("allocating repair list")); + str_liberror(ctx, error, _("allocating fs repair list")); + return error; + } + + error = action_list_alloc(&ctx->file_repair_list); + if (error) { + str_liberror(ctx, error, _("allocating file repair list")); return error; } diff --git a/scrub/phase2.c b/scrub/phase2.c index a40545f6daf..5e4e03dc688 100644 --- a/scrub/phase2.c +++ b/scrub/phase2.c @@ -64,7 +64,7 @@ defer_fs_repair( return error; pthread_mutex_lock(&ctx->lock); - action_list_add(ctx->action_list, aitem); + action_list_add(ctx->fs_repair_list, aitem); pthread_mutex_unlock(&ctx->lock); return 0; } diff --git a/scrub/phase3.c b/scrub/phase3.c index ad2723f9ae3..6e6a68b9b68 100644 --- a/scrub/phase3.c +++ b/scrub/phase3.c @@ -225,7 +225,7 @@ collect_repairs( struct scrub_ctx *ctx = foreach_arg; struct action_list *alist = data; - action_list_merge(ctx->action_list, alist); + action_list_merge(ctx->file_repair_list, alist); return 0; } @@ -269,7 +269,7 @@ phase3_func( * to repair the space metadata. */ for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { - if (!action_list_empty(ctx->action_list)) + if (!action_list_empty(ctx->fs_repair_list)) ictx.always_defer_repairs = true; } diff --git a/scrub/phase4.c b/scrub/phase4.c index 61405dacec4..cb0354b44cb 100644 --- a/scrub/phase4.c +++ b/scrub/phase4.c @@ -194,7 +194,13 @@ repair_everything( do { fixed_anything = 0; - ret = repair_list_schedule(ctx, &wq, ctx->action_list); + ret = repair_list_schedule(ctx, &wq, ctx->fs_repair_list); + if (ret < 0) + break; + if (ret == 1) + fixed_anything++; + + ret = repair_list_schedule(ctx, &wq, ctx->file_repair_list); if (ret < 0) break; if (ret == 1) @@ -209,8 +215,12 @@ repair_everything( if (ret < 0) return ret; - /* Repair everything serially. Last chance to fix things. */ - return action_list_process(ctx, ctx->action_list, XRM_FINAL_WARNING); + /* + * Combine both repair lists and repair everything serially. This is + * the last chance to fix things. + */ + action_list_merge(ctx->fs_repair_list, ctx->file_repair_list); + return action_list_process(ctx, ctx->fs_repair_list, XRM_FINAL_WARNING); } /* Trim the unused areas of the filesystem if the caller asked us to. */ @@ -232,7 +242,8 @@ phase4_func( struct scrub_item sri; int ret; - if (action_list_empty(ctx->action_list)) + if (action_list_empty(ctx->fs_repair_list) && + action_list_empty(ctx->file_repair_list)) goto maybe_trim; /* @@ -293,7 +304,8 @@ phase4_estimate( unsigned long long need_fixing; /* Everything on the repair list plus FSTRIM. */ - need_fixing = action_list_length(ctx->action_list); + need_fixing = action_list_length(ctx->fs_repair_list) + + action_list_length(ctx->file_repair_list); need_fixing++; *items = need_fixing; diff --git a/scrub/xfs_scrub.h b/scrub/xfs_scrub.h index 96a6aa06eac..acfe6175bdb 100644 --- a/scrub/xfs_scrub.h +++ b/scrub/xfs_scrub.h @@ -71,7 +71,8 @@ struct scrub_ctx { /* Mutable scrub state; use lock. */ pthread_mutex_t lock; - struct action_list *action_list; + struct action_list *fs_repair_list; + struct action_list *file_repair_list; unsigned long long max_errors; unsigned long long runtime_errors; unsigned long long corruptions_found;