From patchwork Thu Jul 20 10:48:11 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 13320308 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 9094BEB64DD for ; Thu, 20 Jul 2023 10:48:48 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230482AbjGTKsq (ORCPT ); Thu, 20 Jul 2023 06:48:46 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47700 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231184AbjGTKsl (ORCPT ); Thu, 20 Jul 2023 06:48:41 -0400 Received: from smtp-out1.suse.de (smtp-out1.suse.de [IPv6:2001:67c:2178:6::1c]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 989961996 for ; Thu, 20 Jul 2023 03:48:36 -0700 (PDT) Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out1.suse.de (Postfix) with ESMTPS id 2A57022BDF for ; Thu, 20 Jul 2023 10:48:35 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1; t=1689850115; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc: mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=1elvpPmA4FI6avCXOgQkXvHZQTxe7+KyW6AH0uwhms8=; b=mdsJLrIbw3N3uHgVTpFTHEF54uDflXhs3YLh+9RdkegEZ6teKCq1OgEh/rvgbiuMKgvXru cmuNYcAtCddoVjKGtaFkGS89lU2aPaqHVwKNhKk+Lvj/3L1SGfDH3YIGdb5Z5v+QeQBAoZ EiJAaA+FPmrdASfFuuk3PsmYwaj8HEg= Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 739CD133DD for ; Thu, 20 Jul 2023 10:48:34 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id 8H3JDgIRuWQBcQAAMHmgww (envelope-from ) for ; Thu, 20 Jul 2023 10:48:34 +0000 From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 1/5] btrfs: scrub: avoid unnecessary extent tree search preparing stripes Date: Thu, 20 Jul 2023 18:48:11 +0800 Message-ID: X-Mailer: git-send-email 2.41.0 In-Reply-To: References: MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org Since commit e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure"), scrub no longer re-use the same path for extent tree search. This can lead to unnecessary extent tree search, especially for the new stripe based scrub, as we have way more stripes to prepare. This patch would re-introduce a shared path for extent tree search, and properly release it when the block group is scrubbed. With this change alone, it can already improve the performance of scrub on NVME to around 2.2GiB/s, as this reduces the time to prepare stripes: Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 5066.00 2364144.00 31983.00 86.33 0.50 466.67 2.53 100.00 Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") Signed-off-by: Qu Wenruo Reviewed-by: Johannes Thumshirn --- fs/btrfs/scrub.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3b09d359c914..2b6888ff4a50 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -189,6 +189,7 @@ struct scrub_ctx { struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -353,6 +354,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -1480,6 +1483,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1489,7 +1493,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; @@ -1505,14 +1508,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; - - ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); /* Either error or not found. */ if (ret) goto out; - get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) @@ -1540,7 +1542,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { - ret = find_first_extent_item(extent_root, &path, cur_logical, + ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; @@ -1548,7 +1550,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = 0; break; } - get_extent_info(&path, &extent_start, &extent_len, + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; @@ -1588,7 +1590,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: - btrfs_release_path(&path); return ret; } @@ -1784,8 +1785,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * ASSERT(sctx->cur_stripe < SCRUB_STRIPES_PER_SCTX); stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, - logical, length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, dev, + physical, mirror_num, logical, + length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; @@ -1825,6 +1827,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1835,6 +1838,14 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + /* + * For data stripe search, we can not re-use the same extent path, as + * the data stripe bytenr may be smaller than previous extent. + * Thus we have to use our own extent path. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; @@ -1849,7 +1860,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, + ret = scrub_find_fill_first_stripe(bg, &extent_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1977,6 +1988,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); + btrfs_release_path(&extent_path); out: return ret; } @@ -2143,6 +2155,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 stripe_logical; int stop_loop = 0; + /* Extent_path should be probably released. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && @@ -2261,6 +2276,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; + btrfs_release_path(&sctx->extent_path); + if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); From patchwork Thu Jul 20 10:48:12 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 13320310 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 350D9C001DC for ; Thu, 20 Jul 2023 10:48:51 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230521AbjGTKst (ORCPT ); Thu, 20 Jul 2023 06:48:49 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47708 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231207AbjGTKsm (ORCPT ); Thu, 20 Jul 2023 06:48:42 -0400 Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8B2C71982 for ; Thu, 20 Jul 2023 03:48:37 -0700 (PDT) Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out2.suse.de (Postfix) with ESMTPS id 4C6EC20685 for ; Thu, 20 Jul 2023 10:48:36 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1; t=1689850116; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc: mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=O+hkTnEFWJXRm8RA6hHepTutFMd26Z0baRk941a6BDk=; b=LU6A2DQrm/UPuB+aaYt90Pr7LXa7WU51IcDSemyOYpCg4AaFQD4IV5Wnh4FDtgi6SVAJz+ ubuD0ALQFxLjDyqQtuDaaUEFVr0TfnN3W8eMuD0ePwFDgipMFH7lT9ckmBbTSMFa0w7ez+ Hdlk1dYJP8damwA6mQI4BedcsT+fX7o= Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 95573133DD for ; Thu, 20 Jul 2023 10:48:35 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id cB7dFgMRuWQBcQAAMHmgww (envelope-from ) for ; Thu, 20 Jul 2023 10:48:35 +0000 From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 2/5] btrfs: scrub: avoid unnecessary csum tree search preparing stripes Date: Thu, 20 Jul 2023 18:48:12 +0800 Message-ID: X-Mailer: git-send-email 2.41.0 In-Reply-To: References: MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org One of the bottleneck of the new scrub code is the extra csum tree search. The old code would only do the csum tree search for each scrub bio, which can be as large as 512KiB, thus they can afford to allocate a new patch each time. But the new scrub code is doing csum tree search for each stripe, which is only 64KiB, this means we'd better re-use the same csum path during each search. This patch would introduce a per-sctx path for csum tree search, as we don't need to re-allocate the path every time we need to do a csum tree search. With this update, we can regain the scrub performance to 2.5 GiB/s on a PCIE3 NVME device with large files (the worst case for the new scrub code). The improvement is mostly on the queue depth, now it's almost 4 times the original depth compared to the introduce of stripe based scrub: Device r/s rkB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util nvme0n1p3 5346.00 2642288.00 36075.00 87.09 0.75 494.26 4.00 100.00 Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") Signed-off-by: Qu Wenruo --- fs/btrfs/file-item.c | 33 +++++++++++++++++++++------------ fs/btrfs/file-item.h | 6 +++--- fs/btrfs/raid56.c | 4 ++-- fs/btrfs/scrub.c | 29 +++++++++++++++++++---------- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 696bf695d8eb..21ee77d26c23 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -597,29 +597,36 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit) +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; + bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && key.offset <= start) + goto search_forward; + btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -656,6 +663,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } } +search_forward: while (start <= end) { u64 csum_end; @@ -712,7 +720,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } ret = 0; fail: - btrfs_free_path(path); + if (free_path) + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 4ec669b69008..04bd2d34efb1 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f97fc62fbab2..3e014b9370a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2105,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap, false); + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2b6888ff4a50..5f8c95290e60 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -190,6 +190,7 @@ struct scrub_ctx { struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; struct btrfs_path extent_path; + struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -356,6 +357,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->fs_info = fs_info; sctx->extent_path.search_commit_root = 1; sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -1484,6 +1487,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_path *extent_path, + struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1575,9 +1579,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, - stripe_end, stripe->csums, - &csum_bitmap, true); + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) @@ -1785,9 +1789,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * ASSERT(sctx->cur_stripe < SCRUB_STRIPES_PER_SCTX); stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, dev, - physical, mirror_num, logical, - length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, mirror_num, logical, + length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; @@ -1828,6 +1832,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1839,12 +1844,14 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we can not re-use the same extent path, as - * the data stripe bytenr may be smaller than previous extent. - * Thus we have to use our own extent path. + * For data stripe search, we can not re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. + * Thus we have to use our own extent/csum paths. */ extent_path.search_commit_root = 1; extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; for (int i = 0; i < data_stripes; i++) { int stripe_index; @@ -1860,7 +1867,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, &extent_path, + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1989,6 +1996,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); out: return ret; } @@ -2277,6 +2285,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!ret) ret = ret2; btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) From patchwork Thu Jul 20 10:48:13 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 13320309 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E7DBBEB64DA for ; Thu, 20 Jul 2023 10:48:49 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230509AbjGTKss (ORCPT ); Thu, 20 Jul 2023 06:48:48 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47662 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231196AbjGTKsl (ORCPT ); Thu, 20 Jul 2023 06:48:41 -0400 Received: from smtp-out1.suse.de (smtp-out1.suse.de [IPv6:2001:67c:2178:6::1c]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B03611995 for ; Thu, 20 Jul 2023 03:48:38 -0700 (PDT) Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out1.suse.de (Postfix) with ESMTPS id 70D7F21D9B for ; Thu, 20 Jul 2023 10:48:37 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1; t=1689850117; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc: mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=XHwqqKpCrdgNt6ZXCJiO7sAvB+sZal/UWU5h0ezv17k=; b=Mm8+H+mXWPxmWB12v47legLjBea03a0ICw80HScmLNutNsDZomZS7f+O38zgSP5R4QFHv6 pQqfDxHuRygLhADQ2NA3fR+1KzIbdvIKNwXR1R1QvimtbJ0zLKzCN+1OM4MEZT4giLzrqS FjuS4RLfVFXSEjJqP1mIs9qV4BLJc7E= Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id B8BA7133DD for ; Thu, 20 Jul 2023 10:48:36 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id sIhcHwQRuWQBcQAAMHmgww (envelope-from ) for ; Thu, 20 Jul 2023 10:48:36 +0000 From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 3/5] btrfs: scrub: move write back of repaired sectors into scrub_stripe_read_repair_worker() Date: Thu, 20 Jul 2023 18:48:13 +0800 Message-ID: X-Mailer: git-send-email 2.41.0 In-Reply-To: References: MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org Currently the scrub_stripe_read_repair_worker() only do reads to rebuild the corrupted sectors, it doesn't do any writeback. The design is mostly to put writeback into a more ordered manner, to co-operate with dev-replace with zoned mode, which requires every write to be submitted in their bytenr order. However the writeback for repaired sectors into the original mirror doesn't need such strong requirement, as it can only happen for non-zoned devices. This patch would move the writeback for repaired sectors into scrub_stripe_read_repair_worker(). Signed-off-by: Qu Wenruo --- fs/btrfs/scrub.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f8c95290e60..516703d20a13 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -989,6 +989,8 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, spin_unlock(&sctx->stat_lock); } +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); /* * The main entrance for all read related scrub work, including: * @@ -1003,6 +1005,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); + struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = stripe->bg->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); @@ -1068,7 +1071,25 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) goto out; } out: - scrub_stripe_report_errors(stripe->sctx, stripe); + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { + btrfs_repair_one_zone(fs_info, + sctx->stripes[0].bg->start); + } + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } From patchwork Thu Jul 20 10:48:14 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 13320311 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3C84EEB64DD for ; Thu, 20 Jul 2023 10:48:52 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231169AbjGTKsu (ORCPT ); Thu, 20 Jul 2023 06:48:50 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47670 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229706AbjGTKsn (ORCPT ); Thu, 20 Jul 2023 06:48:43 -0400 Received: from smtp-out2.suse.de (smtp-out2.suse.de [IPv6:2001:67c:2178:6::1d]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E7355172A for ; Thu, 20 Jul 2023 03:48:39 -0700 (PDT) Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out2.suse.de (Postfix) with ESMTPS id 9626420686 for ; Thu, 20 Jul 2023 10:48:38 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1; t=1689850118; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc: mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=UmbF9ScFBUG7yMEzoUvcccNkOWeEKVJfboIOM7LU0qc=; b=mZl3cGa6nLyw8qMCnNJT4Ii8gNgee78HE4xZvuMG7xQRaDK0oKO8SDwWlogIYFG09fhlZj MrSXLYsWRxcK7eCsM6SlWtDc5eGGBQJLGvbwdSPR69niDLR+feZW4IuEAa6OWf2QwaC2+n DAu+SwDF9VRIt3kJV/d4mZ5hmd4fT78= Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id DC3C3133DD for ; Thu, 20 Jul 2023 10:48:37 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id 0AU0KAURuWQBcQAAMHmgww (envelope-from ) for ; Thu, 20 Jul 2023 10:48:37 +0000 From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 4/5] btrfs: scrub: don't go ordered workqueue for dev-replace Date: Thu, 20 Jul 2023 18:48:14 +0800 Message-ID: <8a4a32adb779cb1c947f352c421de1b57506dbaf.1689849582.git.wqu@suse.com> X-Mailer: git-send-email 2.41.0 In-Reply-To: References: MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org The workqueue fs_info->scrub_worker would go ordered workqueue if it's a device replace operation. However the scrub is relying on multiple workers to do data csum verification, and we always submit several read requests in a row. Thus there is no need to use ordered workqueue just for dev-replace. We have extra synchronization (the main thread will always submit-and-wait for dev-replace writes) to handle it for zoned devices. Signed-off-by: Qu Wenruo --- fs/btrfs/scrub.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 516703d20a13..6d08836c0056 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2792,8 +2792,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, - int is_dev_replace) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; @@ -2803,10 +2802,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - if (is_dev_replace) - scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); - else - scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; @@ -2858,7 +2854,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); - ret = scrub_workers_get(fs_info, is_dev_replace); + ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; From patchwork Thu Jul 20 10:48:15 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Qu Wenruo X-Patchwork-Id: 13320312 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id A413EEB64DC for ; Thu, 20 Jul 2023 10:48:52 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231184AbjGTKsv (ORCPT ); Thu, 20 Jul 2023 06:48:51 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47718 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230471AbjGTKsn (ORCPT ); Thu, 20 Jul 2023 06:48:43 -0400 Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0BD791738 for ; Thu, 20 Jul 2023 03:48:41 -0700 (PDT) Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by smtp-out2.suse.de (Postfix) with ESMTPS id BB72520687 for ; Thu, 20 Jul 2023 10:48:39 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1; t=1689850119; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc: mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=bQrXZqEK+ahkSThFQV1d+HvPsWOlTmsSWQQiZbC7lV4=; b=WFFIsEAV2szS0JuJ2ls2CEPr/Cuh5LZQkHe/2wLr1emXxocjxeS8oJCmU/wrUM5/RhbFYk SCkzWWs7Am2fWN3abK86+YlOYnD3r7FHWAJYx+wCveLH8EPnjdrKMB13wLuuitXHPdkIuR 8HjLouZgP6odxJGCyMv5v4LmfLrKGSA= Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de [192.168.254.74]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-521) server-digest SHA512) (No client certificate requested) by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 0C4F1133DD for ; Thu, 20 Jul 2023 10:48:38 +0000 (UTC) Received: from dovecot-director2.suse.de ([192.168.254.65]) by imap2.suse-dmz.suse.de with ESMTPSA id 4LX8MAYRuWQBcQAAMHmgww (envelope-from ) for ; Thu, 20 Jul 2023 10:48:38 +0000 From: Qu Wenruo To: linux-btrfs@vger.kernel.org Subject: [PATCH 5/5] btrfs: scrub: extract the stripe group submission into a helper Date: Thu, 20 Jul 2023 18:48:15 +0800 Message-ID: X-Mailer: git-send-email 2.41.0 In-Reply-To: References: MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org Introduce a new helper, submit_initial_stripes_read(), to submit multiple stripes in one go. This function can be utilized to submit one group, or the remaining ones which doesn't fill a full group (during flush_scrub_stripes()). This helper has extra ASSERT()s to make sure the stripes are properly initialized and not yet submitted. Signed-off-by: Qu Wenruo --- fs/btrfs/scrub.c | 67 +++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6d08836c0056..d6dfffb4efc9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1695,12 +1695,36 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) return false; } +static void submit_initial_stripes_read(struct scrub_ctx *sctx, int first_slot, + int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot >= 0); + ASSERT(first_slot + nr_stripes <= SCRUB_STRIPES_PER_SCTX); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized and not yet submitted. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + ASSERT(!test_bit(SCRUB_STRIPE_FLAG_READ_SUBMITTED, &stripe->state)); + + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); + +} + static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct scrub_stripe *stripe; const int nr_stripes = sctx->cur_stripe; - struct blk_plug plug; int ret = 0; if (!nr_stripes) @@ -1708,17 +1732,14 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - /* We should only have at most one group to submit. */ - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset( - nr_stripes % SCRUB_STRIPES_PER_GROUP ?: - SCRUB_STRIPES_PER_GROUP)); - blk_start_plug(&plug); - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, + SCRUB_STRIPES_PER_GROUP); + + submit_initial_stripes_read(sctx, first_slot, + nr_stripes - first_slot); } - blk_finish_plug(&plug); for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; @@ -1820,25 +1841,19 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * sctx->cur_stripe++; + /* We have filled one group, submit them now. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - + SCRUB_STRIPES_PER_GROUP; + + submit_initial_stripes_read(sctx, first_slot, + SCRUB_STRIPES_PER_GROUP); + } + /* Last slot used, flush them all. */ if (sctx->cur_stripe == SCRUB_STRIPES_PER_SCTX) return flush_scrub_stripes(sctx); - /* We have filled one group, submit them now. */ - if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { - struct blk_plug plug; - - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset(SCRUB_STRIPES_PER_GROUP)); - - blk_start_plug(&plug); - for (int i = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; - i < sctx->cur_stripe; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); - } - blk_finish_plug(&plug); - } return 0; }