From patchwork Sat Jun 17 05:07:22 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Qu Wenruo <wqu@suse.com>
X-Patchwork-Id: 13283411
Return-Path: <linux-btrfs-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0BF0BEB64DB
	for <linux-btrfs@archiver.kernel.org>; Sat, 17 Jun 2023 05:07:53 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S232460AbjFQFHt (ORCPT <rfc822;linux-btrfs@archiver.kernel.org>);
        Sat, 17 Jun 2023 01:07:49 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45792 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232158AbjFQFHs (ORCPT
        <rfc822;linux-btrfs@vger.kernel.org>);
        Sat, 17 Jun 2023 01:07:48 -0400
Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.220.28])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 97E6919BB
        for <linux-btrfs@vger.kernel.org>;
 Fri, 16 Jun 2023 22:07:47 -0700 (PDT)
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by smtp-out1.suse.de (Postfix) with ESMTPS id 049EC21A86
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:46 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1;
        t=1686978466;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:
         mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding:
         in-reply-to:in-reply-to:references:references;
        bh=Jq/17xBPSgnAVhnbOYLvsU/quJIvRfOc0OO8rR8ufJg=;
        b=NwXYeW3Lam/nWlD5+NXzzY+ACne25FYH4+ELYPf9r0xFb/wGuZYs2nBuq1lt9d9HsgFGzL
        tJiYrxBhDS1XZa3D/2zPpwgZ3fPdsyFPkNXBWOhYRxdEXvpLqu6gE89TX2oxvGFBdaKw6k
        GYBSIOZNrTwyRc2aXw3C7KXukYXC3BU=
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 6D8D813915
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:44 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
        by imap2.suse-dmz.suse.de with ESMTPSA
        id yMdBM6A/jWTFEgAAMHmgww
        (envelope-from <wqu@suse.com>)
        for <linux-btrfs@vger.kernel.org>; Sat, 17 Jun 2023 05:07:44 +0000
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 1/5] btrfs: scrub: make scrub_ctx::stripes dynamically
 allocated
Date: Sat, 17 Jun 2023 13:07:22 +0800
Message-ID: 
 <93e32d86ead00d7596072aa2f1fbc59022ebb3ad.1686977659.git.wqu@suse.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
References: <cover.1686977659.git.wqu@suse.com>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org

Currently scrub_ctx::stripes are a fixed size array, this is fine for
most use cases, but later we may want to allocate one larger sized array
for logical bytenr based scrub.

So here we change the member to a dynamically allocated array.

This also affects the lifespan of the member.
Now it only needs to be allocated and initialized at the beginning of
scrub_stripe() function.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/scrub.c | 65 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 093823aa8d2c..d83bcc396bb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -172,7 +172,7 @@ struct scrub_stripe {
 };
 
 struct scrub_ctx {
-	struct scrub_stripe	stripes[SCRUB_STRIPES_PER_SCTX];
+	struct scrub_stripe	*stripes;
 	struct scrub_stripe	*raid56_data_stripes;
 	struct btrfs_fs_info	*fs_info;
 	int			first_free;
@@ -181,6 +181,9 @@ struct scrub_ctx {
 	int			readonly;
 	int			sectors_per_bio;
 
+	/* Number of stripes we have in @stripes. */
+	unsigned int		nr_stripes;
+
 	/* State of IO submission throttling affecting the associated device */
 	ktime_t			throttle_deadline;
 	u64			throttle_sent;
@@ -315,9 +318,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 	if (!sctx)
 		return;
 
-	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
-		release_scrub_stripe(&sctx->stripes[i]);
-
+	if (sctx->stripes)
+		for (i = 0; i < sctx->nr_stripes; i++)
+			release_scrub_stripe(&sctx->stripes[i]);
+	kfree(sctx->stripes);
 	kfree(sctx);
 }
 
@@ -331,7 +335,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 		struct btrfs_fs_info *fs_info, int is_dev_replace)
 {
 	struct scrub_ctx *sctx;
-	int		i;
 
 	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 	if (!sctx)
@@ -339,14 +342,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	refcount_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->fs_info = fs_info;
-	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
-		int ret;
-
-		ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
-		if (ret < 0)
-			goto nomem;
-		sctx->stripes[i].sctx = sctx;
-	}
 	sctx->first_free = 0;
 	atomic_set(&sctx->cancel_req, 0);
 
@@ -1660,6 +1655,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
 	const int nr_stripes = sctx->cur_stripe;
 	int ret = 0;
 
+	ASSERT(nr_stripes <= sctx->nr_stripes);
 	if (!nr_stripes)
 		return 0;
 
@@ -1754,8 +1750,11 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 	struct scrub_stripe *stripe;
 	int ret;
 
+	ASSERT(sctx->stripes);
+	ASSERT(sctx->nr_stripes);
+
 	/* No available slot, submit all stripes and wait for them. */
-	if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
+	if (sctx->cur_stripe >= sctx->nr_stripes) {
 		ret = flush_scrub_stripes(sctx);
 		if (ret < 0)
 			return ret;
@@ -2080,6 +2079,39 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
 	return ret;
 }
 
+static void free_scrub_stripes(struct scrub_ctx *sctx)
+{
+	for (int i = 0; i < sctx->nr_stripes; i++)
+		release_scrub_stripe(&sctx->stripes[i]);
+	kfree(sctx->stripes);
+	sctx->nr_stripes = 0;
+	sctx->stripes = NULL;
+}
+
+static int alloc_scrub_stripes(struct scrub_ctx *sctx, int nr_stripes)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	int ret;
+
+	ASSERT(!sctx->stripes);
+	ASSERT(!sctx->nr_stripes);
+	sctx->stripes = kcalloc(nr_stripes, sizeof(struct scrub_stripe),
+				GFP_KERNEL);
+	if (!sctx->stripes)
+		return -ENOMEM;
+	sctx->nr_stripes = nr_stripes;
+	for (int i = 0; i < sctx->nr_stripes; i++) {
+		ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
+		if (ret < 0)
+			goto cleanup;
+		sctx->stripes[i].sctx = sctx;
+	}
+	return 0;
+cleanup:
+	free_scrub_stripes(sctx);
+	return -ENOMEM;
+}
+
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct btrfs_block_group *bg,
 					   struct extent_map *em,
@@ -2106,6 +2138,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
 	scrub_blocked_if_needed(fs_info);
 
+	ret = alloc_scrub_stripes(sctx, SCRUB_STRIPES_PER_SCTX);
+	if (ret < 0)
+		return ret;
+
 	if (sctx->is_dev_replace &&
 	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
 		mutex_lock(&sctx->wr_lock);
@@ -2228,6 +2264,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		kfree(sctx->raid56_data_stripes);
 		sctx->raid56_data_stripes = NULL;
 	}
+	free_scrub_stripes(sctx);
 
 	if (sctx->is_dev_replace && ret >= 0) {
 		int ret2;

From patchwork Sat Jun 17 05:07:23 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Qu Wenruo <wqu@suse.com>
X-Patchwork-Id: 13283412
Return-Path: <linux-btrfs-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D6156EB64DA
	for <linux-btrfs@archiver.kernel.org>; Sat, 17 Jun 2023 05:07:56 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S232609AbjFQFHy (ORCPT <rfc822;linux-btrfs@archiver.kernel.org>);
        Sat, 17 Jun 2023 01:07:54 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45802 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232513AbjFQFHu (ORCPT
        <rfc822;linux-btrfs@vger.kernel.org>);
        Sat, 17 Jun 2023 01:07:50 -0400
Received: from smtp-out2.suse.de (smtp-out2.suse.de
 [IPv6:2001:67c:2178:6::1d])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6D9F119BB
        for <linux-btrfs@vger.kernel.org>;
 Fri, 16 Jun 2023 22:07:49 -0700 (PDT)
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by smtp-out2.suse.de (Postfix) with ESMTPS id 530731FDCB
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:47 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1;
        t=1686978467;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:
         mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding:
         in-reply-to:in-reply-to:references:references;
        bh=piK7ZmlvUcdml3+57ZCoS5LHAj/bqawtotYEQ8aptMo=;
        b=BgnZjeE/gaR0mffM0asZDvLjPn1KOtslooGqtSILDYvLgThuORQ3UDZ6F6r6wHGz4RD0P6
        iiZcETaSAWOxS/u8vC+sMgxWfehqcoWr+NBVyQAjxLmMoyO14yIvfQyouUG++NGCokxxjp
        neQZElI5YHADkastSqitkTIaXCVjIrg=
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 586E013915
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:46 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
        by imap2.suse-dmz.suse.de with ESMTPSA
        id UAuzCqI/jWTFEgAAMHmgww
        (envelope-from <wqu@suse.com>)
        for <linux-btrfs@vger.kernel.org>; Sat, 17 Jun 2023 05:07:46 +0000
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 2/5] btrfs: scrub: introduce the skeleton for
 logical-scrub
Date: Sat, 17 Jun 2023 13:07:23 +0800
Message-ID: 
 <16be0b6c52962fd3657c7bc09c00020ae72cd771.1686977659.git.wqu@suse.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
References: <cover.1686977659.git.wqu@suse.com>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org

Currently btrfs scrub is a per-device operation, this is fine for most
non-RAID56 profiles, but not a good thing for RAID56 profiles.

The main challenge for RAID56 is, we will read out data stripes more
than one times (one for the data stripe scrub itself, more for
parities).

To address this, and maybe even improve the non-RAID56 scrub, here we
introduce a new scrub flag, SCRUB_LOGICAL.

This would be a per-fs operation, and conflicts with any
dev-scrub/dev-replace.

This patch only implements the basic exclusion checks.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/disk-io.c         |  1 +
 fs/btrfs/fs.h              | 12 +++++++
 fs/btrfs/ioctl.c           |  6 +++-
 fs/btrfs/scrub.c           | 72 +++++++++++++++++++++++++++++++++++++-
 fs/btrfs/scrub.h           |  2 ++
 include/uapi/linux/btrfs.h | 11 ++++--
 6 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1168e5df8bae..5a970040e5db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1845,6 +1845,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
 {
 	mutex_init(&fs_info->scrub_lock);
 	atomic_set(&fs_info->scrubs_running, 0);
+	atomic_set(&fs_info->scrubs_logical_running, 0);
 	atomic_set(&fs_info->scrub_pause_req, 0);
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 396e2a463480..2067722ea6c8 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -631,7 +631,19 @@ struct btrfs_fs_info {
 
 	/* Private scrub information */
 	struct mutex scrub_lock;
+
+	/*
+	 * Number of running scrubbing, including both dev-scrub (at most
+	 * one dev-scrub on each device) and logical-scrub (at most
+	 * one logical-scrub for each fs).
+	 */
 	atomic_t scrubs_running;
+
+	/*
+	 * Number of running logical scrubbing, there is at most one running
+	 * logical scrub for each fs.
+	 */
+	atomic_t scrubs_logical_running;
 	atomic_t scrub_pause_req;
 	atomic_t scrubs_paused;
 	atomic_t scrub_cancel_req;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edbbd5cf23fc..1a1afcce73e0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3179,7 +3179,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 			goto out;
 	}
 
-	ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
+	if (sa->flags & BTRFS_SCRUB_LOGICAL)
+		ret = btrfs_scrub_logical(fs_info, sa->start, sa->end,
+				&sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+	else
+		ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
 			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
 			      0);
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d83bcc396bb0..598754ad7ce6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -178,7 +178,8 @@ struct scrub_ctx {
 	int			first_free;
 	int			cur_stripe;
 	atomic_t		cancel_req;
-	int			readonly;
+	bool			readonly;
+	bool			scrub_logical;
 	int			sectors_per_bio;
 
 	/* Number of stripes we have in @stripes. */
@@ -2841,6 +2842,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		goto out;
 	}
 
+	/* Dev-scrub conflicts with logical-scrub. */
+	if (atomic_read(&fs_info->scrubs_logical_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		ret = -EINPROGRESS;
+		goto out;
+	}
+
 	down_read(&fs_info->dev_replace.rwsem);
 	if (dev->scrub_ctx ||
 	    (!is_dev_replace &&
@@ -2951,6 +2960,67 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	return ret;
 }
 
+int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+			struct btrfs_scrub_progress *progress, bool readonly)
+{
+	struct scrub_ctx *sctx;
+	int ret;
+
+	if (btrfs_fs_closing(fs_info))
+		return -EAGAIN;
+
+	/* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+	ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
+
+	sctx = scrub_setup_ctx(fs_info, false);
+	if (IS_ERR(sctx))
+		return PTR_ERR(sctx);
+	sctx->scrub_logical = true;
+	sctx->readonly = readonly;
+
+	ret = scrub_workers_get(fs_info, false);
+	if (ret)
+		goto out_free_ctx;
+
+	/* Make sure we're the only running scrub. */
+	mutex_lock(&fs_info->scrub_lock);
+	if (atomic_read(&fs_info->scrubs_running)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		ret = -EINPROGRESS;
+		goto out;
+	}
+	down_read(&fs_info->dev_replace.rwsem);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		up_read(&fs_info->dev_replace.rwsem);
+		mutex_unlock(&fs_info->scrub_lock);
+		ret = -EINPROGRESS;
+		goto out;
+	}
+	up_read(&fs_info->dev_replace.rwsem);
+	/*
+	 * Checking @scrub_pause_req here, we can avoid
+	 * race between committing transaction and scrubbing.
+	 */
+	__scrub_blocked_if_needed(fs_info);
+	atomic_inc(&fs_info->scrubs_running);
+	atomic_inc(&fs_info->scrubs_logical_running);
+	mutex_unlock(&fs_info->scrub_lock);
+
+	/* The main work would be implemented. */
+	ret = -EOPNOTSUPP;
+
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_logical_running);
+	wake_up(&fs_info->scrub_pause_wait);
+	if (progress)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+out:
+	scrub_workers_put(fs_info);
+out_free_ctx:
+	scrub_free_ctx(sctx);
+	return ret;
+}
+
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
 {
 	mutex_lock(&fs_info->scrub_lock);
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf9d..22db0f71083e 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -6,6 +6,8 @@
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace);
+int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
+			struct btrfs_scrub_progress *progress, bool readonly);
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index dbb8b96da50d..fa5cb3b3611b 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -186,8 +186,15 @@ struct btrfs_scrub_progress {
 					 * Intermittent error. */
 };
 
-#define BTRFS_SCRUB_READONLY	1
-#define BTRFS_SCRUB_SUPPORTED_FLAGS	(BTRFS_SCRUB_READONLY)
+#define BTRFS_SCRUB_READONLY	(1ULL << 0)
+
+/*
+ * Regular scrub is based on device, while with this flag, scrub is
+ * based on logical, and @devid would be ignored.
+ */
+#define BTRFS_SCRUB_LOGICAL	(1ULL << 1)
+#define BTRFS_SCRUB_SUPPORTED_FLAGS	(BTRFS_SCRUB_READONLY |\
+					 BTRFS_SCRUB_LOGICAL)
 struct btrfs_ioctl_scrub_args {
 	__u64 devid;				/* in */
 	__u64 start;				/* in */

From patchwork Sat Jun 17 05:07:24 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Qu Wenruo <wqu@suse.com>
X-Patchwork-Id: 13283413
Return-Path: <linux-btrfs-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id E6716EB64D9
	for <linux-btrfs@archiver.kernel.org>; Sat, 17 Jun 2023 05:07:56 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S232555AbjFQFHx (ORCPT <rfc822;linux-btrfs@archiver.kernel.org>);
        Sat, 17 Jun 2023 01:07:53 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45808 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232516AbjFQFHv (ORCPT
        <rfc822;linux-btrfs@vger.kernel.org>);
        Sat, 17 Jun 2023 01:07:51 -0400
Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7F5EC1BDF
        for <linux-btrfs@vger.kernel.org>;
 Fri, 16 Jun 2023 22:07:49 -0700 (PDT)
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by smtp-out2.suse.de (Postfix) with ESMTPS id 3FCCB1FD63
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:48 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1;
        t=1686978468;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:
         mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding:
         in-reply-to:in-reply-to:references:references;
        bh=PH0Gis6TXuJxKrYn0R9rJzc6GckiFG+FkOImQ69gFM8=;
        b=gBk3nJyXXoCOSYd6mMJk94RgoSpTyBvHC7NEXLzJlELB6sx0NNgLqurCMdQpF0w9+DrCBg
        3BtFJty6eVMiobVicdeKc58hKvPxkt3eQnTzAMG0i2xZPuMDnCinQrMNJJ7qT7ovHCmLrE
        RVYwWwtsH4fu5PUqo3KpidFeCHjjloE=
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id A72C113915
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:47 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
        by imap2.suse-dmz.suse.de with ESMTPSA
        id wPsCHqM/jWTFEgAAMHmgww
        (envelope-from <wqu@suse.com>)
        for <linux-btrfs@vger.kernel.org>; Sat, 17 Jun 2023 05:07:47 +0000
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 3/5] btrfs: scrub: extract the common preparation before
 scrubbing a block group
Date: Sat, 17 Jun 2023 13:07:24 +0800
Message-ID: 
 <8fc39e8af0ce2c335f0e41bdd8b616b30ac2d7b9.1686977659.git.wqu@suse.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
References: <cover.1686977659.git.wqu@suse.com>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org

Introduce a new helper, prepare_scrub_block_group(), to handle the
checks before scrubbing a block group.

The helper would be called by both scrub_enumerate_chunks() and
scrub_logical_chunks().

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/scrub.c | 328 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 209 insertions(+), 119 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 598754ad7ce6..aa68cda5226f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2349,6 +2349,141 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
 	return btrfs_commit_transaction(trans);
 }
 
+/*
+ * Do the preparation before we scrub the block group.
+ *
+ * Return >0 if we don't need to scrub the block group.
+ * Return 0 if we have done the preparation and set @ro_set_ret.
+ * Return <0 for error and can not scrub the bg.
+ */
+static int prepare_scrub_block_group(struct scrub_ctx *sctx,
+				     struct btrfs_block_group *bg,
+				     bool *ro_set_ret)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct btrfs_root *root = fs_info->dev_root;
+	bool ro_set = false;
+	int ret;
+
+	if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+		if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &bg->runtime_flags))
+			return 1;
+	}
+
+	/*
+	 * Make sure that while we are scrubbing the corresponding block
+	 * group doesn't get its logical address and its device extents
+	 * reused for another block group, which can possibly be of a
+	 * different type and different profile. We do this to prevent
+	 * false error detections and crashes due to bogus attempts to
+	 * repair extents.
+	 */
+	spin_lock(&bg->lock);
+	if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+		spin_unlock(&bg->lock);
+		return 1;
+	}
+	btrfs_freeze_block_group(bg);
+	spin_unlock(&bg->lock);
+
+	/*
+	 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+	 * to avoid deadlock caused by:
+	 * btrfs_inc_block_group_ro()
+	 * -> btrfs_wait_for_commit()
+	 * -> btrfs_commit_transaction()
+	 * -> btrfs_scrub_pause()
+	 */
+	scrub_pause_on(fs_info);
+
+	/*
+	 * Don't do chunk preallocation for scrub.
+	 *
+	 * This is especially important for SYSTEM bgs, or we can hit
+	 * -EFBIG from btrfs_finish_chunk_alloc() like:
+	 * 1. The only SYSTEM bg is marked RO.
+	 *    Since SYSTEM bg is small, that's pretty common.
+	 * 2. New SYSTEM bg will be allocated
+	 *    Due to regular version will allocate new chunk.
+	 * 3. New SYSTEM bg is empty and will get cleaned up
+	 *    Before cleanup really happens, it's marked RO again.
+	 * 4. Empty SYSTEM bg get scrubbed
+	 *    We go back to 2.
+	 *
+	 * This can easily boost the amount of SYSTEM chunks if cleaner
+	 * thread can't be triggered fast enough, and use up all space
+	 * of btrfs_super_block::sys_chunk_array
+	 *
+	 * While for dev replace, we need to try our best to mark block
+	 * group RO, to prevent race between:
+	 * - Write duplication
+	 *   Contains latest data
+	 * - Scrub copy
+	 *   Contains data from commit tree
+	 *
+	 * If target block group is not marked RO, nocow writes can
+	 * be overwritten by scrub copy, causing data corruption.
+	 * So for dev-replace, it's not allowed to continue if a block
+	 * group is not RO.
+	 */
+	ret = btrfs_inc_block_group_ro(bg, sctx->is_dev_replace);
+	if (!ret && sctx->is_dev_replace) {
+		ret = finish_extent_writes_for_zoned(root, bg);
+		if (ret) {
+			btrfs_dec_block_group_ro(bg);
+			scrub_pause_off(fs_info);
+			return ret;
+		}
+	}
+
+	if (ret == 0) {
+		ro_set = 1;
+	} else if (ret == -ENOSPC && !sctx->is_dev_replace &&
+		   !(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
+		/*
+		 * btrfs_inc_block_group_ro return -ENOSPC when it
+		 * failed in creating new chunk for metadata.
+		 * It is not a problem for scrub, because
+		 * metadata are always cowed, and our scrub paused
+		 * commit_transactions.
+		 *
+		 * For RAID56 chunks, we have to mark them read-only
+		 * for scrub, as later we would use our own cache
+		 * out of RAID56 realm.
+		 * Thus we want the RAID56 bg to be marked RO to
+		 * prevent RMW from screwing up out cache.
+		 */
+		ro_set = 0;
+	} else if (ret == -ETXTBSY) {
+		btrfs_warn(fs_info,
+	   "skipping scrub of block group %llu due to active swapfile",
+			   bg->start);
+		btrfs_unfreeze_block_group(bg);
+		scrub_pause_off(fs_info);
+		return ret;
+	} else {
+		btrfs_warn(fs_info,
+			   "failed setting block group ro: %d", ret);
+		btrfs_unfreeze_block_group(bg);
+		scrub_pause_off(fs_info);
+		return ret;
+	}
+
+	/*
+	 * Now the target block is marked RO, wait for nocow writes to
+	 * finish before dev-replace.
+	 * COW is fine, as COW never overwrites extents in commit tree.
+	 */
+	if (sctx->is_dev_replace) {
+		btrfs_wait_nocow_writers(bg);
+		btrfs_wait_ordered_roots(fs_info, U64_MAX, bg->start,
+					 bg->length);
+	}
+	scrub_pause_off(fs_info);
+	*ro_set_ret = ro_set;
+	return 0;
+}
+
 static noinline_for_stack
 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			   struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -2359,7 +2494,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	struct btrfs_root *root = fs_info->dev_root;
 	u64 chunk_offset;
 	int ret = 0;
-	int ro_set;
 	int slot;
 	struct extent_buffer *l;
 	struct btrfs_key key;
@@ -2380,6 +2514,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	while (1) {
+		bool ro_set = false;
 		u64 dev_extent_len;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2461,127 +2596,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			goto skip;
 		}
 
-		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
-			if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
-				btrfs_put_block_group(cache);
-				goto skip;
-			}
+		ret = prepare_scrub_block_group(sctx, cache, &ro_set);
+		if (ret == -ETXTBSY) {
+			ret = 0;
+			goto skip_unfreeze;
 		}
-
-		/*
-		 * Make sure that while we are scrubbing the corresponding block
-		 * group doesn't get its logical address and its device extents
-		 * reused for another block group, which can possibly be of a
-		 * different type and different profile. We do this to prevent
-		 * false error detections and crashes due to bogus attempts to
-		 * repair extents.
-		 */
-		spin_lock(&cache->lock);
-		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
-			spin_unlock(&cache->lock);
+		if (ret < 0) {
+			btrfs_put_block_group(cache);
+			break;
+		}
+		if (ret > 0) {
 			btrfs_put_block_group(cache);
 			goto skip;
 		}
-		btrfs_freeze_block_group(cache);
-		spin_unlock(&cache->lock);
 
-		/*
-		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
-		 * to avoid deadlock caused by:
-		 * btrfs_inc_block_group_ro()
-		 * -> btrfs_wait_for_commit()
-		 * -> btrfs_commit_transaction()
-		 * -> btrfs_scrub_pause()
-		 */
-		scrub_pause_on(fs_info);
-
-		/*
-		 * Don't do chunk preallocation for scrub.
-		 *
-		 * This is especially important for SYSTEM bgs, or we can hit
-		 * -EFBIG from btrfs_finish_chunk_alloc() like:
-		 * 1. The only SYSTEM bg is marked RO.
-		 *    Since SYSTEM bg is small, that's pretty common.
-		 * 2. New SYSTEM bg will be allocated
-		 *    Due to regular version will allocate new chunk.
-		 * 3. New SYSTEM bg is empty and will get cleaned up
-		 *    Before cleanup really happens, it's marked RO again.
-		 * 4. Empty SYSTEM bg get scrubbed
-		 *    We go back to 2.
-		 *
-		 * This can easily boost the amount of SYSTEM chunks if cleaner
-		 * thread can't be triggered fast enough, and use up all space
-		 * of btrfs_super_block::sys_chunk_array
-		 *
-		 * While for dev replace, we need to try our best to mark block
-		 * group RO, to prevent race between:
-		 * - Write duplication
-		 *   Contains latest data
-		 * - Scrub copy
-		 *   Contains data from commit tree
-		 *
-		 * If target block group is not marked RO, nocow writes can
-		 * be overwritten by scrub copy, causing data corruption.
-		 * So for dev-replace, it's not allowed to continue if a block
-		 * group is not RO.
-		 */
-		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
-		if (!ret && sctx->is_dev_replace) {
-			ret = finish_extent_writes_for_zoned(root, cache);
-			if (ret) {
-				btrfs_dec_block_group_ro(cache);
-				scrub_pause_off(fs_info);
-				btrfs_put_block_group(cache);
-				break;
-			}
-		}
-
-		if (ret == 0) {
-			ro_set = 1;
-		} else if (ret == -ENOSPC && !sctx->is_dev_replace &&
-			   !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
-			/*
-			 * btrfs_inc_block_group_ro return -ENOSPC when it
-			 * failed in creating new chunk for metadata.
-			 * It is not a problem for scrub, because
-			 * metadata are always cowed, and our scrub paused
-			 * commit_transactions.
-			 *
-			 * For RAID56 chunks, we have to mark them read-only
-			 * for scrub, as later we would use our own cache
-			 * out of RAID56 realm.
-			 * Thus we want the RAID56 bg to be marked RO to
-			 * prevent RMW from screwing up out cache.
-			 */
-			ro_set = 0;
-		} else if (ret == -ETXTBSY) {
-			btrfs_warn(fs_info,
-		   "skipping scrub of block group %llu due to active swapfile",
-				   cache->start);
-			scrub_pause_off(fs_info);
-			ret = 0;
-			goto skip_unfreeze;
-		} else {
-			btrfs_warn(fs_info,
-				   "failed setting block group ro: %d", ret);
-			btrfs_unfreeze_block_group(cache);
-			btrfs_put_block_group(cache);
-			scrub_pause_off(fs_info);
-			break;
-		}
-
-		/*
-		 * Now the target block is marked RO, wait for nocow writes to
-		 * finish before dev-replace.
-		 * COW is fine, as COW never overwrites extents in commit tree.
-		 */
-		if (sctx->is_dev_replace) {
-			btrfs_wait_nocow_writers(cache);
-			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
-					cache->length);
-		}
-
-		scrub_pause_off(fs_info);
 		down_write(&dev_replace->rwsem);
 		dev_replace->cursor_right = found_key.offset + dev_extent_len;
 		dev_replace->cursor_left = found_key.offset;
@@ -2960,6 +2988,69 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	return ret;
 }
 
+static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	u64 cur = start;
+	int ret = 0;
+
+	while (cur < end) {
+		struct btrfs_block_group *bg;
+		bool ro_set = false;
+
+		bg = btrfs_lookup_first_block_group(fs_info, cur);
+
+		/* No more block groups. */
+		if (!bg)
+			break;
+		if (bg->start > end)
+			break;
+
+		ret = prepare_scrub_block_group(sctx, bg, &ro_set);
+		if (ret == -ETXTBSY) {
+			ret = 0;
+			goto next;
+		}
+		if (ret > 0)
+			goto next;
+		if (ret < 0) {
+			btrfs_put_block_group(bg);
+			break;
+		}
+
+		/* The real work starts here. */
+		ret = -EOPNOTSUPP;
+
+		if (ro_set)
+			btrfs_dec_block_group_ro(bg);
+		/*
+		 * We might have prevented the cleaner kthread from deleting
+		 * this block group if it was already unused because we raced
+		 * and set it to RO mode first. So add it back to the unused
+		 * list, otherwise it might not ever be deleted unless a manual
+		 * balance is triggered or it becomes used and unused again.
+		 */
+		spin_lock(&bg->lock);
+		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags) &&
+		    !bg->ro && bg->reserved == 0 && bg->used == 0) {
+			spin_unlock(&bg->lock);
+			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+				btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
+			else
+				btrfs_mark_bg_unused(bg);
+		} else {
+			spin_unlock(&bg->lock);
+		}
+		btrfs_unfreeze_block_group(bg);
+next:
+		cur = bg->start + bg->length;
+		btrfs_put_block_group(bg);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
 int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
 			struct btrfs_scrub_progress *progress, bool readonly)
 {
@@ -3006,8 +3097,7 @@ int btrfs_scrub_logical(struct btrfs_fs_info *fs_info, u64 start, u64 end,
 	atomic_inc(&fs_info->scrubs_logical_running);
 	mutex_unlock(&fs_info->scrub_lock);
 
-	/* The main work would be implemented. */
-	ret = -EOPNOTSUPP;
+	ret = scrub_logical_chunks(sctx, start, end);
 
 	atomic_dec(&fs_info->scrubs_running);
 	atomic_dec(&fs_info->scrubs_logical_running);

From patchwork Sat Jun 17 05:07:25 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Qu Wenruo <wqu@suse.com>
X-Patchwork-Id: 13283414
Return-Path: <linux-btrfs-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 84285EB64DC
	for <linux-btrfs@archiver.kernel.org>; Sat, 17 Jun 2023 05:07:59 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S232739AbjFQFH4 (ORCPT <rfc822;linux-btrfs@archiver.kernel.org>);
        Sat, 17 Jun 2023 01:07:56 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45814 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232520AbjFQFHv (ORCPT
        <rfc822;linux-btrfs@vger.kernel.org>);
        Sat, 17 Jun 2023 01:07:51 -0400
Received: from smtp-out1.suse.de (smtp-out1.suse.de
 [IPv6:2001:67c:2178:6::1c])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 802B11FD5
        for <linux-btrfs@vger.kernel.org>;
 Fri, 16 Jun 2023 22:07:50 -0700 (PDT)
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by smtp-out1.suse.de (Postfix) with ESMTPS id 29BC021A86
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:49 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1;
        t=1686978469;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:
         mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding:
         in-reply-to:in-reply-to:references:references;
        bh=YCidarLaw0PJF3mO3i+adQ9EbVpb+VpN/cKKpMDUyhY=;
        b=JlUvwTYmeLyCFvMz3ABTVQwmnhN6cg1EzaOFwTcxzpGtyLFC5XeNKp9Ima14sTtKmS19u1
        cjoDBNm8E5/jUH92VVWCngwuAVZTrHOaPX0JrFcOhmTiJloDais+6/sP6ql/Jz4fe2IH2W
        2nNGephTZ8B5x3qufshGA/o1CPXSJoM=
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 92BC813915
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:48 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
        by imap2.suse-dmz.suse.de with ESMTPSA
        id sOYdGaQ/jWTFEgAAMHmgww
        (envelope-from <wqu@suse.com>)
        for <linux-btrfs@vger.kernel.org>; Sat, 17 Jun 2023 05:07:48 +0000
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 4/5] btrfs: scrub: implement the basic extent iteration
 code
Date: Sat, 17 Jun 2023 13:07:25 +0800
Message-ID: 
 <19112293cb4c76cb407e70a29e05701cbba7e0a8.1686977659.git.wqu@suse.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
References: <cover.1686977659.git.wqu@suse.com>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org

Unlike the per-device scrub code, logical bytenr based scrub code is
much easier iterating the extents.

The difference is mainly in the stripe queueing part, as we need to
queue @nr_copies stripes at once.

So here we introduce a helper, scrub_copy_stripe(), and fill the first
stripe, then use that helper to fill the remaining copies.

For now, we still use the same flush_scrub_stripes(), but the repair
part is less efficient.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/scrub.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 160 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index aa68cda5226f..2500737adff1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1465,7 +1465,7 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
 					struct btrfs_device *dev, u64 physical,
 					int mirror_num, u64 logical_start,
-					u32 logical_len,
+					u64 logical_len,
 					struct scrub_stripe *stripe)
 {
 	struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -2988,6 +2988,164 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	return ret;
 }
 
+static void scrub_copy_stripe(const struct scrub_stripe *src,
+			      struct scrub_stripe *dst)
+{
+	struct scrub_ctx *sctx = src->sctx;
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+
+	/* This function should only be called for logical scrub. */
+	ASSERT(sctx->scrub_logical);
+	scrub_reset_stripe(dst);
+
+	dst->bg = src->bg;
+	dst->dev = NULL;
+	dst->physical = 0;
+	dst->logical = src->logical;
+	dst->mirror_num = src->mirror_num;
+
+	bitmap_copy(&dst->extent_sector_bitmap, &src->extent_sector_bitmap,
+		    src->nr_sectors);
+	memcpy(dst->csums, src->csums,
+	       (BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits) *
+		fs_info->csum_size);
+
+	for (int i = 0; i < src->nr_sectors; i++) {
+		dst->sectors[i].is_metadata = src->sectors[i].is_metadata;
+		if (src->sectors[i].is_metadata)
+			dst->sectors[i].generation = src->sectors[i].generation;
+		else if (src->sectors[i].csum)
+			dst->sectors[i].csum = dst->csums + i * fs_info->csum_size;
+	}
+	dst->nr_data_extents = src->nr_data_extents;
+	dst->nr_meta_extents = src->nr_meta_extents;
+	set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &dst->state);
+}
+
+/*
+ * Unlike queue_scrub_stripe() which only queue one stripe, this queue all
+ * mirrors for non-RAID56 profiles, or the full stripe for RAID56.
+ */
+static int queue_scrub_logical_stripes(struct scrub_ctx *sctx,
+			struct btrfs_block_group *bg, u64 logical)
+{
+	const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+	const int nr_copies = btrfs_raid_array[raid_index].ncopies;
+	const int old_cur = sctx->cur_stripe;
+	struct scrub_stripe *stripe;
+	int ret;
+
+	ASSERT(sctx->stripes);
+	ASSERT(sctx->nr_stripes);
+
+	if (sctx->cur_stripe + nr_copies > sctx->nr_stripes) {
+		ret = flush_scrub_stripes(sctx);
+		if (ret < 0)
+			return ret;
+	}
+
+	stripe = &sctx->stripes[old_cur];
+
+	scrub_reset_stripe(stripe);
+	ret = scrub_find_fill_first_stripe(bg, NULL, 0, 1,
+			logical, bg->start + bg->length - logical, stripe);
+	/* Either >0 as no more extents or <0 for error. */
+	if (ret)
+		return ret;
+
+	/* For the remaining slots, just copy the above mirror. */
+	for (int i = 1; i < nr_copies; i++) {
+		struct scrub_stripe *dst = &sctx->stripes[old_cur + i];
+
+		scrub_copy_stripe(stripe, dst);
+		dst->mirror_num = i + 1;
+	}
+	sctx->cur_stripe += nr_copies;
+	return 0;
+}
+
+static int scrub_logical_one_chunk(struct scrub_ctx *sctx,
+				   struct btrfs_block_group *bg)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	u64 cur = bg->start;
+	const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+	const int nr_copies = btrfs_raid_array[raid_index].ncopies;
+	int nr_stripes;
+	int ret = 0;
+	int flush_ret;
+
+	read_lock(&map_tree->lock);
+	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
+	read_unlock(&map_tree->lock);
+	if (!em) {
+		/*
+		 * Might have been an unused block group deleted by the cleaner
+		 * kthread or relocation.
+		 */
+		spin_lock(&bg->lock);
+		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
+			ret = -EINVAL;
+		spin_unlock(&bg->lock);
+		return ret;
+	}
+
+	scrub_blocked_if_needed(fs_info);
+
+	/* RAID56 not yet supported. */
+	if (bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	nr_stripes = SCRUB_STRIPES_PER_SCTX * nr_copies;
+	ret = alloc_scrub_stripes(sctx, nr_stripes);
+	if (ret < 0)
+		goto out;
+
+	while (cur < bg->start + bg->length) {
+		/* Canceled? */
+		if (atomic_read(&fs_info->scrub_cancel_req) ||
+		    atomic_read(&sctx->cancel_req)) {
+			ret = -ECANCELED;
+			break;
+		}
+		/* Paused? */
+		if (atomic_read(&fs_info->scrub_pause_req)) {
+			/* Push queued extents */
+			scrub_blocked_if_needed(fs_info);
+		}
+		/* Block group removed? */
+		spin_lock(&bg->lock);
+		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+			spin_unlock(&bg->lock);
+			ret = 0;
+			break;
+		}
+		spin_unlock(&bg->lock);
+
+		ret = queue_scrub_logical_stripes(sctx, bg, cur);
+		if (ret > 0) {
+			ret = 0;
+			break;
+		}
+		if (ret < 0)
+			break;
+		ASSERT(sctx->cur_stripe >= 1);
+		cur = sctx->stripes[sctx->cur_stripe - 1].logical + BTRFS_STRIPE_LEN;
+	}
+out:
+	flush_ret = flush_scrub_stripes(sctx);
+	if (!ret)
+		ret = flush_ret;
+	free_scrub_stripes(sctx);
+	free_extent_map(em);
+	return ret;
+
+}
+
 static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3018,8 +3176,7 @@ static int scrub_logical_chunks(struct scrub_ctx *sctx, u64 start, u64 end)
 			break;
 		}
 
-		/* The real work starts here. */
-		ret = -EOPNOTSUPP;
+		ret = scrub_logical_one_chunk(sctx, bg);
 
 		if (ro_set)
 			btrfs_dec_block_group_ro(bg);

From patchwork Sat Jun 17 05:07:26 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Qu Wenruo <wqu@suse.com>
X-Patchwork-Id: 13283415
Return-Path: <linux-btrfs-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 3F730EB64D9
	for <linux-btrfs@archiver.kernel.org>; Sat, 17 Jun 2023 05:08:02 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S232768AbjFQFH5 (ORCPT <rfc822;linux-btrfs@archiver.kernel.org>);
        Sat, 17 Jun 2023 01:07:57 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:45816 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232533AbjFQFHw (ORCPT
        <rfc822;linux-btrfs@vger.kernel.org>);
        Sat, 17 Jun 2023 01:07:52 -0400
Received: from smtp-out1.suse.de (smtp-out1.suse.de
 [IPv6:2001:67c:2178:6::1c])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5704A19BB
        for <linux-btrfs@vger.kernel.org>;
 Fri, 16 Jun 2023 22:07:51 -0700 (PDT)
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by smtp-out1.suse.de (Postfix) with ESMTPS id 1306F21A8D
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:50 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.com; s=susede1;
        t=1686978470;
 h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:
         mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding:
         in-reply-to:in-reply-to:references:references;
        bh=He1iCI4LQA0RuXIp5oqFVfawd1MZl94+EA3HX4g9SnI=;
        b=j/ZVmSme9S+SN2+SF4d6BQ5YQayBaNR1IO+RCiYLobhhoO6i5KGgnELleTShQSm4mobL08
        +maC68YDBi6DwIC72jhRmSmCXDFLZjUtds+h5B75Tz0+ZkLW/rvKAVh7mB+/8zLFfK7P4u
        SvSPdGRCqD3oFbI5vG5sOS8JXBuD004=
Received: from imap2.suse-dmz.suse.de (imap2.suse-dmz.suse.de
 [192.168.254.74])
        (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
         key-exchange X25519 server-signature ECDSA (P-521) server-digest
 SHA512)
        (No client certificate requested)
        by imap2.suse-dmz.suse.de (Postfix) with ESMTPS id 7DE6113915
        for <linux-btrfs@vger.kernel.org>;
 Sat, 17 Jun 2023 05:07:49 +0000 (UTC)
Received: from dovecot-director2.suse.de ([192.168.254.65])
        by imap2.suse-dmz.suse.de with ESMTPSA
        id OBHQE6U/jWTFEgAAMHmgww
        (envelope-from <wqu@suse.com>)
        for <linux-btrfs@vger.kernel.org>; Sat, 17 Jun 2023 05:07:49 +0000
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH RFC 5/5] btrfs: scrub: implement the repair part of scrub
 logical
Date: Sat, 17 Jun 2023 13:07:26 +0800
Message-ID: 
 <7142c08a9a4fe142abe5f77641cdaf26a4da55c1.1686977659.git.wqu@suse.com>
X-Mailer: git-send-email 2.41.0
In-Reply-To: <cover.1686977659.git.wqu@suse.com>
References: <cover.1686977659.git.wqu@suse.com>
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-btrfs.vger.kernel.org>
X-Mailing-List: linux-btrfs@vger.kernel.org

The repair part of scrub logical is done differently compared to the
per-device counterpart:

- We read out all mirrors in one go
  Since it's no longer per-device, we just read out all mirrors.

- Find a good mirror for the same sectornr of all mirrors

- Copy the good content to any corrupted sector

This has several advantages:

- Less IO wait
  Since all IOs are submitted at the very beginning, we avoid the read
  then wait for per-device scrub.

  This applies to both read and write part.

This needs some changes to the per-device scrub code though:

- Call the scrub_verify_one_stripe() inside scrub_read_endio()
  This is to improve the performance, as we can have csum verification
  per-mirror.

- Do not queue scrub_stripe_read_repair_worker() workload for
  scrub_logical
  Since we don't need to go per-device repair path.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/scrub.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 133 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2500737adff1..cdde01d55e1a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -988,7 +988,6 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
 	ASSERT(stripe->mirror_num > 0);
 
 	wait_scrub_stripe_io(stripe);
-	scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
 	/* Save the initial failed bitmap for later repair and report usage. */
 	stripe->init_error_bitmap = stripe->error_bitmap;
 	stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
@@ -1061,9 +1060,12 @@ static void scrub_read_endio(struct btrfs_bio *bbio)
 	}
 	bio_put(&bbio->bio);
 	if (atomic_dec_and_test(&stripe->pending_io)) {
+		scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
 		wake_up(&stripe->io_wait);
-		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
-		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+		if (!stripe->sctx->scrub_logical) {
+			INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+			queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+		}
 	}
 }
 
@@ -1649,7 +1651,127 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
 	return false;
 }
 
-static int flush_scrub_stripes(struct scrub_ctx *sctx)
+/*
+ * Unlike the per-device repair, we have all mirrors read out already.
+ *
+ * Thus we only need to find out the good mirror, and copy the content to
+ * any bad sectors.
+ */
+static void repair_one_mirror_group(struct scrub_ctx *sctx, int start_stripe,
+				    int ncopies)
+{
+	struct btrfs_fs_info *fs_info = sctx->fs_info;
+	struct scrub_stripe *first_stripe = &sctx->stripes[start_stripe];
+	struct scrub_stripe *cur_stripe;
+	const u32 sectorsize = fs_info->sectorsize;
+	int sectornr;
+
+	ASSERT(start_stripe + ncopies <= sctx->cur_stripe);
+
+	for_each_set_bit(sectornr, &first_stripe->extent_sector_bitmap,
+			 first_stripe->nr_sectors) {
+		struct scrub_stripe *good_stripe;
+		int good_mirror = -1;
+
+		for (int cur_mirror = start_stripe;
+		     cur_mirror < start_stripe + ncopies; cur_mirror++) {
+			cur_stripe = &sctx->stripes[cur_mirror];
+
+			if (!test_bit(sectornr, &cur_stripe->error_bitmap)) {
+				good_mirror = cur_mirror;
+				break;
+			}
+		}
+		/* No good mirror found, this vertical stripe can not be repaired. */
+		if (good_mirror < 0)
+			continue;
+
+		good_stripe = &sctx->stripes[good_mirror];
+
+		for (int cur_mirror = start_stripe;
+		     cur_mirror < start_stripe + ncopies; cur_mirror++) {
+			cur_stripe = &sctx->stripes[cur_mirror];
+
+			if (!test_bit(sectornr, &cur_stripe->error_bitmap))
+				continue;
+			/* Repair from the good mirror. */
+			memcpy_page(scrub_stripe_get_page(cur_stripe, sectornr),
+				    scrub_stripe_get_page_offset(cur_stripe, sectornr),
+				    scrub_stripe_get_page(good_stripe, sectornr),
+				    scrub_stripe_get_page_offset(good_stripe, sectornr),
+				    sectorsize);
+			clear_bit(sectornr, &cur_stripe->error_bitmap);
+			clear_bit(sectornr, &cur_stripe->io_error_bitmap);
+			if (cur_stripe->sectors[sectornr].is_metadata)
+				clear_bit(sectornr, &cur_stripe->meta_error_bitmap);
+			else
+				clear_bit(sectornr, &cur_stripe->csum_error_bitmap);
+		}
+	}
+	for (int cur_mirror = start_stripe; cur_mirror < start_stripe + ncopies;
+	     cur_mirror++) {
+		cur_stripe = &sctx->stripes[cur_mirror];
+		set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &cur_stripe->state);
+		scrub_stripe_report_errors(sctx, cur_stripe);
+		wake_up(&cur_stripe->repair_wait);
+
+		if (btrfs_is_zoned(fs_info)) {
+			if (!bitmap_empty(&cur_stripe->init_error_bitmap,
+					  cur_stripe->nr_sectors)) {
+				btrfs_repair_one_zone(fs_info, cur_stripe->logical);
+				break;
+			}
+		}
+		if (!sctx->readonly) {
+			unsigned long repaired;
+
+			bitmap_andnot(&repaired, &cur_stripe->init_error_bitmap,
+				      &cur_stripe->error_bitmap,
+				      cur_stripe->nr_sectors);
+			scrub_write_sectors(sctx, cur_stripe, repaired, false);
+		}
+	}
+	/* Wait for above writeback to finish. */
+	for (int cur_mirror = start_stripe; cur_mirror < start_stripe + ncopies;
+	     cur_mirror++) {
+		cur_stripe = &sctx->stripes[cur_mirror];
+
+		wait_scrub_stripe_io(cur_stripe);
+	}
+}
+
+static int handle_logical_stripes(struct scrub_ctx *sctx,
+				  struct btrfs_block_group *bg)
+{
+	const int nr_stripes = sctx->cur_stripe;
+	const int raid_index = btrfs_bg_flags_to_raid_index(bg->flags);
+	const int ncopies = btrfs_raid_array[raid_index].ncopies;
+	struct scrub_stripe *stripe;
+
+	for (int i = 0 ; i < nr_stripes; i++) {
+		stripe = &sctx->stripes[i];
+
+		wait_scrub_stripe_io(stripe);
+
+		/* Save the initial failed bitmap for later repair and report usage. */
+		stripe->init_error_bitmap = stripe->error_bitmap;
+		stripe->init_nr_io_errors =
+			bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
+		stripe->init_nr_csum_errors =
+			bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
+		stripe->init_nr_meta_errors =
+			bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+	}
+
+	for (int i = 0 ; i < nr_stripes; i += ncopies)
+		repair_one_mirror_group(sctx, i, ncopies);
+	sctx->cur_stripe = 0;
+
+	return 0;
+}
+
+static int flush_scrub_stripes(struct scrub_ctx *sctx,
+			       struct btrfs_block_group *bg)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct scrub_stripe *stripe;
@@ -1660,6 +1782,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
 	if (!nr_stripes)
 		return 0;
 
+	if (sctx->scrub_logical)
+		return handle_logical_stripes(sctx, bg);
+
 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
 
 	scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
@@ -1756,7 +1881,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 
 	/* No available slot, submit all stripes and wait for them. */
 	if (sctx->cur_stripe >= sctx->nr_stripes) {
-		ret = flush_scrub_stripes(sctx);
+		ret = flush_scrub_stripes(sctx, bg);
 		if (ret < 0)
 			return ret;
 	}
@@ -2256,7 +2381,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			break;
 	}
 out:
-	ret2 = flush_scrub_stripes(sctx);
+	ret2 = flush_scrub_stripes(sctx, bg);
 	if (!ret)
 		ret = ret2;
 	if (sctx->raid56_data_stripes) {
@@ -3039,7 +3164,7 @@ static int queue_scrub_logical_stripes(struct scrub_ctx *sctx,
 	ASSERT(sctx->nr_stripes);
 
 	if (sctx->cur_stripe + nr_copies > sctx->nr_stripes) {
-		ret = flush_scrub_stripes(sctx);
+		ret = flush_scrub_stripes(sctx, bg);
 		if (ret < 0)
 			return ret;
 	}
@@ -3137,7 +3262,7 @@ static int scrub_logical_one_chunk(struct scrub_ctx *sctx,
 		cur = sctx->stripes[sctx->cur_stripe - 1].logical + BTRFS_STRIPE_LEN;
 	}
 out:
-	flush_ret = flush_scrub_stripes(sctx);
+	flush_ret = flush_scrub_stripes(sctx, bg);
 	if (!ret)
 		ret = flush_ret;
 	free_scrub_stripes(sctx);