diff mbox series

btrfs: fix dev-replace after the scrub rework

Message ID 0113e9e82b06106940e8ef7323fd4a9c01aa5afc.1685610531.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: fix dev-replace after the scrub rework | expand

Commit Message

Qu Wenruo June 1, 2023, 9:10 a.m. UTC
[BUG]
After commit e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror()
to scrub_stripe infrastructure"), scrub no longer works for zoned device
at all.

Even an empty zoned btrfs can not be replaced:

 # mkfs.btrfs -f /dev/nvme0n1
 # mount /dev/nvme0n1 /mnt/btrfs
 # btrfs replace start -Bf 1 /dev/nvme0n2 /mnt/btrfs
 Resetting device zones /dev/nvme1n1 (160 zones) ...
 ERROR: ioctl(DEV_REPLACE_START) failed on "/mnt/btrfs/": Input/output error

And we can hit kernel crash related to that:

 BTRFS info (device nvme1n1): host-managed zoned block device /dev/nvme3n1, 160 zones of 134217728 bytes
 BTRFS info (device nvme1n1): dev_replace from /dev/nvme2n1 (devid 2) to /dev/nvme3n1 started
 nvme3n1: Zone Management Append(0x7d) @ LBA 65536, 4 blocks, Zone Is Full (sct 0x1 / sc 0xb9) DNR
 I/O error, dev nvme3n1, sector 786432 op 0xd:(ZONE_APPEND) flags 0x4000 phys_seg 3 prio class 2
 BTRFS error (device nvme1n1): bdev /dev/nvme3n1 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0
 BUG: kernel NULL pointer dereference, address: 00000000000000a8
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
 RIP: 0010:_raw_spin_lock_irqsave+0x1e/0x40
 Call Trace:
  <IRQ>
  btrfs_lookup_ordered_extent+0x31/0x190
  btrfs_record_physical_zoned+0x18/0x40
  btrfs_simple_end_io+0xaf/0xc0
  blk_update_request+0x153/0x4c0
  blk_mq_end_request+0x15/0xd0
  nvme_poll_cq+0x1d3/0x360
  nvme_irq+0x39/0x80
  __handle_irq_event_percpu+0x3b/0x190
  handle_irq_event+0x2f/0x70
  handle_edge_irq+0x7c/0x210
  __common_interrupt+0x34/0xa0
  common_interrupt+0x7d/0xa0
  </IRQ>
  <TASK>
  asm_common_interrupt+0x22/0x40

[CAUSE]
Dev-replace reuses scrub code to iterate all extents and write the
existing content back to the new device.

And for zoned devices, we call fill_writer_pointer_gap() to make sure
all the writes into the zoned device is sequential, even if there may be
some gaps between the writes.

However we have several different bugs all related to zoned dev-replace:

- We are using ZONE_APPEND operation for metadata style write back
  For zoned devices, btrfs has two ways to write data:

  * ZONE_APPEND for data
    This allows higher queue depth, but will not be able to know where
    the write would land.
    Thus needs to grab the real on-disk physical location in it's endio.

  * WRITE for metadata
    This requires single queue depth (new writes can only be submitted
    after previous one finished), and all writes must be sequential.

  For scrub, we go single queue depth, but still goes with ZONE_APPEND,
  which requires btrfs_bio::inode being populated.
  This is the cause of that crash.

- No correct tracing of write_pointer
  After a write finished, we should forward sctx->write_pointer, or
  fill_writer_pointer_gap() would not work properly and cause more
  than necessary zero out, and fill the whole zone prematurely.

- Incorrect physical bytenr passed to fill_writer_pointer_gap()
  In scrub_write_sectors(), one call site passes logical address, which
  is completely wrong.

  The other call site passes physical address of current sector, but
  we should pass the physical address of the btrfs_bio we're submitting.

  This is the cause of the -EIO errors.

[FIX]
- Do not use ZONE_APPEND for btrfs_submit_repair_write().

- Manually forward sctx->write_pointer after success writeback

- Use the physical address of the to-be-submitted btrfs_bio for
  fill_writer_pointer_gap()

Now zoned device replace would work as expected.

Reported-by: Christoph Hellwig <hch@lst.de>
Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/bio.c   |  4 ----
 fs/btrfs/scrub.c | 36 ++++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 10 deletions(-)

Comments

Christoph Hellwig June 1, 2023, 9:37 a.m. UTC | #1
This fixes the tests case for me.  I'd much prefer to not
duplicate all this logic though, so something like this should
be folded in:

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4e74105bc97191..6fbf30bea27ab0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1102,6 +1102,33 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
 		wake_up(&stripe->io_wait);
 }
 
+static void btrfs_submit_scrub_bio(struct scrub_ctx *sctx,
+				   struct scrub_stripe *stripe,
+				   struct btrfs_bio *bbio,
+				   bool dev_replace)
+{
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	u32 bio_len = bbio->bio.bi_iter.bi_size;
+	u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
+		stripe->logical;
+
+	fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
+	atomic_inc(&stripe->pending_io);
+	btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+
+	/* For zoned writeback, queue depth must be 1. */
+	if (!btrfs_is_zoned(fs_info))
+		return;
+
+	/*
+	 * If the write finished without error, forward the write pointer.
+	 */
+	wait_scrub_stripe_io(stripe);
+	if (!test_bit(bio_off >> fs_info->sectorsize_bits,
+		      &stripe->write_error_bitmap))
+		sctx->write_pointer += bio_len;
+}
+
 /*
  * Submit the write bio(s) for the sectors specified by @write_bitmap.
  *
@@ -1120,7 +1147,6 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 {
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct btrfs_bio *bbio = NULL;
-	const bool zoned = btrfs_is_zoned(fs_info);
 	int sector_nr;
 
 	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
@@ -1133,25 +1159,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 
 		/* Cannot merge with previous sector, submit the current one. */
 		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
-			u32 bio_len = bbio->bio.bi_iter.bi_size;
-			u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
-				      stripe->logical;
-
-			fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
-			atomic_inc(&stripe->pending_io);
-			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
-			/* For zoned writeback, queue depth must be 1. */
-			if (zoned) {
-				wait_scrub_stripe_io(stripe);
-
-				/*
-				 * Write finished without error, forward the
-				 * write pointer.
-				 */
-				if (!test_bit(bio_off >> fs_info->sectorsize_bits,
-					     &stripe->write_error_bitmap))
-					sctx->write_pointer += bio_len;
-			}
+			btrfs_submit_scrub_bio(sctx, stripe, bbio, dev_replace);
 			bbio = NULL;
 		}
 		if (!bbio) {
@@ -1164,26 +1172,9 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
 		ASSERT(ret == fs_info->sectorsize);
 	}
-	if (bbio) {
-		u32 bio_len = bbio->bio.bi_iter.bi_size;
-		u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
-			      stripe->logical;
 
-		fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
-		atomic_inc(&stripe->pending_io);
-		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
-		if (zoned) {
-			wait_scrub_stripe_io(stripe);
-
-			/*
-			 * Write finished without error, forward the
-			 * write pointer.
-			 */
-			if (!test_bit(bio_off >> fs_info->sectorsize_bits,
-				     &stripe->write_error_bitmap))
-				sctx->write_pointer += bio_len;
-		}
-	}
+	if (bbio)
+		btrfs_submit_scrub_bio(sctx, stripe, bbio, dev_replace);
 }
 
 /*
Qu Wenruo June 1, 2023, 10:12 a.m. UTC | #2
On 2023/6/1 17:37, Christoph Hellwig wrote:
> This fixes the tests case for me.  I'd much prefer to not
> duplicate all this logic though, so something like this should
> be folded in:

Yes, the cleanup is also in my mind, but currently I prefer to do the
fix first in-place, then you can do the tide up after the more critical fix.

Thanks,
Qu
>
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 4e74105bc97191..6fbf30bea27ab0 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -1102,6 +1102,33 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
>   		wake_up(&stripe->io_wait);
>   }
>
> +static void btrfs_submit_scrub_bio(struct scrub_ctx *sctx,
> +				   struct scrub_stripe *stripe,
> +				   struct btrfs_bio *bbio,
> +				   bool dev_replace)
> +{
> +	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
> +	u32 bio_len = bbio->bio.bi_iter.bi_size;
> +	u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
> +		stripe->logical;
> +
> +	fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
> +	atomic_inc(&stripe->pending_io);
> +	btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
> +
> +	/* For zoned writeback, queue depth must be 1. */
> +	if (!btrfs_is_zoned(fs_info))
> +		return;
> +
> +	/*
> +	 * If the write finished without error, forward the write pointer.
> +	 */
> +	wait_scrub_stripe_io(stripe);
> +	if (!test_bit(bio_off >> fs_info->sectorsize_bits,
> +		      &stripe->write_error_bitmap))
> +		sctx->write_pointer += bio_len;
> +}
> +
>   /*
>    * Submit the write bio(s) for the sectors specified by @write_bitmap.
>    *
> @@ -1120,7 +1147,6 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
>   {
>   	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
>   	struct btrfs_bio *bbio = NULL;
> -	const bool zoned = btrfs_is_zoned(fs_info);
>   	int sector_nr;
>
>   	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
> @@ -1133,25 +1159,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
>
>   		/* Cannot merge with previous sector, submit the current one. */
>   		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
> -			u32 bio_len = bbio->bio.bi_iter.bi_size;
> -			u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
> -				      stripe->logical;
> -
> -			fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
> -			atomic_inc(&stripe->pending_io);
> -			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
> -			/* For zoned writeback, queue depth must be 1. */
> -			if (zoned) {
> -				wait_scrub_stripe_io(stripe);
> -
> -				/*
> -				 * Write finished without error, forward the
> -				 * write pointer.
> -				 */
> -				if (!test_bit(bio_off >> fs_info->sectorsize_bits,
> -					     &stripe->write_error_bitmap))
> -					sctx->write_pointer += bio_len;
> -			}
> +			btrfs_submit_scrub_bio(sctx, stripe, bbio, dev_replace);
>   			bbio = NULL;
>   		}
>   		if (!bbio) {
> @@ -1164,26 +1172,9 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
>   		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
>   		ASSERT(ret == fs_info->sectorsize);
>   	}
> -	if (bbio) {
> -		u32 bio_len = bbio->bio.bi_iter.bi_size;
> -		u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
> -			      stripe->logical;
>
> -		fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
> -		atomic_inc(&stripe->pending_io);
> -		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
> -		if (zoned) {
> -			wait_scrub_stripe_io(stripe);
> -
> -			/*
> -			 * Write finished without error, forward the
> -			 * write pointer.
> -			 */
> -			if (!test_bit(bio_off >> fs_info->sectorsize_bits,
> -				     &stripe->write_error_bitmap))
> -				sctx->write_pointer += bio_len;
> -		}
> -	}
> +	if (bbio)
> +		btrfs_submit_scrub_bio(sctx, stripe, bbio, dev_replace);
>   }
>
>   /*
Christoph Hellwig June 1, 2023, 10:22 a.m. UTC | #3
On Thu, Jun 01, 2023 at 06:12:12PM +0800, Qu Wenruo wrote:
> Yes, the cleanup is also in my mind, but currently I prefer to do the
> fix first in-place, then you can do the tide up after the more critical fix.

There isn't really much of a difference in the diffstat between
doing the consolidation and duplicating everything.  So I don't
think that's much of an argument.
Qu Wenruo June 1, 2023, 10:34 a.m. UTC | #4
On 2023/6/1 18:22, Christoph Hellwig wrote:
> On Thu, Jun 01, 2023 at 06:12:12PM +0800, Qu Wenruo wrote:
>> Yes, the cleanup is also in my mind, but currently I prefer to do the
>> fix first in-place, then you can do the tide up after the more critical fix.
>
> There isn't really much of a difference in the diffstat between
> doing the consolidation and duplicating everything.  So I don't
> think that's much of an argument.

Indeed, I'll update the patch to cleanup the call sites.

Thanks,
Qu
diff mbox series

Patch

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 85511a8a4801..c9b4aa339b4b 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -785,10 +785,6 @@  void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
 		goto fail;
 
 	if (dev_replace) {
-		if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
-			bbio->bio.bi_opf &= ~REQ_OP_WRITE;
-			bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
-		}
 		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
 		smap.dev = fs_info->dev_replace.tgtdev;
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 8fce48d9e07a..4e74105bc971 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1133,13 +1133,25 @@  static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 
 		/* Cannot merge with previous sector, submit the current one. */
 		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
-			fill_writer_pointer_gap(sctx, stripe->physical +
-					(sector_nr << fs_info->sectorsize_bits));
+			u32 bio_len = bbio->bio.bi_iter.bi_size;
+			u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
+				      stripe->logical;
+
+			fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
 			atomic_inc(&stripe->pending_io);
 			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
 			/* For zoned writeback, queue depth must be 1. */
-			if (zoned)
+			if (zoned) {
 				wait_scrub_stripe_io(stripe);
+
+				/*
+				 * Write finished without error, forward the
+				 * write pointer.
+				 */
+				if (!test_bit(bio_off >> fs_info->sectorsize_bits,
+					     &stripe->write_error_bitmap))
+					sctx->write_pointer += bio_len;
+			}
 			bbio = NULL;
 		}
 		if (!bbio) {
@@ -1153,12 +1165,24 @@  static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
 		ASSERT(ret == fs_info->sectorsize);
 	}
 	if (bbio) {
-		fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
-					SECTOR_SHIFT);
+		u32 bio_len = bbio->bio.bi_iter.bi_size;
+		u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
+			      stripe->logical;
+
+		fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
 		atomic_inc(&stripe->pending_io);
 		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
-		if (zoned)
+		if (zoned) {
 			wait_scrub_stripe_io(stripe);
+
+			/*
+			 * Write finished without error, forward the
+			 * write pointer.
+			 */
+			if (!test_bit(bio_off >> fs_info->sectorsize_bits,
+				     &stripe->write_error_bitmap))
+				sctx->write_pointer += bio_len;
+		}
 	}
 }