diff mbox series

[3/3] btrfs: only write the sectors in the vertical stripe which has data stripes

Message ID 9ad6484a6896b6f15b41daa85ff5f10337acaef5.1653636443.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: raid56: reduce unnecessary parity writes | expand

Commit Message

Qu Wenruo May 27, 2022, 7:28 a.m. UTC
If we have only 8K partial write at the beginning of a full RAID56
stripe, we will write the following contents:

                    0  8K           32K             64K
Disk 1	(data):     |XX|            |               |
Disk 2  (data):     |               |               |
Disk 3  (parity):   |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX|

|X| means the sector will be written back to disk.

Note that, although we won't write any sectors from disk 2, but we will
write the full 64KiB of parity to disk.

This behavior is fine for now, but not for the future (especially for
RAID56J, as we waste quite some space to journal the unused parity
stripes).

So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we
queue a higher level bio into an rbio, we will update rbio::dbitmap to
indicate which vertical stripes we need to writeback.

And at finish_rmw(), we also check dbitmap to see if we need to write
any sector in the veritical stripe.

So after the patch, above example will only lead to the following
writeback patter:

                    0  8K           32K             64K
Disk 1	(data):     |XX|            |               |
Disk 2  (data):     |               |               |
Disk 3  (parity):   |XX|            |               |

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/raid56.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

Comments

Qu Wenruo May 31, 2022, 10:42 a.m. UTC | #1
On 2022/5/27 15:28, Qu Wenruo wrote:
> If we have only 8K partial write at the beginning of a full RAID56
> stripe, we will write the following contents:
> 
>                      0  8K           32K             64K
> Disk 1	(data):     |XX|            |               |
> Disk 2  (data):     |               |               |
> Disk 3  (parity):   |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX|
> 
> |X| means the sector will be written back to disk.
> 
> Note that, although we won't write any sectors from disk 2, but we will
> write the full 64KiB of parity to disk.
> 
> This behavior is fine for now, but not for the future (especially for
> RAID56J, as we waste quite some space to journal the unused parity
> stripes).
> 
> So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we
> queue a higher level bio into an rbio, we will update rbio::dbitmap to
> indicate which vertical stripes we need to writeback.
> 
> And at finish_rmw(), we also check dbitmap to see if we need to write
> any sector in the veritical stripe.
> 
> So after the patch, above example will only lead to the following
> writeback patter:
> 
>                      0  8K           32K             64K
> Disk 1	(data):     |XX|            |               |
> Disk 2  (data):     |               |               |
> Disk 3  (parity):   |XX|            |               |

With the latest debug patch, it turns out that, this patch is only 
working for the initial several writes into a full stripe.

Since we didn't reset dbitmap after the RMW finished, it will slowly 
degrade into the old behavior.

Will update the patchset to address this soon.

Thanks,
Qu
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/raid56.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index bc1e510b0c12..88640f7e1622 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -392,6 +392,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
>   {
>   	bio_list_merge(&dest->bio_list, &victim->bio_list);
>   	dest->bio_list_bytes += victim->bio_list_bytes;
> +	/* Also inherit the bitmaps from @victim. */
> +	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
> +		  dest->stripe_nsectors);
>   	dest->generic_bio_cnt += victim->generic_bio_cnt;
>   	bio_list_init(&victim->bio_list);
>   }
> @@ -1284,6 +1287,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
>   	else
>   		BUG();
>   
> +	/* We should have at least one data sector. */
> +	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
> +
>   	/* at this point we either have a full stripe,
>   	 * or we've read the full stripe from the drive.
>   	 * recalculate the parity and write the new results.
> @@ -1358,6 +1364,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
>   		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
>   			struct sector_ptr *sector;
>   
> +			/* This vertical stripe has no data, skip it. */
> +			if (!test_bit(sectornr, &rbio->dbitmap))
> +				continue;
> +
>   			if (stripe < rbio->nr_data) {
>   				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
>   				if (!sector)
> @@ -1384,6 +1394,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
>   		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
>   			struct sector_ptr *sector;
>   
> +			/* This vertical stripe has no data, skip it. */
> +			if (!test_bit(sectornr, &rbio->dbitmap))
> +				continue;
> +
>   			if (stripe < rbio->nr_data) {
>   				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
>   				if (!sector)
> @@ -1835,6 +1849,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
>   	run_plug(plug);
>   }
>   
> +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
> +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
> +{
> +	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
> +	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
> +	const u64 full_stripe_start = rbio->bioc->raid_map[0];
> +	const u32 orig_len = orig_bio->bi_iter.bi_size;
> +	const u32 sectorsize = fs_info->sectorsize;
> +	u64 cur_logical;
> +
> +	ASSERT(orig_logical >= full_stripe_start &&
> +	       orig_logical + orig_len <= full_stripe_start +
> +	       rbio->nr_data * rbio->stripe_len);
> +
> +	bio_list_add(&rbio->bio_list, orig_bio);
> +	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
> +
> +	/* Update the dbitmap. */
> +	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
> +	     cur_logical += sectorsize) {
> +		int bit = ((u32)(cur_logical - full_stripe_start) >>
> +			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
> +
> +		set_bit(bit, &rbio->dbitmap);
> +	}
> +}
> +
>   /*
>    * our main entry point for writes from the rest of the FS.
>    */
> @@ -1851,9 +1892,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
>   		btrfs_put_bioc(bioc);
>   		return PTR_ERR(rbio);
>   	}
> -	bio_list_add(&rbio->bio_list, bio);
> -	rbio->bio_list_bytes = bio->bi_iter.bi_size;
>   	rbio->operation = BTRFS_RBIO_WRITE;
> +	rbio_add_bio(rbio, bio);
>   
>   	btrfs_bio_counter_inc_noblocked(fs_info);
>   	rbio->generic_bio_cnt = 1;
> @@ -2258,8 +2298,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
>   	}
>   
>   	rbio->operation = BTRFS_RBIO_READ_REBUILD;
> -	bio_list_add(&rbio->bio_list, bio);
> -	rbio->bio_list_bytes = bio->bi_iter.bi_size;
> +	rbio_add_bio(rbio, bio);
>   
>   	rbio->faila = find_logical_bio_stripe(rbio, bio);
>   	if (rbio->faila == -1) {
diff mbox series

Patch

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index bc1e510b0c12..88640f7e1622 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -392,6 +392,9 @@  static void merge_rbio(struct btrfs_raid_bio *dest,
 {
 	bio_list_merge(&dest->bio_list, &victim->bio_list);
 	dest->bio_list_bytes += victim->bio_list_bytes;
+	/* Also inherit the bitmaps from @victim. */
+	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+		  dest->stripe_nsectors);
 	dest->generic_bio_cnt += victim->generic_bio_cnt;
 	bio_list_init(&victim->bio_list);
 }
@@ -1284,6 +1287,9 @@  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	else
 		BUG();
 
+	/* We should have at least one data sector. */
+	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
 	/* at this point we either have a full stripe,
 	 * or we've read the full stripe from the drive.
 	 * recalculate the parity and write the new results.
@@ -1358,6 +1364,10 @@  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
 			struct sector_ptr *sector;
 
+			/* This vertical stripe has no data, skip it. */
+			if (!test_bit(sectornr, &rbio->dbitmap))
+				continue;
+
 			if (stripe < rbio->nr_data) {
 				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
 				if (!sector)
@@ -1384,6 +1394,10 @@  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
 			struct sector_ptr *sector;
 
+			/* This vertical stripe has no data, skip it. */
+			if (!test_bit(sectornr, &rbio->dbitmap))
+				continue;
+
 			if (stripe < rbio->nr_data) {
 				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
 				if (!sector)
@@ -1835,6 +1849,33 @@  static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	run_plug(plug);
 }
 
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	const u64 full_stripe_start = rbio->bioc->raid_map[0];
+	const u32 orig_len = orig_bio->bi_iter.bi_size;
+	const u32 sectorsize = fs_info->sectorsize;
+	u64 cur_logical;
+
+	ASSERT(orig_logical >= full_stripe_start &&
+	       orig_logical + orig_len <= full_stripe_start +
+	       rbio->nr_data * rbio->stripe_len);
+
+	bio_list_add(&rbio->bio_list, orig_bio);
+	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+	/* Update the dbitmap. */
+	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+	     cur_logical += sectorsize) {
+		int bit = ((u32)(cur_logical - full_stripe_start) >>
+			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+		set_bit(bit, &rbio->dbitmap);
+	}
+}
+
 /*
  * our main entry point for writes from the rest of the FS.
  */
@@ -1851,9 +1892,8 @@  int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
 		btrfs_put_bioc(bioc);
 		return PTR_ERR(rbio);
 	}
-	bio_list_add(&rbio->bio_list, bio);
-	rbio->bio_list_bytes = bio->bi_iter.bi_size;
 	rbio->operation = BTRFS_RBIO_WRITE;
+	rbio_add_bio(rbio, bio);
 
 	btrfs_bio_counter_inc_noblocked(fs_info);
 	rbio->generic_bio_cnt = 1;
@@ -2258,8 +2298,7 @@  int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	}
 
 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
-	bio_list_add(&rbio->bio_list, bio);
-	rbio->bio_list_bytes = bio->bi_iter.bi_size;
+	rbio_add_bio(rbio, bio);
 
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {