diff mbox series

[06/13] btrfs: add a helper to queue a corrupted sector for read repair

Message ID 2cfd9d2766824ddce727b06068de24d7a4be9637.1651559986.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: make read repair work in synchronous mode | expand

Commit Message

Qu Wenruo May 3, 2022, 6:49 a.m. UTC
The new helper, read_repair_bio_add_sector(), will either:

- Add the sector range into btrfs_read_repair_ctrl::cur_bio
  If we have the bio allocated and the sector range is adjacent to the
  bio.

- Allocate a new btrfs_read_repair_bio and add the page sector range to
  it
  If we have btrfs_read_repair_ctrl::cur_bio, we will first submit it,
  then allocate a new one.

This patch will also introduce a new type of bioset specifically for
read repair, the reasons are:

- A lot of extra members in btrfs_bio makes no sense for read repair

- Possible mempool deadlock under heavy memory pressure.
  We may have exhausted the last btrfs_bio in the mempool, and we will
  fail to allocate a new btrfs_bio in our endio function.

  This problem is already there for a long time in the existing read
  repair code, in the endio function we're allocating new btrfs_bio for
  each corrupted sector, which is way worse.

Thus in this patch we introduce a new bioset specifically for read repair
usage.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/read-repair.c | 117 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/read-repair.h |  33 ++++++++++++
 fs/btrfs/super.c       |   9 +++-
 3 files changed, 157 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig May 3, 2022, 3:07 p.m. UTC | #1
This adds an ununused static function and thus doesn't even compile.
Qu Wenruo May 4, 2022, 1:13 a.m. UTC | #2
On 2022/5/3 23:07, Christoph Hellwig wrote:
> This adds an ununused static function and thus doesn't even compile.

It's just a warning and can pass the compile.

Or we have to have a patch with over 400 lines.

Thanks,
Qu
Christoph Hellwig May 4, 2022, 2:06 p.m. UTC | #3
On Wed, May 04, 2022 at 09:13:43AM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/5/3 23:07, Christoph Hellwig wrote:
> > This adds an ununused static function and thus doesn't even compile.
> 
> It's just a warning and can pass the compile.
> 
> Or we have to have a patch with over 400 lines.

The latter is the only thing that makes sense.  Patches that are not
self contained also can't be reviewed self contained.  A larger patch
is much better than a random split.
David Sterba May 12, 2022, 5:20 p.m. UTC | #4
On Wed, May 04, 2022 at 07:06:20AM -0700, Christoph Hellwig wrote:
> On Wed, May 04, 2022 at 09:13:43AM +0800, Qu Wenruo wrote:
> > On 2022/5/3 23:07, Christoph Hellwig wrote:
> > > This adds an ununused static function and thus doesn't even compile.
> > 
> > It's just a warning and can pass the compile.
> > 
> > Or we have to have a patch with over 400 lines.
> 
> The latter is the only thing that makes sense.  Patches that are not
> self contained also can't be reviewed self contained.  A larger patch
> is much better than a random split.

We've been doing it the way where a complex function is in a separate
patch and its usage in another one. Yes there are different preferences
and opinions on that. It also depends how one does the review, either in
the mails or in the code. For me it's fine with the split as once I'm at
second patch the function exists in the file. In the mails it's "how
does it work" and "how it is used", so this can be seen as two different
things, thus in two patches. On exception a short function in the same
patch as it's use makes sense, for bigger ones I'm for patch split.
diff mbox series

Patch

diff --git a/fs/btrfs/read-repair.c b/fs/btrfs/read-repair.c
index 048bb7c16c56..f75842dcdd02 100644
--- a/fs/btrfs/read-repair.c
+++ b/fs/btrfs/read-repair.c
@@ -5,6 +5,8 @@ 
 #include "volumes.h"
 #include "read-repair.h"
 
+static struct bio_set read_repair_bioset;
+
 int btrfs_read_repair_alloc_bitmaps(struct btrfs_fs_info *fs_info,
 				    struct btrfs_bio *bbio)
 {
@@ -52,6 +54,8 @@  void btrfs_read_repair_add_sector(struct inode *inode,
 		ASSERT(ctrl->init_mirror);
 		ctrl->num_copies = btrfs_num_copies(fs_info, ctrl->logical,
 						    sectorsize);
+		init_waitqueue_head(&ctrl->io_wait);
+		atomic_set(&ctrl->io_bytes, 0);
 		/*
 		 * We use btrfs_bio::read_repair_bitmaps, which should be
 		 * preallocated before submission.
@@ -74,6 +78,104 @@  void btrfs_read_repair_add_sector(struct inode *inode,
 		btrfs_bio(failed_bio)->read_repair_cur_bitmap);
 }
 
+static struct btrfs_read_repair_bio *repair_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_read_repair_bio, bio);
+}
+
+static void read_repair_end_bio(struct bio *bio)
+{
+	struct btrfs_read_repair_ctrl *ctrl = bio->bi_private;
+	const struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct bvec_iter_all iter_all;
+	struct bio_vec *bvec;
+	const u64 logical = repair_bio(bio)->logical;
+	u32 offset = 0;
+	bool uptodate = (bio->bi_status == BLK_STS_OK);
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		unsigned long *bitmap =
+			btrfs_bio(ctrl->failed_bio)->read_repair_cur_bitmap;
+		/*
+		 * If we have a successful read, clear the error bit.
+		 * In read_repair_finish(), we will re-check the csum
+		 * (if exists) later.
+		 */
+		if (uptodate)
+			clear_bit((logical + offset - ctrl->logical) >>
+				fs_info->sectorsize_bits, bitmap);
+		offset += bvec->bv_len;
+	}
+	atomic_sub(offset, &ctrl->io_bytes);
+	wake_up(&ctrl->io_wait);
+	bio_put(bio);
+}
+
+/* Add a sector into the read repair bios list for later submission */
+static void read_repair_bio_add_sector(struct btrfs_read_repair_ctrl *ctrl,
+				       struct page *page, unsigned int pgoff,
+				       int sector_nr, int mirror)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct btrfs_read_repair_bio *rbio;
+	struct bio *bio;
+	int ret;
+
+	/* Check if the sector can be added to the last bio */
+	if (ctrl->cur_bio) {
+		bio = ctrl->cur_bio;
+		rbio = repair_bio(bio);
+
+		ASSERT(rbio->logical);
+		if (rbio->logical + bio->bi_iter.bi_size ==
+		    ctrl->logical + (sector_nr << fs_info->sectorsize_bits))
+			goto add;
+
+		ASSERT(bio_op(bio) == REQ_OP_READ);
+		ASSERT(bio->bi_private == ctrl);
+		ASSERT(bio->bi_end_io == read_repair_end_bio);
+		ASSERT(repair_bio(bio)->logical >= ctrl->logical &&
+		       repair_bio(bio)->logical + bio->bi_iter.bi_size <=
+		       ctrl->logical + ctrl->bio_size);
+		/*
+		 * The current bio is not adjacent to the current range,
+		 * just submit it.
+		 *
+		 * Our endio is super atomic, and we don't want to waste time on
+		 * lookup data csum. So here we just call btrfs_map_bio()
+		 * directly.
+		 */
+		ret = btrfs_map_bio(fs_info, bio, mirror);
+		if (ret) {
+			bio->bi_status = ret;
+			bio_endio(bio);
+		}
+		ctrl->cur_bio = NULL;
+	}
+	ASSERT(ctrl->cur_bio == NULL);
+	bio = bio_alloc_bioset(NULL, BIO_MAX_VECS, 0, GFP_NOFS,
+			       &read_repair_bioset);
+	/* It's backed by mempool, thus should not fail */
+	ASSERT(bio);
+
+	rbio = repair_bio(bio);
+	rbio->logical = ctrl->logical + (sector_nr << fs_info->sectorsize_bits);
+	bio->bi_opf = REQ_OP_READ;
+	bio->bi_iter.bi_sector = rbio->logical >> SECTOR_SHIFT;
+	bio->bi_private = ctrl;
+	bio->bi_end_io = read_repair_end_bio;
+	ctrl->cur_bio = bio;
+
+add:
+	ret = bio_add_page(bio, page, fs_info->sectorsize, pgoff);
+	/*
+	 * We allocated the read bio with enough bvecs to contain
+	 * the original bio, thus it should not fail to add a sector.
+	 */
+	ASSERT(ret == fs_info->sectorsize);
+	atomic_add(fs_info->sectorsize, &ctrl->io_bytes);
+}
+
 void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
 {
 	if (!ctrl->failed_bio)
@@ -86,3 +188,18 @@  void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
 	ctrl->error = false;
 	ctrl->failed_bio = NULL;
 }
+
+void __cold btrfs_read_repair_exit(void)
+{
+	bioset_exit(&read_repair_bioset);
+}
+
+int __init btrfs_read_repair_init(void)
+{
+	int ret;
+
+	ret = bioset_init(&read_repair_bioset, BIO_POOL_SIZE,
+			  offsetof(struct btrfs_read_repair_bio, bio),
+			  BIOSET_NEED_BVECS);
+	return ret;
+}
diff --git a/fs/btrfs/read-repair.h b/fs/btrfs/read-repair.h
index 42a251f1300b..3e1430489f89 100644
--- a/fs/btrfs/read-repair.h
+++ b/fs/btrfs/read-repair.h
@@ -14,6 +14,13 @@  struct btrfs_read_repair_ctrl {
 	/* The initial failed bio, we will grab page/pgoff from it */
 	struct bio *failed_bio;
 
+	/* Currently assembled bio for read/write */
+	struct bio *cur_bio;
+
+	wait_queue_head_t io_wait;
+
+	atomic_t io_bytes;
+
 	/* The logical bytenr of the original bio. */
 	u64 logical;
 
@@ -39,9 +46,35 @@  struct btrfs_read_repair_ctrl {
 
 int btrfs_read_repair_alloc_bitmaps(struct btrfs_fs_info *fs_info,
 				    struct btrfs_bio *bbio);
+
+/*
+ * Extra info for read repair bios.
+ *
+ * Read repair bios requires less info compared to btrfs_bio:
+ * - No need for csum
+ * - No need for mirror_num
+ * - No need for file_offset
+ *   They can all be fetched from the btrfs_read_repair_ctrl stored in
+ *   bi_private.
+ *
+ * - No need for iter
+ *   We use logical bytenr from btrfs_read_repair_bio::logical.
+ *   For bio iteration, our read repair bio will never cross stripe boundary,
+ *   thus we can use regular bio_for_each_segment_all().
+ *
+ * - No need for device
+ *   We just don't care at all.
+ */
+struct btrfs_read_repair_bio {
+	u64 logical;
+	struct bio bio;
+};
+
 void btrfs_read_repair_add_sector(struct inode *inode,
 				  struct btrfs_read_repair_ctrl *ctrl,
 				  struct bio *failed_bio, u32 bio_offset,
 				  u64 file_offset);
 void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl);
+int __init btrfs_read_repair_init(void);
+void __cold btrfs_read_repair_exit(void);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 206f44005c52..db476e675962 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@ 
 #include "block-group.h"
 #include "discard.h"
 #include "qgroup.h"
+#include "read-repair.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
@@ -2642,10 +2643,12 @@  static int __init init_btrfs_fs(void)
 	err = extent_io_init();
 	if (err)
 		goto free_cachep;
-
-	err = extent_state_cache_init();
+	err = btrfs_read_repair_init();
 	if (err)
 		goto free_extent_io;
+	err = extent_state_cache_init();
+	if (err)
+		goto free_read_repair;
 
 	err = extent_map_init();
 	if (err)
@@ -2709,6 +2712,8 @@  static int __init init_btrfs_fs(void)
 	extent_map_exit();
 free_extent_state_cache:
 	extent_state_cache_exit();
+free_read_repair:
+	btrfs_read_repair_exit();
 free_extent_io:
 	extent_io_exit();
 free_cachep: