@@ -5,6 +5,8 @@
#include "volumes.h"
#include "read-repair.h"
+static struct bio_set read_repair_bioset;
+
int btrfs_read_repair_alloc_bitmaps(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio)
{
@@ -52,6 +54,8 @@ void btrfs_read_repair_add_sector(struct inode *inode,
ASSERT(ctrl->init_mirror);
ctrl->num_copies = btrfs_num_copies(fs_info, ctrl->logical,
sectorsize);
+ init_waitqueue_head(&ctrl->io_wait);
+ atomic_set(&ctrl->io_bytes, 0);
/*
* We use btrfs_bio::read_repair_bitmaps, which should be
* preallocated before submission.
@@ -74,6 +78,104 @@ void btrfs_read_repair_add_sector(struct inode *inode,
btrfs_bio(failed_bio)->read_repair_cur_bitmap);
}
+static struct btrfs_read_repair_bio *repair_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_read_repair_bio, bio);
+}
+
+static void read_repair_end_bio(struct bio *bio)
+{
+ struct btrfs_read_repair_ctrl *ctrl = bio->bi_private;
+ const struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ const u64 logical = repair_bio(bio)->logical;
+ u32 offset = 0;
+ bool uptodate = (bio->bi_status == BLK_STS_OK);
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ unsigned long *bitmap =
+ btrfs_bio(ctrl->failed_bio)->read_repair_cur_bitmap;
+ /*
+ * If we have a successful read, clear the error bit.
+ * In read_repair_finish(), we will re-check the csum
+ * (if exists) later.
+ */
+ if (uptodate)
+ clear_bit((logical + offset - ctrl->logical) >>
+ fs_info->sectorsize_bits, bitmap);
+ offset += bvec->bv_len;
+ }
+ atomic_sub(offset, &ctrl->io_bytes);
+ wake_up(&ctrl->io_wait);
+ bio_put(bio);
+}
+
+/* Add a sector into the read repair bios list for later submission */
+static void read_repair_bio_add_sector(struct btrfs_read_repair_ctrl *ctrl,
+ struct page *page, unsigned int pgoff,
+ int sector_nr, int mirror)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+ struct btrfs_read_repair_bio *rbio;
+ struct bio *bio;
+ int ret;
+
+ /* Check if the sector can be added to the last bio */
+ if (ctrl->cur_bio) {
+ bio = ctrl->cur_bio;
+ rbio = repair_bio(bio);
+
+ ASSERT(rbio->logical);
+ if (rbio->logical + bio->bi_iter.bi_size ==
+ ctrl->logical + (sector_nr << fs_info->sectorsize_bits))
+ goto add;
+
+ ASSERT(bio_op(bio) == REQ_OP_READ);
+ ASSERT(bio->bi_private == ctrl);
+ ASSERT(bio->bi_end_io == read_repair_end_bio);
+ ASSERT(repair_bio(bio)->logical >= ctrl->logical &&
+ repair_bio(bio)->logical + bio->bi_iter.bi_size <=
+ ctrl->logical + ctrl->bio_size);
+ /*
+ * The current bio is not adjacent to the current range,
+ * just submit it.
+ *
+ * Our endio is super atomic, and we don't want to waste time on
+ * lookup data csum. So here we just call btrfs_map_bio()
+ * directly.
+ */
+ ret = btrfs_map_bio(fs_info, bio, mirror);
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
+ ctrl->cur_bio = NULL;
+ }
+ ASSERT(ctrl->cur_bio == NULL);
+ bio = bio_alloc_bioset(NULL, BIO_MAX_VECS, 0, GFP_NOFS,
+ &read_repair_bioset);
+ /* It's backed by mempool, thus should not fail */
+ ASSERT(bio);
+
+ rbio = repair_bio(bio);
+ rbio->logical = ctrl->logical + (sector_nr << fs_info->sectorsize_bits);
+ bio->bi_opf = REQ_OP_READ;
+ bio->bi_iter.bi_sector = rbio->logical >> SECTOR_SHIFT;
+ bio->bi_private = ctrl;
+ bio->bi_end_io = read_repair_end_bio;
+ ctrl->cur_bio = bio;
+
+add:
+ ret = bio_add_page(bio, page, fs_info->sectorsize, pgoff);
+ /*
+ * We allocated the read bio with enough bvecs to contain
+ * the original bio, thus it should not fail to add a sector.
+ */
+ ASSERT(ret == fs_info->sectorsize);
+ atomic_add(fs_info->sectorsize, &ctrl->io_bytes);
+}
+
void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
{
if (!ctrl->failed_bio)
@@ -86,3 +188,18 @@ void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
ctrl->error = false;
ctrl->failed_bio = NULL;
}
+
+void __cold btrfs_read_repair_exit(void)
+{
+ bioset_exit(&read_repair_bioset);
+}
+
+int __init btrfs_read_repair_init(void)
+{
+ int ret;
+
+ ret = bioset_init(&read_repair_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_read_repair_bio, bio),
+ BIOSET_NEED_BVECS);
+ return ret;
+}
@@ -14,6 +14,13 @@ struct btrfs_read_repair_ctrl {
/* The initial failed bio, we will grab page/pgoff from it */
struct bio *failed_bio;
+ /* Currently assembled bio for read/write */
+ struct bio *cur_bio;
+
+ wait_queue_head_t io_wait;
+
+ atomic_t io_bytes;
+
/* The logical bytenr of the original bio. */
u64 logical;
@@ -39,9 +46,35 @@ struct btrfs_read_repair_ctrl {
int btrfs_read_repair_alloc_bitmaps(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio);
+
+/*
+ * Extra info for read repair bios.
+ *
+ * Read repair bios requires less info compared to btrfs_bio:
+ * - No need for csum
+ * - No need for mirror_num
+ * - No need for file_offset
+ * They can all be fetched from the btrfs_read_repair_ctrl stored in
+ * bi_private.
+ *
+ * - No need for iter
+ * We use logical bytenr from btrfs_read_repair_bio::logical.
+ * For bio iteration, our read repair bio will never cross stripe boundary,
+ * thus we can use regular bio_for_each_segment_all().
+ *
+ * - No need for device
+ * We just don't care at all.
+ */
+struct btrfs_read_repair_bio {
+ u64 logical;
+ struct bio bio;
+};
+
void btrfs_read_repair_add_sector(struct inode *inode,
struct btrfs_read_repair_ctrl *ctrl,
struct bio *failed_bio, u32 bio_offset,
u64 file_offset);
void btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl);
+int __init btrfs_read_repair_init(void);
+void __cold btrfs_read_repair_exit(void);
#endif
@@ -48,6 +48,7 @@
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
+#include "read-repair.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -2642,10 +2643,12 @@ static int __init init_btrfs_fs(void)
err = extent_io_init();
if (err)
goto free_cachep;
-
- err = extent_state_cache_init();
+ err = btrfs_read_repair_init();
if (err)
goto free_extent_io;
+ err = extent_state_cache_init();
+ if (err)
+ goto free_read_repair;
err = extent_map_init();
if (err)
@@ -2709,6 +2712,8 @@ static int __init init_btrfs_fs(void)
extent_map_exit();
free_extent_state_cache:
extent_state_cache_exit();
+free_read_repair:
+ btrfs_read_repair_exit();
free_extent_io:
extent_io_exit();
free_cachep:
The new helper, read_repair_bio_add_sector(), will either: - Add the sector range into btrfs_read_repair_ctrl::cur_bio If we have the bio allocated and the sector range is adjacent to the bio. - Allocate a new btrfs_read_repair_bio and add the page sector range to it If we have btrfs_read_repair_ctrl::cur_bio, we will first submit it, then allocate a new one. This patch will also introduce a new type of bioset specifically for read repair, the reasons are: - A lot of extra members in btrfs_bio makes no sense for read repair - Possible mempool deadlock under heavy memory pressure. We may have exhausted the last btrfs_bio in the mempool, and we will fail to allocate a new btrfs_bio in our endio function. This problem is already there for a long time in the existing read repair code, in the endio function we're allocating new btrfs_bio for each corrupted sector, which is way worse. Thus in this patch we introduce a new bioset specifically for read repair usage. Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/read-repair.c | 117 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/read-repair.h | 33 ++++++++++++ fs/btrfs/super.c | 9 +++- 3 files changed, 157 insertions(+), 2 deletions(-)