diff mbox series

[v2,4/7] btrfs: introduce new read-repair infrastructure

Message ID b014412ee0713e01f52269e553c0cff3487ca495.1653476251.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: read-repair rework based on bitmap | expand

Commit Message

Qu Wenruo May 25, 2022, 10:59 a.m. UTC
The new read repair infrastructure is consist of the following 3 parts:

- btrfs_read_repair_ctrl
  Record a continous corrupted range.
  Will mostly be on-stack structure for the top-level endio function.

- btrfs_read_repair_add_sector()
  This function is called each time we hit a bad sector.

  This function itself will check if the bad sector can be merged with
  the existing bad range.

  If not, call btrfs_read_repair_finish() to finish the current range
  first, and then add the new sector into the now empty
  btrfs_read_repair_ctrl.

  Will return -EIO if there is any range failed to be repaired.

- btrfs_read_repair_finish()
  This function should be called before the endio function exit.

  This function will iterate through all the mirrors, trying to grab
  the correct data.

  If we grabbed a correct sector, we will queue it for later writeback
  into the bad mirror.

To hold the original bad sectors, we have two bios, one named
@bad_sectors. Although it's a bio, we only utilize the bio_vec
infrastructure to hold all the initial bad sectors.

It's the @io_bio we really utilize to submit new read and write.

For io_bio, the usage is pretty much the same as
btrfs_read_repair_add_sector().

If we can merge the target sector, then that's the best case.
If not, then we submit the current @io_bio, wait for it, and allocate a
new bio for the next usage.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/extent_io.c   |   2 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/read-repair.c | 328 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/read-repair.h |  48 ++++++
 5 files changed, 379 insertions(+), 2 deletions(-)
 create mode 100644 fs/btrfs/read-repair.c
 create mode 100644 fs/btrfs/read-repair.h

Comments

Qu Wenruo May 26, 2022, 3:06 a.m. UTC | #1
On 2022/5/25 18:59, Qu Wenruo wrote:
> The new read repair infrastructure is consist of the following 3 parts:
> 
[...]
> +static void io_add_or_submit(struct btrfs_read_repair_ctrl *ctrl, int mirror,
> +			   u64 logical, struct page *page, unsigned int pgoff,
> +			   int opf)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bio *io_bio = ctrl->io_bio;
> +
> +	/* Uninitialized. */
> +	if (io_bio->bi_iter.bi_sector == 0) {
> +		ASSERT(io_bio->bi_iter.bi_size == 0);
> +		io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
> +		io_bio->bi_opf = opf;
> +		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +		return;
> +	}
> +
> +	/* Continuous, add the page */
> +	if ((io_bio->bi_iter.bi_sector << SECTOR_SHIFT) +
> +	     io_bio->bi_iter.bi_size == logical) {
> +		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +		return;
> +	}
> +
> +	/* Not continuous, submit first. */

Hi Christoph, I'm pretty sure the non-continuous bio problem is here for 
all of our attempts to rework read-repair.

I'm wondering if there is some "dummy" page provided from block layer 
that we can utilize?

E.g. We have the following checker pattern:

mirror 1	|X|X|X|X|
mirror 2	|X| |X| |
mirror 3	| |X| |X|

After reading all the 4 sectors from mirror 2, we know the 2nd and 4th 
are good and should not need to be-read.

Then reading mirror 3 needs us to submit two bios.

But if we have some "dummy" pages, and added into the bio for sector 2 
and 4, we only need one bio submission.

Is there such convenient page for us to utilize? Or we have to assign it 
globally?

Thanks,
Qu

> +	io_bio_submit(ctrl, mirror, opf);
> +	io_bio = ctrl->io_bio;
> +	io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
> +	bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +}
> +
> +static void writeback_good_mirror(struct btrfs_read_repair_ctrl *ctrl,
> +				  int mirror, u64 logical,
> +				  struct page *page, unsigned int pgoff)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bio *io_bio = ctrl->io_bio;
> +
> +
> +	if (btrfs_repair_one_zone(fs_info, ctrl->logical))
> +		return;
> +
> +	/*
> +	 * For RAID56, we can not just write the bad data back, as
> +	 * any write will trigger RMW and read back the corrrupted
> +	 * on-disk stripe, causing further damage.
> +	 * So here we do special repair for raid56.
> +	 *
> +	 * And unfortunately, this repair is very low level and not
> +	 * compatible with the rest of the mirror based repair.
> +	 * So it's still done in synchronous mode using
> +	 * btrfs_repair_io_failure().
> +	 */
> +	if (ctrl->is_raid56) {
> +		const u64 file_offset = logical - ctrl->logical +
> +					ctrl->file_offset;
> +		btrfs_repair_io_failure(fs_info,
> +				btrfs_ino(BTRFS_I(ctrl->inode)), file_offset,
> +				fs_info->sectorsize, logical, page, pgoff,
> +				mirror);
> +		return;
> +	}
> +
> +	ASSERT(io_bio);
> +	io_add_or_submit(ctrl, mirror, logical, page, pgoff, REQ_OP_WRITE);
> +}
> +
> +static void repair_from_mirror(struct btrfs_read_repair_ctrl *ctrl, int mirror)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	unsigned long old_bitmap = ctrl->bad_bitmap;
> +	const int prev_mirror = get_prev_mirror(mirror, ctrl->num_copies);
> +	int nr_sector;
> +	u32 offset;
> +	int ret;
> +
> +	/*
> +	 * Reset the io_bio logial bytenr so later io_add_or_submit() can do
> +	 * correct check on the logical bytenr.
> +	 */
> +	ctrl->io_bio->bi_iter.bi_sector = 0;
> +
> +	/* Add all bad sectors into io_bio. */
> +	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
> +		u64 logical = ctrl->logical + offset;
> +
> +		nr_sector = offset >> fs_info->sectorsize_bits;
> +
> +		/* Good sectors, no need to handle. */
> +		if (!test_bit(nr_sector, &ctrl->bad_bitmap))
> +			continue;
> +
> +		io_add_or_submit(ctrl, mirror, logical, bv.bv_page,
> +				 bv.bv_offset, REQ_OP_READ | REQ_SYNC);
> +	}
> +	io_bio_submit(ctrl, mirror, REQ_OP_READ | REQ_SYNC);
> +
> +	/* Check the newly read data. */
> +	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
> +		u8 *csum_expected;
> +		u8 csum[BTRFS_CSUM_SIZE];
> +
> +		nr_sector = offset >> fs_info->sectorsize_bits;
> +
> +		/* Originally good sector or read failed, skip. */
> +		if (!test_bit(nr_sector, &old_bitmap) ||
> +		    test_bit(nr_sector, &ctrl->bad_bitmap))
> +			continue;
> +
> +		/* No data csum, only need to repair. */
> +		if (!ctrl->csum)
> +			goto repair;
> +
> +		/*
> +		 * The remaining case is successful read with csum, need
> +		 * recheck the csum.
> +		 */
> +		csum_expected = btrfs_csum_ptr(fs_info, ctrl->csum, offset);
> +		ret = btrfs_check_sector_csum(fs_info, bv.bv_page,
> +				bv.bv_offset, csum, csum_expected);
> +		if (ret) {
> +			set_bit(nr_sector, &ctrl->bad_bitmap);
> +			continue;
> +		}
> +repair:
> +		/*
> +		 * This sector is properly fixed, write it back to previous
> +		 * bad mirror.
> +		 */
> +		writeback_good_mirror(ctrl, prev_mirror, ctrl->logical + offset,
> +				bv.bv_page, bv.bv_offset);
> +	}
> +	/* Submit the last write bio. */
> +	io_bio_submit(ctrl, mirror, REQ_OP_WRITE);
> +}
> +
> +int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
> +{
> +	struct btrfs_fs_info *fs_info;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	u32 offset;
> +	int nr_sectors;
> +	int mirror;
> +	int ret = -EIO;
> +
> +	if (!ctrl->inode)
> +		return 0;
> +
> +	fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	nr_sectors = ctrl->len >> fs_info->sectorsize_bits;
> +	ASSERT(ctrl->len);
> +	/* All sectors should be bad initially. */
> +	ASSERT(find_first_zero_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors);
> +
> +	for (mirror = get_next_mirror(ctrl->failed_mirror, ctrl->num_copies);
> +	     mirror != ctrl->failed_mirror;
> +	     mirror = get_next_mirror(mirror, ctrl->num_copies)) {
> +		repair_from_mirror(ctrl, mirror);
> +
> +		/* All repaired*/
> +		if (find_first_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors) {
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	/* DIO doesn't need any page status/extent update.*/
> +	if (!ctrl->is_dio) {
> +		/* Unlock all the pages and unlock the extent range. */
> +		bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter,
> +				    offset) {
> +			bool uptodate = !test_bit(offset >>
> +						  fs_info->sectorsize_bits,
> +						  &ctrl->bad_bitmap);
> +
> +			end_sector_io(bv.bv_page, ctrl->file_offset + offset,
> +				      uptodate);
> +		}
> +	}
> +	bio_put(ctrl->bad_sectors);
> +	if (ctrl->io_bio)
> +		bio_put(ctrl->io_bio);
> +	memset(ctrl, 0, sizeof(*ctrl));
> +	return ret;
> +}
> diff --git a/fs/btrfs/read-repair.h b/fs/btrfs/read-repair.h
> new file mode 100644
> index 000000000000..87219c786109
> --- /dev/null
> +++ b/fs/btrfs/read-repair.h
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef BTRFS_READ_REPAIR_H
> +#define BTRFS_READ_REPAIR_H
> +
> +#include <linux/blk_types.h>
> +#include <linux/fs.h>
> +
> +struct btrfs_read_repair_ctrl {
> +	struct inode *inode;
> +
> +	/* The logical bytenr of the firts corrupted sector. */
> +	u64 logical;
> +
> +	/* The file offset of the first corrupted sector. */
> +	u64 file_offset;
> +
> +	/* The checksum for the corrupted sectors. */
> +	u8 *csum;
> +
> +	/* Current length of the corrupted range. */
> +	u32 len;
> +
> +	int failed_mirror;
> +	int num_copies;
> +	unsigned long bad_bitmap;
> +	bool is_raid56;
> +	bool is_dio;
> +
> +	/* This is only to hold all the initial bad continuous sectors. */
> +	struct bio *bad_sectors;
> +
> +	/*
> +	 * The bio we use to do the real IO.
> +	 * This bio has to be btrfs_bio, as btrfs_map_bio() will utilize
> +	 * btrfs_bio()->device.
> +	 */
> +	struct bio *io_bio;
> +};
> +
> +int btrfs_read_repair_add_sector(struct inode *inode,
> +				 struct btrfs_read_repair_ctrl *ctrl,
> +				 struct page *page, unsigned int pgoff,
> +				 u64 logical, u64 file_offset, u8 *csum,
> +				 int failed_mirror, bool is_dio);
> +int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl);
> +
> +#endif
Christoph Hellwig May 26, 2022, 7:30 a.m. UTC | #2
On Thu, May 26, 2022 at 11:06:31AM +0800, Qu Wenruo wrote:
> Hi Christoph, I'm pretty sure the non-continuous bio problem is here for 
> all of our attempts to rework read-repair.

Why is it a problem?  Multiple discontiguous errors in the same bio
are a very unusual error pattern.  We need to handle it obviously, but
it doesn't need to be optimized as it is so rare.  The most common error
pattern is that the entire read will return an error, followed by a single
corrupted sector.

> I'm wondering if there is some "dummy" page provided from block layer that 
> we can utilize?

For reads nvme (and a few SCSI HBAs) support a bit bucket SGL for reads
that discard parts of the data.  Right now upstream none of this is
supported, altough Keith has been looking into it (for a rather different
use case) in nvme.  This does not help with writes, never mind the fact
that I would not want to use exotic and barely tested code and hardware
features for a non time critical and rarely used error handling path..
Qu Wenruo May 26, 2022, 7:37 a.m. UTC | #3
On 2022/5/26 15:30, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 11:06:31AM +0800, Qu Wenruo wrote:
>> Hi Christoph, I'm pretty sure the non-continuous bio problem is here for
>> all of our attempts to rework read-repair.
>
> Why is it a problem?  Multiple discontiguous errors in the same bio
> are a very unusual error pattern.  We need to handle it obviously, but
> it doesn't need to be optimized as it is so rare.  The most common error
> pattern is that the entire read will return an error, followed by a single
> corrupted sector.

Rare case doesn't mean it won't happen.

We still need to address it anyway.

Furthermore, if we can submit one bio to read the whole mirror range,
without putting the corrupted data into our repaired data, it also means
we will have read at most (num_copies - 1) times, without resetting the
initial mirror.

>
>> I'm wondering if there is some "dummy" page provided from block layer that
>> we can utilize?
>
> For reads nvme (and a few SCSI HBAs) support a bit bucket SGL for reads
> that discard parts of the data.  Right now upstream none of this is
> supported, altough Keith has been looking into it (for a rather different
> use case) in nvme.  This does not help with writes, never mind the fact
> that I would not want to use exotic and barely tested code and hardware
> features for a non time critical and rarely used error handling path..

I'm not purposing the SGL method, but still do a full range read, the
only difference is, the page range we don't care will be written to some
dust bin page, and only the range we care will be put into the real pages.

E.g. we allocate a dedicated page per-fs (or even for the whole btrfs
module) as a dustbin page.

When we don't want to read some range, we just add that page into the
bio (this means we may put the same page into the bio several times, and
the page may be utilized by several different bios at the same time).
And submit the bio.

I'm not sure the current code base can handle the case though.


For write, it's pretty simple, we only writeback the whole correct range.
If we didn't recover the full corrupted range, we just don't do writeback.

Thanks,
Qu
Christoph Hellwig May 26, 2022, 7:45 a.m. UTC | #4
On Thu, May 26, 2022 at 03:37:47PM +0800, Qu Wenruo wrote:
> Rare case doesn't mean it won't happen.
>
> We still need to address it anyway.

address != build overly complicated code to optimize for it
Qu Wenruo May 26, 2022, 7:52 a.m. UTC | #5
On 2022/5/26 15:45, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 03:37:47PM +0800, Qu Wenruo wrote:
>> Rare case doesn't mean it won't happen.
>>
>> We still need to address it anyway.
> 
> address != build overly complicated code to optimize for it
> 
Well, using seemly simple code, but can lead to read way more loops and 
way more data to read, is neither a good way.
Christoph Hellwig May 26, 2022, 8 a.m. UTC | #6
On Thu, May 26, 2022 at 03:52:03PM +0800, Qu Wenruo wrote:
>
>
> On 2022/5/26 15:45, Christoph Hellwig wrote:
>> On Thu, May 26, 2022 at 03:37:47PM +0800, Qu Wenruo wrote:
>>> Rare case doesn't mean it won't happen.
>>>
>>> We still need to address it anyway.
>>
>> address != build overly complicated code to optimize for it
>>
> Well, using seemly simple code, but can lead to read way more loops and way 
> more data to read, is neither a good way.

Again, having checkered corruption is an extremely unlikely event.
I'd rather deal with it by doing more reads than code complexity.
Qu Wenruo May 26, 2022, 8:07 a.m. UTC | #7
On 2022/5/26 16:00, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 03:52:03PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2022/5/26 15:45, Christoph Hellwig wrote:
>>> On Thu, May 26, 2022 at 03:37:47PM +0800, Qu Wenruo wrote:
>>>> Rare case doesn't mean it won't happen.
>>>>
>>>> We still need to address it anyway.
>>>
>>> address != build overly complicated code to optimize for it
>>>
>> Well, using seemly simple code, but can lead to read way more loops and way
>> more data to read, is neither a good way.
> 
> Again, having checkered corruption is an extremely unlikely event.
> I'd rather deal with it by doing more reads than code complexity.
> 

Then it can be said to almost all ENOSPC error handling code.
It's less than 1% chance, but we spend over 10% code for it.

And if you really want to go that path, I see no reason why we didn't go 
sector-by-sector repair.


Furthermore if "more reads" means over 10 times the amount we need, I 
strongly doubt if it's sane.

Just the same RAID1C3, mirror 1 all corrupted, mirror 2 and 3 checker 
pattern, fill it into a 4MiB range, and try run your version of code 
starting with mirror 1, and see how many loops we need to go, especially 
how many times we need to read mirror 1 unnecessarily.

Thanks,
Qu
Christoph Hellwig May 26, 2022, 8:17 a.m. UTC | #8
On Thu, May 26, 2022 at 04:07:49PM +0800, Qu Wenruo wrote:
> Then it can be said to almost all ENOSPC error handling code.

ENOSPC is a lot more common.

> It's less than 1% chance, but we spend over 10% code for it.
>
> And if you really want to go that path, I see no reason why we didn't go 
> sector-by-sector repair.

Because that really sucks for the case where the whole I/O fails.
Which is the common failure scenario.
Qu Wenruo May 26, 2022, 8:26 a.m. UTC | #9
On 2022/5/26 16:17, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 04:07:49PM +0800, Qu Wenruo wrote:
>> Then it can be said to almost all ENOSPC error handling code.
>
> ENOSPC is a lot more common.

Sorry, I mean ENOMEM.

>
>> It's less than 1% chance, but we spend over 10% code for it.
>>
>> And if you really want to go that path, I see no reason why we didn't go
>> sector-by-sector repair.
>
> Because that really sucks for the case where the whole I/O fails.
> Which is the common failure scenario.

But it's just a performance problem, which is not that critical.
Christoph Hellwig May 26, 2022, 8:28 a.m. UTC | #10
On Thu, May 26, 2022 at 04:26:30PM +0800, Qu Wenruo wrote:
>
>
> On 2022/5/26 16:17, Christoph Hellwig wrote:
>> On Thu, May 26, 2022 at 04:07:49PM +0800, Qu Wenruo wrote:
>>> Then it can be said to almost all ENOSPC error handling code.
>>
>> ENOSPC is a lot more common.
>
> Sorry, I mean ENOMEM.
>
>>
>>> It's less than 1% chance, but we spend over 10% code for it.
>>>
>>> And if you really want to go that path, I see no reason why we didn't go
>>> sector-by-sector repair.
>>
>> Because that really sucks for the case where the whole I/O fails.
>> Which is the common failure scenario.
>
> But it's just a performance problem, which is not that critical.

I'm officially lost now.
Qu Wenruo May 26, 2022, 8:49 a.m. UTC | #11
On 2022/5/26 16:28, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 04:26:30PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2022/5/26 16:17, Christoph Hellwig wrote:
>>> On Thu, May 26, 2022 at 04:07:49PM +0800, Qu Wenruo wrote:
>>>> Then it can be said to almost all ENOSPC error handling code.
>>>
>>> ENOSPC is a lot more common.
>>
>> Sorry, I mean ENOMEM.
>>
>>>
>>>> It's less than 1% chance, but we spend over 10% code for it.
>>>>
>>>> And if you really want to go that path, I see no reason why we didn't go
>>>> sector-by-sector repair.
>>>
>>> Because that really sucks for the case where the whole I/O fails.
>>> Which is the common failure scenario.
>>
>> But it's just a performance problem, which is not that critical.
>
> I'm officially lost now.

Why? If you care so much about the code simplicity, sector-by-sector is
the best.
If you care so much about the performance, the latest bitmap is the
best, no matter if it's the worst checker patter or not.
Christoph Hellwig May 26, 2022, 8:54 a.m. UTC | #12
On Thu, May 26, 2022 at 04:49:15PM +0800, Qu Wenruo wrote:
>>>> Because that really sucks for the case where the whole I/O fails.
>>>> Which is the common failure scenario.
>>>
>>> But it's just a performance problem, which is not that critical.
>>
>> I'm officially lost now.
>
> Why? If you care so much about the code simplicity, sector-by-sector is
> the best.
> If you care so much about the performance, the latest bitmap is the
> best, no matter if it's the worst checker patter or not.

Because you tell me that handling the most common and important case
in read repair is just a performance issue, which you keep arguing for
micro-optimizing a corner case.  And not, for the case of failing a
large bio (which arguably can only happen for buffered I/O at the
moment, but that is another thing to look into) the bitmaps will only
help you for up to 64 sectors.  Way better than just doing a single
sector synchronous I/O but not exactly nice while still being a fair
amount of code compared to just doing variable sized synchronous
I/O.
Qu Wenruo May 26, 2022, 9:13 a.m. UTC | #13
On 2022/5/26 16:54, Christoph Hellwig wrote:
> On Thu, May 26, 2022 at 04:49:15PM +0800, Qu Wenruo wrote:
>>>>> Because that really sucks for the case where the whole I/O fails.
>>>>> Which is the common failure scenario.
>>>>
>>>> But it's just a performance problem, which is not that critical.
>>>
>>> I'm officially lost now.
>>
>> Why? If you care so much about the code simplicity, sector-by-sector is
>> the best.
>> If you care so much about the performance, the latest bitmap is the
>> best, no matter if it's the worst checker patter or not.
>
> Because you tell me that handling the most common and important case
> in read repair is just a performance issue, which you keep arguing for
> micro-optimizing a corner case.

Because I'm fine either way, but not fine with the middle ground.

I purposed both versions, to fulfill the different requirements.
The tradeoff is not avoidable, we have to choose the poison.

If we want to go code simplicity, then my argument to support
sector-by-sector is, the repair is already a cold path (the same
argument you go on the checker pattern), thus overall the performance
drop is not that critical.

If we want to go the best perf (even corruption is already a corner
case), then there is the bitmap version, handling all cases, at the cost
of complex code.

If you can find a simpler version, and still handle the checker pattern
sanely (aka, without reading the same bad mirror again and again), sure,
I'm always fine to go that version.

Otherwise sector-by-sector or bitmap seems a more sane choice to do between.

Thanks,
Qu

>  And not, for the case of failing a
> large bio (which arguably can only happen for buffered I/O at the
> moment, but that is another thing to look into) the bitmaps will only
> help you for up to 64 sectors.  Way better than just doing a single
> sector synchronous I/O but not exactly nice while still being a fair
> amount of code compared to just doing variable sized synchronous
> I/O.
Christoph Hellwig May 27, 2022, 8:10 a.m. UTC | #14
On Thu, May 26, 2022 at 05:13:30PM +0800, Qu Wenruo wrote:
>> Because you tell me that handling the most common and important case
>> in read repair is just a performance issue, which you keep arguing for
>> micro-optimizing a corner case.
>
> Because I'm fine either way, but not fine with the middle ground.

Where the middle ground is optimizig for the common case while handling
the non-common cases in a non-optimized way?
diff mbox series

Patch

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99f9995670ea..0b2605c750ca 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@  btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-	   subpage.o tree-mod-log.o
+	   subpage.o tree-mod-log.o read-repair.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1083d6cfa858..160dedb078fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2735,7 +2735,7 @@  static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 		btrfs_subpage_end_reader(fs_info, page, start, len);
 }
 
-static void end_sector_io(struct page *page, u64 offset, bool uptodate)
+void end_sector_io(struct page *page, u64 offset, bool uptodate)
 {
 	struct inode *inode = page->mapping->host;
 	u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6cdcea1551a6..e3f9db50983d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -250,6 +250,7 @@  struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
+void end_sector_io(struct page *page, u64 offset, bool uptodate);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
 
diff --git a/fs/btrfs/read-repair.c b/fs/btrfs/read-repair.c
new file mode 100644
index 000000000000..26f59439fc5c
--- /dev/null
+++ b/fs/btrfs/read-repair.c
@@ -0,0 +1,328 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bio.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "read-repair.h"
+#include "btrfs_inode.h"
+
+static int get_next_mirror(int mirror, int num_copies)
+{
+	if (mirror + 1 > num_copies)
+		return mirror + 1 - num_copies;
+	return mirror + 1;
+}
+
+static int get_prev_mirror(int mirror, int num_copies)
+{
+	if (mirror - 1 == 0)
+		return num_copies;
+	return mirror - 1;
+}
+static void init_new_ctrl(struct inode *inode,
+			  struct btrfs_read_repair_ctrl *ctrl,
+			  u64 logical, u64 file_offset, u8 *csum,
+			  int failed_mirror, bool is_dio)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+	ASSERT(!ctrl->inode);
+	ASSERT(!ctrl->logical);
+	ASSERT(!ctrl->bad_sectors);
+	ASSERT(!ctrl->io_bio);
+	ASSERT(!ctrl->len);
+
+	ctrl->inode = inode;
+	ctrl->logical = logical;
+	ctrl->num_copies = btrfs_num_copies(fs_info, logical,
+					    fs_info->sectorsize);
+	ctrl->is_raid56 = btrfs_is_parity_mirror(fs_info, logical,
+						 fs_info->sectorsize);
+	ctrl->is_dio = is_dio;
+	ctrl->file_offset = file_offset;
+	ctrl->failed_mirror = failed_mirror;
+	ctrl->csum = csum;
+	ctrl->bad_sectors = bio_alloc(NULL, BITS_PER_LONG, REQ_OP_READ, GFP_NOFS);
+	ctrl->bad_sectors->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+
+	ctrl->io_bio = btrfs_bio_alloc(BITS_PER_LONG);
+}
+
+int btrfs_read_repair_add_sector(struct inode *inode,
+				 struct btrfs_read_repair_ctrl *ctrl,
+				 struct page *page, unsigned int pgoff,
+				 u64 logical, u64 file_offset, u8 *csum,
+				 int failed_mirror, bool is_dio)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	const u32 size_limit = BITS_PER_LONG << fs_info->sectorsize_bits;
+	int ret = 0;
+
+	/* Mirror number should not be 0. */
+	ASSERT(failed_mirror);
+
+	/* Not initialized, initialize an empty one. */
+	if (!ctrl->inode) {
+		const int num_copies = btrfs_num_copies(fs_info, logical,
+							fs_info->sectorsize);
+
+		/* No more copies, can not repair. */
+		if (num_copies <= 1) {
+			if (!is_dio)
+				end_sector_io(page, file_offset, false);
+			return -EIO;
+		}
+
+		init_new_ctrl(inode, ctrl, logical, file_offset, csum,
+			      failed_mirror, is_dio);
+	}
+
+	/* Not continuous with the exiting range, finish the current one. */
+	if (ctrl->logical + ctrl->len != logical ||
+	    ctrl->failed_mirror != failed_mirror) {
+		ret = btrfs_read_repair_finish(ctrl);
+		init_new_ctrl(inode, ctrl, logical, file_offset, csum,
+			      failed_mirror, is_dio);
+	}
+
+	/* Continuous, just add the page into the current bio. */
+	ASSERT(ctrl->bad_sectors);
+	ASSERT(ctrl->len < size_limit);
+	bio_add_page(ctrl->bad_sectors, page, fs_info->sectorsize, pgoff);
+
+	ctrl->len += fs_info->sectorsize;
+	set_bit((logical - ctrl->logical) >> fs_info->sectorsize_bits,
+		&ctrl->bad_bitmap);
+	if (ctrl->len >= size_limit)
+		ret = btrfs_read_repair_finish(ctrl);
+	return ret;
+}
+
+/*
+ * Iterate through a bio on a per-sector basis.
+ */
+#define bio_for_each_sector(fs_info, bvl, bio, iter, bio_offset)	\
+	for ((iter) = bio->bi_iter, (bio_offset) = 0;			\
+	     (iter).bi_size &&						\
+	     (((bvl) = bio_iter_iovec((bio), (iter))), 1);		\
+	     (bio_offset) += fs_info->sectorsize,			\
+	     bio_advance_iter_single(bio, &(iter),			\
+	     (fs_info)->sectorsize))
+
+static void io_bio_submit(struct btrfs_read_repair_ctrl *ctrl, int mirror,
+			  int opf)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct bio *io_bio = ctrl->io_bio;
+	u64 io_logical;
+	u32 io_size;
+	int ret;
+
+	ASSERT(io_bio);
+	io_logical = io_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	io_size = io_bio->bi_iter.bi_size;
+	/* Not yet utilized, just keep it for later usage. */
+	if (io_size == 0) {
+		io_bio->bi_iter.bi_sector = 0;
+		return;
+	}
+
+	io_bio->bi_opf = opf;
+	ret = btrfs_map_bio_wait(fs_info, ctrl->io_bio, mirror);
+	/* Read succeeded, clear the bad bits. */
+	if ((opf & REQ_OP_MASK) == REQ_OP_READ && !ret)
+		bitmap_clear(&ctrl->bad_bitmap,
+			(io_logical - ctrl->logical) >> fs_info->sectorsize_bits,
+			io_size >> fs_info->sectorsize_bits);
+	ctrl->io_bio = btrfs_bio_alloc(BITS_PER_LONG);
+	/* Leave bi_sector uninitialized. */
+	ctrl->io_bio->bi_iter.bi_sector = 0;
+}
+
+static void io_add_or_submit(struct btrfs_read_repair_ctrl *ctrl, int mirror,
+			   u64 logical, struct page *page, unsigned int pgoff,
+			   int opf)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct bio *io_bio = ctrl->io_bio;
+
+	/* Uninitialized. */
+	if (io_bio->bi_iter.bi_sector == 0) {
+		ASSERT(io_bio->bi_iter.bi_size == 0);
+		io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+		io_bio->bi_opf = opf;
+		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
+		return;
+	}
+
+	/* Continuous, add the page */
+	if ((io_bio->bi_iter.bi_sector << SECTOR_SHIFT) +
+	     io_bio->bi_iter.bi_size == logical) {
+		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
+		return;
+	}
+
+	/* Not continuous, submit first. */
+	io_bio_submit(ctrl, mirror, opf);
+	io_bio = ctrl->io_bio;
+	io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+	bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
+}
+
+static void writeback_good_mirror(struct btrfs_read_repair_ctrl *ctrl,
+				  int mirror, u64 logical,
+				  struct page *page, unsigned int pgoff)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct bio *io_bio = ctrl->io_bio;
+
+
+	if (btrfs_repair_one_zone(fs_info, ctrl->logical))
+		return;
+
+	/*
+	 * For RAID56, we can not just write the bad data back, as
+	 * any write will trigger RMW and read back the corrrupted
+	 * on-disk stripe, causing further damage.
+	 * So here we do special repair for raid56.
+	 *
+	 * And unfortunately, this repair is very low level and not
+	 * compatible with the rest of the mirror based repair.
+	 * So it's still done in synchronous mode using
+	 * btrfs_repair_io_failure().
+	 */
+	if (ctrl->is_raid56) {
+		const u64 file_offset = logical - ctrl->logical +
+					ctrl->file_offset;
+		btrfs_repair_io_failure(fs_info,
+				btrfs_ino(BTRFS_I(ctrl->inode)), file_offset,
+				fs_info->sectorsize, logical, page, pgoff,
+				mirror);
+		return;
+	}
+
+	ASSERT(io_bio);
+	io_add_or_submit(ctrl, mirror, logical, page, pgoff, REQ_OP_WRITE);
+}
+
+static void repair_from_mirror(struct btrfs_read_repair_ctrl *ctrl, int mirror)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	unsigned long old_bitmap = ctrl->bad_bitmap;
+	const int prev_mirror = get_prev_mirror(mirror, ctrl->num_copies);
+	int nr_sector;
+	u32 offset;
+	int ret;
+
+	/*
+	 * Reset the io_bio logial bytenr so later io_add_or_submit() can do
+	 * correct check on the logical bytenr.
+	 */
+	ctrl->io_bio->bi_iter.bi_sector = 0;
+
+	/* Add all bad sectors into io_bio. */
+	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
+		u64 logical = ctrl->logical + offset;
+
+		nr_sector = offset >> fs_info->sectorsize_bits;
+
+		/* Good sectors, no need to handle. */
+		if (!test_bit(nr_sector, &ctrl->bad_bitmap))
+			continue;
+
+		io_add_or_submit(ctrl, mirror, logical, bv.bv_page,
+				 bv.bv_offset, REQ_OP_READ | REQ_SYNC);
+	}
+	io_bio_submit(ctrl, mirror, REQ_OP_READ | REQ_SYNC);
+
+	/* Check the newly read data. */
+	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
+		u8 *csum_expected;
+		u8 csum[BTRFS_CSUM_SIZE];
+
+		nr_sector = offset >> fs_info->sectorsize_bits;
+
+		/* Originally good sector or read failed, skip. */
+		if (!test_bit(nr_sector, &old_bitmap) ||
+		    test_bit(nr_sector, &ctrl->bad_bitmap))
+			continue;
+
+		/* No data csum, only need to repair. */
+		if (!ctrl->csum)
+			goto repair;
+
+		/*
+		 * The remaining case is successful read with csum, need
+		 * recheck the csum.
+		 */
+		csum_expected = btrfs_csum_ptr(fs_info, ctrl->csum, offset);
+		ret = btrfs_check_sector_csum(fs_info, bv.bv_page,
+				bv.bv_offset, csum, csum_expected);
+		if (ret) {
+			set_bit(nr_sector, &ctrl->bad_bitmap);
+			continue;
+		}
+repair:
+		/*
+		 * This sector is properly fixed, write it back to previous
+		 * bad mirror.
+		 */
+		writeback_good_mirror(ctrl, prev_mirror, ctrl->logical + offset,
+				bv.bv_page, bv.bv_offset);
+	}
+	/* Submit the last write bio. */
+	io_bio_submit(ctrl, mirror, REQ_OP_WRITE);
+}
+
+int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
+{
+	struct btrfs_fs_info *fs_info;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	u32 offset;
+	int nr_sectors;
+	int mirror;
+	int ret = -EIO;
+
+	if (!ctrl->inode)
+		return 0;
+
+	fs_info = btrfs_sb(ctrl->inode->i_sb);
+	nr_sectors = ctrl->len >> fs_info->sectorsize_bits;
+	ASSERT(ctrl->len);
+	/* All sectors should be bad initially. */
+	ASSERT(find_first_zero_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors);
+
+	for (mirror = get_next_mirror(ctrl->failed_mirror, ctrl->num_copies);
+	     mirror != ctrl->failed_mirror;
+	     mirror = get_next_mirror(mirror, ctrl->num_copies)) {
+		repair_from_mirror(ctrl, mirror);
+
+		/* All repaired*/
+		if (find_first_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors) {
+			ret = 0;
+			break;
+		}
+	}
+
+	/* DIO doesn't need any page status/extent update.*/
+	if (!ctrl->is_dio) {
+		/* Unlock all the pages and unlock the extent range. */
+		bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter,
+				    offset) {
+			bool uptodate = !test_bit(offset >>
+						  fs_info->sectorsize_bits,
+						  &ctrl->bad_bitmap);
+
+			end_sector_io(bv.bv_page, ctrl->file_offset + offset,
+				      uptodate);
+		}
+	}
+	bio_put(ctrl->bad_sectors);
+	if (ctrl->io_bio)
+		bio_put(ctrl->io_bio);
+	memset(ctrl, 0, sizeof(*ctrl));
+	return ret;
+}
diff --git a/fs/btrfs/read-repair.h b/fs/btrfs/read-repair.h
new file mode 100644
index 000000000000..87219c786109
--- /dev/null
+++ b/fs/btrfs/read-repair.h
@@ -0,0 +1,48 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_READ_REPAIR_H
+#define BTRFS_READ_REPAIR_H
+
+#include <linux/blk_types.h>
+#include <linux/fs.h>
+
+struct btrfs_read_repair_ctrl {
+	struct inode *inode;
+
+	/* The logical bytenr of the firts corrupted sector. */
+	u64 logical;
+
+	/* The file offset of the first corrupted sector. */
+	u64 file_offset;
+
+	/* The checksum for the corrupted sectors. */
+	u8 *csum;
+
+	/* Current length of the corrupted range. */
+	u32 len;
+
+	int failed_mirror;
+	int num_copies;
+	unsigned long bad_bitmap;
+	bool is_raid56;
+	bool is_dio;
+
+	/* This is only to hold all the initial bad continuous sectors. */
+	struct bio *bad_sectors;
+
+	/*
+	 * The bio we use to do the real IO.
+	 * This bio has to be btrfs_bio, as btrfs_map_bio() will utilize
+	 * btrfs_bio()->device.
+	 */
+	struct bio *io_bio;
+};
+
+int btrfs_read_repair_add_sector(struct inode *inode,
+				 struct btrfs_read_repair_ctrl *ctrl,
+				 struct page *page, unsigned int pgoff,
+				 u64 logical, u64 file_offset, u8 *csum,
+				 int failed_mirror, bool is_dio);
+int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl);
+
+#endif