Message ID | 8531c41848973ac60ca4e23e2c7a2a47c4b94881.1705313879.git.wqu@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: scrub: avoid use-after-free when chunk end is not 64K aligned | expand |
On 15.01.24 11:19, Qu Wenruo wrote: > This means btrfs_submit_bio() would split the bio, and trigger endio > function for both of the two halves. > > However scrub_submit_initial_read() would only expect the endio function > to be called once, not any more. > This means the first endio function would already free the bbio::bio, > leaving the bvec freed, thus the 2nd endio call would lead to > use-after-free. This is a problem I also encountered when doing the RST case for scub. > > [FIX] > - Make sure scrub_read_endio() only updates bits in its range > This is not only for the fix, but also for the existing code > of RST scrub. > Since RST scrub can only submit part of the stripe, we can no longer > assume the initial read bbio is always the full 64K. Outch indeed that fix is needed for RST. Thanks! > - Make sure scrub_submit_initial_read() only to read the chunk range > This is done by calculating the real number of sectors we need to > read, and add sector-by-sector to the bio. Why can't you do it the same way the RST version does it by checking the extent_sector_bitmap and then add sector-by-sector from it? > - Make sure scrub_submit_extent_sector_read() won't read beyond chunk > Normally this function won't read beyond chunk as it would only > read ranges covered by an extent. > But in the case of corrupted extent tree, where an extent can exist > beyond chunk boundary, we can still read beyond chunk. > In this case, add extra checks to prevent such read beyond chunk. > > Thankfully the other scrub read path won't need extra fixes: > > - scrub_stripe_submit_repair_read() > With above fixes, we won't update error bit for range beyond chunk, > thus scrub_stripe_submit_repair_read() should never submit any read > beyond the chunk. > > Reported-by: Rongrong <i@rong.moe> > Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") > Tested-by: Rongrong <i@rong.moe> > Signed-off-by: Qu Wenruo <wqu@suse.com> > --- > fs/btrfs/scrub.c | 35 ++++++++++++++++++++++++++++------- > 1 file changed, 28 insertions(+), 7 deletions(-) > > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c > index a01807cbd4d4..01dd146f050d 100644 > --- a/fs/btrfs/scrub.c > +++ b/fs/btrfs/scrub.c > @@ -1098,12 +1098,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) > static void scrub_read_endio(struct btrfs_bio *bbio) > { > struct scrub_stripe *stripe = bbio->private; > + struct bio_vec *bvec; > + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); > + int num_sectors; > + u32 bio_size = 0; > + int i; > + > + ASSERT(sector_nr < stripe->nr_sectors); > + bio_for_each_bvec_all(bvec, &bbio->bio, i) > + bio_size += bvec->bv_len; > + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; > > if (bbio->bio.bi_status) { > - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); > - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); > + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); > + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); > } else { > - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); > + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); > } > bio_put(&bbio->bio); > if (atomic_dec_and_test(&stripe->pending_io)) { > @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, > { > struct btrfs_fs_info *fs_info = stripe->bg->fs_info; > struct btrfs_bio *bbio = NULL; > + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + > + stripe->bg->length - stripe->logical) >> > + fs_info->sectorsize_bits; > u64 stripe_len = BTRFS_STRIPE_LEN; > int mirror = stripe->mirror_num; > int i; > @@ -1646,6 +1659,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, > struct page *page = scrub_stripe_get_page(stripe, i); > unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); > > + if (i >= nr_sectors) > + break; > + > /* The current sector cannot be merged, submit the bio. */ > if (bbio && > ((i > 0 && > @@ -1701,6 +1717,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, > { > struct btrfs_fs_info *fs_info = sctx->fs_info; > struct btrfs_bio *bbio; > + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + > + stripe->bg->length - stripe->logical) >> > + fs_info->sectorsize_bits; > int mirror = stripe->mirror_num; > > ASSERT(stripe->bg); > @@ -1715,14 +1734,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, > bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, > scrub_read_endio, stripe); > > - /* Read the whole stripe. */ > bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; > - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { > + /* Read the whole range inside the chunk boundary. */ > + for (unsigned int cur = 0; cur < nr_sectors; cur++) { > + struct page *page = scrub_stripe_get_page(stripe, cur); > + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); > int ret; > > - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); > + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); > /* We should have allocated enough bio vectors. */ > - ASSERT(ret == PAGE_SIZE); > + ASSERT(ret == fs_info->sectorsize); > } > atomic_inc(&stripe->pending_io); >
On 2024/1/15 22:39, Johannes Thumshirn wrote: [...] > >> - Make sure scrub_submit_initial_read() only to read the chunk range >> This is done by calculating the real number of sectors we need to >> read, and add sector-by-sector to the bio. > > Why can't you do it the same way the RST version does it by checking the > extent_sector_bitmap and then add sector-by-sector from it? Sure, we can, although the whole new scrub code is before RST, and at that time, the whole 64K read behavior is considered as a better option, as it reduces the IOPS for a fragmented stripe. And initially to make the code much simpler, but it turns out that the "simpler" code is also less robust, requiring very strict never-be-split condition, and finally leading to the bug. Thanks, Qu
On Tue, Jan 16, 2024 at 09:20:58AM +1030, Qu Wenruo wrote: > > > On 2024/1/15 22:39, Johannes Thumshirn wrote: > [...] > > > >> - Make sure scrub_submit_initial_read() only to read the chunk range > >> This is done by calculating the real number of sectors we need to > >> read, and add sector-by-sector to the bio. > > > > Why can't you do it the same way the RST version does it by checking the > > extent_sector_bitmap and then add sector-by-sector from it? > > Sure, we can, although the whole new scrub code is before RST, and at > that time, the whole 64K read behavior is considered as a better option, > as it reduces the IOPS for a fragmented stripe. I'd like to keep the scrub fix separte from the RST code, even if there's a chance for some code sharing or reuse. The scrub fix needs to be backported so it's better to keep it independent. The change itself looks as an enhancement of the existing code so it's "obvious" and does not need any preparatory patches.
On 2024/1/17 04:58, David Sterba wrote: > On Tue, Jan 16, 2024 at 09:20:58AM +1030, Qu Wenruo wrote: >> >> >> On 2024/1/15 22:39, Johannes Thumshirn wrote: >> [...] >>> >>>> - Make sure scrub_submit_initial_read() only to read the chunk range >>>> This is done by calculating the real number of sectors we need to >>>> read, and add sector-by-sector to the bio. >>> >>> Why can't you do it the same way the RST version does it by checking the >>> extent_sector_bitmap and then add sector-by-sector from it? >> >> Sure, we can, although the whole new scrub code is before RST, and at >> that time, the whole 64K read behavior is considered as a better option, >> as it reduces the IOPS for a fragmented stripe. > > I'd like to keep the scrub fix separte from the RST code, even if > there's a chance for some code sharing or reuse. The scrub fix needs to > be backported so it's better to keep it independent. So do I need to split the fix, so that the first part would be purely for the non-RST scrub part, and then a small fix to the RST part? I can try to do that, but since we need to touch the read endio function anyway, it may mean if we don't do it properly, it may break bisection. Thus I'd prefer to do a manual backport for the older branches without the RST code. Thanks, Qu > > The change itself looks as an enhancement of the existing code so it's > "obvious" and does not need any preparatory patches. >
On Wed, Jan 17, 2024 at 06:36:00AM +1030, Qu Wenruo wrote: > On 2024/1/17 04:58, David Sterba wrote: > > On Tue, Jan 16, 2024 at 09:20:58AM +1030, Qu Wenruo wrote: > >> On 2024/1/15 22:39, Johannes Thumshirn wrote: > >> [...] > >>> > >>>> - Make sure scrub_submit_initial_read() only to read the chunk range > >>>> This is done by calculating the real number of sectors we need to > >>>> read, and add sector-by-sector to the bio. > >>> > >>> Why can't you do it the same way the RST version does it by checking the > >>> extent_sector_bitmap and then add sector-by-sector from it? > >> > >> Sure, we can, although the whole new scrub code is before RST, and at > >> that time, the whole 64K read behavior is considered as a better option, > >> as it reduces the IOPS for a fragmented stripe. > > > > I'd like to keep the scrub fix separte from the RST code, even if > > there's a chance for some code sharing or reuse. The scrub fix needs to > > be backported so it's better to keep it independent. > > So do I need to split the fix, so that the first part would be purely > for the non-RST scrub part, and then a small fix to the RST part? > > I can try to do that, but since we need to touch the read endio function > anyway, it may mean if we don't do it properly, it may break bisection. > > Thus I'd prefer to do a manual backport for the older branches without > the RST code. I was not sure how much the scrub and RST are entangled so it was a suggestion to make the backport workable. It is preferred by stable to take patches 1:1 regarding the code changes (context adjustments are ok). In this case the manual backport would be needed, let's say one patch is taken without change and another one (regarding the RST changes) would be manualy tweaked.
On 2024/1/17 11:25, David Sterba wrote: > On Wed, Jan 17, 2024 at 06:36:00AM +1030, Qu Wenruo wrote: >> On 2024/1/17 04:58, David Sterba wrote: >>> On Tue, Jan 16, 2024 at 09:20:58AM +1030, Qu Wenruo wrote: >>>> On 2024/1/15 22:39, Johannes Thumshirn wrote: >>>> [...] >>>>> >>>>>> - Make sure scrub_submit_initial_read() only to read the chunk range >>>>>> This is done by calculating the real number of sectors we need to >>>>>> read, and add sector-by-sector to the bio. >>>>> >>>>> Why can't you do it the same way the RST version does it by checking the >>>>> extent_sector_bitmap and then add sector-by-sector from it? >>>> >>>> Sure, we can, although the whole new scrub code is before RST, and at >>>> that time, the whole 64K read behavior is considered as a better option, >>>> as it reduces the IOPS for a fragmented stripe. >>> >>> I'd like to keep the scrub fix separte from the RST code, even if >>> there's a chance for some code sharing or reuse. The scrub fix needs to >>> be backported so it's better to keep it independent. >> >> So do I need to split the fix, so that the first part would be purely >> for the non-RST scrub part, and then a small fix to the RST part? >> >> I can try to do that, but since we need to touch the read endio function >> anyway, it may mean if we don't do it properly, it may break bisection. >> >> Thus I'd prefer to do a manual backport for the older branches without >> the RST code. > > I was not sure how much the scrub and RST are entangled so it was a > suggestion to make the backport workable. It is preferred by stable to > take patches 1:1 regarding the code changes (context adjustments are > ok). In this case the manual backport would be needed, let's say one > patch is taken without change and another one (regarding the RST > changes) would be manualy tweaked. > No big deal. It turns out that, I can indeed split the patch into two. The change on scrub_read_endio() is already shared by both regular and RST scrub paths. The only change I did to RST specific path can be split out into another patch, which doesn't even need to be backported. So yes, the split is possible, and it should make later backport much easier. Thanks, Qu
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..01dd146f050d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1659,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1717,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1734,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io);