[v3,06/11] Btrfs, raid56: support parity scrub on raid56

Message ID	1417007091-11885-7-git-send-email-miaox@cn.fujitsu.com (mailing list archive)
State	Superseded
Headers	show Return-Path: <linux-btrfs-owner@kernel.org> From: Miao Xie <miaox@cn.fujitsu.com> To: <linux-btrfs@vger.kernel.org> Subject: [PATCH v3 06/11] Btrfs, raid56: support parity scrub on raid56 Date: Wed, 26 Nov 2014 21:04:46 +0800 Message-ID: <1417007091-11885-7-git-send-email-miaox@cn.fujitsu.com> In-Reply-To: <1417007091-11885-1-git-send-email-miaox@cn.fujitsu.com> References: <1416928481.3019.13@mail.thefacebook.com> <1417007091-11885-1-git-send-email-miaox@cn.fujitsu.com> MIME-Version: 1.0 Content-Type: text/plain Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index bfc406d..3b99cbc 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -65,6 +65,7 @@ enum btrfs_rbio_ops { BTRFS_RBIO_WRITE = 0, BTRFS_RBIO_READ_REBUILD = 1, + BTRFS_RBIO_PARITY_SCRUB = 2, }; struct btrfs_raid_bio { @@ -123,6 +124,7 @@ struct btrfs_raid_bio { /* number of data stripes (no p/q) */ int nr_data; + int stripe_npages; /* * set if we're doing a parity rebuild * for a read from higher up, which is handled @@ -137,6 +139,7 @@ struct btrfs_raid_bio { /* second bad stripe (for raid6 use) */ int failb; + int scrubp; /* * number of pages needed to represent the full * stripe @@ -171,6 +174,11 @@ struct btrfs_raid_bio { * here for faster lookup */ struct page **bio_pages; + + /* + * bitmap to record which horizontal stripe has data + */ + unsigned long *dbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -185,6 +193,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check); +static void async_scrub_parity(struct btrfs_raid_bio *rbio); + /* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe @@ -586,10 +598,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, cur->raid_map[0]) return 0; - /* reads can't merge with writes */ - if (last->operation != cur->operation) { + /* we can't merge with different operations */ + if (last->operation != cur->operation) + return 0; + /* + * We've need read the full stripe from the drive. + * check and repair the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + if (last->operation == BTRFS_RBIO_PARITY_SCRUB || + cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - } return 1; } @@ -782,9 +804,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else if (next->operation == BTRFS_RBIO_WRITE){ + else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); + } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { + steal_rbio(rbio, next); + async_scrub_parity(next); } goto done_nolock; @@ -950,9 +975,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, struct btrfs_raid_bio *rbio; int nr_data = 0; int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; - rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -967,6 +994,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; atomic_set(&rbio->refs, 1); @@ -980,6 +1008,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, p = rbio + 1; rbio->stripe_pages = p; rbio->bio_pages = p + sizeof(struct page *) * num_pages; + rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) nr_data = bbio->num_stripes - 2; @@ -1774,6 +1803,14 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(pagenr, rbio->dbitmap)) + continue; + /* setup our array of pointers with pages * from each stripe */ @@ -1918,7 +1955,13 @@ cleanup_io: } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; - finish_rmw(rbio); + + if (rbio->operation == BTRFS_RBIO_WRITE) + finish_rmw(rbio); + else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) + finish_parity_scrub(rbio, 0); + else + BUG(); } else { rbio_orig_end_io(rbio, err, 0); } @@ -2126,3 +2169,462 @@ static void read_rebuild_work(struct btrfs_work *work) rbio = container_of(work, struct btrfs_raid_bio, work); __raid56_parity_recover(rbio); } + +/* + * The following code is used to scrub/replace the parity stripe + * + * Note: We need make sure all the pages that add into the scrub/replace + * raid bio are correct and not be changed during the scrub/replace. That + * is those pages just hold metadata or file data with checksum. + */ + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) +{ + struct btrfs_raid_bio *rbio; + int i; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) + return NULL; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + rbio->operation = BTRFS_RBIO_PARITY_SCRUB; + + for (i = 0; i < bbio->num_stripes; i++) { + if (bbio->stripes[i].dev == scrub_dev) { + rbio->scrubp = i; + break; + } + } + + /* Now we just support the sectorsize equals to page size */ + ASSERT(root->sectorsize == PAGE_SIZE); + ASSERT(rbio->stripe_npages == stripe_nsectors); + bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + + return rbio; +} + +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical) +{ + int stripe_offset; + int index; + + ASSERT(logical >= rbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + + rbio->stripe_len * rbio->nr_data); + stripe_offset = (int)(logical - rbio->raid_map[0]); + index = stripe_offset >> PAGE_CACHE_SHIFT; + rbio->bio_pages[index] = page; +} + +/* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. + */ +static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) +{ + int i; + int bit; + int index; + struct page *page; + + for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { + for (i = 0; i < rbio->bbio->num_stripes; i++) { + index = i * rbio->stripe_npages + bit; + if (rbio->stripe_pages[index]) + continue; + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; + ClearPageUptodate(page); + } + } + return 0; +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_parity_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + + if (atomic_read(&rbio->error)) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); +} + +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[bbio->num_stripes]; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct page *p_page = NULL; + struct page *q_page = NULL; + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + + if (bbio->num_stripes - rbio->nr_data == 1) { + p_stripe = bbio->num_stripes - 1; + } else if (bbio->num_stripes - rbio->nr_data == 2) { + p_stripe = bbio->num_stripes - 2; + q_stripe = bbio->num_stripes - 1; + } else { + BUG(); + } + + /* + * Because the higher layers(scrubber) are unlikely to + * use this area of the disk again soon, so don't cache + * it. + */ + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + if (!need_check) + goto writeback; + + p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!p_page) + goto cleanup; + SetPageUptodate(p_page); + + if (q_stripe != -1) { + q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!q_page) { + __free_page(p_page); + goto cleanup; + } + SetPageUptodate(q_page); + } + + atomic_set(&rbio->error, 0); + + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *p; + void *parity; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + pointers[stripe++] = kmap(p_page); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + pointers[stripe++] = kmap(q_page); + + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + /* Check scrubbing pairty and repair it */ + p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + parity = kmap(p); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + else + /* Parity is right, needn't writeback */ + bitmap_clear(rbio->dbitmap, pagenr, 1); + kunmap(p); + + for (stripe = 0; stripe < bbio->num_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + __free_page(p_page); + if (q_page) + __free_page(q_page); + +writeback: + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, + page, rbio->scrubp, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + + nr_data = bio_list_size(&bio_list); + if (!nr_data) { + /* Every parity is right */ + rbio_orig_end_io(rbio, 0, 0); + return; + } + + atomic_set(&rbio->stripes_pending, nr_data); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_parity_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) +{ + if (stripe >= 0 && stripe < rbio->nr_data) + return 1; + return 0; +} + +/* + * While we're doing the parity check and repair, we could have errors + * in reading pages off the disk. This checks for errors and if we're + * not able to read the page it'll trigger parity reconstruction. The + * parity scrub will be finished after we've reconstructed the failed + * stripes + */ +static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +{ + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + goto cleanup; + + if (rbio->faila >= 0 || rbio->failb >= 0) { + int dfail = 0, failp = -1; + + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; + + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; + + /* + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) + */ + if (dfail > rbio->bbio->max_errors - 1) + goto cleanup; + + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) { + finish_parity_scrub(rbio, 0); + return; + } + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckly, use the other one to repair + * the data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) + goto cleanup; + + __raid_recover_end_io(rbio); + } else { + finish_parity_scrub(rbio, 1); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_parity_scrub_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_parity_scrub(rbio); +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int pagenr; + int stripe; + struct bio *bio; + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + bio_list_init(&bio_list); + + atomic_set(&rbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid56_parity_scrub_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return; + +finish: + validate_rbio_for_parity_scrub(rbio); +} + +static void scrub_parity_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_parity_scrub_stripe(rbio); +} + +static void async_scrub_parity(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + scrub_parity_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_scrub_parity(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index b310e8c..3d4ddb3 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -39,6 +39,9 @@ static inline int nr_data_stripes(struct map_lookup *map) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) +struct btrfs_raid_bio; +struct btrfs_device; + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, u64 stripe_len, int mirror_num, int hold_bbio); @@ -46,6 +49,15 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, u64 stripe_len); +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical); +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ca4b9eb..7f95afc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -74,6 +74,7 @@ struct scrub_page { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; + struct list_head list; u64 flags; /* extent flags */ u64 generation; u64 logical; @@ -114,14 +115,52 @@ struct scrub_block { atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ struct scrub_ctx *sctx; + struct scrub_parity *sparity; struct { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; unsigned int generation_error:1; /* also sets header_error */ + + /* The following is for the data used to check parity */ + /* It is for the data with checksum */ + unsigned int data_corrected:1; }; }; +/* Used for the chunks with parity stripe such RAID5/6 */ +struct scrub_parity { + struct scrub_ctx *sctx; + + struct btrfs_device *scrub_dev; + + u64 logic_start; + + u64 logic_end; + + int nsectors; + + int stripe_len; + + atomic_t ref_count; + + struct list_head spages; + + /* Work of parity check and repair */ + struct btrfs_work work; + + /* Mark the parity blocks which have data */ + unsigned long *dbitmap; + + /* + * Mark the parity blocks which have data, but errors happen when + * read data or check data + */ + unsigned long *ebitmap; + + unsigned long bitmap[0]; +}; + struct scrub_wr_ctx { struct scrub_bio *wr_curr_bio; struct btrfs_device *tgtdev; @@ -227,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); +static void scrub_parity_get(struct scrub_parity *sparity); +static void scrub_parity_put(struct scrub_parity *sparity); static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -943,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ spin_lock(&sctx->stat_lock); sctx->stat.unverified_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); if (sctx->is_dev_replace) @@ -1203,6 +1245,7 @@ nodatasum_case: corrected_error: spin_lock(&sctx->stat_lock); sctx->stat.corrected_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "BTRFS: fixed up error at logical %llu on dev %s\n", @@ -1644,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { int page_num; + /* + * This block is used for the check of the parity on the source device, + * so the data needn't be written into the destination device. + */ + if (sblock->sparity) + return; + for (page_num = 0; page_num < sblock->page_count; page_num++) { int ret; @@ -2025,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock) if (atomic_dec_and_test(&sblock->ref_count)) { int i; + if (sblock->sparity) + scrub_parity_put(sblock->sparity); + for (i = 0; i < sblock->page_count; i++) scrub_page_put(sblock->pagev[i]); kfree(sblock); @@ -2282,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) scrub_pending_bio_dec(sctx); } +static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, + unsigned long *bitmap, + u64 start, u64 len) +{ + int offset; + int nsectors; + int sectorsize = sparity->sctx->dev_root->sectorsize; + + if (len >= sparity->stripe_len) { + bitmap_set(bitmap, 0, sparity->nsectors); + return; + } + + start -= sparity->logic_start; + offset = (int)do_div(start, sparity->stripe_len); + offset /= sectorsize; + nsectors = (int)len / sectorsize; + + if (offset + nsectors <= sparity->nsectors) { + bitmap_set(bitmap, offset, nsectors); + return; + } + + bitmap_set(bitmap, offset, sparity->nsectors - offset); + bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); +} + +static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); +} + +static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); +} + static void scrub_block_complete(struct scrub_block *sblock) { + int corrupted = 0; + if (!sblock->no_io_error_seen) { + corrupted = 1; scrub_handle_errored_block(sblock); } else { /* @@ -2292,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock) * dev replace case, otherwise write here in dev replace * case. */ - if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + corrupted = scrub_checksum(sblock); + if (!corrupted && sblock->sctx->is_dev_replace) scrub_write_block_to_dev_replace(sblock); } + + if (sblock->sparity && corrupted && !sblock->data_corrected) { + u64 start = sblock->pagev[0]->logical; + u64 end = sblock->pagev[sblock->page_count - 1]->logical + + PAGE_SIZE; + + scrub_parity_mark_sectors_error(sblock->sparity, + start, end - start); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -2386,6 +2491,132 @@ behind_scrub_pages: return 0; } +static int scrub_pages_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num, u8 *csum) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + sblock->sparity = sparity; + scrub_parity_get(sparity); + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + /* For scrub block */ + scrub_page_get(spage); + sblock->pagev[index] = spage; + /* For scrub parity */ + scrub_page_get(spage); + list_add_tail(&spage->list, &sparity->spages); + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static int scrub_extent_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num) +{ + struct scrub_ctx *sctx = sparity->sctx; + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + blocksize = sctx->nodesize; + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + goto skip; + } + ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + flags, gen, mirror_num, + have_csum ? csum : NULL); +skip: + if (ret) + return ret; + len -= l; + logical += l; + physical += l; + } + return 0; +} + /* * Given a physical address, this will calculate it's * logical offset. if this is a parity stripe, it will return @@ -2394,7 +2625,8 @@ behind_scrub_pages: * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset) + struct map_lookup *map, u64 *offset, + u64 *stripe_start) { int i; int j = 0; @@ -2405,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num, last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); + if (stripe_start) + *stripe_start = last_offset; + *offset = last_offset; for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; @@ -2427,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num, return 1; } +static void scrub_free_parity(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_page *curr, *next; + int nbits; + + nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + if (nbits) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors += nbits; + sctx->stat.uncorrectable_errors += nbits; + spin_unlock(&sctx->stat_lock); + } + + list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_del_init(&curr->list); + scrub_page_put(curr); + } + + kfree(sparity); +} + +static void scrub_parity_bio_endio(struct bio *bio, int error) +{ + struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_ctx *sctx = sparity->sctx; + + if (error) + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); + bio_put(bio); +} + +static void scrub_parity_check_and_repair(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct bio *bio; + struct btrfs_raid_bio *rbio; + struct scrub_page *spage; + struct btrfs_bio *bbio = NULL; + u64 *raid_map = NULL; + u64 length; + int ret; + + if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, + sparity->nsectors)) + goto out; + + length = sparity->logic_end - sparity->logic_start + 1; + ret = btrfs_map_sblock(sctx->dev_root->fs_info, REQ_GET_READ_MIRRORS, + sparity->logic_start, + &length, &bbio, 0, &raid_map); + if (ret || !bbio || !raid_map) + goto bbio_out; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = sparity->logic_start >> 9; + bio->bi_private = sparity; + bio->bi_end_io = scrub_parity_bio_endio; + + rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, + raid_map, length, + sparity->scrub_dev, + sparity->dbitmap, + sparity->nsectors); + if (!rbio) + goto rbio_out; + + list_for_each_entry(spage, &sparity->spages, list) + raid56_parity_add_scrub_pages(rbio, spage->page, + spage->logical); + + scrub_pending_bio_inc(sctx); + raid56_parity_submit_scrub_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + kfree(bbio); + kfree(raid_map); + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +out: + scrub_free_parity(sparity); +} + +static inline int scrub_calc_parity_bitmap_len(int nsectors) +{ + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); +} + +static void scrub_parity_get(struct scrub_parity *sparity) +{ + atomic_inc(&sparity->ref_count); +} + +static void scrub_parity_put(struct scrub_parity *sparity) +{ + if (!atomic_dec_and_test(&sparity->ref_count)) + return; + + scrub_parity_check_and_repair(sparity); +} + +static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logic_start, + u64 logic_end) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + u64 flags; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + u64 generation; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + struct scrub_parity *sparity; + int nsectors; + int bitmap_len; + int extent_mirror_num; + int stop_loop = 0; + + nsectors = map->stripe_len / root->sectorsize; + bitmap_len = scrub_calc_parity_bitmap_len(nsectors); + sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, + GFP_NOFS); + if (!sparity) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + sparity->stripe_len = map->stripe_len; + sparity->nsectors = nsectors; + sparity->sctx = sctx; + sparity->scrub_dev = sdev; + sparity->logic_start = logic_start; + sparity->logic_end = logic_end; + atomic_set(&sparity->ref_count, 1); + INIT_LIST_HEAD(&sparity->spages); + sparity->dbitmap = sparity->bitmap; + sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + + ret = 0; + while (logic_start < logic_end) { + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logic_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->nodesize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logic_start) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid > logic_end) { + stop_loop = 1; + break; + } + + while (key.objectid >= logic_start + map->stripe_len) + logic_start += map->stripe_len; + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logic_start && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); + goto next; + } +again: + extent_logical = key.objectid; + extent_len = bytes; + + if (extent_logical < logic_start) { + extent_len -= logic_start - extent_logical; + extent_logical = logic_start; + } + + if (extent_logical + extent_len > + logic_start + map->stripe_len) + extent_len = logic_start + map->stripe_len - + extent_logical; + + scrub_parity_mark_sectors_data(sparity, extent_logical, + extent_len); + + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent_for_parity(sparity, extent_logical, + extent_len, + extent_physical, + extent_dev, flags, + generation, + extent_mirror_num); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + logic_start += map->stripe_len; + + if (logic_start >= logic_end) { + stop_loop = 1; + break; + } + + if (logic_start < key.objectid + bytes) { + cond_resched(); + goto again; + } + } +next: + path->slots[0]++; + } + + btrfs_release_path(path); + + if (stop_loop) + break; + + logic_start += map->stripe_len; + } +out: + if (ret < 0) + scrub_parity_mark_sectors_error(sparity, logic_start, + logic_end - logic_start + 1); + scrub_parity_put(sparity); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + btrfs_release_path(path); + return ret < 0 ? ret : 0; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, int num, u64 base, u64 length, int is_dev_replace) { - struct btrfs_path *path; + struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; @@ -2460,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 stripe_logical; + u64 stripe_end; struct btrfs_device *extent_dev; int extent_mirror_num; int stop_loop = 0; @@ -2485,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, mirror_num = num % map->num_stripes + 1; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { - get_raid56_logic_offset(physical, num, map, &offset); + get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; } else { @@ -2497,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; + ppath = btrfs_alloc_path(); + if (!ppath) { + btrfs_free_path(ppath); + return -ENOMEM; + } + /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -2515,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { get_raid56_logic_offset(physical_end, num, - map, &logic_end); + map, &logic_end, NULL); logic_end += base; } else { logic_end = logical + increment * nstripes; @@ -2562,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { ret = get_raid56_logic_offset(physical, num, - map, &logical); + map, &logical, &stripe_logical); logical += base; - if (ret) + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment - 1; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; goto skip; + } } /* * canceled? @@ -2716,13 +3284,25 @@ again: * loop until we find next data stripe * or we have finished all stripes. */ - do { - physical += map->stripe_len; - ret = get_raid56_logic_offset( - physical, num, - map, &logical); - logical += base; - } while (physical < physical_end && ret); +loop: + physical += map->stripe_len; + ret = get_raid56_logic_offset(physical, + num, map, &logical, + &stripe_logical); + logical += base; + + if (ret && physical < physical_end) { + stripe_logical += base; + stripe_end = stripe_logical + + increment - 1; + ret = scrub_raid56_parity(sctx, + map, scrub_dev, ppath, + stripe_logical, + stripe_end); + if (ret) + goto out; + goto loop; + } } else { physical += map->stripe_len; logical += increment; @@ -2763,6 +3343,7 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); + btrfs_free_path(ppath); return ret < 0 ? ret : 0; }

[v3,06/11] Btrfs, raid56: support parity scrub on raid56

Commit Message

Patch