From patchwork Thu Apr 29 18:06:06 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Woodhouse X-Patchwork-Id: 95964 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o3UIIVOB025694 for ; Fri, 30 Apr 2010 18:18:57 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933039Ab0D3SQ7 (ORCPT ); Fri, 30 Apr 2010 14:16:59 -0400 Received: from casper.infradead.org ([85.118.1.10]:42526 "EHLO casper.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758642Ab0D3RaP (ORCPT ); Fri, 30 Apr 2010 13:30:15 -0400 Received: from [2001:8b0:10b:1:222:41ff:fe2d:310a] by casper.infradead.org with esmtpsa (Exim 4.69 #1 (Red Hat Linux)) id 1O7Y7m-0007lT-4W; Thu, 29 Apr 2010 18:06:12 +0000 Subject: Updating RAID[56] support From: David Woodhouse To: Chris Mason Cc: linux-btrfs@vger.kernel.org Date: Thu, 29 Apr 2010 19:06:06 +0100 Message-ID: <1272564366.3367.4143.camel@macbook.infradead.org> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 (2.28.3-1.fc12) X-SRS-Rewrite: SMTP reverse-path rewritten from by casper.infradead.org See http://www.infradead.org/rpr.html Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Fri, 30 Apr 2010 18:19:03 +0000 (UTC) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32eabf1..70dc314 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2226,7 +2226,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int looped = 0; int ret; int index; - int stripe_len = 64 * 1024; + int stripe_len = 4 * 1024; if ((type & BTRFS_BLOCK_GROUP_RAID1) && (type & BTRFS_BLOCK_GROUP_DUP)) { @@ -2735,6 +2735,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 stripe_len; u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; @@ -2816,13 +2817,24 @@ again: goto again; } stripe_nr = offset; + + stripe_len = map->stripe_len; + if (!multi_ret && !unplug_page && (rw & (1 << BIO_RW)) && + map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + /* + * For the merge_bio_hook() we allow _writes_ (but not reads) + * to cover a full stripe-set. + */ + stripe_len *= nr_data_stripes(map); + printk("Stripe_len becomes %llx\n", stripe_len); + } /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, stripe_len); - stripe_offset = stripe_nr * map->stripe_len; + stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); /* stripe_offset is the offset of this block in its stripe*/ @@ -2833,8 +2845,21 @@ again: BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + /* For writes to RAID[56], allow a full stripe, not just a single + disk's worth */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + !stripe_offset && multi_ret && raid_map_ret && (rw & (1 << BIO_RW))) { + *length = min_t(u64, em->len - offset, + stripe_len * nr_data_stripes(map)); + printk("len becomes %Lx for RAID[56] write (min(%Lx,%Lx))\n", *length, + em->len - offset, stripe_len * nr_data_stripes(map)); + } else { + *length = min_t(u64, em->len - offset, + stripe_len - stripe_offset); + printk("len becomes %Lx (min(%Lx,%Lx))\n", *length, + em->len - offset, stripe_len - stripe_offset); + } + } else { *length = em->len - offset; } @@ -3173,6 +3198,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int ret; int dev_nr = 0; int total_devs = 1; + printk("%s %d %d %llx %x\n", __func__, rw, mirror_num, logical, bio->bi_size); length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; @@ -3187,6 +3213,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, multi->orig_bio = first_bio; atomic_set(&multi->stripes_pending, multi->num_stripes); + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + } + if (raid_map) { if (rw == READ) return raid56_parity_recover(root, bio, multi, @@ -3629,6 +3662,7 @@ struct btrfs_raid_multi_bio { struct btrfs_root *root; struct btrfs_multi_bio *multi; u64 *raid_map; + int partial; struct bio *bio[0]; }; @@ -3671,7 +3705,7 @@ static void raid_write_end_io(struct bio *bio, int err) if (!atomic_dec_and_test(&rmult->multi->stripes_pending)) return; - + printk("Ended final write IO\n"); /* OK, we have read all the stripes we need to. */ if (atomic_read(&rmult->multi->error)) { bio_endio(rmult->multi->orig_bio, -EIO); @@ -3683,7 +3717,8 @@ static void raid_write_end_io(struct bio *bio, int err) bio_endio(rmult->multi->orig_bio, 0); cleanup: - mutex_unlock(&raid_hack_mutex); + if (rmult->partial) + mutex_unlock(&raid_hack_mutex); free_raid_multi(rmult); } @@ -3843,28 +3878,14 @@ static struct bio *alloc_raid_stripe_bio(struct btrfs_root *root, return bio; } -static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_multi_bio *multi, u64 *raid_map, - u64 stripe_len) +static int raid56_parity_write_partial(struct btrfs_raid_multi_bio *rmult, + struct bio *bio, u64 stripe_len) { int i; int start_ofs, end_ofs; int stripes_to_read = 0; u64 logical = (u64)bio->bi_sector << 9; - struct btrfs_raid_multi_bio *rmult; - - rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *), - GFP_NOFS); - if (!rmult) { - kfree(raid_map); - kfree(multi); - return -ENOMEM; - } - rmult->multi = multi; - rmult->raid_map = raid_map; - rmult->root = root; - /* * FIXME: the merge_bio_hook logic currently ensures that writes only * cover one stripe, meaning we _always_ have to read the other data @@ -3880,6 +3901,7 @@ static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, * And we can ditch this mutex too: */ mutex_lock(&raid_hack_mutex); + rmult->partial = 1; /* What subrange of the stripe are we writing? */ start_ofs = do_div(logical, stripe_len); @@ -3888,19 +3910,19 @@ static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, /* Allocate bios for reading and for the parity and q-stripe writes */ logical = (u64)bio->bi_sector << 9; - for (i = 0; i < multi->num_stripes; i++) { + for (i = 0; i < rmult->multi->num_stripes; i++) { if (start_ofs) { - if (!is_parity_stripe(raid_map[i])) - raid_map[i] += start_ofs; - multi->stripes[i].physical += start_ofs; + if (!is_parity_stripe(rmult->raid_map[i])) + rmult->raid_map[i] += start_ofs; + rmult->multi->stripes[i].physical += start_ofs; } - if (raid_map[i] == logical) { + if (rmult->raid_map[i] == logical) { /* Set the correct bdev for the original write bio */ - bio->bi_bdev = multi->stripes[i].dev->bdev; + bio->bi_bdev = rmult->multi->stripes[i].dev->bdev; } else { - rmult->bio[i] = - alloc_raid_stripe_bio(root, &multi->stripes[i], - bio->bi_size); + rmult->bio[i] = alloc_raid_stripe_bio(rmult->root, + &rmult->multi->stripes[i], + bio->bi_size); if (!rmult->bio[i]) { free_raid_multi(rmult); bio_endio(bio, -EIO); @@ -3909,23 +3931,23 @@ static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, } rmult->bio[i]->bi_private = rmult; - if (!is_parity_stripe(raid_map[i])) + if (!is_parity_stripe(rmult->raid_map[i])) stripes_to_read++; } } if (!stripes_to_read) { /* Nothing to read -- just calculate parity and write it all */ - atomic_set(&multi->stripes_pending, 1); + atomic_set(&rmult->multi->stripes_pending, 1); bio->bi_private = rmult; raid_read_end_io(bio, 0); return 0; } - atomic_set(&multi->stripes_pending, stripes_to_read); - for (i = 0; stripes_to_read && i < multi->num_stripes; i++) { - if (rmult->bio[i] && !is_parity_stripe(raid_map[i])) { + atomic_set(&rmult->multi->stripes_pending, stripes_to_read); + for (i = 0; stripes_to_read && i < rmult->multi->num_stripes; i++) { + if (rmult->bio[i] && !is_parity_stripe(rmult->raid_map[i])) { rmult->bio[i]->bi_end_io = raid_read_end_io; - btrfs_bio_wq_end_io(root->fs_info, rmult->bio[i], 1); + btrfs_bio_wq_end_io(rmult->root->fs_info, rmult->bio[i], 1); submit_bio(READ, rmult->bio[i]); stripes_to_read--; } @@ -3933,6 +3955,139 @@ static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, return 0; } +static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_multi_bio *multi, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_multi_bio *rmult; + int i, j, k; + int nr_data = 0; + int p_stripe = -1, q_stripe = -1, orig_stripe = -1; + void *pointers[multi->num_stripes]; + u64 logical = (u64)bio->bi_sector << 9; + + rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *), + GFP_NOFS); + if (!rmult) { + kfree(raid_map); + kfree(multi); + return -ENOMEM; + } + rmult->multi = multi; + rmult->raid_map = raid_map; + rmult->root = root; + + for (i = 0; i < multi->num_stripes; i++) { + if (raid_map[i] == RAID5_P_STRIPE) + p_stripe = i; + else if (raid_map[i] == RAID6_Q_STRIPE) + q_stripe = i; + else + nr_data++; + } + + if (bio->bi_size != stripe_len * nr_data) { + printk("partial\n"); + return raid56_parity_write_partial(rmult, bio, stripe_len); + } + + /* Yay, a full-stripe write! */ + + for (i = 0; i < multi->num_stripes; i++) { + if (raid_map[i] == logical) { + orig_stripe = i; + continue; + } + rmult->bio[i] = alloc_raid_stripe_bio(root, &multi->stripes[i], + is_parity_stripe(raid_map[i])?stripe_len:0); + BUG_ON(!rmult->bio[i]); /* FIXME */ + } + + for (i = 0; i < stripe_len >> PAGE_SHIFT; i++) { + for (j = 0; j < nr_data; j++) { + int pagenr = j * (stripe_len >> PAGE_SHIFT) + i; + pointers[j] = kmap(bio->bi_io_vec[pagenr].bv_page); + } + pointers[j++] = kmap(rmult->bio[p_stripe]->bi_io_vec[i].bv_page); + if (q_stripe != -1) { + pointers[j++] = kmap(rmult->bio[q_stripe]->bi_io_vec[i].bv_page); + + raid6_call.gen_syndrome(multi->num_stripes, PAGE_SIZE, + pointers); + printk("D %lx P(%d) %lx Q(%d) %lx\n", + *(unsigned long *)pointers[0], + p_stripe, + *(unsigned long *)pointers[p_stripe], + q_stripe, + *(unsigned long *)pointers[q_stripe]); + kunmap(rmult->bio[q_stripe]->bi_io_vec[i].bv_page); + } else { + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + for (k = 1; k < nr_data; k++) { + for (j = 0; j < PAGE_SIZE; j += sizeof(unsigned long)) { + *(unsigned long *)(pointers[nr_data] + j) ^= + *(unsigned long *)(pointers[k] + j); + } + } + } + for (j = 0; j < nr_data; j++) { + int pagenr = j * (stripe_len >> PAGE_SHIFT) + i; + kunmap(bio->bi_io_vec[pagenr].bv_page); + } + kunmap(rmult->bio[p_stripe]->bi_io_vec[i].bv_page); + } + + atomic_set(&multi->stripes_pending, multi->num_stripes); + multi->max_errors = 0; + printk("RAID full write, multi %p\n", multi); + + /* Split original bio into chunks for different disks */ + for (i = 0; i < multi->num_stripes; i++) { + struct bio *this_bio = rmult->bio[i]; + if (!this_bio) { + /* Leave the original bio till last */ + continue; + } + + if (!is_parity_stripe(raid_map[i]) && raid_map[i] != logical) { + for (j = 0; j < stripe_len >> PAGE_SHIFT; j++) { + int pagenr = ((raid_map[i] - logical) >> PAGE_SHIFT) + j; + struct page *pg; + printk("Steal page %d for bio %d (%p), vec %p\n", pagenr, i, this_bio, bio->bi_io_vec); + pg = bio->bi_io_vec[pagenr].bv_page; + printk("Stolen page is %p\n", pg); + get_page(pg); + bio_add_page(this_bio, pg, PAGE_SIZE, 0); + } + } + + this_bio->bi_private = rmult; + this_bio->bi_end_io = raid_write_end_io; + this_bio->bi_sector = multi->stripes[i].physical >> 9; + this_bio->bi_bdev = multi->stripes[i].dev->bdev; + printk("Submit %s bio #%d %p to %x:%llx\n", + (i == p_stripe)?"P":((i==q_stripe)?"Q":"D"), + i, this_bio, + this_bio->bi_bdev->bd_dev, + (u64)this_bio->bi_sector << 9); + schedule_bio(root, multi->stripes[i].dev, WRITE, this_bio); + } + + /* Write the original bio last to prevent various races */ + BUG_ON(orig_stripe == -1); + bio->bi_private = rmult; + bio->bi_end_io = raid_write_end_io; + bio->bi_sector = multi->stripes[orig_stripe].physical >> 9; + bio->bi_bdev = multi->stripes[orig_stripe].dev->bdev; + printk("Submit original D bio #%d %p to %x:%llx\n", + orig_stripe, bio, + bio->bi_bdev->bd_dev, + (u64)bio->bi_sector << 9); + schedule_bio(root, multi->stripes[orig_stripe].dev, WRITE, bio); + + return 0; +} + static void raid_recover_end_io(struct bio *bio, int err) { struct btrfs_raid_multi_bio *rmult = bio->bi_private;