Updating RAID[56] support
diff mbox

Message ID 1272564366.3367.4143.camel@macbook.infradead.org
State New, archived
Headers show

Commit Message

David Woodhouse April 29, 2010, 6:06 p.m. UTC
None

Patch
diff mbox

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 32eabf1..70dc314 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2226,7 +2226,7 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int looped = 0;
 	int ret;
 	int index;
-	int stripe_len = 64 * 1024;
+	int stripe_len = 4 * 1024;
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
@@ -2735,6 +2735,7 @@  static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 offset;
 	u64 stripe_offset;
 	u64 stripe_nr;
+	u64 stripe_len;
 	u64 *raid_map = NULL;
 	int stripes_allocated = 8;
 	int stripes_required = 1;
@@ -2816,13 +2817,24 @@  again:
 		goto again;
 	}
 	stripe_nr = offset;
+
+	stripe_len = map->stripe_len;
+	if (!multi_ret && !unplug_page && (rw & (1 << BIO_RW)) &&
+	    map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+		/*
+		 * For the merge_bio_hook() we allow _writes_ (but not reads)
+		 * to cover a full stripe-set. 
+		 */
+		stripe_len *= nr_data_stripes(map);
+		printk("Stripe_len becomes %llx\n", stripe_len);
+	}
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, map->stripe_len);
+	do_div(stripe_nr, stripe_len);
 
-	stripe_offset = stripe_nr * map->stripe_len;
+	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
 
 	/* stripe_offset is the offset of this block in its stripe*/
@@ -2833,8 +2845,21 @@  again:
 			 BTRFS_BLOCK_GROUP_RAID10 |
 			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-			      map->stripe_len - stripe_offset);
+		/* For writes to RAID[56], allow a full stripe, not just a single
+		   disk's worth */
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+		    !stripe_offset && multi_ret && raid_map_ret && (rw & (1 << BIO_RW))) {
+			*length = min_t(u64, em->len - offset,
+					stripe_len * nr_data_stripes(map));
+			printk("len becomes %Lx for RAID[56] write (min(%Lx,%Lx))\n", *length, 
+			       em->len - offset, stripe_len * nr_data_stripes(map));
+		} else {
+			*length = min_t(u64, em->len - offset,
+					stripe_len - stripe_offset);
+			printk("len becomes %Lx (min(%Lx,%Lx))\n", *length, 
+			       em->len - offset, stripe_len - stripe_offset);
+		}
+					
 	} else {
 		*length = em->len - offset;
 	}
@@ -3173,6 +3198,7 @@  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
+	printk("%s %d %d %llx %x\n", __func__, rw, mirror_num, logical, bio->bi_size);
 
 	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
@@ -3187,6 +3213,13 @@  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
+	if (map_length < length) {
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
+	}
+
 	if (raid_map) {
 		if (rw == READ)
 			return raid56_parity_recover(root, bio, multi,
@@ -3629,6 +3662,7 @@  struct btrfs_raid_multi_bio {
 	struct btrfs_root *root;
 	struct btrfs_multi_bio *multi;
 	u64 *raid_map;
+	int partial;
 	struct bio *bio[0];
 };
 
@@ -3671,7 +3705,7 @@  static void raid_write_end_io(struct bio *bio, int err)
 
 	if (!atomic_dec_and_test(&rmult->multi->stripes_pending))
 		return;
-
+	printk("Ended final write IO\n");
 	/* OK, we have read all the stripes we need to. */
 	if (atomic_read(&rmult->multi->error)) {
 		bio_endio(rmult->multi->orig_bio, -EIO);
@@ -3683,7 +3717,8 @@  static void raid_write_end_io(struct bio *bio, int err)
 	bio_endio(rmult->multi->orig_bio, 0);
 
  cleanup:
-	mutex_unlock(&raid_hack_mutex);
+	if (rmult->partial)
+		mutex_unlock(&raid_hack_mutex);
 	free_raid_multi(rmult);
 }
 
@@ -3843,28 +3878,14 @@  static struct bio *alloc_raid_stripe_bio(struct btrfs_root *root,
 	return bio;
 }
 
-static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-			       struct btrfs_multi_bio *multi, u64 *raid_map,
-			       u64 stripe_len)
+static int raid56_parity_write_partial(struct btrfs_raid_multi_bio *rmult,
+				       struct bio *bio, u64 stripe_len)
 {
 	int i;
 	int start_ofs, end_ofs;
 	int stripes_to_read = 0;
 	u64 logical = (u64)bio->bi_sector << 9;
 
-	struct btrfs_raid_multi_bio *rmult;
-
-	rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *),
-			GFP_NOFS);
-	if (!rmult) {
-		kfree(raid_map);
-		kfree(multi);
-		return -ENOMEM;
-	}
-	rmult->multi = multi;
-	rmult->raid_map = raid_map;
-	rmult->root = root;
-
 	/*
 	 * FIXME: the merge_bio_hook logic currently ensures that writes only
 	 * cover one stripe, meaning we _always_ have to read the other data
@@ -3880,6 +3901,7 @@  static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	 * And we can ditch this mutex too:
 	 */
 	mutex_lock(&raid_hack_mutex);
+	rmult->partial = 1;
 
 	/* What subrange of the stripe are we writing? */
 	start_ofs = do_div(logical, stripe_len);
@@ -3888,19 +3910,19 @@  static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 
 	/* Allocate bios for reading and for the parity and q-stripe writes */
 	logical = (u64)bio->bi_sector << 9;
-	for (i = 0; i < multi->num_stripes; i++) {
+	for (i = 0; i < rmult->multi->num_stripes; i++) {
 		if (start_ofs) {
-			if (!is_parity_stripe(raid_map[i]))
-				raid_map[i] += start_ofs;
-			multi->stripes[i].physical += start_ofs;
+			if (!is_parity_stripe(rmult->raid_map[i]))
+				rmult->raid_map[i] += start_ofs;
+			rmult->multi->stripes[i].physical += start_ofs;
 		}
-		if (raid_map[i] == logical) {
+		if (rmult->raid_map[i] == logical) {
 			/* Set the correct bdev for the original write bio */
-			bio->bi_bdev = multi->stripes[i].dev->bdev;
+			bio->bi_bdev = rmult->multi->stripes[i].dev->bdev;
 		} else {
-			rmult->bio[i] =
-				alloc_raid_stripe_bio(root, &multi->stripes[i],
-						      bio->bi_size);
+			rmult->bio[i] = alloc_raid_stripe_bio(rmult->root,
+							      &rmult->multi->stripes[i],
+							      bio->bi_size);
 			if (!rmult->bio[i]) {
 				free_raid_multi(rmult);
 				bio_endio(bio, -EIO);
@@ -3909,23 +3931,23 @@  static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 			}
 			rmult->bio[i]->bi_private = rmult;
 
-			if (!is_parity_stripe(raid_map[i]))
+			if (!is_parity_stripe(rmult->raid_map[i]))
 				stripes_to_read++;
 		}
 	}
 	if (!stripes_to_read) {
 		/* Nothing to read -- just calculate parity and write it all */
-		atomic_set(&multi->stripes_pending, 1);
+		atomic_set(&rmult->multi->stripes_pending, 1);
 		bio->bi_private = rmult;
 		raid_read_end_io(bio, 0);
 		return 0;
 	}
 
-	atomic_set(&multi->stripes_pending, stripes_to_read);
-	for (i = 0; stripes_to_read && i < multi->num_stripes; i++) {
-		if (rmult->bio[i] && !is_parity_stripe(raid_map[i])) {
+	atomic_set(&rmult->multi->stripes_pending, stripes_to_read);
+	for (i = 0; stripes_to_read && i < rmult->multi->num_stripes; i++) {
+		if (rmult->bio[i] && !is_parity_stripe(rmult->raid_map[i])) {
 			rmult->bio[i]->bi_end_io = raid_read_end_io;
-			btrfs_bio_wq_end_io(root->fs_info, rmult->bio[i], 1);
+			btrfs_bio_wq_end_io(rmult->root->fs_info, rmult->bio[i], 1);
 			submit_bio(READ, rmult->bio[i]);
 			stripes_to_read--;
 		}
@@ -3933,6 +3955,139 @@  static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	return 0;
 }
 
+static int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_multi_bio *multi, u64 *raid_map,
+			       u64 stripe_len)
+{
+	struct btrfs_raid_multi_bio *rmult;
+	int i, j, k;
+	int nr_data = 0;
+	int p_stripe = -1, q_stripe = -1, orig_stripe = -1;
+	void *pointers[multi->num_stripes];
+	u64 logical = (u64)bio->bi_sector << 9;
+
+	rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *),
+			GFP_NOFS);
+	if (!rmult) {
+		kfree(raid_map);
+		kfree(multi);
+		return -ENOMEM;
+	}
+	rmult->multi = multi;
+	rmult->raid_map = raid_map;
+	rmult->root = root;
+
+	for (i = 0; i < multi->num_stripes; i++) {
+		if (raid_map[i] == RAID5_P_STRIPE)
+			p_stripe = i;
+		else if (raid_map[i] == RAID6_Q_STRIPE)
+			q_stripe = i;
+		else
+			nr_data++;
+	}
+
+	if (bio->bi_size != stripe_len * nr_data) {
+		printk("partial\n");
+		return raid56_parity_write_partial(rmult, bio, stripe_len);
+	}
+
+	/* Yay, a full-stripe write! */
+
+	for (i = 0; i < multi->num_stripes; i++) {
+		if (raid_map[i] == logical) {
+			orig_stripe = i;
+			continue;
+		}
+		rmult->bio[i] = alloc_raid_stripe_bio(root, &multi->stripes[i],
+						      is_parity_stripe(raid_map[i])?stripe_len:0);
+		BUG_ON(!rmult->bio[i]); /* FIXME */
+	}
+
+	for (i = 0; i < stripe_len >> PAGE_SHIFT; i++) {
+		for (j = 0; j < nr_data; j++) {
+			int pagenr = j * (stripe_len >> PAGE_SHIFT) + i;
+			pointers[j] = kmap(bio->bi_io_vec[pagenr].bv_page);
+		}
+		pointers[j++] = kmap(rmult->bio[p_stripe]->bi_io_vec[i].bv_page);
+		if (q_stripe != -1) {
+			pointers[j++] = kmap(rmult->bio[q_stripe]->bi_io_vec[i].bv_page);
+
+			raid6_call.gen_syndrome(multi->num_stripes, PAGE_SIZE,
+						pointers);
+			printk("D %lx P(%d) %lx Q(%d) %lx\n",
+			       *(unsigned long *)pointers[0],
+			       p_stripe,
+			       *(unsigned long *)pointers[p_stripe],
+			       q_stripe,
+			       *(unsigned long *)pointers[q_stripe]);
+			kunmap(rmult->bio[q_stripe]->bi_io_vec[i].bv_page);
+		} else {
+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+			for (k = 1; k < nr_data; k++) {
+				for (j = 0; j < PAGE_SIZE; j += sizeof(unsigned long)) {
+					*(unsigned long *)(pointers[nr_data] + j) ^=
+						*(unsigned long *)(pointers[k] + j);
+				}
+			}
+		}
+		for (j = 0; j < nr_data; j++) {
+			int pagenr = j * (stripe_len >> PAGE_SHIFT) + i;
+			kunmap(bio->bi_io_vec[pagenr].bv_page);
+		}
+		kunmap(rmult->bio[p_stripe]->bi_io_vec[i].bv_page);
+	}
+
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+	multi->max_errors = 0;
+	printk("RAID full write, multi %p\n", multi);
+
+	/* Split original bio into chunks for different disks */
+	for (i = 0; i < multi->num_stripes; i++) {
+		struct bio *this_bio = rmult->bio[i];
+		if (!this_bio) {
+			/* Leave the original bio till last */
+			continue;
+		}
+
+		if (!is_parity_stripe(raid_map[i]) && raid_map[i] != logical) {
+			for (j = 0; j < stripe_len >> PAGE_SHIFT; j++) {
+				int pagenr = ((raid_map[i] - logical) >> PAGE_SHIFT) + j;
+				struct page *pg;
+				printk("Steal page %d for bio %d (%p), vec %p\n", pagenr, i, this_bio, bio->bi_io_vec);
+				pg = bio->bi_io_vec[pagenr].bv_page;
+				printk("Stolen page is %p\n", pg);
+				get_page(pg);
+				bio_add_page(this_bio, pg, PAGE_SIZE, 0);
+			}
+		}
+
+		this_bio->bi_private = rmult;
+		this_bio->bi_end_io = raid_write_end_io;
+		this_bio->bi_sector = multi->stripes[i].physical >> 9;
+		this_bio->bi_bdev = multi->stripes[i].dev->bdev;
+		printk("Submit %s bio #%d %p to %x:%llx\n", 
+		       (i == p_stripe)?"P":((i==q_stripe)?"Q":"D"),
+		       i, this_bio,
+		       this_bio->bi_bdev->bd_dev,
+		       (u64)this_bio->bi_sector << 9);
+		schedule_bio(root, multi->stripes[i].dev, WRITE, this_bio);
+	}
+
+	/* Write the original bio last to prevent various races */
+	BUG_ON(orig_stripe == -1);
+	bio->bi_private = rmult;
+	bio->bi_end_io = raid_write_end_io;
+	bio->bi_sector = multi->stripes[orig_stripe].physical >> 9;
+	bio->bi_bdev = multi->stripes[orig_stripe].dev->bdev;
+	printk("Submit original D bio #%d %p to %x:%llx\n", 
+	       orig_stripe, bio,
+	       bio->bi_bdev->bd_dev,
+	       (u64)bio->bi_sector << 9);
+	schedule_bio(root, multi->stripes[orig_stripe].dev, WRITE, bio);
+
+	return 0;
+}
+
 static void raid_recover_end_io(struct bio *bio, int err)
 {
 	struct btrfs_raid_multi_bio *rmult = bio->bi_private;