diff mbox

DM RAID: Add support for MD RAID10 personality

Message ID 1340712231.19015.42.camel@f16 (mailing list archive)
State Deferred, archived
Headers show

Commit Message

Jonthan Brassow June 26, 2012, 12:03 p.m. UTC
dm raid: add md raid10 support

Support the MD RAID10 personality through dm-raid.c

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>



--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Comments

NeilBrown July 4, 2012, 1:21 a.m. UTC | #1
On Tue, 26 Jun 2012 07:03:51 -0500 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> dm raid: add md raid10 support
> 
> Support the MD RAID10 personality through dm-raid.c
> 
> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> 
> Index: linux-upstream/drivers/md/dm-raid.c
> ===================================================================
> --- linux-upstream.orig/drivers/md/dm-raid.c
> +++ linux-upstream/drivers/md/dm-raid.c
> @@ -11,6 +11,7 @@
>  #include "md.h"
>  #include "raid1.h"
>  #include "raid5.h"
> +#include "raid10.h"
>  #include "bitmap.h"
>  
>  #include <linux/device-mapper.h>
> @@ -52,7 +53,11 @@ struct raid_dev {
>  #define DMPF_MAX_RECOVERY_RATE 0x20
>  #define DMPF_MAX_WRITE_BEHIND  0x40
>  #define DMPF_STRIPE_CACHE      0x80
> -#define DMPF_REGION_SIZE       0X100
> +#define DMPF_REGION_SIZE       0x100
> +#define DMPF_RAID10_NEAR_COPIES 0x200
> +#define DMPF_RAID10_FAR_COPIES  0x400
> +#define DMPF_RAID10_FAR_OFFSET  0x800
> +
>  struct raid_set {
>  	struct dm_target *ti;
>  
> @@ -66,6 +71,15 @@ struct raid_set {
>  	struct raid_dev dev[0];
>  };
>  
> +/* near_copies in first byte */
> +/* far_copies in second byte */
> +/* far_offset in 17th bit */
> +#define ALGORITHM_RAID10(near_copies, far_copies, far_offset) \
> +	((near_copies & 0xFF) | ((far_copies & 0xFF) << 8) | ((!!far_offset) << 16))
> +#define RAID10_NC(layout) (layout & 0xFF)
> +#define RAID10_FC(layout) ((layout >> 8) & 0xFF)
> +#define RAID10_FO(layout) (layout & 0x10000)
> +
>  /* Supported raid types and properties. */
>  static struct raid_type {
>  	const char *name;		/* RAID algorithm. */
> @@ -76,6 +90,8 @@ static struct raid_type {
>  	const unsigned algorithm;	/* RAID algorithm. */
>  } raid_types[] = {
>  	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
> +	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
> +	{"raid1e",   "RAID1E (Enhanced RAID1)",         0, 2, 10, -1 /* Varies */},
>  	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
>  	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
>  	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
> @@ -339,10 +355,17 @@ static int validate_region_size(struct r
>   *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
>   *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
>   *    [region_size <sectors>]           Defines granularity of bitmap
> + *
> + * RAID10-only options:
> + *    [raid10_near_copies   <# copies>] Near copies. (Default: 2)
> + *    [raid10_far_copies    <# copies>] Far copies.  (Default: 1)
> + *    [raid10_far_offset    <0/1>]      Offset is device size(0) or stripe(1).

Can I suggest that you don't do it like this?  i.e. don't copy the
mistakes I made :-)

I don't think there is any value in supporting multiple near and far copies.
There are two dimensions of the layout:
 - number of copies.  Defaults to 2
 - location of the copies:  near, far, offset

Some day I could implement an alternate version of 'far' or 'offset' which
improves redundancy slightly.
Instead of 
   A B C D
   ...
   D A B C

it would be

   A B C D 
   ....
   B A D C

i.e. treat the devices as pair and swap the device for the second copy.
This doesn't generalise to an odd number of devices, but increases the number
of pairs of devices that can concurrently fail without losing date.
(for 4 devices, there are 6 pairs.  With current 'far' mode there are only 2
pair of devices that can concurrently fail (0,2 and 1,3).  With the proposed
far mode there are 4 (0,2 0,3 1,2, 1,3).

Adding this with your current proposal would be messy.
Adding it with the two dimensions I suggest would simply involve adding
another 'location' - 'farswap' or 'far2' or something.

I note you didn't make 'dm-raid1e' a module alias.  Was that deliberate?

Thanks,
NeilBrown



>   */
>  static int parse_raid_params(struct raid_set *rs, char **argv,
>  			     unsigned num_raid_params)
>  {
> +	unsigned raid10_default = ALGORITHM_RAID10(2, 1, 0);
> +	unsigned raid10_nc = 1, raid10_fc = 1, raid10_fo = 0;
>  	unsigned i, rebuild_cnt = 0;
>  	unsigned long value, region_size = 0;
>  	sector_t sectors_per_dev = rs->ti->len;
> @@ -435,6 +458,7 @@ static int parse_raid_params(struct raid
>  				if (rebuild_cnt > rs->raid_type->parity_devs)
>  					rs->ti->error = "Too many rebuild devices specified for given RAID type";
>  				break;
> +			case 10:
>  			default:
>  				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
>  				rs->ti->error = "Rebuild not supported for this RAID type";
> @@ -492,7 +516,7 @@ static int parse_raid_params(struct raid
>  			 */
>  			value /= 2;
>  
> -			if (rs->raid_type->level < 5) {
> +			if (rs->raid_type->level != 5) {
>  				rs->ti->error = "Inappropriate argument: stripe_cache";
>  				return -EINVAL;
>  			}
> @@ -517,6 +541,33 @@ static int parse_raid_params(struct raid
>  		} else if (!strcasecmp(key, "region_size")) {
>  			rs->print_flags |= DMPF_REGION_SIZE;
>  			region_size = value;
> +		} else if (!strcasecmp(key, "raid10_near_copies") &&
> +			   (rs->raid_type->level == 10)) {
> +			if ((value < 1) || (value > 0xFF)) {
> +				rs->ti->error = "Bad value for 'raid10_near_copies'";
> +				return -EINVAL;
> +			}
> +			rs->print_flags |= DMPF_RAID10_NEAR_COPIES;
> +			raid10_nc = value;
> +			raid10_default = 0;
> +		} else if (!strcasecmp(key, "raid10_far_copies") &&
> +			   (rs->raid_type->level == 10)) {
> +			if ((value < 1) || (value > 0xFF)) {
> +				rs->ti->error = "Bad value for 'raid10_far_copies'";
> +				return -EINVAL;
> +			}
> +			rs->print_flags |= DMPF_RAID10_FAR_COPIES;
> +			raid10_fc = value;
> +			raid10_default = 0;
> +		} else if (!strcasecmp(key, "raid10_far_offset") &&
> +			   (rs->raid_type->level == 10)) {
> +			if (value > 1) {
> +				rs->ti->error = "Bad value for 'raid10_far_offset'";
> +				return -EINVAL;
> +			}
> +			rs->print_flags |= DMPF_RAID10_FAR_OFFSET;
> +			raid10_fo = value;
> +			raid10_default = 0;
>  		} else {
>  			DMERR("Unable to parse RAID parameter: %s", key);
>  			rs->ti->error = "Unable to parse RAID parameters";
> @@ -532,9 +583,33 @@ static int parse_raid_params(struct raid
>  	else
>  		rs->ti->split_io = region_size;
>  
> -	if ((rs->raid_type->level > 1) &&
> -	    sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
> +	if (rs->raid_type->level == 10) {
> +		/* (Len * Stripes) / Mirrors */
> +		sectors_per_dev *= rs->md.raid_disks;
> +		if (sector_div(sectors_per_dev, (raid10_nc * raid10_fc))) {
> +			rs->ti->error = "Target length not divisible by number of data devices";
> +			return -EINVAL;
> +		}
> +		if ((raid10_nc * raid10_fc) > rs->md.raid_disks) {
> +			rs->ti->error = "Not enough devices to satisfy specification";
> +			return -EINVAL;
> +		}
> +		if (raid10_fo && (raid10_fc < 2)) {
> +			DMWARN("RAID10 parameter 'far_offset' ignored");
> +			raid10_fo = 0;
> +		}
> +
> +		if (raid10_default)
> +			rs->md.layout = raid10_default;
> +		else
> +			rs->md.layout = ALGORITHM_RAID10(raid10_nc,
> +							 raid10_fc, raid10_fo);
> +		rs->md.new_layout = rs->md.layout;
> +	} else if ((rs->raid_type->level > 1) &&
> +		   sector_div(sectors_per_dev,
> +			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
>  		rs->ti->error = "Target length not divisible by number of data devices";
> +
>  		return -EINVAL;
>  	}
>  	rs->md.dev_sectors = sectors_per_dev;
> @@ -560,6 +635,9 @@ static int raid_is_congested(struct dm_t
>  	if (rs->raid_type->level == 1)
>  		return md_raid1_congested(&rs->md, bits);
>  
> +	if (rs->raid_type->level == 10)
> +		return md_raid10_congested(&rs->md, bits);
> +
>  	return md_raid5_congested(&rs->md, bits);
>  }
>  
> @@ -878,6 +956,9 @@ static int analyse_superblocks(struct dm
>  	case 6:
>  		redundancy = rs->raid_type->parity_devs;
>  		break;
> +	case 10:
> +		redundancy = RAID10_NC(mddev->layout) *	RAID10_FC(mddev->layout);
> +		break;
>  	default:
>  		ti->error = "Unknown RAID type";
>  		return -EINVAL;
> @@ -1197,6 +1278,18 @@ static int raid_status(struct dm_target
>  			DMEMIT(" region_size %lu",
>  			       rs->md.bitmap_info.chunksize >> 9);
>  
> +		if (rs->print_flags & DMPF_RAID10_NEAR_COPIES)
> +			DMEMIT(" raid10_near_copies %u",
> +			       RAID10_NC(rs->md.layout));
> +
> +		if (rs->print_flags & DMPF_RAID10_FAR_COPIES)
> +			DMEMIT(" raid10_far_copies %u",
> +			       RAID10_FC(rs->md.layout));
> +
> +		if (rs->print_flags & DMPF_RAID10_FAR_OFFSET)
> +			DMEMIT(" raid10_far_offset %u",
> +			       RAID10_FO(rs->md.layout));
> +
>  		DMEMIT(" %d", rs->md.raid_disks);
>  		for (i = 0; i < rs->md.raid_disks; i++) {
>  			if (rs->dev[i].meta_dev)
> @@ -1271,7 +1364,7 @@ static void raid_resume(struct dm_target
>  
>  static struct target_type raid_target = {
>  	.name = "raid",
> -	.version = {1, 2, 0},
> +	.version = {1, 3, 0},
>  	.module = THIS_MODULE,
>  	.ctr = raid_ctr,
>  	.dtr = raid_dtr,
> @@ -1298,6 +1391,8 @@ module_init(dm_raid_init);
>  module_exit(dm_raid_exit);
>  
>  MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
> +MODULE_ALIAS("dm-raid1");
> +MODULE_ALIAS("dm-raid10");
>  MODULE_ALIAS("dm-raid4");
>  MODULE_ALIAS("dm-raid5");
>  MODULE_ALIAS("dm-raid6");
> Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
> ===================================================================
> --- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
> +++ linux-upstream/Documentation/device-mapper/dm-raid.txt
> @@ -27,6 +27,11 @@ The target is named "raid" and it accept
>  		- rotating parity N (right-to-left) with data restart
>    raid6_nc	RAID6 N continue
>  		- rotating parity N (right-to-left) with data continuation
> +  raid10/raid1e Various RAID10 inspired algorithms chosen by additional params
> +  		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
> +		- RAID1E: Integrated Adjacent Stripe Mirroring
> +		- RAID1E: Integrated Offset Stripe Mirroring
> +		-  and other similar RAID10 variants
>  
>    Reference: Chapter 4 of
>    http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
> @@ -59,6 +64,80 @@ The target is named "raid" and it accept
>  		logical size of the array.  The bitmap records the device
>  		synchronisation state for each region.
>  
> +        [raid10_near_copies   <# copies>]
> +        [raid10_far_copies    <# copies>]
> +        [raid10_far_offset    <0/1>]
> +		These three options are used to alter the default layout of
> +		a RAID10/RAID1E configuration.  The total number of copies is
> +		given by the number of "near" (aka "adjacent") copies times
> +		the number of "far" (aka "offset") copies.  Near copies
> +		are what most people think of with respect to mirroring.
> +		If 'raid10_near_copies 2', 'raid10_far_copies 1' and
> +		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
> +		are:
> +		2 drives         3 drives          4 drives
> +		--------         ----------        --------------
> +		A1  A1           A1  A1  A2        A1  A1  A2  A2
> +		A2  A2           A2  A3  A3        A3  A3  A4  A4
> +		A3  A3           A4  A4  A5        A5  A5  A6  A6
> +		A4  A4           A5  A6  A6        A7  A7  A8  A8
> +		..  ..           ..  ..  ..        ..  ..  ..  ..
> +		The 2-device layout is equivalent 2-way RAID1.  The 4-device
> +		layout is what a traditional RAID10 would look like.  The
> +		3-device layout is what might be called a 'RAID1E - Integrated
> +		Adjacent Stripe Mirroring'.
> +
> +		The 'raid10_far_[copies|offset]' arguments work together to
> +		determine where any "far"/"offset" copies will be placed.
> +		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
> +		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
> +		are:
> +		2 drives             3 drives             4 drives
> +		--------             --------------       --------------------
> +		A1  A2               A1   A2   A3         A1   A2   A3   A4
> +		A3  A4               A4   A5   A6         A5   A6   A7   A8
> +		A5  A6               A7   A8   A9         A9   A10  A11  A12
> +		..  ..               ..   ..   ..         ..   ..   ..   ..
> +		A2  A1               A3   A1   A2         A4   A1   A2   A3
> +		A4  A3               A6   A4   A5         A8   A5   A6   A7
> +		A6  A5               A9   A7   A8         A12  A9   A10  A11
> +		..  ..               ..   ..   ..         ..   ..   ..   ..
> +
> +		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
> +		'raid10_far_offset 1', then the layouts for 2, 3 and 4 devices
> +		are:
> +		2 drives       3 drives           4 drives
> +		--------       ------------       -----------------
> +		A1  A2         A1  A2  A3         A1  A2  A3  A4
> +		A2  A1         A3  A1  A2         A4  A1  A2  A3
> +		A3  A4         A4  A5  A6         A5  A6  A7  A8
> +		A4  A3         A6  A4  A5         A8  A5  A6  A7
> +		A5  A6         A7  A8  A9         A9  A10 A11 A12
> +		A6  A5         A9  A7  A8         A12 A9  A10 A11
> +		..  ..         ..  ..  ..         ..  ..  ..  ..
> +		Here we see layouts closely akin to 'RAID1E - Integrated
> +		Offset Stripe Mirroring'.
> +
> +		Near and far copies can both be specified giving more
> +		complex arrangements.  If 'raid10_near_copies 2',
> +		'raid10_far_copies 2' and 'raid10_far_offset 0', then the
> +		layouts for 4 and 5 devices are:
> +		4 drives              5 drives
> +		--------              --------
> +		A1  A1  A2  A2        A1  A1  A2  A2  A3
> +		A3  A3  A4  A4        A3  A4  A4  A5  A5
> +		A5  A5  A6  A6        A6  A6  A7  A7  A8
> +		A7  A7  A8  A8        A8  A9  A9  A10 A10
> +		..  ..  ..  ..        ..  ..  ..  ..  ..
> +		A2  A2  A1  A1        A2  A3  A1  A1  A2
> +		A4  A4  A3  A3        A5  A5  A3  A4  A4
> +		A6  A6  A5  A5        A7  A8  A6  A6  A7
> +		A8  A8  A7  A7        A10 A10 A8  A9  A9
> +		..  ..  ..  ..        ..  ..  ..  ..  ..
> +		Thanks wikipedia 'Non-standard RAID levels' for the layout
> +		figures:
> +		http://en.wikipedia.org/wiki/Non-standard_RAID_levels
> +
>  <#raid_devs>: The number of devices composing the array.
>  	Each device consists of two entries.  The first is the device
>  	containing the metadata (if any); the second is the one containing the
>
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Jonthan Brassow July 4, 2012, 3:20 a.m. UTC | #2
On Jul 3, 2012, at 8:21 PM, NeilBrown wrote:

> On Tue, 26 Jun 2012 07:03:51 -0500 Jonathan Brassow <jbrassow@redhat.com>
> wrote:
> 
>> dm raid: add md raid10 support
>> 
>> Support the MD RAID10 personality through dm-raid.c
>> 
>> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
>> 
>> Index: linux-upstream/drivers/md/dm-raid.c
>> ===================================================================
>> --- linux-upstream.orig/drivers/md/dm-raid.c
>> +++ linux-upstream/drivers/md/dm-raid.c
>> @@ -11,6 +11,7 @@
>> #include "md.h"
>> #include "raid1.h"
>> #include "raid5.h"
>> +#include "raid10.h"
>> #include "bitmap.h"
>> 
>> #include <linux/device-mapper.h>
>> @@ -52,7 +53,11 @@ struct raid_dev {
>> #define DMPF_MAX_RECOVERY_RATE 0x20
>> #define DMPF_MAX_WRITE_BEHIND  0x40
>> #define DMPF_STRIPE_CACHE      0x80
>> -#define DMPF_REGION_SIZE       0X100
>> +#define DMPF_REGION_SIZE       0x100
>> +#define DMPF_RAID10_NEAR_COPIES 0x200
>> +#define DMPF_RAID10_FAR_COPIES  0x400
>> +#define DMPF_RAID10_FAR_OFFSET  0x800
>> +
>> struct raid_set {
>> 	struct dm_target *ti;
>> 
>> @@ -66,6 +71,15 @@ struct raid_set {
>> 	struct raid_dev dev[0];
>> };
>> 
>> +/* near_copies in first byte */
>> +/* far_copies in second byte */
>> +/* far_offset in 17th bit */
>> +#define ALGORITHM_RAID10(near_copies, far_copies, far_offset) \
>> +	((near_copies & 0xFF) | ((far_copies & 0xFF) << 8) | ((!!far_offset) << 16))
>> +#define RAID10_NC(layout) (layout & 0xFF)
>> +#define RAID10_FC(layout) ((layout >> 8) & 0xFF)
>> +#define RAID10_FO(layout) (layout & 0x10000)
>> +
>> /* Supported raid types and properties. */
>> static struct raid_type {
>> 	const char *name;		/* RAID algorithm. */
>> @@ -76,6 +90,8 @@ static struct raid_type {
>> 	const unsigned algorithm;	/* RAID algorithm. */
>> } raid_types[] = {
>> 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
>> +	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
>> +	{"raid1e",   "RAID1E (Enhanced RAID1)",         0, 2, 10, -1 /* Varies */},
>> 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
>> 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
>> 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
>> @@ -339,10 +355,17 @@ static int validate_region_size(struct r
>>  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
>>  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
>>  *    [region_size <sectors>]           Defines granularity of bitmap
>> + *
>> + * RAID10-only options:
>> + *    [raid10_near_copies   <# copies>] Near copies. (Default: 2)
>> + *    [raid10_far_copies    <# copies>] Far copies.  (Default: 1)
>> + *    [raid10_far_offset    <0/1>]      Offset is device size(0) or stripe(1).
> 
> Can I suggest that you don't do it like this?  i.e. don't copy the
> mistakes I made :-)
> 
> I don't think there is any value in supporting multiple near and far copies.
> There are two dimensions of the layout:
> - number of copies.  Defaults to 2
> - location of the copies:  near, far, offset
> 
> Some day I could implement an alternate version of 'far' or 'offset' which
> improves redundancy slightly.
> Instead of 
>   A B C D
>   ...
>   D A B C
> 
> it would be
> 
>   A B C D 
>   ....
>   B A D C
> 
> i.e. treat the devices as pair and swap the device for the second copy.
> This doesn't generalise to an odd number of devices, but increases the number
> of pairs of devices that can concurrently fail without losing date.
> (for 4 devices, there are 6 pairs.  With current 'far' mode there are only 2
> pair of devices that can concurrently fail (0,2 and 1,3).  With the proposed
> far mode there are 4 (0,2 0,3 1,2, 1,3).
> 
> Adding this with your current proposal would be messy.
> Adding it with the two dimensions I suggest would simply involve adding
> another 'location' - 'farswap' or 'far2' or something.
> 
> I note you didn't make 'dm-raid1e' a module alias.  Was that deliberate?

The missed module alias was an oversight, thanks for catching that.  (Similar - as you can see from this patch - to the oversight I had when introducing raid1. :)  I have gone back and forth on whether to include the alternate "raid1e" or not.  There are different RAID1E algorithms, as described in the kernel doc.  Perhaps there should also be aliases similar to raid5 and raid6 - like raid1e_as (i.e. RAID1E - Adjacent Stripe), etc.  However, there are several combinations that don't have a real name.  Any thoughts?  I would be just fine leaving "raid1e" out as I would leaving it in.  These days, RAID10 is really a subset of RAID1E - perhaps I should be considering taking "raid10" out.  ;)

I like your suggestion of changing the parameter names.  I've found the original names somewhat confusing.  ('far_offset' seems to imply to me that the copy would not be the very next stripe, but _offset_ somehow - it seems to have the reverse meaning to me.  I think this comes from the fact that it acts as a modifier to 'far_copy'.)  I toyed with a couple different ways of doing this but figured it was best to just go along.  Anyway, what you are suggesting seems to be:
	raid10_copies <number> (Default: 2)
	raid10_layout <string> (Default: "near"/"adjacent")
Where <string> could be "near", "far", "offset" and "some-future-thing".  That seems nice to me and seems to clear up some of the confusion caused by "far_offset" seeming to be a modifier to "far_copies".

Let me know what you think about the above, and I'll get v2 ready.

 brassow

P.S.  While it doesn't affect this singular patch, I see people posting their set of patches as a reply to the summary '0 of' patch.  This keeps things together better, and I'll assume this is the way I should post from now on.
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown July 4, 2012, 5:15 a.m. UTC | #3
On Tue, 3 Jul 2012 22:20:29 -0500 Brassow Jonathan <jbrassow@redhat.com>
wrote:

> 
> On Jul 3, 2012, at 8:21 PM, NeilBrown wrote:
> 
> > On Tue, 26 Jun 2012 07:03:51 -0500 Jonathan Brassow <jbrassow@redhat.com>
> > wrote:
> > 
> >> dm raid: add md raid10 support
> >> 
> >> Support the MD RAID10 personality through dm-raid.c
> >> 
> >> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
> >> 
> >> Index: linux-upstream/drivers/md/dm-raid.c
> >> ===================================================================
> >> --- linux-upstream.orig/drivers/md/dm-raid.c
> >> +++ linux-upstream/drivers/md/dm-raid.c
> >> @@ -11,6 +11,7 @@
> >> #include "md.h"
> >> #include "raid1.h"
> >> #include "raid5.h"
> >> +#include "raid10.h"
> >> #include "bitmap.h"
> >> 
> >> #include <linux/device-mapper.h>
> >> @@ -52,7 +53,11 @@ struct raid_dev {
> >> #define DMPF_MAX_RECOVERY_RATE 0x20
> >> #define DMPF_MAX_WRITE_BEHIND  0x40
> >> #define DMPF_STRIPE_CACHE      0x80
> >> -#define DMPF_REGION_SIZE       0X100
> >> +#define DMPF_REGION_SIZE       0x100
> >> +#define DMPF_RAID10_NEAR_COPIES 0x200
> >> +#define DMPF_RAID10_FAR_COPIES  0x400
> >> +#define DMPF_RAID10_FAR_OFFSET  0x800
> >> +
> >> struct raid_set {
> >> 	struct dm_target *ti;
> >> 
> >> @@ -66,6 +71,15 @@ struct raid_set {
> >> 	struct raid_dev dev[0];
> >> };
> >> 
> >> +/* near_copies in first byte */
> >> +/* far_copies in second byte */
> >> +/* far_offset in 17th bit */
> >> +#define ALGORITHM_RAID10(near_copies, far_copies, far_offset) \
> >> +	((near_copies & 0xFF) | ((far_copies & 0xFF) << 8) | ((!!far_offset) << 16))
> >> +#define RAID10_NC(layout) (layout & 0xFF)
> >> +#define RAID10_FC(layout) ((layout >> 8) & 0xFF)
> >> +#define RAID10_FO(layout) (layout & 0x10000)
> >> +
> >> /* Supported raid types and properties. */
> >> static struct raid_type {
> >> 	const char *name;		/* RAID algorithm. */
> >> @@ -76,6 +90,8 @@ static struct raid_type {
> >> 	const unsigned algorithm;	/* RAID algorithm. */
> >> } raid_types[] = {
> >> 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
> >> +	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
> >> +	{"raid1e",   "RAID1E (Enhanced RAID1)",         0, 2, 10, -1 /* Varies */},
> >> 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
> >> 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
> >> 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
> >> @@ -339,10 +355,17 @@ static int validate_region_size(struct r
> >>  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
> >>  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
> >>  *    [region_size <sectors>]           Defines granularity of bitmap
> >> + *
> >> + * RAID10-only options:
> >> + *    [raid10_near_copies   <# copies>] Near copies. (Default: 2)
> >> + *    [raid10_far_copies    <# copies>] Far copies.  (Default: 1)
> >> + *    [raid10_far_offset    <0/1>]      Offset is device size(0) or stripe(1).
> > 
> > Can I suggest that you don't do it like this?  i.e. don't copy the
> > mistakes I made :-)
> > 
> > I don't think there is any value in supporting multiple near and far copies.
> > There are two dimensions of the layout:
> > - number of copies.  Defaults to 2
> > - location of the copies:  near, far, offset
> > 
> > Some day I could implement an alternate version of 'far' or 'offset' which
> > improves redundancy slightly.
> > Instead of 
> >   A B C D
> >   ...
> >   D A B C
> > 
> > it would be
> > 
> >   A B C D 
> >   ....
> >   B A D C
> > 
> > i.e. treat the devices as pair and swap the device for the second copy.
> > This doesn't generalise to an odd number of devices, but increases the number
> > of pairs of devices that can concurrently fail without losing date.
> > (for 4 devices, there are 6 pairs.  With current 'far' mode there are only 2
> > pair of devices that can concurrently fail (0,2 and 1,3).  With the proposed
> > far mode there are 4 (0,2 0,3 1,2, 1,3).
> > 
> > Adding this with your current proposal would be messy.
> > Adding it with the two dimensions I suggest would simply involve adding
> > another 'location' - 'farswap' or 'far2' or something.
> > 
> > I note you didn't make 'dm-raid1e' a module alias.  Was that deliberate?
> 
> The missed module alias was an oversight, thanks for catching that.  (Similar - as you can see from this patch - to the oversight I had when introducing raid1. :)  I have gone back and forth on whether to include the alternate "raid1e" or not.  There are different RAID1E algorithms, as described in the kernel doc.  Perhaps there should also be aliases similar to raid5 and raid6 - like raid1e_as (i.e. RAID1E - Adjacent Stripe), etc.  However, there are several combinations that don't have a real name.  Any thoughts?  I would be just fine leaving "raid1e" out as I would leaving it in.  These days, RAID10 is really a subset of RAID1E - perhaps I should be considering taking "raid10" out.  ;)

I don't really have much of an opinion here - as long as the scheme chosen is
coherent and extensible I'm happy.

I probably wouldn't have chosen to attach the layout to the "raid5" (e.g.
raid5_la), but I don't object.  The info has to go somewhere and it will
always be a smallish set of choices.
I probably would suggest not supporting both "raid10" and "raid1e" as that
could lead to confusion.  I can see good reasons for choosing either though.

> 
> I like your suggestion of changing the parameter names.  I've found the original names somewhat confusing.  ('far_offset' seems to imply to me that the copy would not be the very next stripe, but _offset_ somehow - it seems to have the reverse meaning to me.  I think this comes from the fact that it acts as a modifier to 'far_copy'.)  I toyed with a couple different ways of doing this but figured it was best to just go along.  Anyway, what you are suggesting seems to be:
> 	raid10_copies <number> (Default: 2)
> 	raid10_layout <string> (Default: "near"/"adjacent")
> Where <string> could be "near", "far", "offset" and "some-future-thing".  That seems nice to me and seems to clear up some of the confusion caused by "far_offset" seeming to be a modifier to "far_copies".

Yes, that is what I'm suggesting.

> 
> Let me know what you think about the above, and I'll get v2 ready.
> 
>  brassow
> 
> P.S.  While it doesn't affect this singular patch, I see people posting their set of patches as a reply to the summary '0 of' patch.  This keeps things together better, and I'll assume this is the way I should post from now on.

There is probably some tool that does that.  It might help a little bit so if
you find it easy, then do it.  But if it is at all a burden, don't bother.

Thanks,
NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Jan Ceuleers July 4, 2012, 3:27 p.m. UTC | #4
On 07/04/2012 07:15 AM, NeilBrown wrote:
>> P.S.  While it doesn't affect this singular patch, I see people posting their set of patches as a reply to the summary '0 of' patch.  This keeps things together better, and I'll assume this is the way I should post from now on.
> 
> There is probably some tool that does that.  It might help a little bit so if
> you find it easy, then do it.  But if it is at all a burden, don't bother.

The tool in question is git. Specifically: the --thread option to git
send-email .

(Other VCSs are available)

Jan

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Jonthan Brassow July 10, 2012, 7:27 p.m. UTC | #5
On Jul 4, 2012, at 12:15 AM, NeilBrown wrote:

>> 
>> I like your suggestion of changing the parameter names.  I've found the original names somewhat confusing.  ('far_offset' seems to imply to me that the copy would not be the very next stripe, but _offset_ somehow - it seems to have the reverse meaning to me.  I think this comes from the fact that it acts as a modifier to 'far_copy'.)  I toyed with a couple different ways of doing this but figured it was best to just go along.  Anyway, what you are suggesting seems to be:
>> 	raid10_copies <number> (Default: 2)
>> 	raid10_layout <string> (Default: "near"/"adjacent")
>> Where <string> could be "near", "far", "offset" and "some-future-thing".  That seems nice to me and seems to clear up some of the confusion caused by "far_offset" seeming to be a modifier to "far_copies".
> 
> Yes, that is what I'm suggesting.

One problem with this approach is that users can no longer mix and match.  They can't have 2 near copies and 2 far copies, for example.  Perhaps someone might choose this layout for read balancing performance...

The original method didn't allow for two different simultaneous "far" algorithms (because it would add no redundancy unless they were shifted from each other as well as the original), but this new way of specifying makes it worse.

Do you see this as a problem?  If so, we need to find a way to specify the number of copies for each layout /and/ include the potential for "double-shift" vs "single-shift" or some further variant.  One idea I had before was:
	raid10_near_copies <#>
	raid10_far_copies <#>
	raid10_stripe_copies <#>  (similar to far+offset, but still allows for simultaneous "far" w/o "offset")
To allow for the different variants of "shifting", we could have different raid10 variants, like "raid10_2s" or "raid1e_2s" - similar to the extensions on RAID5 ("raid5_ls") that you don't like.  :)

 brassow

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown July 11, 2012, 12:08 a.m. UTC | #6
On Tue, 10 Jul 2012 14:27:30 -0500 Brassow Jonathan <jbrassow@redhat.com>
wrote:

> 
> On Jul 4, 2012, at 12:15 AM, NeilBrown wrote:
> 
> >> 
> >> I like your suggestion of changing the parameter names.  I've found the original names somewhat confusing.  ('far_offset' seems to imply to me that the copy would not be the very next stripe, but _offset_ somehow - it seems to have the reverse meaning to me.  I think this comes from the fact that it acts as a modifier to 'far_copy'.)  I toyed with a couple different ways of doing this but figured it was best to just go along.  Anyway, what you are suggesting seems to be:
> >> 	raid10_copies <number> (Default: 2)
> >> 	raid10_layout <string> (Default: "near"/"adjacent")
> >> Where <string> could be "near", "far", "offset" and "some-future-thing".  That seems nice to me and seems to clear up some of the confusion caused by "far_offset" seeming to be a modifier to "far_copies".
> > 
> > Yes, that is what I'm suggesting.
> 
> One problem with this approach is that users can no longer mix and match.  They can't have 2 near copies and 2 far copies, for example.  Perhaps someone might choose this layout for read balancing performance...

I came across a bit of code in raid10 recently which would not work properly
in at least some combinations of  near>1 and far>1.  It might have been only
when 'raid_disks' was not divisible by 'near' - I'd have to check.

Obviously I don't *know* that no-one ever uses a layout like this, but I've
never heard of one.  Only rarely do people have 3 copies.  Having 4 seems
excessive.
So while I cannot safely change mdadm to reject the combination I have no
concern about never allowing the combination with dm-raid.  (are there enough
double-negatives there?).

I think we should keep it simple.  Copies + layout.  Anything more complex
doesn't - in my opinion - bring any real value at all.

NeilBrown


> 
> The original method didn't allow for two different simultaneous "far" algorithms (because it would add no redundancy unless they were shifted from each other as well as the original), but this new way of specifying makes it worse.
> 
> Do you see this as a problem?  If so, we need to find a way to specify the number of copies for each layout /and/ include the potential for "double-shift" vs "single-shift" or some further variant.  One idea I had before was:
> 	raid10_near_copies <#>
> 	raid10_far_copies <#>
> 	raid10_stripe_copies <#>  (similar to far+offset, but still allows for simultaneous "far" w/o "offset")
> To allow for the different variants of "shifting", we could have different raid10 variants, like "raid10_2s" or "raid1e_2s" - similar to the extensions on RAID5 ("raid5_ls") that you don't like.  :)
> 
>  brassow--
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox

Patch

Index: linux-upstream/drivers/md/dm-raid.c
===================================================================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@ 
 #include "md.h"
 #include "raid1.h"
 #include "raid5.h"
+#include "raid10.h"
 #include "bitmap.h"
 
 #include <linux/device-mapper.h>
@@ -52,7 +53,11 @@  struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0X100
+#define DMPF_REGION_SIZE       0x100
+#define DMPF_RAID10_NEAR_COPIES 0x200
+#define DMPF_RAID10_FAR_COPIES  0x400
+#define DMPF_RAID10_FAR_OFFSET  0x800
+
 struct raid_set {
 	struct dm_target *ti;
 
@@ -66,6 +71,15 @@  struct raid_set {
 	struct raid_dev dev[0];
 };
 
+/* near_copies in first byte */
+/* far_copies in second byte */
+/* far_offset in 17th bit */
+#define ALGORITHM_RAID10(near_copies, far_copies, far_offset) \
+	((near_copies & 0xFF) | ((far_copies & 0xFF) << 8) | ((!!far_offset) << 16))
+#define RAID10_NC(layout) (layout & 0xFF)
+#define RAID10_FC(layout) ((layout >> 8) & 0xFF)
+#define RAID10_FO(layout) (layout & 0x10000)
+
 /* Supported raid types and properties. */
 static struct raid_type {
 	const char *name;		/* RAID algorithm. */
@@ -76,6 +90,8 @@  static struct raid_type {
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
+	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, -1 /* Varies */},
+	{"raid1e",   "RAID1E (Enhanced RAID1)",         0, 2, 10, -1 /* Varies */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -339,10 +355,17 @@  static int validate_region_size(struct r
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [region_size <sectors>]           Defines granularity of bitmap
+ *
+ * RAID10-only options:
+ *    [raid10_near_copies   <# copies>] Near copies. (Default: 2)
+ *    [raid10_far_copies    <# copies>] Far copies.  (Default: 1)
+ *    [raid10_far_offset    <0/1>]      Offset is device size(0) or stripe(1).
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
+	unsigned raid10_default = ALGORITHM_RAID10(2, 1, 0);
+	unsigned raid10_nc = 1, raid10_fc = 1, raid10_fo = 0;
 	unsigned i, rebuild_cnt = 0;
 	unsigned long value, region_size = 0;
 	sector_t sectors_per_dev = rs->ti->len;
@@ -435,6 +458,7 @@  static int parse_raid_params(struct raid
 				if (rebuild_cnt > rs->raid_type->parity_devs)
 					rs->ti->error = "Too many rebuild devices specified for given RAID type";
 				break;
+			case 10:
 			default:
 				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
 				rs->ti->error = "Rebuild not supported for this RAID type";
@@ -492,7 +516,7 @@  static int parse_raid_params(struct raid
 			 */
 			value /= 2;
 
-			if (rs->raid_type->level < 5) {
+			if (rs->raid_type->level != 5) {
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				return -EINVAL;
 			}
@@ -517,6 +541,33 @@  static int parse_raid_params(struct raid
 		} else if (!strcasecmp(key, "region_size")) {
 			rs->print_flags |= DMPF_REGION_SIZE;
 			region_size = value;
+		} else if (!strcasecmp(key, "raid10_near_copies") &&
+			   (rs->raid_type->level == 10)) {
+			if ((value < 1) || (value > 0xFF)) {
+				rs->ti->error = "Bad value for 'raid10_near_copies'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_NEAR_COPIES;
+			raid10_nc = value;
+			raid10_default = 0;
+		} else if (!strcasecmp(key, "raid10_far_copies") &&
+			   (rs->raid_type->level == 10)) {
+			if ((value < 1) || (value > 0xFF)) {
+				rs->ti->error = "Bad value for 'raid10_far_copies'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_FAR_COPIES;
+			raid10_fc = value;
+			raid10_default = 0;
+		} else if (!strcasecmp(key, "raid10_far_offset") &&
+			   (rs->raid_type->level == 10)) {
+			if (value > 1) {
+				rs->ti->error = "Bad value for 'raid10_far_offset'";
+				return -EINVAL;
+			}
+			rs->print_flags |= DMPF_RAID10_FAR_OFFSET;
+			raid10_fo = value;
+			raid10_default = 0;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -532,9 +583,33 @@  static int parse_raid_params(struct raid
 	else
 		rs->ti->split_io = region_size;
 
-	if ((rs->raid_type->level > 1) &&
-	    sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+	if (rs->raid_type->level == 10) {
+		/* (Len * Stripes) / Mirrors */
+		sectors_per_dev *= rs->md.raid_disks;
+		if (sector_div(sectors_per_dev, (raid10_nc * raid10_fc))) {
+			rs->ti->error = "Target length not divisible by number of data devices";
+			return -EINVAL;
+		}
+		if ((raid10_nc * raid10_fc) > rs->md.raid_disks) {
+			rs->ti->error = "Not enough devices to satisfy specification";
+			return -EINVAL;
+		}
+		if (raid10_fo && (raid10_fc < 2)) {
+			DMWARN("RAID10 parameter 'far_offset' ignored");
+			raid10_fo = 0;
+		}
+
+		if (raid10_default)
+			rs->md.layout = raid10_default;
+		else
+			rs->md.layout = ALGORITHM_RAID10(raid10_nc,
+							 raid10_fc, raid10_fo);
+		rs->md.new_layout = rs->md.layout;
+	} else if ((rs->raid_type->level > 1) &&
+		   sector_div(sectors_per_dev,
+			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
 		rs->ti->error = "Target length not divisible by number of data devices";
+
 		return -EINVAL;
 	}
 	rs->md.dev_sectors = sectors_per_dev;
@@ -560,6 +635,9 @@  static int raid_is_congested(struct dm_t
 	if (rs->raid_type->level == 1)
 		return md_raid1_congested(&rs->md, bits);
 
+	if (rs->raid_type->level == 10)
+		return md_raid10_congested(&rs->md, bits);
+
 	return md_raid5_congested(&rs->md, bits);
 }
 
@@ -878,6 +956,9 @@  static int analyse_superblocks(struct dm
 	case 6:
 		redundancy = rs->raid_type->parity_devs;
 		break;
+	case 10:
+		redundancy = RAID10_NC(mddev->layout) *	RAID10_FC(mddev->layout);
+		break;
 	default:
 		ti->error = "Unknown RAID type";
 		return -EINVAL;
@@ -1197,6 +1278,18 @@  static int raid_status(struct dm_target
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
+		if (rs->print_flags & DMPF_RAID10_NEAR_COPIES)
+			DMEMIT(" raid10_near_copies %u",
+			       RAID10_NC(rs->md.layout));
+
+		if (rs->print_flags & DMPF_RAID10_FAR_COPIES)
+			DMEMIT(" raid10_far_copies %u",
+			       RAID10_FC(rs->md.layout));
+
+		if (rs->print_flags & DMPF_RAID10_FAR_OFFSET)
+			DMEMIT(" raid10_far_offset %u",
+			       RAID10_FO(rs->md.layout));
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			if (rs->dev[i].meta_dev)
@@ -1271,7 +1364,7 @@  static void raid_resume(struct dm_target
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@@ -1298,6 +1391,8 @@  module_init(dm_raid_init);
 module_exit(dm_raid_exit);
 
 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid1");
+MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
===================================================================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -27,6 +27,11 @@  The target is named "raid" and it accept
 		- rotating parity N (right-to-left) with data restart
   raid6_nc	RAID6 N continue
 		- rotating parity N (right-to-left) with data continuation
+  raid10/raid1e Various RAID10 inspired algorithms chosen by additional params
+  		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
+		- RAID1E: Integrated Adjacent Stripe Mirroring
+		- RAID1E: Integrated Offset Stripe Mirroring
+		-  and other similar RAID10 variants
 
   Reference: Chapter 4 of
   http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
@@ -59,6 +64,80 @@  The target is named "raid" and it accept
 		logical size of the array.  The bitmap records the device
 		synchronisation state for each region.
 
+        [raid10_near_copies   <# copies>]
+        [raid10_far_copies    <# copies>]
+        [raid10_far_offset    <0/1>]
+		These three options are used to alter the default layout of
+		a RAID10/RAID1E configuration.  The total number of copies is
+		given by the number of "near" (aka "adjacent") copies times
+		the number of "far" (aka "offset") copies.  Near copies
+		are what most people think of with respect to mirroring.
+		If 'raid10_near_copies 2', 'raid10_far_copies 1' and
+		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives         3 drives          4 drives
+		--------         ----------        --------------
+		A1  A1           A1  A1  A2        A1  A1  A2  A2
+		A2  A2           A2  A3  A3        A3  A3  A4  A4
+		A3  A3           A4  A4  A5        A5  A5  A6  A6
+		A4  A4           A5  A6  A6        A7  A7  A8  A8
+		..  ..           ..  ..  ..        ..  ..  ..  ..
+		The 2-device layout is equivalent 2-way RAID1.  The 4-device
+		layout is what a traditional RAID10 would look like.  The
+		3-device layout is what might be called a 'RAID1E - Integrated
+		Adjacent Stripe Mirroring'.
+
+		The 'raid10_far_[copies|offset]' arguments work together to
+		determine where any "far"/"offset" copies will be placed.
+		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
+		'raid10_far_offset 0', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives             3 drives             4 drives
+		--------             --------------       --------------------
+		A1  A2               A1   A2   A3         A1   A2   A3   A4
+		A3  A4               A4   A5   A6         A5   A6   A7   A8
+		A5  A6               A7   A8   A9         A9   A10  A11  A12
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+		A2  A1               A3   A1   A2         A4   A1   A2   A3
+		A4  A3               A6   A4   A5         A8   A5   A6   A7
+		A6  A5               A9   A7   A8         A12  A9   A10  A11
+		..  ..               ..   ..   ..         ..   ..   ..   ..
+
+		If 'raid10_near_copies 1', 'raid10_far_copies 2' and
+		'raid10_far_offset 1', then the layouts for 2, 3 and 4 devices
+		are:
+		2 drives       3 drives           4 drives
+		--------       ------------       -----------------
+		A1  A2         A1  A2  A3         A1  A2  A3  A4
+		A2  A1         A3  A1  A2         A4  A1  A2  A3
+		A3  A4         A4  A5  A6         A5  A6  A7  A8
+		A4  A3         A6  A4  A5         A8  A5  A6  A7
+		A5  A6         A7  A8  A9         A9  A10 A11 A12
+		A6  A5         A9  A7  A8         A12 A9  A10 A11
+		..  ..         ..  ..  ..         ..  ..  ..  ..
+		Here we see layouts closely akin to 'RAID1E - Integrated
+		Offset Stripe Mirroring'.
+
+		Near and far copies can both be specified giving more
+		complex arrangements.  If 'raid10_near_copies 2',
+		'raid10_far_copies 2' and 'raid10_far_offset 0', then the
+		layouts for 4 and 5 devices are:
+		4 drives              5 drives
+		--------              --------
+		A1  A1  A2  A2        A1  A1  A2  A2  A3
+		A3  A3  A4  A4        A3  A4  A4  A5  A5
+		A5  A5  A6  A6        A6  A6  A7  A7  A8
+		A7  A7  A8  A8        A8  A9  A9  A10 A10
+		..  ..  ..  ..        ..  ..  ..  ..  ..
+		A2  A2  A1  A1        A2  A3  A1  A1  A2
+		A4  A4  A3  A3        A5  A5  A3  A4  A4
+		A6  A6  A5  A5        A7  A8  A6  A6  A7
+		A8  A8  A7  A7        A10 A10 A8  A9  A9
+		..  ..  ..  ..        ..  ..  ..  ..  ..
+		Thanks wikipedia 'Non-standard RAID levels' for the layout
+		figures:
+		http://en.wikipedia.org/wiki/Non-standard_RAID_levels
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the