[RFC] Btrfs: Add linear chunk allocation support.
diff mbox

Message ID 1674620.BNzKaL6CNE@localhost.localdomain
State New, archived
Headers show

Commit Message

Chandan Rajendra Dec. 2, 2013, 5:53 a.m. UTC
This patch implements the *core* of the idea suggested at
https://btrfs.wiki.kernel.org/index.php/Project_ideas#Linear_chunk_allocation_mode.
Other required changes (e.g. balance/restripe) will be made based on
the reviews obtained for this patch.

On a multi-disk filesystem instance using single mode storage, we could
minimize the data loss (due to a hard drive failure) by filling up a disk
completely before starting to use another disk.

Chunk allocation is performed based on the increasing order of devid i.e.
the device with the lowest devid will be used for allocating the new chunk.

Signed-off-by: chandan <chandan@linux.vnet.ibm.com>
---
 fs/btrfs/ctree.h   |  4 +++-
 fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 7 deletions(-)

Comments

David Sterba Dec. 4, 2013, 12:06 a.m. UTC | #1
On Mon, Dec 02, 2013 at 12:23:02PM +0630, chandan wrote:
> This patch implements the *core* of the idea suggested at
> https://btrfs.wiki.kernel.org/index.php/Project_ideas#Linear_chunk_allocation_mode.
> Other required changes (e.g. balance/restripe) will be made based on
> the reviews obtained for this patch.
> 
> On a multi-disk filesystem instance using single mode storage, we could
> minimize the data loss (due to a hard drive failure) by filling up a disk
> completely before starting to use another disk.

The usecase sounds ok, but I don't agree the incompat bit is the way to
implement that, this is not a format change that would make it
impossible to read the data on older kernels.

I assume you want to make the allocation policy permanent and nothing
else was available. We're in the process of getting the per-object
properties ready and I think this will fit your needs.

You can use a temporary mount option to set the allocator policy for
now. There's a related project idea that would build upon the same
infrastructure:

https://btrfs.wiki.kernel.org/index.php/Project_ideas#Better_data_balancing_over_multiple_devices_for_raid1.2F10_.28allocation.29

> Chunk allocation is performed based on the increasing order of devid i.e.
> the device with the lowest devid will be used for allocating the new chunk.

This does not look very flexible and highly depends on the order in
which the devices are added. I think a device property like 'allocation
priority' would let the user chose the device order.


david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chandan Rajendra Dec. 4, 2013, 9:25 a.m. UTC | #2
Thanks for the review comments David. I will come up with another patch
that implements your suggestion.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82e2b74..fb13d85 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -522,6 +522,7 @@  struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
 #define BTRFS_FEATURE_INCOMPAT_NO_HOLES		(1ULL << 9)
+#define BTRFS_FEATURE_INCOMPAT_LINEAR_CHUNK_ALLOC (1ULL << 10)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
@@ -539,7 +540,8 @@  struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES |		\
+	 BTRFS_FEATURE_INCOMPAT_LINEAR_CHUNK_ALLOC)
 
 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
 	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0db6370..8288fb5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3884,7 +3884,7 @@  static int btrfs_add_system_chunk(struct btrfs_root *root,
 /*
  * sort the devices in descending order by max_avail, total_avail
  */
-static int btrfs_cmp_device_info(const void *a, const void *b)
+static int btrfs_cmp_device_info_space(const void *a, const void *b)
 {
 	const struct btrfs_device_info *di_a = a;
 	const struct btrfs_device_info *di_b = b;
@@ -3900,6 +3900,18 @@  static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
+static int btrfs_cmp_device_info_devid(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	if (di_a->dev->devid > di_b->dev->devid)
+		return 1;
+	if (di_a->dev->devid < di_b->dev->devid)
+		return -1;
+	return 0;
+}
+
 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = {
 		.sub_stripes	= 2,
@@ -3985,6 +3997,7 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map *em;
 	struct btrfs_device_info *devices_info = NULL;
 	u64 total_avail;
+	u64 incompat_flags;
 	int num_stripes;	/* total number of stripes to allocate */
 	int data_stripes;	/* number of stripes that count for
 				   block group size */
@@ -4106,11 +4119,22 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		++ndevs;
 	}
 
-	/*
-	 * now sort the devices by hole size / available space
-	 */
-	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_info, NULL);
+	incompat_flags = btrfs_super_incompat_flags(info->super_copy);
+
+	if ((chunk_to_extended(type) & BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+		&& (incompat_flags & BTRFS_FEATURE_INCOMPAT_LINEAR_CHUNK_ALLOC)) {
+		/*
+		 * Sort the devices obtained based on devid.
+		 */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+			btrfs_cmp_device_info_devid, NULL);
+	} else {
+		/*
+		 * now sort the devices by hole size / available space
+		 */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+			btrfs_cmp_device_info_space, NULL);
+	}
 
 	/* round down to number of usable stripes */
 	ndevs -= ndevs % devs_increment;