diff mbox series

[v9,12/41] btrfs: implement zoned chunk allocator

Message ID 5b9798f6e4c317e6a2c433ef88ffeabe00b93bb3.1604065695.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Oct. 30, 2020, 1:51 p.m. UTC
This commit implements a zoned chunk/dev_extent allocator. The zoned
allocator aligns the device extents to zone boundaries, so that a zone
reset affects only the device extent and does not change the state of
blocks in the neighbor device extents.

Also, it checks that a region allocation is not overlapping any of the
super block zones, and ensures the region is empty.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/volumes.c | 131 +++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |   1 +
 fs/btrfs/zoned.c   | 126 +++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/zoned.h   |  30 +++++++++++
 4 files changed, 288 insertions(+)

Comments

Josef Bacik Nov. 2, 2020, 8:09 p.m. UTC | #1
On 10/30/20 9:51 AM, Naohiro Aota wrote:
> This commit implements a zoned chunk/dev_extent allocator. The zoned
> allocator aligns the device extents to zone boundaries, so that a zone
> reset affects only the device extent and does not change the state of
> blocks in the neighbor device extents.
> 
> Also, it checks that a region allocation is not overlapping any of the
> super block zones, and ensures the region is empty.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/volumes.c | 131 +++++++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/volumes.h |   1 +
>   fs/btrfs/zoned.c   | 126 +++++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/zoned.h   |  30 +++++++++++
>   4 files changed, 288 insertions(+)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index db884b96a5ea..78c62ef02e6f 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1416,6 +1416,14 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
>   	return false;
>   }
>   
> +static inline u64 dev_extent_search_start_zoned(struct btrfs_device *device,
> +						u64 start)
> +{
> +	start = max_t(u64, start,
> +		      max_t(u64, device->zone_info->zone_size, SZ_1M));
> +	return btrfs_zone_align(device, start);
> +}
> +
>   static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>   {
>   	switch (device->fs_devices->chunk_alloc_policy) {
> @@ -1426,11 +1434,57 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>   		 * make sure to start at an offset of at least 1MB.
>   		 */
>   		return max_t(u64, start, SZ_1M);
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		return dev_extent_search_start_zoned(device, start);
>   	default:
>   		BUG();
>   	}
>   }
>   
> +static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
> +					u64 *hole_start, u64 *hole_size,
> +					u64 num_bytes)
> +{
> +	u64 zone_size = device->zone_info->zone_size;
> +	u64 pos;
> +	int ret;
> +	int changed = 0;
> +
> +	ASSERT(IS_ALIGNED(*hole_start, zone_size));
> +
> +	while (*hole_size > 0) {
> +		pos = btrfs_find_allocatable_zones(device, *hole_start,
> +						   *hole_start + *hole_size,
> +						   num_bytes);
> +		if (pos != *hole_start) {
> +			*hole_size = *hole_start + *hole_size - pos;
> +			*hole_start = pos;
> +			changed = 1;
> +			if (*hole_size < num_bytes)
> +				break;
> +		}
> +
> +		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
> +
> +		/* range is ensured to be empty */
> +		if (!ret)
> +			return changed;
> +
> +		/* given hole range was invalid (outside of device) */
> +		if (ret == -ERANGE) {
> +			*hole_start += *hole_size;
> +			*hole_size = 0;
> +			return 1;
> +		}
> +
> +		*hole_start += zone_size;
> +		*hole_size -= zone_size;
> +		changed = 1;
> +	}
> +
> +	return changed;
> +}
> +
>   /**
>    * dev_extent_hole_check - check if specified hole is suitable for allocation
>    * @device:	the device which we have the hole
> @@ -1463,6 +1517,10 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
>   	case BTRFS_CHUNK_ALLOC_REGULAR:
>   		/* No extra check */
>   		break;
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		changed |= dev_extent_hole_check_zoned(device, hole_start,
> +						       hole_size, num_bytes);
I'm confused here, we check to make sure the pending stuff doesn't overlap with 
non-empty zones.  However we don't ever actually mark zones as non-empty except 
on mount.  I realize that if we allocate this zone then it appears pending and 
thus we won't allocate with this zone again while the fs is mounted, but it took 
me a while to realize this.  Is there a reason to not mark a zone as non-empty 
when we allocate from it?


> +		break;
>   	default:
>   		BUG();
>   	}
> @@ -1517,6 +1575,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
>   
>   	search_start = dev_extent_search_start(device, search_start);
>   
> +	WARN_ON(device->zone_info &&
> +		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
> +
>   	path = btrfs_alloc_path();
>   	if (!path)
>   		return -ENOMEM;
> @@ -4907,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
>   	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
>   }
>   
> +static void
> +init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices *fs_devices,
> +				  struct alloc_chunk_ctl *ctl)
> +{
> +	u64 zone_size = fs_devices->fs_info->zone_size;
> +	u64 limit;
> +	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
> +	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
> +	u64 min_chunk_size = min_data_stripes * zone_size;
> +	u64 type = ctl->type;
> +
> +	ctl->max_stripe_size = zone_size;
> +	if (type & BTRFS_BLOCK_GROUP_DATA) {
> +		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
> +						 zone_size);
> +	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
> +		ctl->max_chunk_size = ctl->max_stripe_size;
> +	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
> +		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
> +		ctl->devs_max = min_t(int, ctl->devs_max,
> +				      BTRFS_MAX_DEVS_SYS_CHUNK);
> +	}
> +
> +	/* We don't want a chunk larger than 10% of writable space */
> +	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
> +			       zone_size),
> +		    min_chunk_size);
> +	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
> +	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
> +}
> +
>   static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>   				 struct alloc_chunk_ctl *ctl)
>   {
> @@ -4927,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>   	case BTRFS_CHUNK_ALLOC_REGULAR:
>   		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
>   		break;
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
> +		break;
>   	default:
>   		BUG();
>   	}
> @@ -5053,6 +5148,40 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
>   	return 0;
>   }
>   
> +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
> +				    struct btrfs_device_info *devices_info)
> +{
> +	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
> +	/* number of stripes that count for block group size */
> +	int data_stripes;
> +
> +	/*
> +	 * It should hold because:
> +	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
> +	 */
> +	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
> +
> +	ctl->stripe_size = zone_size;
> +	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
> +	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
> +
> +	/*
> +	 * stripe_size is fixed in ZONED. Reduce ndevs instead.
> +	 */
> +	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
> +		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
> +					     ctl->stripe_size) + ctl->nparity,
> +				     ctl->dev_stripes);
> +		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
> +		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
> +		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
> +	}
> +
> +	ctl->chunk_size = ctl->stripe_size * data_stripes;
> +
> +	return 0;
> +}
> +
>   static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
>   			      struct alloc_chunk_ctl *ctl,
>   			      struct btrfs_device_info *devices_info)
> @@ -5080,6 +5209,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
>   	switch (fs_devices->chunk_alloc_policy) {
>   	case BTRFS_CHUNK_ALLOC_REGULAR:
>   		return decide_stripe_size_regular(ctl, devices_info);
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		return decide_stripe_size_zoned(ctl, devices_info);
>   	default:
>   		BUG();
>   	}
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 9c07b97a2260..0249aca668fb 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -213,6 +213,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
>   
>   enum btrfs_chunk_allocation_policy {
>   	BTRFS_CHUNK_ALLOC_REGULAR,
> +	BTRFS_CHUNK_ALLOC_ZONED,
>   };
>   
>   struct btrfs_fs_devices {
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index d5487cba203b..4411d786597a 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -1,11 +1,13 @@
>   // SPDX-License-Identifier: GPL-2.0
>   
> +#include <linux/bitops.h>
>   #include <linux/slab.h>
>   #include <linux/blkdev.h>
>   #include "ctree.h"
>   #include "volumes.h"
>   #include "zoned.h"
>   #include "rcu-string.h"
> +#include "disk-io.h"
>   
>   /* Maximum number of zones to report per blkdev_report_zones() call */
>   #define BTRFS_REPORT_NR_ZONES   4096
> @@ -328,6 +330,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
>   
>   	fs_info->zone_size = zone_size;
>   	fs_info->max_zone_append_size = max_zone_append_size;
> +	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
>   
>   	btrfs_info(fs_info, "ZONED mode enabled, zone size %llu B",
>   		   fs_info->zone_size);
> @@ -607,3 +610,126 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
>   				sb_zone << zone_sectors_shift, zone_sectors * 2,
>   				GFP_NOFS);
>   }
> +
> +/*
> + * btrfs_check_allocatable_zones - find allocatable zones within give region
> + * @device:	the device to allocate a region
> + * @hole_start: the position of the hole to allocate the region
> + * @num_bytes:	the size of wanted region
> + * @hole_size:	the size of hole
> + *
> + * Allocatable region should not contain any superblock locations.
> + */
> +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
> +				 u64 hole_end, u64 num_bytes)
> +{
> +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> +	u8 shift = zinfo->zone_size_shift;
> +	u64 nzones = num_bytes >> shift;
> +	u64 pos = hole_start;
> +	u64 begin, end;
> +	u64 sb_pos;
> +	bool have_sb;
> +	int i;
> +
> +	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
> +	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
> +
> +	while (pos < hole_end) {
> +		begin = pos >> shift;
> +		end = begin + nzones;
> +
> +		if (end > zinfo->nr_zones)
> +			return hole_end;
> +
> +		/* check if zones in the region are all empty */
> +		if (btrfs_dev_is_sequential(device, pos) &&
> +		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
> +			pos += zinfo->zone_size;
> +			continue;
> +		}
> +
> +		have_sb = false;
> +		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +			sb_pos = sb_zone_number(zinfo->zone_size, i);
> +			if (!(end < sb_pos || sb_pos + 1 < begin)) {
> +				have_sb = true;
> +				pos = (sb_pos + 2) << shift;
> +				break;
> +			}
> +		}
> +		if (!have_sb)
> +			break;
> +	}
> +
> +	return pos;
> +}
> +
> +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
> +			    u64 length, u64 *bytes)
> +{
> +	int ret;
> +
> +	*bytes = 0;
> +	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
> +			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
> +			       GFP_NOFS);
> +	if (ret)
> +		return ret;
> +
> +	*bytes = length;
> +	while (length) {
> +		btrfs_dev_set_zone_empty(device, physical);
> +		physical += device->zone_info->zone_size;
> +		length -= device->zone_info->zone_size;
> +	}
> +
> +	return 0;
> +}
> +
> +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
> +{
> +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> +	u8 shift = zinfo->zone_size_shift;
> +	unsigned long begin = start >> shift;
> +	unsigned long end = (start + size) >> shift;
> +	u64 pos;
> +	int ret;
> +
> +	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
> +	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
> +
> +	if (end > zinfo->nr_zones)
> +		return -ERANGE;
> +
> +	/* all the zones are conventional */
> +	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
> +		return 0;
> +

This check is duplicated below.

> +	/* all the zones are sequential and empty */
> +	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
> +	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
> +		return 0;
> +
> +	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
> +		u64 reset_bytes;
> +
> +		if (!btrfs_dev_is_sequential(device, pos) ||
> +		    btrfs_dev_is_empty_zone(device, pos))
> +			continue;
> +
> +		/* free regions should be empty */
> +		btrfs_warn_in_rcu(
> +			device->fs_info,
> +			"resetting device %s zone %llu for allocation",
> +			rcu_str_deref(device->name), pos >> shift);
> +		WARN_ON_ONCE(1);
> +
> +		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
> +					      &reset_bytes);
> +		if (ret)
> +			return ret;

This seems bad, as we could just have corruption right?  So we're resetting the 
zone which could lose us data right?  Shouldn't we just bail here?  Thanks,

Josef
Naohiro Aota Nov. 2, 2020, 10:21 p.m. UTC | #2
On Mon, Nov 02, 2020 at 03:09:58PM -0500, Josef Bacik wrote:
>On 10/30/20 9:51 AM, Naohiro Aota wrote:
>>This commit implements a zoned chunk/dev_extent allocator. The zoned
>>allocator aligns the device extents to zone boundaries, so that a zone
>>reset affects only the device extent and does not change the state of
>>blocks in the neighbor device extents.
>>
>>Also, it checks that a region allocation is not overlapping any of the
>>super block zones, and ensures the region is empty.
>>
>>Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>>---
>>  fs/btrfs/volumes.c | 131 +++++++++++++++++++++++++++++++++++++++++++++
>>  fs/btrfs/volumes.h |   1 +
>>  fs/btrfs/zoned.c   | 126 +++++++++++++++++++++++++++++++++++++++++++
>>  fs/btrfs/zoned.h   |  30 +++++++++++
>>  4 files changed, 288 insertions(+)
>>
>>diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>index db884b96a5ea..78c62ef02e6f 100644
>>--- a/fs/btrfs/volumes.c
>>+++ b/fs/btrfs/volumes.c
>>@@ -1416,6 +1416,14 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
>>  	return false;
>>  }
>>+static inline u64 dev_extent_search_start_zoned(struct btrfs_device *device,
>>+						u64 start)
>>+{
>>+	start = max_t(u64, start,
>>+		      max_t(u64, device->zone_info->zone_size, SZ_1M));
>>+	return btrfs_zone_align(device, start);
>>+}
>>+
>>  static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>>  {
>>  	switch (device->fs_devices->chunk_alloc_policy) {
>>@@ -1426,11 +1434,57 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>>  		 * make sure to start at an offset of at least 1MB.
>>  		 */
>>  		return max_t(u64, start, SZ_1M);
>>+	case BTRFS_CHUNK_ALLOC_ZONED:
>>+		return dev_extent_search_start_zoned(device, start);
>>  	default:
>>  		BUG();
>>  	}
>>  }
>>+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
>>+					u64 *hole_start, u64 *hole_size,
>>+					u64 num_bytes)
>>+{
>>+	u64 zone_size = device->zone_info->zone_size;
>>+	u64 pos;
>>+	int ret;
>>+	int changed = 0;
>>+
>>+	ASSERT(IS_ALIGNED(*hole_start, zone_size));
>>+
>>+	while (*hole_size > 0) {
>>+		pos = btrfs_find_allocatable_zones(device, *hole_start,
>>+						   *hole_start + *hole_size,
>>+						   num_bytes);
>>+		if (pos != *hole_start) {
>>+			*hole_size = *hole_start + *hole_size - pos;
>>+			*hole_start = pos;
>>+			changed = 1;
>>+			if (*hole_size < num_bytes)
>>+				break;
>>+		}
>>+
>>+		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
>>+
>>+		/* range is ensured to be empty */
>>+		if (!ret)
>>+			return changed;
>>+
>>+		/* given hole range was invalid (outside of device) */
>>+		if (ret == -ERANGE) {
>>+			*hole_start += *hole_size;
>>+			*hole_size = 0;
>>+			return 1;
>>+		}
>>+
>>+		*hole_start += zone_size;
>>+		*hole_size -= zone_size;
>>+		changed = 1;
>>+	}
>>+
>>+	return changed;
>>+}
>>+
>>  /**
>>   * dev_extent_hole_check - check if specified hole is suitable for allocation
>>   * @device:	the device which we have the hole
>>@@ -1463,6 +1517,10 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
>>  	case BTRFS_CHUNK_ALLOC_REGULAR:
>>  		/* No extra check */
>>  		break;
>>+	case BTRFS_CHUNK_ALLOC_ZONED:
>>+		changed |= dev_extent_hole_check_zoned(device, hole_start,
>>+						       hole_size, num_bytes);
>I'm confused here, we check to make sure the pending stuff doesn't 
>overlap with non-empty zones.  However we don't ever actually mark 
>zones as non-empty except on mount.  I realize that if we allocate 
>this zone then it appears pending and thus we won't allocate with this 
>zone again while the fs is mounted, but it took me a while to realize 
>this.  Is there a reason to not mark a zone as non-empty when we 
>allocate from it?

It's marking the zones as non-empty. Allocated zones eventually
construct a block group in btrfs_make_block_group(). The function calls
btrfs_load_block_group_zone_info() and that will clear the empty flag.

>
>
>>+		break;
>>  	default:
>>  		BUG();
>>  	}
>>@@ -1517,6 +1575,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
>>  	search_start = dev_extent_search_start(device, search_start);
>>+	WARN_ON(device->zone_info &&
>>+		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
>>+
>>  	path = btrfs_alloc_path();
>>  	if (!path)
>>  		return -ENOMEM;
>>@@ -4907,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
>>  	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
>>  }
>>+static void
>>+init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices *fs_devices,
>>+				  struct alloc_chunk_ctl *ctl)
>>+{
>>+	u64 zone_size = fs_devices->fs_info->zone_size;
>>+	u64 limit;
>>+	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
>>+	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
>>+	u64 min_chunk_size = min_data_stripes * zone_size;
>>+	u64 type = ctl->type;
>>+
>>+	ctl->max_stripe_size = zone_size;
>>+	if (type & BTRFS_BLOCK_GROUP_DATA) {
>>+		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
>>+						 zone_size);
>>+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
>>+		ctl->max_chunk_size = ctl->max_stripe_size;
>>+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
>>+		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
>>+		ctl->devs_max = min_t(int, ctl->devs_max,
>>+				      BTRFS_MAX_DEVS_SYS_CHUNK);
>>+	}
>>+
>>+	/* We don't want a chunk larger than 10% of writable space */
>>+	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
>>+			       zone_size),
>>+		    min_chunk_size);
>>+	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
>>+	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
>>+}
>>+
>>  static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>>  				 struct alloc_chunk_ctl *ctl)
>>  {
>>@@ -4927,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>>  	case BTRFS_CHUNK_ALLOC_REGULAR:
>>  		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
>>  		break;
>>+	case BTRFS_CHUNK_ALLOC_ZONED:
>>+		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
>>+		break;
>>  	default:
>>  		BUG();
>>  	}
>>@@ -5053,6 +5148,40 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
>>  	return 0;
>>  }
>>+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
>>+				    struct btrfs_device_info *devices_info)
>>+{
>>+	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
>>+	/* number of stripes that count for block group size */
>>+	int data_stripes;
>>+
>>+	/*
>>+	 * It should hold because:
>>+	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
>>+	 */
>>+	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
>>+
>>+	ctl->stripe_size = zone_size;
>>+	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
>>+	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
>>+
>>+	/*
>>+	 * stripe_size is fixed in ZONED. Reduce ndevs instead.
>>+	 */
>>+	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
>>+		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
>>+					     ctl->stripe_size) + ctl->nparity,
>>+				     ctl->dev_stripes);
>>+		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
>>+		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
>>+		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
>>+	}
>>+
>>+	ctl->chunk_size = ctl->stripe_size * data_stripes;
>>+
>>+	return 0;
>>+}
>>+
>>  static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
>>  			      struct alloc_chunk_ctl *ctl,
>>  			      struct btrfs_device_info *devices_info)
>>@@ -5080,6 +5209,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
>>  	switch (fs_devices->chunk_alloc_policy) {
>>  	case BTRFS_CHUNK_ALLOC_REGULAR:
>>  		return decide_stripe_size_regular(ctl, devices_info);
>>+	case BTRFS_CHUNK_ALLOC_ZONED:
>>+		return decide_stripe_size_zoned(ctl, devices_info);
>>  	default:
>>  		BUG();
>>  	}
>>diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>>index 9c07b97a2260..0249aca668fb 100644
>>--- a/fs/btrfs/volumes.h
>>+++ b/fs/btrfs/volumes.h
>>@@ -213,6 +213,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
>>  enum btrfs_chunk_allocation_policy {
>>  	BTRFS_CHUNK_ALLOC_REGULAR,
>>+	BTRFS_CHUNK_ALLOC_ZONED,
>>  };
>>  struct btrfs_fs_devices {
>>diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>index d5487cba203b..4411d786597a 100644
>>--- a/fs/btrfs/zoned.c
>>+++ b/fs/btrfs/zoned.c
>>@@ -1,11 +1,13 @@
>>  // SPDX-License-Identifier: GPL-2.0
>>+#include <linux/bitops.h>
>>  #include <linux/slab.h>
>>  #include <linux/blkdev.h>
>>  #include "ctree.h"
>>  #include "volumes.h"
>>  #include "zoned.h"
>>  #include "rcu-string.h"
>>+#include "disk-io.h"
>>  /* Maximum number of zones to report per blkdev_report_zones() call */
>>  #define BTRFS_REPORT_NR_ZONES   4096
>>@@ -328,6 +330,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
>>  	fs_info->zone_size = zone_size;
>>  	fs_info->max_zone_append_size = max_zone_append_size;
>>+	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
>>  	btrfs_info(fs_info, "ZONED mode enabled, zone size %llu B",
>>  		   fs_info->zone_size);
>>@@ -607,3 +610,126 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
>>  				sb_zone << zone_sectors_shift, zone_sectors * 2,
>>  				GFP_NOFS);
>>  }
>>+
>>+/*
>>+ * btrfs_check_allocatable_zones - find allocatable zones within give region
>>+ * @device:	the device to allocate a region
>>+ * @hole_start: the position of the hole to allocate the region
>>+ * @num_bytes:	the size of wanted region
>>+ * @hole_size:	the size of hole
>>+ *
>>+ * Allocatable region should not contain any superblock locations.
>>+ */
>>+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
>>+				 u64 hole_end, u64 num_bytes)
>>+{
>>+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
>>+	u8 shift = zinfo->zone_size_shift;
>>+	u64 nzones = num_bytes >> shift;
>>+	u64 pos = hole_start;
>>+	u64 begin, end;
>>+	u64 sb_pos;
>>+	bool have_sb;
>>+	int i;
>>+
>>+	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
>>+	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
>>+
>>+	while (pos < hole_end) {
>>+		begin = pos >> shift;
>>+		end = begin + nzones;
>>+
>>+		if (end > zinfo->nr_zones)
>>+			return hole_end;
>>+
>>+		/* check if zones in the region are all empty */
>>+		if (btrfs_dev_is_sequential(device, pos) &&
>>+		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
>>+			pos += zinfo->zone_size;
>>+			continue;
>>+		}
>>+
>>+		have_sb = false;
>>+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
>>+			sb_pos = sb_zone_number(zinfo->zone_size, i);
>>+			if (!(end < sb_pos || sb_pos + 1 < begin)) {
>>+				have_sb = true;
>>+				pos = (sb_pos + 2) << shift;
>>+				break;
>>+			}
>>+		}
>>+		if (!have_sb)
>>+			break;
>>+	}
>>+
>>+	return pos;
>>+}
>>+
>>+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
>>+			    u64 length, u64 *bytes)
>>+{
>>+	int ret;
>>+
>>+	*bytes = 0;
>>+	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
>>+			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
>>+			       GFP_NOFS);
>>+	if (ret)
>>+		return ret;
>>+
>>+	*bytes = length;
>>+	while (length) {
>>+		btrfs_dev_set_zone_empty(device, physical);
>>+		physical += device->zone_info->zone_size;
>>+		length -= device->zone_info->zone_size;
>>+	}
>>+
>>+	return 0;
>>+}
>>+
>>+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
>>+{
>>+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
>>+	u8 shift = zinfo->zone_size_shift;
>>+	unsigned long begin = start >> shift;
>>+	unsigned long end = (start + size) >> shift;
>>+	u64 pos;
>>+	int ret;
>>+
>>+	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
>>+	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
>>+
>>+	if (end > zinfo->nr_zones)
>>+		return -ERANGE;
>>+
>>+	/* all the zones are conventional */
>>+	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
>>+		return 0;
>>+
>
>This check is duplicated below.
>

This one checks if bits from begin to end is all cleared (= all zones are
conventional). OTOH, the below checks if all the bits are set (= all zones
are sequential).

>>+	/* all the zones are sequential and empty */
>>+	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
>>+	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
>>+		return 0;
>>+
>>+	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
>>+		u64 reset_bytes;
>>+
>>+		if (!btrfs_dev_is_sequential(device, pos) ||
>>+		    btrfs_dev_is_empty_zone(device, pos))
>>+			continue;
>>+
>>+		/* free regions should be empty */
>>+		btrfs_warn_in_rcu(
>>+			device->fs_info,
>>+			"resetting device %s zone %llu for allocation",
>>+			rcu_str_deref(device->name), pos >> shift);
>>+		WARN_ON_ONCE(1);
>>+
>>+		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
>>+					      &reset_bytes);
>>+		if (ret)
>>+			return ret;
>
>This seems bad, as we could just have corruption right?  So we're 
>resetting the zone which could lose us data right?  Shouldn't we just 
>bail here?  Thanks,
>
>Josef

Yes.. This happens 1) when we freed up the region but forget to reset the
zones, or 2) when we lost the allocation information. For the first case, it's
OK to reset the zones here. For the second case, it's still as much
dangerous as regular device since the regular btrfs will overwrite the
data in that case anyway. I admit it's much safer to bail out here, so I
can change this to error exiting.
David Sterba Nov. 3, 2020, 1:23 p.m. UTC | #3
On Fri, Oct 30, 2020 at 10:51:19PM +0900, Naohiro Aota wrote:
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1416,6 +1416,14 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
>  	return false;
>  }
>  
> +static inline u64 dev_extent_search_start_zoned(struct btrfs_device *device,
> +						u64 start)
> +{
> +	start = max_t(u64, start,
> +		      max_t(u64, device->zone_info->zone_size, SZ_1M));

Can you rewrite that as ifs?

> +	return btrfs_zone_align(device, start);
> +}
> +
>  static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>  {
>  	switch (device->fs_devices->chunk_alloc_policy) {
> @@ -1426,11 +1434,57 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
>  		 * make sure to start at an offset of at least 1MB.
>  		 */
>  		return max_t(u64, start, SZ_1M);
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		return dev_extent_search_start_zoned(device, start);
>  	default:
>  		BUG();
>  	}
>  }
>  
> +static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
> +					u64 *hole_start, u64 *hole_size,
> +					u64 num_bytes)
> +{
> +	u64 zone_size = device->zone_info->zone_size;
> +	u64 pos;
> +	int ret;
> +	int changed = 0;
> +
> +	ASSERT(IS_ALIGNED(*hole_start, zone_size));
> +
> +	while (*hole_size > 0) {
> +		pos = btrfs_find_allocatable_zones(device, *hole_start,
> +						   *hole_start + *hole_size,
> +						   num_bytes);
> +		if (pos != *hole_start) {
> +			*hole_size = *hole_start + *hole_size - pos;
> +			*hole_start = pos;
> +			changed = 1;
> +			if (*hole_size < num_bytes)
> +				break;
> +		}
> +
> +		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
> +
> +		/* range is ensured to be empty */

		/* Range ... */

> +		if (!ret)
> +			return changed;
> +
> +		/* given hole range was invalid (outside of device) */

		/* Given ... */

> +		if (ret == -ERANGE) {
> +			*hole_start += *hole_size;
> +			*hole_size = 0;
> +			return 1;
> +		}
> +
> +		*hole_start += zone_size;
> +		*hole_size -= zone_size;
> +		changed = 1;
> +	}
> +
> +	return changed;
> +}
> +
>  /**
>   * dev_extent_hole_check - check if specified hole is suitable for allocation
>   * @device:	the device which we have the hole
> @@ -1463,6 +1517,10 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
>  	case BTRFS_CHUNK_ALLOC_REGULAR:
>  		/* No extra check */
>  		break;
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		changed |= dev_extent_hole_check_zoned(device, hole_start,
> +						       hole_size, num_bytes);
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -1517,6 +1575,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
>  
>  	search_start = dev_extent_search_start(device, search_start);
>  
> +	WARN_ON(device->zone_info &&
> +		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
> +
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> @@ -4907,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
>  	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
>  }
>  
> +static void
> +init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices *fs_devices,
> +				  struct alloc_chunk_ctl *ctl)

static void init_alloc_chunk_ctl_policy_zoned(

Ie. type and name on one line

> +{
> +	u64 zone_size = fs_devices->fs_info->zone_size;
> +	u64 limit;
> +	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
> +	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
> +	u64 min_chunk_size = min_data_stripes * zone_size;
> +	u64 type = ctl->type;
> +
> +	ctl->max_stripe_size = zone_size;
> +	if (type & BTRFS_BLOCK_GROUP_DATA) {
> +		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
> +						 zone_size);
> +	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
> +		ctl->max_chunk_size = ctl->max_stripe_size;
> +	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
> +		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
> +		ctl->devs_max = min_t(int, ctl->devs_max,
> +				      BTRFS_MAX_DEVS_SYS_CHUNK);
> +	}
> +
> +	/* We don't want a chunk larger than 10% of writable space */
> +	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
> +			       zone_size),
> +		    min_chunk_size);
> +	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
> +	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
> +}
> +
>  static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>  				 struct alloc_chunk_ctl *ctl)
>  {
> @@ -4927,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
>  	case BTRFS_CHUNK_ALLOC_REGULAR:
>  		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
>  		break;
> +	case BTRFS_CHUNK_ALLOC_ZONED:
> +		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -5053,6 +5148,40 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
>  	return 0;
>  }
>  
> +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
> +				    struct btrfs_device_info *devices_info)
> +{
> +	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
> +	/* number of stripes that count for block group size */

	/* Number ... */

> +	int data_stripes;
> +
> +	/*
> +	 * It should hold because:
> +	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
> +	 */
> +	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
> +
> +	ctl->stripe_size = zone_size;
> +	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
> +	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
> +
> +	/*
> +	 * stripe_size is fixed in ZONED. Reduce ndevs instead.

/* One line comment if it fits to 80 cols */

> +	 */
> +	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
> +		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
> +					     ctl->stripe_size) + ctl->nparity,
> +				     ctl->dev_stripes);
> +		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
> +		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
> +		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
> +	}
> +
> +	ctl->chunk_size = ctl->stripe_size * data_stripes;
> +
> +	return 0;
> +}
> +
>  static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
>  			      struct alloc_chunk_ctl *ctl,
>  			      struct btrfs_device_info *devices_info)
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -607,3 +610,126 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
>  				sb_zone << zone_sectors_shift, zone_sectors * 2,
>  				GFP_NOFS);
>  }
> +
> +/*
> + * btrfs_check_allocatable_zones - find allocatable zones within give region
> + * @device:	the device to allocate a region
> + * @hole_start: the position of the hole to allocate the region
> + * @num_bytes:	the size of wanted region
> + * @hole_size:	the size of hole
> + *
> + * Allocatable region should not contain any superblock locations.
> + */
> +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
> +				 u64 hole_end, u64 num_bytes)
> +{
> +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> +	u8 shift = zinfo->zone_size_shift;
> +	u64 nzones = num_bytes >> shift;
> +	u64 pos = hole_start;
> +	u64 begin, end;
> +	u64 sb_pos;
> +	bool have_sb;
> +	int i;
> +
> +	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
> +	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
> +
> +	while (pos < hole_end) {
> +		begin = pos >> shift;
> +		end = begin + nzones;
> +
> +		if (end > zinfo->nr_zones)
> +			return hole_end;
> +
> +		/* check if zones in the region are all empty */

		/* Check ... */

> +		if (btrfs_dev_is_sequential(device, pos) &&
> +		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
> +			pos += zinfo->zone_size;
> +			continue;
> +		}
> +
> +		have_sb = false;
> +		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +			sb_pos = sb_zone_number(zinfo->zone_size, i);
> +			if (!(end < sb_pos || sb_pos + 1 < begin)) {
> +				have_sb = true;
> +				pos = (sb_pos + 2) << shift;
> +				break;
> +			}
> +		}
> +		if (!have_sb)
> +			break;
> +	}
> +
> +	return pos;
> +}
> +
> +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
> +			    u64 length, u64 *bytes)
> +{
> +	int ret;
> +
> +	*bytes = 0;
> +	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
> +			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
> +			       GFP_NOFS);
> +	if (ret)
> +		return ret;
> +
> +	*bytes = length;
> +	while (length) {
> +		btrfs_dev_set_zone_empty(device, physical);
> +		physical += device->zone_info->zone_size;
> +		length -= device->zone_info->zone_size;
> +	}
> +
> +	return 0;
> +}
> +
> +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
> +{
> +	struct btrfs_zoned_device_info *zinfo = device->zone_info;
> +	u8 shift = zinfo->zone_size_shift;
> +	unsigned long begin = start >> shift;
> +	unsigned long end = (start + size) >> shift;
> +	u64 pos;
> +	int ret;
> +
> +	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
> +	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
> +
> +	if (end > zinfo->nr_zones)
> +		return -ERANGE;
> +
> +	/* all the zones are conventional */

	/* All ... */

> +	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
> +		return 0;
> +
> +	/* all the zones are sequential and empty */

	/* All ... */

> +	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
> +	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
> +		return 0;
> +
> +	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
> +		u64 reset_bytes;
> +
> +		if (!btrfs_dev_is_sequential(device, pos) ||
> +		    btrfs_dev_is_empty_zone(device, pos))
> +			continue;
> +
> +		/* free regions should be empty */

		/* Free ... */

> +		btrfs_warn_in_rcu(
> +			device->fs_info,
> +			"resetting device %s zone %llu for allocation",

		"zoned: resetting device %s (devid %llu) zone %llu for allocation

> +			rcu_str_deref(device->name), pos >> shift);
> +		WARN_ON_ONCE(1);
> +
> +		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
> +					      &reset_bytes);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index 447c4e5ffcbb..24dd0c9561f9 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -34,6 +34,11 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
>  			  u64 *bytenr_ret);
>  void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
>  int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
> +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
> +				 u64 hole_end, u64 num_bytes);
> +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
> +			    u64 length, u64 *bytes);
> +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
>  #else /* CONFIG_BLK_DEV_ZONED */
>  static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
>  				     struct blk_zone *zone)
> @@ -77,6 +82,23 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev,
>  {
>  	return 0;
>  }

newline

> +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
> +					       u64 hole_start, u64 hole_end,
> +					       u64 num_bytes)
> +{
> +	return hole_start;
> +}

newline

> +static inline int btrfs_reset_device_zone(struct btrfs_device *device,
> +					  u64 physical, u64 length, u64 *bytes)
> +{
> +	*bytes = 0;
> +	return 0;
> +}

newline

> +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
> +					   u64 start, u64 size)
> +{
> +	return 0;
> +}

newline

>  #endif
>  
>  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
> @@ -155,4 +177,12 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device,
>  	       !btrfs_dev_is_sequential(device, pos);
>  }
>  
> +static inline u64 btrfs_zone_align(struct btrfs_device *device, u64 pos)

I can't tell from the name what it does, something like
btrfs_align_offset_to_zone would be more clear.

> +{
> +	if (!device->zone_info)
> +		return pos;
> +
> +	return ALIGN(pos, device->zone_info->zone_size);
> +}
> +
>  #endif
> -- 
> 2.27.0
diff mbox series

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index db884b96a5ea..78c62ef02e6f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1416,6 +1416,14 @@  static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
 	return false;
 }
 
+static inline u64 dev_extent_search_start_zoned(struct btrfs_device *device,
+						u64 start)
+{
+	start = max_t(u64, start,
+		      max_t(u64, device->zone_info->zone_size, SZ_1M));
+	return btrfs_zone_align(device, start);
+}
+
 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
 {
 	switch (device->fs_devices->chunk_alloc_policy) {
@@ -1426,11 +1434,57 @@  static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
 		 * make sure to start at an offset of at least 1MB.
 		 */
 		return max_t(u64, start, SZ_1M);
+	case BTRFS_CHUNK_ALLOC_ZONED:
+		return dev_extent_search_start_zoned(device, start);
 	default:
 		BUG();
 	}
 }
 
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+					u64 *hole_start, u64 *hole_size,
+					u64 num_bytes)
+{
+	u64 zone_size = device->zone_info->zone_size;
+	u64 pos;
+	int ret;
+	int changed = 0;
+
+	ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+	while (*hole_size > 0) {
+		pos = btrfs_find_allocatable_zones(device, *hole_start,
+						   *hole_start + *hole_size,
+						   num_bytes);
+		if (pos != *hole_start) {
+			*hole_size = *hole_start + *hole_size - pos;
+			*hole_start = pos;
+			changed = 1;
+			if (*hole_size < num_bytes)
+				break;
+		}
+
+		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+		/* range is ensured to be empty */
+		if (!ret)
+			return changed;
+
+		/* given hole range was invalid (outside of device) */
+		if (ret == -ERANGE) {
+			*hole_start += *hole_size;
+			*hole_size = 0;
+			return 1;
+		}
+
+		*hole_start += zone_size;
+		*hole_size -= zone_size;
+		changed = 1;
+	}
+
+	return changed;
+}
+
 /**
  * dev_extent_hole_check - check if specified hole is suitable for allocation
  * @device:	the device which we have the hole
@@ -1463,6 +1517,10 @@  static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
 	case BTRFS_CHUNK_ALLOC_REGULAR:
 		/* No extra check */
 		break;
+	case BTRFS_CHUNK_ALLOC_ZONED:
+		changed |= dev_extent_hole_check_zoned(device, hole_start,
+						       hole_size, num_bytes);
+		break;
 	default:
 		BUG();
 	}
@@ -1517,6 +1575,9 @@  static int find_free_dev_extent_start(struct btrfs_device *device,
 
 	search_start = dev_extent_search_start(device, search_start);
 
+	WARN_ON(device->zone_info &&
+		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4907,6 +4968,37 @@  static void init_alloc_chunk_ctl_policy_regular(
 	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
 }
 
+static void
+init_alloc_chunk_ctl_policy_zoned(struct btrfs_fs_devices *fs_devices,
+				  struct alloc_chunk_ctl *ctl)
+{
+	u64 zone_size = fs_devices->fs_info->zone_size;
+	u64 limit;
+	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+	u64 min_chunk_size = min_data_stripes * zone_size;
+	u64 type = ctl->type;
+
+	ctl->max_stripe_size = zone_size;
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+						 zone_size);
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		ctl->max_chunk_size = ctl->max_stripe_size;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+		ctl->devs_max = min_t(int, ctl->devs_max,
+				      BTRFS_MAX_DEVS_SYS_CHUNK);
+	}
+
+	/* We don't want a chunk larger than 10% of writable space */
+	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+			       zone_size),
+		    min_chunk_size);
+	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
 				 struct alloc_chunk_ctl *ctl)
 {
@@ -4927,6 +5019,9 @@  static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
 	case BTRFS_CHUNK_ALLOC_REGULAR:
 		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
 		break;
+	case BTRFS_CHUNK_ALLOC_ZONED:
+		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+		break;
 	default:
 		BUG();
 	}
@@ -5053,6 +5148,40 @@  static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
 	return 0;
 }
 
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+				    struct btrfs_device_info *devices_info)
+{
+	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+	/* number of stripes that count for block group size */
+	int data_stripes;
+
+	/*
+	 * It should hold because:
+	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
+	 */
+	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+	ctl->stripe_size = zone_size;
+	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+	/*
+	 * stripe_size is fixed in ZONED. Reduce ndevs instead.
+	 */
+	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+					     ctl->stripe_size) + ctl->nparity,
+				     ctl->dev_stripes);
+		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+	}
+
+	ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+	return 0;
+}
+
 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
 			      struct alloc_chunk_ctl *ctl,
 			      struct btrfs_device_info *devices_info)
@@ -5080,6 +5209,8 @@  static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
 	switch (fs_devices->chunk_alloc_policy) {
 	case BTRFS_CHUNK_ALLOC_REGULAR:
 		return decide_stripe_size_regular(ctl, devices_info);
+	case BTRFS_CHUNK_ALLOC_ZONED:
+		return decide_stripe_size_zoned(ctl, devices_info);
 	default:
 		BUG();
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9c07b97a2260..0249aca668fb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -213,6 +213,7 @@  BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 
 enum btrfs_chunk_allocation_policy {
 	BTRFS_CHUNK_ALLOC_REGULAR,
+	BTRFS_CHUNK_ALLOC_ZONED,
 };
 
 struct btrfs_fs_devices {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d5487cba203b..4411d786597a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1,11 +1,13 @@ 
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "zoned.h"
 #include "rcu-string.h"
+#include "disk-io.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
@@ -328,6 +330,7 @@  int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 
 	fs_info->zone_size = zone_size;
 	fs_info->max_zone_append_size = max_zone_append_size;
+	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 
 	btrfs_info(fs_info, "ZONED mode enabled, zone size %llu B",
 		   fs_info->zone_size);
@@ -607,3 +610,126 @@  int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 				sb_zone << zone_sectors_shift, zone_sectors * 2,
 				GFP_NOFS);
 }
+
+/*
+ * btrfs_check_allocatable_zones - find allocatable zones within give region
+ * @device:	the device to allocate a region
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes:	the size of wanted region
+ * @hole_size:	the size of hole
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+				 u64 hole_end, u64 num_bytes)
+{
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	u8 shift = zinfo->zone_size_shift;
+	u64 nzones = num_bytes >> shift;
+	u64 pos = hole_start;
+	u64 begin, end;
+	u64 sb_pos;
+	bool have_sb;
+	int i;
+
+	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+	while (pos < hole_end) {
+		begin = pos >> shift;
+		end = begin + nzones;
+
+		if (end > zinfo->nr_zones)
+			return hole_end;
+
+		/* check if zones in the region are all empty */
+		if (btrfs_dev_is_sequential(device, pos) &&
+		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+			pos += zinfo->zone_size;
+			continue;
+		}
+
+		have_sb = false;
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			sb_pos = sb_zone_number(zinfo->zone_size, i);
+			if (!(end < sb_pos || sb_pos + 1 < begin)) {
+				have_sb = true;
+				pos = (sb_pos + 2) << shift;
+				break;
+			}
+		}
+		if (!have_sb)
+			break;
+	}
+
+	return pos;
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+			    u64 length, u64 *bytes)
+{
+	int ret;
+
+	*bytes = 0;
+	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+			       GFP_NOFS);
+	if (ret)
+		return ret;
+
+	*bytes = length;
+	while (length) {
+		btrfs_dev_set_zone_empty(device, physical);
+		physical += device->zone_info->zone_size;
+		length -= device->zone_info->zone_size;
+	}
+
+	return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	u8 shift = zinfo->zone_size_shift;
+	unsigned long begin = start >> shift;
+	unsigned long end = (start + size) >> shift;
+	u64 pos;
+	int ret;
+
+	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+	if (end > zinfo->nr_zones)
+		return -ERANGE;
+
+	/* all the zones are conventional */
+	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+		return 0;
+
+	/* all the zones are sequential and empty */
+	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+		return 0;
+
+	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+		u64 reset_bytes;
+
+		if (!btrfs_dev_is_sequential(device, pos) ||
+		    btrfs_dev_is_empty_zone(device, pos))
+			continue;
+
+		/* free regions should be empty */
+		btrfs_warn_in_rcu(
+			device->fs_info,
+			"resetting device %s zone %llu for allocation",
+			rcu_str_deref(device->name), pos >> shift);
+		WARN_ON_ONCE(1);
+
+		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+					      &reset_bytes);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 447c4e5ffcbb..24dd0c9561f9 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -34,6 +34,11 @@  int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 			  u64 *bytenr_ret);
 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+				 u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+			    u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -77,6 +82,23 @@  static inline int btrfs_reset_sb_log_zones(struct block_device *bdev,
 {
 	return 0;
 }
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+					       u64 hole_start, u64 hole_end,
+					       u64 num_bytes)
+{
+	return hole_start;
+}
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+					  u64 physical, u64 length, u64 *bytes)
+{
+	*bytes = 0;
+	return 0;
+}
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+					   u64 start, u64 size)
+{
+	return 0;
+}
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -155,4 +177,12 @@  static inline bool btrfs_check_super_location(struct btrfs_device *device,
 	       !btrfs_dev_is_sequential(device, pos);
 }
 
+static inline u64 btrfs_zone_align(struct btrfs_device *device, u64 pos)
+{
+	if (!device->zone_info)
+		return pos;
+
+	return ALIGN(pos, device->zone_info->zone_size);
+}
+
 #endif