[07/19] btrfs: do sequential extent allocation in HMZONED mode
diff mbox series

Message ID 20190607131025.31996-8-naohiro.aota@wdc.com
State New
Headers show
Series
  • btrfs zoned block device support
Related show

Commit Message

Naohiro Aota June 7, 2019, 1:10 p.m. UTC
On HMZONED drives, writes must always be sequential and directed at a block
group zone write pointer position. Thus, block allocation in a block group
must also be done sequentially using an allocation pointer equal to the
block group zone write pointer plus the number of blocks allocated but not
yet written.

Sequential allocation function find_free_extent_seq() bypass the checks in
find_free_extent() and increase the reserved byte counter by itself. It is
impossible to revert once allocated region in the sequential allocation,
since it might race with other allocations and leave an allocation hole,
which breaks the sequential write rule.

Furthermore, this commit introduce two new variable to struct
btrfs_block_group_cache. "wp_broken" indicate that write pointer is broken
(e.g. not synced on a RAID1 block group) and mark that block group read
only. "unusable" keeps track of the size of once allocated then freed
region. Such region is never usable until resetting underlying zones.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/ctree.h            |  24 +++
 fs/btrfs/extent-tree.c      | 378 ++++++++++++++++++++++++++++++++++--
 fs/btrfs/free-space-cache.c |  33 ++++
 fs/btrfs/free-space-cache.h |   5 +
 4 files changed, 426 insertions(+), 14 deletions(-)

Comments

Josef Bacik June 13, 2019, 2:07 p.m. UTC | #1
On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
> On HMZONED drives, writes must always be sequential and directed at a block
> group zone write pointer position. Thus, block allocation in a block group
> must also be done sequentially using an allocation pointer equal to the
> block group zone write pointer plus the number of blocks allocated but not
> yet written.
> 
> Sequential allocation function find_free_extent_seq() bypass the checks in
> find_free_extent() and increase the reserved byte counter by itself. It is
> impossible to revert once allocated region in the sequential allocation,
> since it might race with other allocations and leave an allocation hole,
> which breaks the sequential write rule.
> 
> Furthermore, this commit introduce two new variable to struct
> btrfs_block_group_cache. "wp_broken" indicate that write pointer is broken
> (e.g. not synced on a RAID1 block group) and mark that block group read
> only. "unusable" keeps track of the size of once allocated then freed
> region. Such region is never usable until resetting underlying zones.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/ctree.h            |  24 +++
>  fs/btrfs/extent-tree.c      | 378 ++++++++++++++++++++++++++++++++++--
>  fs/btrfs/free-space-cache.c |  33 ++++
>  fs/btrfs/free-space-cache.h |   5 +
>  4 files changed, 426 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 6c00101407e4..f4bcd2a6ec12 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -582,6 +582,20 @@ struct btrfs_full_stripe_locks_tree {
>  	struct mutex lock;
>  };
>  
> +/* Block group allocation types */
> +enum btrfs_alloc_type {
> +
> +	/* Regular first fit allocation */
> +	BTRFS_ALLOC_FIT		= 0,
> +
> +	/*
> +	 * Sequential allocation: this is for HMZONED mode and
> +	 * will result in ignoring free space before a block
> +	 * group allocation offset.
> +	 */
> +	BTRFS_ALLOC_SEQ		= 1,
> +};
> +
>  struct btrfs_block_group_cache {
>  	struct btrfs_key key;
>  	struct btrfs_block_group_item item;
> @@ -592,6 +606,7 @@ struct btrfs_block_group_cache {
>  	u64 reserved;
>  	u64 delalloc_bytes;
>  	u64 bytes_super;
> +	u64 unusable;
>  	u64 flags;
>  	u64 cache_generation;
>  
> @@ -621,6 +636,7 @@ struct btrfs_block_group_cache {
>  	unsigned int iref:1;
>  	unsigned int has_caching_ctl:1;
>  	unsigned int removed:1;
> +	unsigned int wp_broken:1;
>  
>  	int disk_cache_state;
>  
> @@ -694,6 +710,14 @@ struct btrfs_block_group_cache {
>  
>  	/* Record locked full stripes for RAID5/6 block group */
>  	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
> +
> +	/*
> +	 * Allocation offset for the block group to implement sequential
> +	 * allocation. This is used only with HMZONED mode enabled and if
> +	 * the block group resides on a sequential zone.
> +	 */
> +	enum btrfs_alloc_type alloc_type;
> +	u64 alloc_offset;
>  };
>  
>  /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 363db58f56b8..ebd0d6eae038 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -28,6 +28,7 @@
>  #include "sysfs.h"
>  #include "qgroup.h"
>  #include "ref-verify.h"
> +#include "rcu-string.h"
>  
>  #undef SCRAMBLE_DELAYED_REFS
>  
> @@ -590,6 +591,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
>  	struct btrfs_caching_control *caching_ctl;
>  	int ret = 0;
>  
> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
>  	if (!caching_ctl)
>  		return -ENOMEM;
> @@ -6555,6 +6558,19 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
>  	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
>  }
>  
> +static void __btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
> +				       u64 ram_bytes, u64 num_bytes,
> +				       int delalloc)
> +{
> +	struct btrfs_space_info *space_info = cache->space_info;
> +
> +	cache->reserved += num_bytes;
> +	space_info->bytes_reserved += num_bytes;
> +	update_bytes_may_use(space_info, -ram_bytes);
> +	if (delalloc)
> +		cache->delalloc_bytes += num_bytes;
> +}
> +
>  /**
>   * btrfs_add_reserved_bytes - update the block_group and space info counters
>   * @cache:	The cache we are manipulating
> @@ -6573,17 +6589,16 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
>  	struct btrfs_space_info *space_info = cache->space_info;
>  	int ret = 0;
>  
> +	/* should handled by find_free_extent_seq */
> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	spin_lock(&space_info->lock);
>  	spin_lock(&cache->lock);
> -	if (cache->ro) {
> +	if (cache->ro)
>  		ret = -EAGAIN;
> -	} else {
> -		cache->reserved += num_bytes;
> -		space_info->bytes_reserved += num_bytes;
> -		update_bytes_may_use(space_info, -ram_bytes);
> -		if (delalloc)
> -			cache->delalloc_bytes += num_bytes;
> -	}
> +	else
> +		__btrfs_add_reserved_bytes(cache, ram_bytes, num_bytes,
> +					   delalloc);
>  	spin_unlock(&cache->lock);
>  	spin_unlock(&space_info->lock);
>  	return ret;
> @@ -6701,9 +6716,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>  			cache = btrfs_lookup_block_group(fs_info, start);
>  			BUG_ON(!cache); /* Logic error */
>  
> -			cluster = fetch_cluster_info(fs_info,
> -						     cache->space_info,
> -						     &empty_cluster);
> +			if (cache->alloc_type == BTRFS_ALLOC_FIT)
> +				cluster = fetch_cluster_info(fs_info,
> +							     cache->space_info,
> +							     &empty_cluster);
> +			else
> +				cluster = NULL;
> +
>  			empty_cluster <<= 1;
>  		}
>  
> @@ -6743,7 +6762,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>  		space_info->max_extent_size = 0;
>  		percpu_counter_add_batch(&space_info->total_bytes_pinned,
>  			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
> -		if (cache->ro) {
> +		if (cache->ro || cache->alloc_type == BTRFS_ALLOC_SEQ) {
> +			/* need reset before reusing in ALLOC_SEQ BG */
>  			space_info->bytes_readonly += len;
>  			readonly = true;
>  		}
> @@ -7588,6 +7608,60 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
>  	return 0;
>  }
>  
> +/*
> + * Simple allocator for sequential only block group. It only allows
> + * sequential allocation. No need to play with trees. This function
> + * also reserve the bytes as in btrfs_add_reserved_bytes.
> + */
> +
> +static int find_free_extent_seq(struct btrfs_block_group_cache *cache,
> +				struct find_free_extent_ctl *ffe_ctl)
> +{
> +	struct btrfs_space_info *space_info = cache->space_info;
> +	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
> +	u64 start = cache->key.objectid;
> +	u64 num_bytes = ffe_ctl->num_bytes;
> +	u64 avail;
> +	int ret = 0;
> +
> +	/* Sanity check */
> +	if (cache->alloc_type != BTRFS_ALLOC_SEQ)
> +		return 1;
> +
> +	spin_lock(&space_info->lock);
> +	spin_lock(&cache->lock);
> +
> +	if (cache->ro) {
> +		ret = -EAGAIN;
> +		goto out;
> +	}
> +
> +	spin_lock(&ctl->tree_lock);
> +	avail = cache->key.offset - cache->alloc_offset;
> +	if (avail < num_bytes) {
> +		ffe_ctl->max_extent_size = avail;
> +		spin_unlock(&ctl->tree_lock);
> +		ret = 1;
> +		goto out;
> +	}
> +
> +	ffe_ctl->found_offset = start + cache->alloc_offset;
> +	cache->alloc_offset += num_bytes;
> +	ctl->free_space -= num_bytes;
> +	spin_unlock(&ctl->tree_lock);
> +
> +	BUG_ON(!IS_ALIGNED(ffe_ctl->found_offset,
> +			   cache->fs_info->stripesize));
> +	ffe_ctl->search_start = ffe_ctl->found_offset;
> +	__btrfs_add_reserved_bytes(cache, ffe_ctl->ram_bytes, num_bytes,
> +				   ffe_ctl->delalloc);
> +
> +out:
> +	spin_unlock(&cache->lock);
> +	spin_unlock(&space_info->lock);
> +	return ret;
> +}
> +
>  /*
>   * Return >0 means caller needs to re-search for free extent
>   * Return 0 means we have the needed free extent.
> @@ -7889,6 +7963,16 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
>  			goto loop;
>  
> +		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
> +			ret = find_free_extent_seq(block_group, &ffe_ctl);
> +			if (ret)
> +				goto loop;
> +			/* btrfs_find_space_for_alloc_seq should ensure
> +			 * that everything is OK and reserve the extent.
> +			 */
> +			goto nocheck;
> +		}
> +
>  		/*
>  		 * Ok we want to try and use the cluster allocator, so
>  		 * lets look there
> @@ -7944,6 +8028,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  					     num_bytes);
>  			goto loop;
>  		}
> +nocheck:
>  		btrfs_inc_block_group_reservations(block_group);
>  
>  		/* we are all good, lets return */
> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
>  	}
>  
>  	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
> -		    cache->bytes_super - btrfs_block_group_used(&cache->item);
> +		    cache->bytes_super - cache->unusable -
> +		    btrfs_block_group_used(&cache->item);
>  	sinfo_used = btrfs_space_info_used(sinfo, true);
>  
>  	if (sinfo_used + num_bytes + min_allocable_bytes <=
> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
>  	if (!--cache->ro) {
>  		num_bytes = cache->key.offset - cache->reserved -
>  			    cache->pinned - cache->bytes_super -
> +			    cache->unusable -
>  			    btrfs_block_group_used(&cache->item);

You've done this in a few places, but not all the places, most notably
btrfs_space_info_used() which is used in the space reservation code a lot.

>  		sinfo->bytes_readonly -= num_bytes;
>  		list_del_init(&cache->ro_list);
> @@ -10200,11 +10287,240 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
>  	}
>  }
>  
> +static int
> +btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
> +{
> +	struct btrfs_fs_info *fs_info = cache->fs_info;
> +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
> +	struct extent_map *em;
> +	struct map_lookup *map;
> +	struct btrfs_device *device;
> +	u64 logical = cache->key.objectid;
> +	u64 length = cache->key.offset;
> +	u64 physical = 0;
> +	int ret, alloc_type;
> +	int i, j;
> +	u64 *alloc_offsets = NULL;
> +
> +#define WP_MISSING_DEV ((u64)-1)
> +
> +	/* Sanity check */
> +	if (!IS_ALIGNED(length, fs_info->zone_size)) {
> +		btrfs_err(fs_info, "unaligned block group at %llu + %llu",
> +			  logical, length);
> +		return -EIO;
> +	}
> +
> +	/* Get the chunk mapping */
> +	em_tree = &fs_info->mapping_tree.map_tree;
> +	read_lock(&em_tree->lock);
> +	em = lookup_extent_mapping(em_tree, logical, length);
> +	read_unlock(&em_tree->lock);
> +
> +	if (!em)
> +		return -EINVAL;
> +
> +	map = em->map_lookup;
> +
> +	/*
> +	 * Get the zone type: if the group is mapped to a non-sequential zone,
> +	 * there is no need for the allocation offset (fit allocation is OK).
> +	 */
> +	alloc_type = -1;
> +	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
> +				GFP_NOFS);
> +	if (!alloc_offsets) {
> +		free_extent_map(em);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < map->num_stripes; i++) {
> +		int is_sequential;
> +		struct blk_zone zone;
> +
> +		device = map->stripes[i].dev;
> +		physical = map->stripes[i].physical;
> +
> +		if (device->bdev == NULL) {
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			continue;
> +		}
> +
> +		is_sequential = btrfs_dev_is_sequential(device, physical);
> +		if (alloc_type == -1)
> +			alloc_type = is_sequential ?
> +					BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
> +
> +		if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
> +		    (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
> +			btrfs_err(fs_info, "found block group of mixed zone types");
> +			ret = -EIO;
> +			goto out;
> +		}
> +
> +		if (!is_sequential)
> +			continue;
> +
> +		/* this zone will be used for allocation, so mark this
> +		 * zone non-empty
> +		 */
> +		clear_bit(physical >> device->zone_size_shift,
> +			  device->empty_zones);
> +
> +		/*
> +		 * The group is mapped to a sequential zone. Get the zone write
> +		 * pointer to determine the allocation offset within the zone.
> +		 */
> +		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
> +		ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
> +		if (ret == -EIO || ret == -EOPNOTSUPP) {
> +			ret = 0;
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			continue;
> +		} else if (ret) {
> +			goto out;
> +		}
> +
> +
> +		switch (zone.cond) {
> +		case BLK_ZONE_COND_OFFLINE:
> +		case BLK_ZONE_COND_READONLY:
> +			btrfs_err(fs_info, "Offline/readonly zone %llu",
> +				  physical >> device->zone_size_shift);
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			break;
> +		case BLK_ZONE_COND_EMPTY:
> +			alloc_offsets[i] = 0;
> +			break;
> +		case BLK_ZONE_COND_FULL:
> +			alloc_offsets[i] = fs_info->zone_size;
> +			break;
> +		default:
> +			/* Partially used zone */
> +			alloc_offsets[i] =
> +				((zone.wp - zone.start) << SECTOR_SHIFT);
> +			break;
> +		}
> +	}
> +
> +	if (alloc_type == BTRFS_ALLOC_FIT)
> +		goto out;
> +
> +	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> +	case 0: /* single */
> +	case BTRFS_BLOCK_GROUP_DUP:
> +	case BTRFS_BLOCK_GROUP_RAID1:
> +		cache->alloc_offset = WP_MISSING_DEV;
> +		for (i = 0; i < map->num_stripes; i++) {
> +			if (alloc_offsets[i] == WP_MISSING_DEV)
> +				continue;
> +			if (cache->alloc_offset == WP_MISSING_DEV)
> +				cache->alloc_offset = alloc_offsets[i];
> +			if (alloc_offsets[i] == cache->alloc_offset)
> +				continue;
> +
> +			btrfs_err(fs_info,
> +				  "write pointer mismatch: block group %llu",
> +				  logical);
> +			cache->wp_broken = 1;
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID0:
> +		cache->alloc_offset = 0;
> +		for (i = 0; i < map->num_stripes; i++) {
> +			if (alloc_offsets[i] == WP_MISSING_DEV) {
> +				btrfs_err(fs_info,
> +					  "cannot recover write pointer: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			if (alloc_offsets[0] < alloc_offsets[i]) {
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			cache->alloc_offset += alloc_offsets[i];
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID10:
> +		/*
> +		 * Pass1: check write pointer of RAID1 level: each pointer
> +		 * should be equal.
> +		 */
> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
> +			int base = i*map->sub_stripes;
> +			u64 offset = WP_MISSING_DEV;
> +
> +			for (j = 0; j < map->sub_stripes; j++) {
> +				if (alloc_offsets[base+j] == WP_MISSING_DEV)
> +					continue;
> +				if (offset == WP_MISSING_DEV)
> +					offset = alloc_offsets[base+j];
> +				if (alloc_offsets[base+j] == offset)
> +					continue;
> +
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +			}
> +			for (j = 0; j < map->sub_stripes; j++)
> +				alloc_offsets[base+j] = offset;
> +		}
> +
> +		/* Pass2: check write pointer of RAID1 level */
> +		cache->alloc_offset = 0;
> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
> +			int base = i*map->sub_stripes;
> +
> +			if (alloc_offsets[base] == WP_MISSING_DEV) {
> +				btrfs_err(fs_info,
> +					  "cannot recover write pointer: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			if (alloc_offsets[0] < alloc_offsets[base]) {
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			cache->alloc_offset += alloc_offsets[base];
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID5:
> +	case BTRFS_BLOCK_GROUP_RAID6:
> +		/* RAID5/6 is not supported yet */
> +	default:
> +		btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
> +			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	cache->alloc_type = alloc_type;
> +	kfree(alloc_offsets);
> +	free_extent_map(em);
> +
> +	return ret;
> +}
> +

Move this to the zoned device file that you create.

>  static struct btrfs_block_group_cache *
>  btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>  			       u64 start, u64 size)
>  {
>  	struct btrfs_block_group_cache *cache;
> +	int ret;
>  
>  	cache = kzalloc(sizeof(*cache), GFP_NOFS);
>  	if (!cache)
> @@ -10238,6 +10554,16 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>  	atomic_set(&cache->trimming, 0);
>  	mutex_init(&cache->free_space_lock);
>  	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
> +	cache->alloc_type = BTRFS_ALLOC_FIT;
> +	cache->alloc_offset = 0;
> +
> +	if (btrfs_fs_incompat(fs_info, HMZONED)) {
> +		ret = btrfs_get_block_group_alloc_offset(cache);
> +		if (ret) {
> +			kfree(cache);
> +			return NULL;
> +		}
> +	}
>  
>  	return cache;
>  }
> @@ -10310,6 +10636,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  	int need_clear = 0;
>  	u64 cache_gen;
>  	u64 feature;
> +	u64 unusable;
>  	int mixed;
>  
>  	feature = btrfs_super_incompat_flags(info->super_copy);
> @@ -10415,6 +10742,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			free_excluded_extents(cache);
>  		}
>  
> +		switch (cache->alloc_type) {
> +		case BTRFS_ALLOC_FIT:
> +			unusable = cache->bytes_super;
> +			break;
> +		case BTRFS_ALLOC_SEQ:
> +			WARN_ON(cache->bytes_super != 0);
> +			unusable = cache->alloc_offset -
> +				btrfs_block_group_used(&cache->item);
> +			/* we only need ->free_space in ALLOC_SEQ BGs */
> +			cache->last_byte_to_unpin = (u64)-1;
> +			cache->cached = BTRFS_CACHE_FINISHED;
> +			cache->free_space_ctl->free_space =
> +				cache->key.offset - cache->alloc_offset;
> +			cache->unusable = unusable;
> +			free_excluded_extents(cache);
> +			break;
> +		default:
> +			BUG();
> +		}
> +
>  		ret = btrfs_add_block_group_cache(info, cache);
>  		if (ret) {
>  			btrfs_remove_free_space_cache(cache);
> @@ -10425,7 +10772,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  		trace_btrfs_add_block_group(info, cache, 0);
>  		update_space_info(info, cache->flags, found_key.offset,
>  				  btrfs_block_group_used(&cache->item),
> -				  cache->bytes_super, &space_info);
> +				  unusable, &space_info);
>  
>  		cache->space_info = space_info;
>  
> @@ -10438,6 +10785,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			ASSERT(list_empty(&cache->bg_list));
>  			btrfs_mark_bg_unused(cache);
>  		}
> +
> +		if (cache->wp_broken)
> +			inc_block_group_ro(cache, 1);
>  	}
>  
>  	list_for_each_entry_rcu(space_info, &info->space_info, list) {
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index f74dc259307b..cc69dc71f4c1 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2326,8 +2326,11 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  			   u64 offset, u64 bytes)
>  {
>  	struct btrfs_free_space *info;
> +	struct btrfs_block_group_cache *block_group = ctl->private;
>  	int ret = 0;
>  
> +	WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
>  	if (!info)
>  		return -ENOMEM;
> @@ -2376,6 +2379,28 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
> +			       u64 bytenr, u64 size)
> +{
> +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> +	u64 offset = bytenr - block_group->key.objectid;
> +	u64 to_free, to_unusable;
> +
> +	spin_lock(&ctl->tree_lock);
> +	if (offset >= block_group->alloc_offset)
> +		to_free = size;
> +	else if (offset + size <= block_group->alloc_offset)
> +		to_free = 0;
> +	else
> +		to_free = offset + size - block_group->alloc_offset;
> +	to_unusable = size - to_free;
> +	ctl->free_space += to_free;
> +	block_group->unusable += to_unusable;
> +	spin_unlock(&ctl->tree_lock);
> +	return 0;
> +
> +}
> +
>  int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>  			    u64 offset, u64 bytes)
>  {
> @@ -2384,6 +2409,8 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>  	int ret;
>  	bool re_search = false;
>  
> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +

These should probably be ASSERT() right?  Want to make sure the developers
really notice a problem when testing.  Thanks,

Josef
David Sterba June 17, 2019, 10:30 p.m. UTC | #2
On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
> On HMZONED drives, writes must always be sequential and directed at a block
> group zone write pointer position. Thus, block allocation in a block group
> must also be done sequentially using an allocation pointer equal to the
> block group zone write pointer plus the number of blocks allocated but not
> yet written.
> 
> Sequential allocation function find_free_extent_seq() bypass the checks in
> find_free_extent() and increase the reserved byte counter by itself. It is
> impossible to revert once allocated region in the sequential allocation,
> since it might race with other allocations and leave an allocation hole,
> which breaks the sequential write rule.
> 
> Furthermore, this commit introduce two new variable to struct
> btrfs_block_group_cache. "wp_broken" indicate that write pointer is broken
> (e.g. not synced on a RAID1 block group) and mark that block group read
> only. "unusable" keeps track of the size of once allocated then freed
> region. Such region is never usable until resetting underlying zones.
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/ctree.h            |  24 +++
>  fs/btrfs/extent-tree.c      | 378 ++++++++++++++++++++++++++++++++++--
>  fs/btrfs/free-space-cache.c |  33 ++++
>  fs/btrfs/free-space-cache.h |   5 +
>  4 files changed, 426 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 6c00101407e4..f4bcd2a6ec12 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -582,6 +582,20 @@ struct btrfs_full_stripe_locks_tree {
>  	struct mutex lock;
>  };
>  
> +/* Block group allocation types */
> +enum btrfs_alloc_type {
> +
> +	/* Regular first fit allocation */
> +	BTRFS_ALLOC_FIT		= 0,
> +
> +	/*
> +	 * Sequential allocation: this is for HMZONED mode and
> +	 * will result in ignoring free space before a block
> +	 * group allocation offset.

Please format the comments to 80 columns

> +	 */
> +	BTRFS_ALLOC_SEQ		= 1,
> +};
> +
>  struct btrfs_block_group_cache {
>  	struct btrfs_key key;
>  	struct btrfs_block_group_item item;
> @@ -592,6 +606,7 @@ struct btrfs_block_group_cache {
>  	u64 reserved;
>  	u64 delalloc_bytes;
>  	u64 bytes_super;
> +	u64 unusable;

'unusable' is specific to the zones, so 'zone_unusable' would make it
clear. The terminilogy around space is confusing already (we have
unused, free, reserved, allocated, slack).

>  	u64 flags;
>  	u64 cache_generation;
>  
> @@ -621,6 +636,7 @@ struct btrfs_block_group_cache {
>  	unsigned int iref:1;
>  	unsigned int has_caching_ctl:1;
>  	unsigned int removed:1;
> +	unsigned int wp_broken:1;
>  
>  	int disk_cache_state;
>  
> @@ -694,6 +710,14 @@ struct btrfs_block_group_cache {
>  
>  	/* Record locked full stripes for RAID5/6 block group */
>  	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
> +
> +	/*
> +	 * Allocation offset for the block group to implement sequential
> +	 * allocation. This is used only with HMZONED mode enabled and if
> +	 * the block group resides on a sequential zone.
> +	 */
> +	enum btrfs_alloc_type alloc_type;
> +	u64 alloc_offset;
>  };
>  
>  /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 363db58f56b8..ebd0d6eae038 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -28,6 +28,7 @@
>  #include "sysfs.h"
>  #include "qgroup.h"
>  #include "ref-verify.h"
> +#include "rcu-string.h"
>  
>  #undef SCRAMBLE_DELAYED_REFS
>  
> @@ -590,6 +591,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
>  	struct btrfs_caching_control *caching_ctl;
>  	int ret = 0;
>  
> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
>  	if (!caching_ctl)
>  		return -ENOMEM;
> @@ -6555,6 +6558,19 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
>  	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
>  }
>  
> +static void __btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
> +				       u64 ram_bytes, u64 num_bytes,
> +				       int delalloc)
> +{
> +	struct btrfs_space_info *space_info = cache->space_info;
> +
> +	cache->reserved += num_bytes;
> +	space_info->bytes_reserved += num_bytes;
> +	update_bytes_may_use(space_info, -ram_bytes);
> +	if (delalloc)
> +		cache->delalloc_bytes += num_bytes;
> +}
> +
>  /**
>   * btrfs_add_reserved_bytes - update the block_group and space info counters
>   * @cache:	The cache we are manipulating
> @@ -6573,17 +6589,16 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
>  	struct btrfs_space_info *space_info = cache->space_info;
>  	int ret = 0;
>  
> +	/* should handled by find_free_extent_seq */
> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	spin_lock(&space_info->lock);
>  	spin_lock(&cache->lock);
> -	if (cache->ro) {
> +	if (cache->ro)
>  		ret = -EAGAIN;
> -	} else {
> -		cache->reserved += num_bytes;
> -		space_info->bytes_reserved += num_bytes;
> -		update_bytes_may_use(space_info, -ram_bytes);
> -		if (delalloc)
> -			cache->delalloc_bytes += num_bytes;
> -	}
> +	else
> +		__btrfs_add_reserved_bytes(cache, ram_bytes, num_bytes,
> +					   delalloc);
>  	spin_unlock(&cache->lock);
>  	spin_unlock(&space_info->lock);
>  	return ret;
> @@ -6701,9 +6716,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>  			cache = btrfs_lookup_block_group(fs_info, start);
>  			BUG_ON(!cache); /* Logic error */
>  
> -			cluster = fetch_cluster_info(fs_info,
> -						     cache->space_info,
> -						     &empty_cluster);
> +			if (cache->alloc_type == BTRFS_ALLOC_FIT)
> +				cluster = fetch_cluster_info(fs_info,
> +							     cache->space_info,
> +							     &empty_cluster);
> +			else
> +				cluster = NULL;
> +
>  			empty_cluster <<= 1;
>  		}
>  
> @@ -6743,7 +6762,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>  		space_info->max_extent_size = 0;
>  		percpu_counter_add_batch(&space_info->total_bytes_pinned,
>  			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
> -		if (cache->ro) {
> +		if (cache->ro || cache->alloc_type == BTRFS_ALLOC_SEQ) {
> +			/* need reset before reusing in ALLOC_SEQ BG */
>  			space_info->bytes_readonly += len;
>  			readonly = true;
>  		}
> @@ -7588,6 +7608,60 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
>  	return 0;
>  }
>  
> +/*
> + * Simple allocator for sequential only block group. It only allows
> + * sequential allocation. No need to play with trees. This function
> + * also reserve the bytes as in btrfs_add_reserved_bytes.
> + */
> +
> +static int find_free_extent_seq(struct btrfs_block_group_cache *cache,
> +				struct find_free_extent_ctl *ffe_ctl)
> +{
> +	struct btrfs_space_info *space_info = cache->space_info;
> +	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
> +	u64 start = cache->key.objectid;
> +	u64 num_bytes = ffe_ctl->num_bytes;
> +	u64 avail;
> +	int ret = 0;
> +
> +	/* Sanity check */
> +	if (cache->alloc_type != BTRFS_ALLOC_SEQ)
> +		return 1;
> +
> +	spin_lock(&space_info->lock);
> +	spin_lock(&cache->lock);
> +
> +	if (cache->ro) {
> +		ret = -EAGAIN;
> +		goto out;
> +	}
> +
> +	spin_lock(&ctl->tree_lock);
> +	avail = cache->key.offset - cache->alloc_offset;
> +	if (avail < num_bytes) {
> +		ffe_ctl->max_extent_size = avail;
> +		spin_unlock(&ctl->tree_lock);
> +		ret = 1;
> +		goto out;
> +	}
> +
> +	ffe_ctl->found_offset = start + cache->alloc_offset;
> +	cache->alloc_offset += num_bytes;
> +	ctl->free_space -= num_bytes;
> +	spin_unlock(&ctl->tree_lock);
> +
> +	BUG_ON(!IS_ALIGNED(ffe_ctl->found_offset,
> +			   cache->fs_info->stripesize));
> +	ffe_ctl->search_start = ffe_ctl->found_offset;
> +	__btrfs_add_reserved_bytes(cache, ffe_ctl->ram_bytes, num_bytes,
> +				   ffe_ctl->delalloc);
> +
> +out:
> +	spin_unlock(&cache->lock);
> +	spin_unlock(&space_info->lock);
> +	return ret;
> +}
> +
>  /*
>   * Return >0 means caller needs to re-search for free extent
>   * Return 0 means we have the needed free extent.
> @@ -7889,6 +7963,16 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
>  			goto loop;
>  
> +		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
> +			ret = find_free_extent_seq(block_group, &ffe_ctl);
> +			if (ret)
> +				goto loop;
> +			/* btrfs_find_space_for_alloc_seq should ensure
> +			 * that everything is OK and reserve the extent.
> +			 */

Please use the

/*
 * comment
 */

style

> +			goto nocheck;
> +		}
> +
>  		/*
>  		 * Ok we want to try and use the cluster allocator, so
>  		 * lets look there
> @@ -7944,6 +8028,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>  					     num_bytes);
>  			goto loop;
>  		}
> +nocheck:
>  		btrfs_inc_block_group_reservations(block_group);
>  
>  		/* we are all good, lets return */
> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
>  	}
>  
>  	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
> -		    cache->bytes_super - btrfs_block_group_used(&cache->item);
> +		    cache->bytes_super - cache->unusable -
> +		    btrfs_block_group_used(&cache->item);
>  	sinfo_used = btrfs_space_info_used(sinfo, true);
>  
>  	if (sinfo_used + num_bytes + min_allocable_bytes <=
> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
>  	if (!--cache->ro) {
>  		num_bytes = cache->key.offset - cache->reserved -
>  			    cache->pinned - cache->bytes_super -
> +			    cache->unusable -
>  			    btrfs_block_group_used(&cache->item);
>  		sinfo->bytes_readonly -= num_bytes;
>  		list_del_init(&cache->ro_list);
> @@ -10200,11 +10287,240 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
>  	}
>  }
>  
> +static int
> +btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
> +{
> +	struct btrfs_fs_info *fs_info = cache->fs_info;
> +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
> +	struct extent_map *em;
> +	struct map_lookup *map;
> +	struct btrfs_device *device;
> +	u64 logical = cache->key.objectid;
> +	u64 length = cache->key.offset;
> +	u64 physical = 0;
> +	int ret, alloc_type;
> +	int i, j;
> +	u64 *alloc_offsets = NULL;
> +
> +#define WP_MISSING_DEV ((u64)-1)

Please move the definition to the beginning of the file

> +
> +	/* Sanity check */
> +	if (!IS_ALIGNED(length, fs_info->zone_size)) {
> +		btrfs_err(fs_info, "unaligned block group at %llu + %llu",
> +			  logical, length);
> +		return -EIO;
> +	}
> +
> +	/* Get the chunk mapping */
> +	em_tree = &fs_info->mapping_tree.map_tree;
> +	read_lock(&em_tree->lock);
> +	em = lookup_extent_mapping(em_tree, logical, length);
> +	read_unlock(&em_tree->lock);
> +
> +	if (!em)
> +		return -EINVAL;
> +
> +	map = em->map_lookup;
> +
> +	/*
> +	 * Get the zone type: if the group is mapped to a non-sequential zone,
> +	 * there is no need for the allocation offset (fit allocation is OK).
> +	 */
> +	alloc_type = -1;
> +	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
> +				GFP_NOFS);
> +	if (!alloc_offsets) {
> +		free_extent_map(em);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < map->num_stripes; i++) {
> +		int is_sequential;

Please use bool instead of int

> +		struct blk_zone zone;
> +
> +		device = map->stripes[i].dev;
> +		physical = map->stripes[i].physical;
> +
> +		if (device->bdev == NULL) {
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			continue;
> +		}
> +
> +		is_sequential = btrfs_dev_is_sequential(device, physical);
> +		if (alloc_type == -1)
> +			alloc_type = is_sequential ?
> +					BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
> +
> +		if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
> +		    (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
> +			btrfs_err(fs_info, "found block group of mixed zone types");
> +			ret = -EIO;
> +			goto out;
> +		}
> +
> +		if (!is_sequential)
> +			continue;
> +
> +		/* this zone will be used for allocation, so mark this
> +		 * zone non-empty
> +		 */
> +		clear_bit(physical >> device->zone_size_shift,
> +			  device->empty_zones);
> +
> +		/*
> +		 * The group is mapped to a sequential zone. Get the zone write
> +		 * pointer to determine the allocation offset within the zone.
> +		 */
> +		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
> +		ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
> +		if (ret == -EIO || ret == -EOPNOTSUPP) {
> +			ret = 0;
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			continue;
> +		} else if (ret) {
> +			goto out;
> +		}
> +
> +
> +		switch (zone.cond) {
> +		case BLK_ZONE_COND_OFFLINE:
> +		case BLK_ZONE_COND_READONLY:
> +			btrfs_err(fs_info, "Offline/readonly zone %llu",
> +				  physical >> device->zone_size_shift);
> +			alloc_offsets[i] = WP_MISSING_DEV;
> +			break;
> +		case BLK_ZONE_COND_EMPTY:
> +			alloc_offsets[i] = 0;
> +			break;
> +		case BLK_ZONE_COND_FULL:
> +			alloc_offsets[i] = fs_info->zone_size;
> +			break;
> +		default:
> +			/* Partially used zone */
> +			alloc_offsets[i] =
> +				((zone.wp - zone.start) << SECTOR_SHIFT);
> +			break;
> +		}
> +	}
> +
> +	if (alloc_type == BTRFS_ALLOC_FIT)
> +		goto out;
> +
> +	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> +	case 0: /* single */
> +	case BTRFS_BLOCK_GROUP_DUP:
> +	case BTRFS_BLOCK_GROUP_RAID1:
> +		cache->alloc_offset = WP_MISSING_DEV;
> +		for (i = 0; i < map->num_stripes; i++) {
> +			if (alloc_offsets[i] == WP_MISSING_DEV)
> +				continue;
> +			if (cache->alloc_offset == WP_MISSING_DEV)
> +				cache->alloc_offset = alloc_offsets[i];
> +			if (alloc_offsets[i] == cache->alloc_offset)
> +				continue;
> +
> +			btrfs_err(fs_info,
> +				  "write pointer mismatch: block group %llu",
> +				  logical);
> +			cache->wp_broken = 1;
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID0:
> +		cache->alloc_offset = 0;
> +		for (i = 0; i < map->num_stripes; i++) {
> +			if (alloc_offsets[i] == WP_MISSING_DEV) {
> +				btrfs_err(fs_info,
> +					  "cannot recover write pointer: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			if (alloc_offsets[0] < alloc_offsets[i]) {
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			cache->alloc_offset += alloc_offsets[i];
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID10:
> +		/*
> +		 * Pass1: check write pointer of RAID1 level: each pointer
> +		 * should be equal.
> +		 */
> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
> +			int base = i*map->sub_stripes;

spaces around binary operators

			int base = i * map->sub_stripes;

> +			u64 offset = WP_MISSING_DEV;
> +
> +			for (j = 0; j < map->sub_stripes; j++) {
> +				if (alloc_offsets[base+j] == WP_MISSING_DEV)

here and below

> +					continue;
> +				if (offset == WP_MISSING_DEV)
> +					offset = alloc_offsets[base+j];
> +				if (alloc_offsets[base+j] == offset)
> +					continue;
> +
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +			}
> +			for (j = 0; j < map->sub_stripes; j++)
> +				alloc_offsets[base+j] = offset;
> +		}
> +
> +		/* Pass2: check write pointer of RAID1 level */
> +		cache->alloc_offset = 0;
> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
> +			int base = i*map->sub_stripes;
> +
> +			if (alloc_offsets[base] == WP_MISSING_DEV) {
> +				btrfs_err(fs_info,
> +					  "cannot recover write pointer: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			if (alloc_offsets[0] < alloc_offsets[base]) {
> +				btrfs_err(fs_info,
> +					  "write pointer mismatch: block group %llu",
> +					  logical);
> +				cache->wp_broken = 1;
> +				continue;
> +			}
> +
> +			cache->alloc_offset += alloc_offsets[base];
> +		}
> +		break;
> +	case BTRFS_BLOCK_GROUP_RAID5:
> +	case BTRFS_BLOCK_GROUP_RAID6:
> +		/* RAID5/6 is not supported yet */
> +	default:
> +		btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
> +			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	cache->alloc_type = alloc_type;
> +	kfree(alloc_offsets);
> +	free_extent_map(em);
> +
> +	return ret;
> +}
> +
>  static struct btrfs_block_group_cache *
>  btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>  			       u64 start, u64 size)
>  {
>  	struct btrfs_block_group_cache *cache;
> +	int ret;
>  
>  	cache = kzalloc(sizeof(*cache), GFP_NOFS);
>  	if (!cache)
> @@ -10238,6 +10554,16 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>  	atomic_set(&cache->trimming, 0);
>  	mutex_init(&cache->free_space_lock);
>  	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
> +	cache->alloc_type = BTRFS_ALLOC_FIT;
> +	cache->alloc_offset = 0;
> +
> +	if (btrfs_fs_incompat(fs_info, HMZONED)) {
> +		ret = btrfs_get_block_group_alloc_offset(cache);
> +		if (ret) {
> +			kfree(cache);
> +			return NULL;
> +		}
> +	}
>  
>  	return cache;
>  }
> @@ -10310,6 +10636,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  	int need_clear = 0;
>  	u64 cache_gen;
>  	u64 feature;
> +	u64 unusable;
>  	int mixed;
>  
>  	feature = btrfs_super_incompat_flags(info->super_copy);
> @@ -10415,6 +10742,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			free_excluded_extents(cache);
>  		}
>  
> +		switch (cache->alloc_type) {
> +		case BTRFS_ALLOC_FIT:
> +			unusable = cache->bytes_super;
> +			break;
> +		case BTRFS_ALLOC_SEQ:
> +			WARN_ON(cache->bytes_super != 0);
> +			unusable = cache->alloc_offset -
> +				btrfs_block_group_used(&cache->item);
> +			/* we only need ->free_space in ALLOC_SEQ BGs */
> +			cache->last_byte_to_unpin = (u64)-1;
> +			cache->cached = BTRFS_CACHE_FINISHED;
> +			cache->free_space_ctl->free_space =
> +				cache->key.offset - cache->alloc_offset;
> +			cache->unusable = unusable;
> +			free_excluded_extents(cache);
> +			break;
> +		default:
> +			BUG();

An unexpeced value of allocation is found, this needs a message and
proper error handling, btrfs_read_block_groups is called from mount path
so the recovery should be possible.

> +		}
> +
>  		ret = btrfs_add_block_group_cache(info, cache);
>  		if (ret) {
>  			btrfs_remove_free_space_cache(cache);
> @@ -10425,7 +10772,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  		trace_btrfs_add_block_group(info, cache, 0);
>  		update_space_info(info, cache->flags, found_key.offset,
>  				  btrfs_block_group_used(&cache->item),
> -				  cache->bytes_super, &space_info);
> +				  unusable, &space_info);
>  
>  		cache->space_info = space_info;
>  
> @@ -10438,6 +10785,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  			ASSERT(list_empty(&cache->bg_list));
>  			btrfs_mark_bg_unused(cache);
>  		}
> +
> +		if (cache->wp_broken)
> +			inc_block_group_ro(cache, 1);
>  	}
>  
>  	list_for_each_entry_rcu(space_info, &info->space_info, list) {
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index f74dc259307b..cc69dc71f4c1 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2326,8 +2326,11 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  			   u64 offset, u64 bytes)
>  {
>  	struct btrfs_free_space *info;
> +	struct btrfs_block_group_cache *block_group = ctl->private;
>  	int ret = 0;
>  
> +	WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
>  	if (!info)
>  		return -ENOMEM;
> @@ -2376,6 +2379,28 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
> +			       u64 bytenr, u64 size)
> +{
> +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> +	u64 offset = bytenr - block_group->key.objectid;
> +	u64 to_free, to_unusable;
> +
> +	spin_lock(&ctl->tree_lock);
> +	if (offset >= block_group->alloc_offset)
> +		to_free = size;
> +	else if (offset + size <= block_group->alloc_offset)
> +		to_free = 0;
> +	else
> +		to_free = offset + size - block_group->alloc_offset;
> +	to_unusable = size - to_free;
> +	ctl->free_space += to_free;
> +	block_group->unusable += to_unusable;
> +	spin_unlock(&ctl->tree_lock);
> +	return 0;
> +
> +}
> +
>  int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>  			    u64 offset, u64 bytes)
>  {
> @@ -2384,6 +2409,8 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>  	int ret;
>  	bool re_search = false;
>  
> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	spin_lock(&ctl->tree_lock);
>  
>  again:
> @@ -2619,6 +2646,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
>  	u64 align_gap = 0;
>  	u64 align_gap_len = 0;
>  
> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	spin_lock(&ctl->tree_lock);
>  	entry = find_free_space(ctl, &offset, &bytes_search,
>  				block_group->full_stripe_len, max_extent_size);
> @@ -2738,6 +2767,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
>  	struct rb_node *node;
>  	u64 ret = 0;
>  
> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	spin_lock(&cluster->lock);
>  	if (bytes > cluster->max_size)
>  		goto out;
> @@ -3384,6 +3415,8 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
>  {
>  	int ret;
>  
> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
> +
>  	*trimmed = 0;
>  
>  	spin_lock(&block_group->lock);
> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
> index 8760acb55ffd..d30667784f73 100644
> --- a/fs/btrfs/free-space-cache.h
> +++ b/fs/btrfs/free-space-cache.h
> @@ -73,10 +73,15 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
>  int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  			   struct btrfs_free_space_ctl *ctl,
>  			   u64 bytenr, u64 size);
> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
> +			       u64 bytenr, u64 size);
>  static inline int
>  btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
>  		     u64 bytenr, u64 size)
>  {
> +	if (block_group->alloc_type == BTRFS_ALLOC_SEQ)
> +		return __btrfs_add_free_space_seq(block_group, bytenr, size);
> +
>  	return __btrfs_add_free_space(block_group->fs_info,
>  				      block_group->free_space_ctl,
>  				      bytenr, size);
> -- 
> 2.21.0
Naohiro Aota June 18, 2019, 8:28 a.m. UTC | #3
On 2019/06/13 23:07, Josef Bacik wrote:
> On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
>> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
>>   	}
>>   
>>   	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
>> -		    cache->bytes_super - btrfs_block_group_used(&cache->item);
>> +		    cache->bytes_super - cache->unusable -
>> +		    btrfs_block_group_used(&cache->item);
>>   	sinfo_used = btrfs_space_info_used(sinfo, true);
>>   
>>   	if (sinfo_used + num_bytes + min_allocable_bytes <=
>> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
>>   	if (!--cache->ro) {
>>   		num_bytes = cache->key.offset - cache->reserved -
>>   			    cache->pinned - cache->bytes_super -
>> +			    cache->unusable -
>>   			    btrfs_block_group_used(&cache->item);
> 
> You've done this in a few places, but not all the places, most notably
> btrfs_space_info_used() which is used in the space reservation code a lot.

I added "unsable" to struct btrfs_block_group_cache, but added
nothing to struct btrfs_space_info. Once extent is allocated and
freed in an ALLOC_SEQ Block Group, such extent is never resued
until we remove the BG. I'm accounting the size of such region
in "cache->unusable" and in "space_info->bytes_readonly". So,
btrfs_space_info_used() does not need the modify.

I admit it's confusing here. I can add "bytes_zone_unusable" to
struct btrfs_space_info, if it's better.

>>   		sinfo->bytes_readonly -= num_bytes;
>>   		list_del_init(&cache->ro_list);
>> @@ -10200,11 +10287,240 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
>>   	}
>>   }
>>   
>> +static int
>> +btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
>> +{
>> +	struct btrfs_fs_info *fs_info = cache->fs_info;
>> +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
>> +	struct extent_map *em;
>> +	struct map_lookup *map;
>> +	struct btrfs_device *device;
>> +	u64 logical = cache->key.objectid;
>> +	u64 length = cache->key.offset;
>> +	u64 physical = 0;
>> +	int ret, alloc_type;
>> +	int i, j;
>> +	u64 *alloc_offsets = NULL;
>> +
>> +#define WP_MISSING_DEV ((u64)-1)
>> +
>> +	/* Sanity check */
>> +	if (!IS_ALIGNED(length, fs_info->zone_size)) {
>> +		btrfs_err(fs_info, "unaligned block group at %llu + %llu",
>> +			  logical, length);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Get the chunk mapping */
>> +	em_tree = &fs_info->mapping_tree.map_tree;
>> +	read_lock(&em_tree->lock);
>> +	em = lookup_extent_mapping(em_tree, logical, length);
>> +	read_unlock(&em_tree->lock);
>> +
>> +	if (!em)
>> +		return -EINVAL;
>> +
>> +	map = em->map_lookup;
>> +
>> +	/*
>> +	 * Get the zone type: if the group is mapped to a non-sequential zone,
>> +	 * there is no need for the allocation offset (fit allocation is OK).
>> +	 */
>> +	alloc_type = -1;
>> +	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
>> +				GFP_NOFS);
>> +	if (!alloc_offsets) {
>> +		free_extent_map(em);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	for (i = 0; i < map->num_stripes; i++) {
>> +		int is_sequential;
>> +		struct blk_zone zone;
>> +
>> +		device = map->stripes[i].dev;
>> +		physical = map->stripes[i].physical;
>> +
>> +		if (device->bdev == NULL) {
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			continue;
>> +		}
>> +
>> +		is_sequential = btrfs_dev_is_sequential(device, physical);
>> +		if (alloc_type == -1)
>> +			alloc_type = is_sequential ?
>> +					BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
>> +
>> +		if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
>> +		    (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
>> +			btrfs_err(fs_info, "found block group of mixed zone types");
>> +			ret = -EIO;
>> +			goto out;
>> +		}
>> +
>> +		if (!is_sequential)
>> +			continue;
>> +
>> +		/* this zone will be used for allocation, so mark this
>> +		 * zone non-empty
>> +		 */
>> +		clear_bit(physical >> device->zone_size_shift,
>> +			  device->empty_zones);
>> +
>> +		/*
>> +		 * The group is mapped to a sequential zone. Get the zone write
>> +		 * pointer to determine the allocation offset within the zone.
>> +		 */
>> +		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
>> +		ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
>> +		if (ret == -EIO || ret == -EOPNOTSUPP) {
>> +			ret = 0;
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			continue;
>> +		} else if (ret) {
>> +			goto out;
>> +		}
>> +
>> +
>> +		switch (zone.cond) {
>> +		case BLK_ZONE_COND_OFFLINE:
>> +		case BLK_ZONE_COND_READONLY:
>> +			btrfs_err(fs_info, "Offline/readonly zone %llu",
>> +				  physical >> device->zone_size_shift);
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			break;
>> +		case BLK_ZONE_COND_EMPTY:
>> +			alloc_offsets[i] = 0;
>> +			break;
>> +		case BLK_ZONE_COND_FULL:
>> +			alloc_offsets[i] = fs_info->zone_size;
>> +			break;
>> +		default:
>> +			/* Partially used zone */
>> +			alloc_offsets[i] =
>> +				((zone.wp - zone.start) << SECTOR_SHIFT);
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (alloc_type == BTRFS_ALLOC_FIT)
>> +		goto out;
>> +
>> +	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>> +	case 0: /* single */
>> +	case BTRFS_BLOCK_GROUP_DUP:
>> +	case BTRFS_BLOCK_GROUP_RAID1:
>> +		cache->alloc_offset = WP_MISSING_DEV;
>> +		for (i = 0; i < map->num_stripes; i++) {
>> +			if (alloc_offsets[i] == WP_MISSING_DEV)
>> +				continue;
>> +			if (cache->alloc_offset == WP_MISSING_DEV)
>> +				cache->alloc_offset = alloc_offsets[i];
>> +			if (alloc_offsets[i] == cache->alloc_offset)
>> +				continue;
>> +
>> +			btrfs_err(fs_info,
>> +				  "write pointer mismatch: block group %llu",
>> +				  logical);
>> +			cache->wp_broken = 1;
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID0:
>> +		cache->alloc_offset = 0;
>> +		for (i = 0; i < map->num_stripes; i++) {
>> +			if (alloc_offsets[i] == WP_MISSING_DEV) {
>> +				btrfs_err(fs_info,
>> +					  "cannot recover write pointer: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			if (alloc_offsets[0] < alloc_offsets[i]) {
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			cache->alloc_offset += alloc_offsets[i];
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID10:
>> +		/*
>> +		 * Pass1: check write pointer of RAID1 level: each pointer
>> +		 * should be equal.
>> +		 */
>> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +			int base = i*map->sub_stripes;
>> +			u64 offset = WP_MISSING_DEV;
>> +
>> +			for (j = 0; j < map->sub_stripes; j++) {
>> +				if (alloc_offsets[base+j] == WP_MISSING_DEV)
>> +					continue;
>> +				if (offset == WP_MISSING_DEV)
>> +					offset = alloc_offsets[base+j];
>> +				if (alloc_offsets[base+j] == offset)
>> +					continue;
>> +
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +			}
>> +			for (j = 0; j < map->sub_stripes; j++)
>> +				alloc_offsets[base+j] = offset;
>> +		}
>> +
>> +		/* Pass2: check write pointer of RAID1 level */
>> +		cache->alloc_offset = 0;
>> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +			int base = i*map->sub_stripes;
>> +
>> +			if (alloc_offsets[base] == WP_MISSING_DEV) {
>> +				btrfs_err(fs_info,
>> +					  "cannot recover write pointer: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			if (alloc_offsets[0] < alloc_offsets[base]) {
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			cache->alloc_offset += alloc_offsets[base];
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID5:
>> +	case BTRFS_BLOCK_GROUP_RAID6:
>> +		/* RAID5/6 is not supported yet */
>> +	default:
>> +		btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
>> +			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +out:
>> +	cache->alloc_type = alloc_type;
>> +	kfree(alloc_offsets);
>> +	free_extent_map(em);
>> +
>> +	return ret;
>> +}
>> +
> 
> Move this to the zoned device file that you create.

Sure.

>>   static struct btrfs_block_group_cache *
>>   btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>   			       u64 start, u64 size)
>>   {
>>   	struct btrfs_block_group_cache *cache;
>> +	int ret;
>>   
>>   	cache = kzalloc(sizeof(*cache), GFP_NOFS);
>>   	if (!cache)
>> @@ -10238,6 +10554,16 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>   	atomic_set(&cache->trimming, 0);
>>   	mutex_init(&cache->free_space_lock);
>>   	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
>> +	cache->alloc_type = BTRFS_ALLOC_FIT;
>> +	cache->alloc_offset = 0;
>> +
>> +	if (btrfs_fs_incompat(fs_info, HMZONED)) {
>> +		ret = btrfs_get_block_group_alloc_offset(cache);
>> +		if (ret) {
>> +			kfree(cache);
>> +			return NULL;
>> +		}
>> +	}
>>   
>>   	return cache;
>>   }
>> @@ -10310,6 +10636,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   	int need_clear = 0;
>>   	u64 cache_gen;
>>   	u64 feature;
>> +	u64 unusable;
>>   	int mixed;
>>   
>>   	feature = btrfs_super_incompat_flags(info->super_copy);
>> @@ -10415,6 +10742,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   			free_excluded_extents(cache);
>>   		}
>>   
>> +		switch (cache->alloc_type) {
>> +		case BTRFS_ALLOC_FIT:
>> +			unusable = cache->bytes_super;
>> +			break;
>> +		case BTRFS_ALLOC_SEQ:
>> +			WARN_ON(cache->bytes_super != 0);
>> +			unusable = cache->alloc_offset -
>> +				btrfs_block_group_used(&cache->item);
>> +			/* we only need ->free_space in ALLOC_SEQ BGs */
>> +			cache->last_byte_to_unpin = (u64)-1;
>> +			cache->cached = BTRFS_CACHE_FINISHED;
>> +			cache->free_space_ctl->free_space =
>> +				cache->key.offset - cache->alloc_offset;
>> +			cache->unusable = unusable;
>> +			free_excluded_extents(cache);
>> +			break;
>> +		default:
>> +			BUG();
>> +		}
>> +
>>   		ret = btrfs_add_block_group_cache(info, cache);
>>   		if (ret) {
>>   			btrfs_remove_free_space_cache(cache);
>> @@ -10425,7 +10772,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   		trace_btrfs_add_block_group(info, cache, 0);
>>   		update_space_info(info, cache->flags, found_key.offset,
>>   				  btrfs_block_group_used(&cache->item),
>> -				  cache->bytes_super, &space_info);
>> +				  unusable, &space_info);
>>   
>>   		cache->space_info = space_info;
>>   
>> @@ -10438,6 +10785,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   			ASSERT(list_empty(&cache->bg_list));
>>   			btrfs_mark_bg_unused(cache);
>>   		}
>> +
>> +		if (cache->wp_broken)
>> +			inc_block_group_ro(cache, 1);
>>   	}
>>   
>>   	list_for_each_entry_rcu(space_info, &info->space_info, list) {
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index f74dc259307b..cc69dc71f4c1 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -2326,8 +2326,11 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>>   			   u64 offset, u64 bytes)
>>   {
>>   	struct btrfs_free_space *info;
>> +	struct btrfs_block_group_cache *block_group = ctl->private;
>>   	int ret = 0;
>>   
>> +	WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
>>   	if (!info)
>>   		return -ENOMEM;
>> @@ -2376,6 +2379,28 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>>   	return ret;
>>   }
>>   
>> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
>> +			       u64 bytenr, u64 size)
>> +{
>> +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>> +	u64 offset = bytenr - block_group->key.objectid;
>> +	u64 to_free, to_unusable;
>> +
>> +	spin_lock(&ctl->tree_lock);
>> +	if (offset >= block_group->alloc_offset)
>> +		to_free = size;
>> +	else if (offset + size <= block_group->alloc_offset)
>> +		to_free = 0;
>> +	else
>> +		to_free = offset + size - block_group->alloc_offset;
>> +	to_unusable = size - to_free;
>> +	ctl->free_space += to_free;
>> +	block_group->unusable += to_unusable;
>> +	spin_unlock(&ctl->tree_lock);
>> +	return 0;
>> +
>> +}
>> +
>>   int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>>   			    u64 offset, u64 bytes)
>>   {
>> @@ -2384,6 +2409,8 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>>   	int ret;
>>   	bool re_search = false;
>>   
>> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
> 
> These should probably be ASSERT() right?  Want to make sure the developers
> really notice a problem when testing.  Thanks,
> 
> Josef
> 

Agree. I will use ASSERT.
Naohiro Aota June 18, 2019, 8:49 a.m. UTC | #4
On 2019/06/18 7:29, David Sterba wrote:
> On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
>> On HMZONED drives, writes must always be sequential and directed at a block
>> group zone write pointer position. Thus, block allocation in a block group
>> must also be done sequentially using an allocation pointer equal to the
>> block group zone write pointer plus the number of blocks allocated but not
>> yet written.
>>
>> Sequential allocation function find_free_extent_seq() bypass the checks in
>> find_free_extent() and increase the reserved byte counter by itself. It is
>> impossible to revert once allocated region in the sequential allocation,
>> since it might race with other allocations and leave an allocation hole,
>> which breaks the sequential write rule.
>>
>> Furthermore, this commit introduce two new variable to struct
>> btrfs_block_group_cache. "wp_broken" indicate that write pointer is broken
>> (e.g. not synced on a RAID1 block group) and mark that block group read
>> only. "unusable" keeps track of the size of once allocated then freed
>> region. Such region is never usable until resetting underlying zones.
>> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>> ---
>>   fs/btrfs/ctree.h            |  24 +++
>>   fs/btrfs/extent-tree.c      | 378 ++++++++++++++++++++++++++++++++++--
>>   fs/btrfs/free-space-cache.c |  33 ++++
>>   fs/btrfs/free-space-cache.h |   5 +
>>   4 files changed, 426 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 6c00101407e4..f4bcd2a6ec12 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -582,6 +582,20 @@ struct btrfs_full_stripe_locks_tree {
>>   	struct mutex lock;
>>   };
>>   
>> +/* Block group allocation types */
>> +enum btrfs_alloc_type {
>> +
>> +	/* Regular first fit allocation */
>> +	BTRFS_ALLOC_FIT		= 0,
>> +
>> +	/*
>> +	 * Sequential allocation: this is for HMZONED mode and
>> +	 * will result in ignoring free space before a block
>> +	 * group allocation offset.
> 
> Please format the comments to 80 columns
> 
>> +	 */
>> +	BTRFS_ALLOC_SEQ		= 1,
>> +};
>> +
>>   struct btrfs_block_group_cache {
>>   	struct btrfs_key key;
>>   	struct btrfs_block_group_item item;
>> @@ -592,6 +606,7 @@ struct btrfs_block_group_cache {
>>   	u64 reserved;
>>   	u64 delalloc_bytes;
>>   	u64 bytes_super;
>> +	u64 unusable;
> 
> 'unusable' is specific to the zones, so 'zone_unusable' would make it
> clear. The terminilogy around space is confusing already (we have
> unused, free, reserved, allocated, slack).

Sure. I will change the name.

Or, is it better toadd new struct "btrfs_seq_alloc_info" and move all
these variable there? Then, I can just add one pointer to the struct here.

>>   	u64 flags;
>>   	u64 cache_generation;
>>   
>> @@ -621,6 +636,7 @@ struct btrfs_block_group_cache {
>>   	unsigned int iref:1;
>>   	unsigned int has_caching_ctl:1;
>>   	unsigned int removed:1;
>> +	unsigned int wp_broken:1;
>>   
>>   	int disk_cache_state;
>>   
>> @@ -694,6 +710,14 @@ struct btrfs_block_group_cache {
>>   
>>   	/* Record locked full stripes for RAID5/6 block group */
>>   	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
>> +
>> +	/*
>> +	 * Allocation offset for the block group to implement sequential
>> +	 * allocation. This is used only with HMZONED mode enabled and if
>> +	 * the block group resides on a sequential zone.
>> +	 */
>> +	enum btrfs_alloc_type alloc_type;
>> +	u64 alloc_offset;
>>   };
>>   
>>   /* delayed seq elem */
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 363db58f56b8..ebd0d6eae038 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -28,6 +28,7 @@
>>   #include "sysfs.h"
>>   #include "qgroup.h"
>>   #include "ref-verify.h"
>> +#include "rcu-string.h"
>>   
>>   #undef SCRAMBLE_DELAYED_REFS
>>   
>> @@ -590,6 +591,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
>>   	struct btrfs_caching_control *caching_ctl;
>>   	int ret = 0;
>>   
>> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
>>   	if (!caching_ctl)
>>   		return -ENOMEM;
>> @@ -6555,6 +6558,19 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
>>   	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
>>   }
>>   
>> +static void __btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
>> +				       u64 ram_bytes, u64 num_bytes,
>> +				       int delalloc)
>> +{
>> +	struct btrfs_space_info *space_info = cache->space_info;
>> +
>> +	cache->reserved += num_bytes;
>> +	space_info->bytes_reserved += num_bytes;
>> +	update_bytes_may_use(space_info, -ram_bytes);
>> +	if (delalloc)
>> +		cache->delalloc_bytes += num_bytes;
>> +}
>> +
>>   /**
>>    * btrfs_add_reserved_bytes - update the block_group and space info counters
>>    * @cache:	The cache we are manipulating
>> @@ -6573,17 +6589,16 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
>>   	struct btrfs_space_info *space_info = cache->space_info;
>>   	int ret = 0;
>>   
>> +	/* should handled by find_free_extent_seq */
>> +	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	spin_lock(&space_info->lock);
>>   	spin_lock(&cache->lock);
>> -	if (cache->ro) {
>> +	if (cache->ro)
>>   		ret = -EAGAIN;
>> -	} else {
>> -		cache->reserved += num_bytes;
>> -		space_info->bytes_reserved += num_bytes;
>> -		update_bytes_may_use(space_info, -ram_bytes);
>> -		if (delalloc)
>> -			cache->delalloc_bytes += num_bytes;
>> -	}
>> +	else
>> +		__btrfs_add_reserved_bytes(cache, ram_bytes, num_bytes,
>> +					   delalloc);
>>   	spin_unlock(&cache->lock);
>>   	spin_unlock(&space_info->lock);
>>   	return ret;
>> @@ -6701,9 +6716,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>>   			cache = btrfs_lookup_block_group(fs_info, start);
>>   			BUG_ON(!cache); /* Logic error */
>>   
>> -			cluster = fetch_cluster_info(fs_info,
>> -						     cache->space_info,
>> -						     &empty_cluster);
>> +			if (cache->alloc_type == BTRFS_ALLOC_FIT)
>> +				cluster = fetch_cluster_info(fs_info,
>> +							     cache->space_info,
>> +							     &empty_cluster);
>> +			else
>> +				cluster = NULL;
>> +
>>   			empty_cluster <<= 1;
>>   		}
>>   
>> @@ -6743,7 +6762,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>>   		space_info->max_extent_size = 0;
>>   		percpu_counter_add_batch(&space_info->total_bytes_pinned,
>>   			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
>> -		if (cache->ro) {
>> +		if (cache->ro || cache->alloc_type == BTRFS_ALLOC_SEQ) {
>> +			/* need reset before reusing in ALLOC_SEQ BG */
>>   			space_info->bytes_readonly += len;
>>   			readonly = true;
>>   		}
>> @@ -7588,6 +7608,60 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
>>   	return 0;
>>   }
>>   
>> +/*
>> + * Simple allocator for sequential only block group. It only allows
>> + * sequential allocation. No need to play with trees. This function
>> + * also reserve the bytes as in btrfs_add_reserved_bytes.
>> + */
>> +
>> +static int find_free_extent_seq(struct btrfs_block_group_cache *cache,
>> +				struct find_free_extent_ctl *ffe_ctl)
>> +{
>> +	struct btrfs_space_info *space_info = cache->space_info;
>> +	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
>> +	u64 start = cache->key.objectid;
>> +	u64 num_bytes = ffe_ctl->num_bytes;
>> +	u64 avail;
>> +	int ret = 0;
>> +
>> +	/* Sanity check */
>> +	if (cache->alloc_type != BTRFS_ALLOC_SEQ)
>> +		return 1;
>> +
>> +	spin_lock(&space_info->lock);
>> +	spin_lock(&cache->lock);
>> +
>> +	if (cache->ro) {
>> +		ret = -EAGAIN;
>> +		goto out;
>> +	}
>> +
>> +	spin_lock(&ctl->tree_lock);
>> +	avail = cache->key.offset - cache->alloc_offset;
>> +	if (avail < num_bytes) {
>> +		ffe_ctl->max_extent_size = avail;
>> +		spin_unlock(&ctl->tree_lock);
>> +		ret = 1;
>> +		goto out;
>> +	}
>> +
>> +	ffe_ctl->found_offset = start + cache->alloc_offset;
>> +	cache->alloc_offset += num_bytes;
>> +	ctl->free_space -= num_bytes;
>> +	spin_unlock(&ctl->tree_lock);
>> +
>> +	BUG_ON(!IS_ALIGNED(ffe_ctl->found_offset,
>> +			   cache->fs_info->stripesize));
>> +	ffe_ctl->search_start = ffe_ctl->found_offset;
>> +	__btrfs_add_reserved_bytes(cache, ffe_ctl->ram_bytes, num_bytes,
>> +				   ffe_ctl->delalloc);
>> +
>> +out:
>> +	spin_unlock(&cache->lock);
>> +	spin_unlock(&space_info->lock);
>> +	return ret;
>> +}
>> +
>>   /*
>>    * Return >0 means caller needs to re-search for free extent
>>    * Return 0 means we have the needed free extent.
>> @@ -7889,6 +7963,16 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>>   		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
>>   			goto loop;
>>   
>> +		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
>> +			ret = find_free_extent_seq(block_group, &ffe_ctl);
>> +			if (ret)
>> +				goto loop;
>> +			/* btrfs_find_space_for_alloc_seq should ensure
>> +			 * that everything is OK and reserve the extent.
>> +			 */
> 
> Please use the
> 
> /*
>   * comment
>   */
> 
> style
> 
>> +			goto nocheck;
>> +		}
>> +
>>   		/*
>>   		 * Ok we want to try and use the cluster allocator, so
>>   		 * lets look there
>> @@ -7944,6 +8028,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
>>   					     num_bytes);
>>   			goto loop;
>>   		}
>> +nocheck:
>>   		btrfs_inc_block_group_reservations(block_group);
>>   
>>   		/* we are all good, lets return */
>> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
>>   	}
>>   
>>   	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
>> -		    cache->bytes_super - btrfs_block_group_used(&cache->item);
>> +		    cache->bytes_super - cache->unusable -
>> +		    btrfs_block_group_used(&cache->item);
>>   	sinfo_used = btrfs_space_info_used(sinfo, true);
>>   
>>   	if (sinfo_used + num_bytes + min_allocable_bytes <=
>> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
>>   	if (!--cache->ro) {
>>   		num_bytes = cache->key.offset - cache->reserved -
>>   			    cache->pinned - cache->bytes_super -
>> +			    cache->unusable -
>>   			    btrfs_block_group_used(&cache->item);
>>   		sinfo->bytes_readonly -= num_bytes;
>>   		list_del_init(&cache->ro_list);
>> @@ -10200,11 +10287,240 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
>>   	}
>>   }
>>   
>> +static int
>> +btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
>> +{
>> +	struct btrfs_fs_info *fs_info = cache->fs_info;
>> +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
>> +	struct extent_map *em;
>> +	struct map_lookup *map;
>> +	struct btrfs_device *device;
>> +	u64 logical = cache->key.objectid;
>> +	u64 length = cache->key.offset;
>> +	u64 physical = 0;
>> +	int ret, alloc_type;
>> +	int i, j;
>> +	u64 *alloc_offsets = NULL;
>> +
>> +#define WP_MISSING_DEV ((u64)-1)
> 
> Please move the definition to the beginning of the file
> 
>> +
>> +	/* Sanity check */
>> +	if (!IS_ALIGNED(length, fs_info->zone_size)) {
>> +		btrfs_err(fs_info, "unaligned block group at %llu + %llu",
>> +			  logical, length);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Get the chunk mapping */
>> +	em_tree = &fs_info->mapping_tree.map_tree;
>> +	read_lock(&em_tree->lock);
>> +	em = lookup_extent_mapping(em_tree, logical, length);
>> +	read_unlock(&em_tree->lock);
>> +
>> +	if (!em)
>> +		return -EINVAL;
>> +
>> +	map = em->map_lookup;
>> +
>> +	/*
>> +	 * Get the zone type: if the group is mapped to a non-sequential zone,
>> +	 * there is no need for the allocation offset (fit allocation is OK).
>> +	 */
>> +	alloc_type = -1;
>> +	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
>> +				GFP_NOFS);
>> +	if (!alloc_offsets) {
>> +		free_extent_map(em);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	for (i = 0; i < map->num_stripes; i++) {
>> +		int is_sequential;
> 
> Please use bool instead of int
> 
>> +		struct blk_zone zone;
>> +
>> +		device = map->stripes[i].dev;
>> +		physical = map->stripes[i].physical;
>> +
>> +		if (device->bdev == NULL) {
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			continue;
>> +		}
>> +
>> +		is_sequential = btrfs_dev_is_sequential(device, physical);
>> +		if (alloc_type == -1)
>> +			alloc_type = is_sequential ?
>> +					BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
>> +
>> +		if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
>> +		    (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
>> +			btrfs_err(fs_info, "found block group of mixed zone types");
>> +			ret = -EIO;
>> +			goto out;
>> +		}
>> +
>> +		if (!is_sequential)
>> +			continue;
>> +
>> +		/* this zone will be used for allocation, so mark this
>> +		 * zone non-empty
>> +		 */
>> +		clear_bit(physical >> device->zone_size_shift,
>> +			  device->empty_zones);
>> +
>> +		/*
>> +		 * The group is mapped to a sequential zone. Get the zone write
>> +		 * pointer to determine the allocation offset within the zone.
>> +		 */
>> +		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
>> +		ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
>> +		if (ret == -EIO || ret == -EOPNOTSUPP) {
>> +			ret = 0;
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			continue;
>> +		} else if (ret) {
>> +			goto out;
>> +		}
>> +
>> +
>> +		switch (zone.cond) {
>> +		case BLK_ZONE_COND_OFFLINE:
>> +		case BLK_ZONE_COND_READONLY:
>> +			btrfs_err(fs_info, "Offline/readonly zone %llu",
>> +				  physical >> device->zone_size_shift);
>> +			alloc_offsets[i] = WP_MISSING_DEV;
>> +			break;
>> +		case BLK_ZONE_COND_EMPTY:
>> +			alloc_offsets[i] = 0;
>> +			break;
>> +		case BLK_ZONE_COND_FULL:
>> +			alloc_offsets[i] = fs_info->zone_size;
>> +			break;
>> +		default:
>> +			/* Partially used zone */
>> +			alloc_offsets[i] =
>> +				((zone.wp - zone.start) << SECTOR_SHIFT);
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (alloc_type == BTRFS_ALLOC_FIT)
>> +		goto out;
>> +
>> +	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>> +	case 0: /* single */
>> +	case BTRFS_BLOCK_GROUP_DUP:
>> +	case BTRFS_BLOCK_GROUP_RAID1:
>> +		cache->alloc_offset = WP_MISSING_DEV;
>> +		for (i = 0; i < map->num_stripes; i++) {
>> +			if (alloc_offsets[i] == WP_MISSING_DEV)
>> +				continue;
>> +			if (cache->alloc_offset == WP_MISSING_DEV)
>> +				cache->alloc_offset = alloc_offsets[i];
>> +			if (alloc_offsets[i] == cache->alloc_offset)
>> +				continue;
>> +
>> +			btrfs_err(fs_info,
>> +				  "write pointer mismatch: block group %llu",
>> +				  logical);
>> +			cache->wp_broken = 1;
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID0:
>> +		cache->alloc_offset = 0;
>> +		for (i = 0; i < map->num_stripes; i++) {
>> +			if (alloc_offsets[i] == WP_MISSING_DEV) {
>> +				btrfs_err(fs_info,
>> +					  "cannot recover write pointer: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			if (alloc_offsets[0] < alloc_offsets[i]) {
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			cache->alloc_offset += alloc_offsets[i];
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID10:
>> +		/*
>> +		 * Pass1: check write pointer of RAID1 level: each pointer
>> +		 * should be equal.
>> +		 */
>> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +			int base = i*map->sub_stripes;
> 
> spaces around binary operators
> 
> 			int base = i * map->sub_stripes;
> 
>> +			u64 offset = WP_MISSING_DEV;
>> +
>> +			for (j = 0; j < map->sub_stripes; j++) {
>> +				if (alloc_offsets[base+j] == WP_MISSING_DEV)
> 
> here and below
> 
>> +					continue;
>> +				if (offset == WP_MISSING_DEV)
>> +					offset = alloc_offsets[base+j];
>> +				if (alloc_offsets[base+j] == offset)
>> +					continue;
>> +
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +			}
>> +			for (j = 0; j < map->sub_stripes; j++)
>> +				alloc_offsets[base+j] = offset;
>> +		}
>> +
>> +		/* Pass2: check write pointer of RAID1 level */
>> +		cache->alloc_offset = 0;
>> +		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +			int base = i*map->sub_stripes;
>> +
>> +			if (alloc_offsets[base] == WP_MISSING_DEV) {
>> +				btrfs_err(fs_info,
>> +					  "cannot recover write pointer: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			if (alloc_offsets[0] < alloc_offsets[base]) {
>> +				btrfs_err(fs_info,
>> +					  "write pointer mismatch: block group %llu",
>> +					  logical);
>> +				cache->wp_broken = 1;
>> +				continue;
>> +			}
>> +
>> +			cache->alloc_offset += alloc_offsets[base];
>> +		}
>> +		break;
>> +	case BTRFS_BLOCK_GROUP_RAID5:
>> +	case BTRFS_BLOCK_GROUP_RAID6:
>> +		/* RAID5/6 is not supported yet */
>> +	default:
>> +		btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
>> +			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +out:
>> +	cache->alloc_type = alloc_type;
>> +	kfree(alloc_offsets);
>> +	free_extent_map(em);
>> +
>> +	return ret;
>> +}
>> +
>>   static struct btrfs_block_group_cache *
>>   btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>   			       u64 start, u64 size)
>>   {
>>   	struct btrfs_block_group_cache *cache;
>> +	int ret;
>>   
>>   	cache = kzalloc(sizeof(*cache), GFP_NOFS);
>>   	if (!cache)
>> @@ -10238,6 +10554,16 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>   	atomic_set(&cache->trimming, 0);
>>   	mutex_init(&cache->free_space_lock);
>>   	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
>> +	cache->alloc_type = BTRFS_ALLOC_FIT;
>> +	cache->alloc_offset = 0;
>> +
>> +	if (btrfs_fs_incompat(fs_info, HMZONED)) {
>> +		ret = btrfs_get_block_group_alloc_offset(cache);
>> +		if (ret) {
>> +			kfree(cache);
>> +			return NULL;
>> +		}
>> +	}
>>   
>>   	return cache;
>>   }
>> @@ -10310,6 +10636,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   	int need_clear = 0;
>>   	u64 cache_gen;
>>   	u64 feature;
>> +	u64 unusable;
>>   	int mixed;
>>   
>>   	feature = btrfs_super_incompat_flags(info->super_copy);
>> @@ -10415,6 +10742,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   			free_excluded_extents(cache);
>>   		}
>>   
>> +		switch (cache->alloc_type) {
>> +		case BTRFS_ALLOC_FIT:
>> +			unusable = cache->bytes_super;
>> +			break;
>> +		case BTRFS_ALLOC_SEQ:
>> +			WARN_ON(cache->bytes_super != 0);
>> +			unusable = cache->alloc_offset -
>> +				btrfs_block_group_used(&cache->item);
>> +			/* we only need ->free_space in ALLOC_SEQ BGs */
>> +			cache->last_byte_to_unpin = (u64)-1;
>> +			cache->cached = BTRFS_CACHE_FINISHED;
>> +			cache->free_space_ctl->free_space =
>> +				cache->key.offset - cache->alloc_offset;
>> +			cache->unusable = unusable;
>> +			free_excluded_extents(cache);
>> +			break;
>> +		default:
>> +			BUG();
> 
> An unexpeced value of allocation is found, this needs a message and
> proper error handling, btrfs_read_block_groups is called from mount path
> so the recovery should be possible.

OK. I will handle this case.

> 
>> +		}
>> +
>>   		ret = btrfs_add_block_group_cache(info, cache);
>>   		if (ret) {
>>   			btrfs_remove_free_space_cache(cache);
>> @@ -10425,7 +10772,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   		trace_btrfs_add_block_group(info, cache, 0);
>>   		update_space_info(info, cache->flags, found_key.offset,
>>   				  btrfs_block_group_used(&cache->item),
>> -				  cache->bytes_super, &space_info);
>> +				  unusable, &space_info);
>>   
>>   		cache->space_info = space_info;
>>   
>> @@ -10438,6 +10785,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>   			ASSERT(list_empty(&cache->bg_list));
>>   			btrfs_mark_bg_unused(cache);
>>   		}
>> +
>> +		if (cache->wp_broken)
>> +			inc_block_group_ro(cache, 1);
>>   	}
>>   
>>   	list_for_each_entry_rcu(space_info, &info->space_info, list) {
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index f74dc259307b..cc69dc71f4c1 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -2326,8 +2326,11 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>>   			   u64 offset, u64 bytes)
>>   {
>>   	struct btrfs_free_space *info;
>> +	struct btrfs_block_group_cache *block_group = ctl->private;
>>   	int ret = 0;
>>   
>> +	WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
>>   	if (!info)
>>   		return -ENOMEM;
>> @@ -2376,6 +2379,28 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>>   	return ret;
>>   }
>>   
>> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
>> +			       u64 bytenr, u64 size)
>> +{
>> +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>> +	u64 offset = bytenr - block_group->key.objectid;
>> +	u64 to_free, to_unusable;
>> +
>> +	spin_lock(&ctl->tree_lock);
>> +	if (offset >= block_group->alloc_offset)
>> +		to_free = size;
>> +	else if (offset + size <= block_group->alloc_offset)
>> +		to_free = 0;
>> +	else
>> +		to_free = offset + size - block_group->alloc_offset;
>> +	to_unusable = size - to_free;
>> +	ctl->free_space += to_free;
>> +	block_group->unusable += to_unusable;
>> +	spin_unlock(&ctl->tree_lock);
>> +	return 0;
>> +
>> +}
>> +
>>   int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>>   			    u64 offset, u64 bytes)
>>   {
>> @@ -2384,6 +2409,8 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>>   	int ret;
>>   	bool re_search = false;
>>   
>> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	spin_lock(&ctl->tree_lock);
>>   
>>   again:
>> @@ -2619,6 +2646,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
>>   	u64 align_gap = 0;
>>   	u64 align_gap_len = 0;
>>   
>> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	spin_lock(&ctl->tree_lock);
>>   	entry = find_free_space(ctl, &offset, &bytes_search,
>>   				block_group->full_stripe_len, max_extent_size);
>> @@ -2738,6 +2767,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
>>   	struct rb_node *node;
>>   	u64 ret = 0;
>>   
>> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	spin_lock(&cluster->lock);
>>   	if (bytes > cluster->max_size)
>>   		goto out;
>> @@ -3384,6 +3415,8 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
>>   {
>>   	int ret;
>>   
>> +	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>   	*trimmed = 0;
>>   
>>   	spin_lock(&block_group->lock);
>> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
>> index 8760acb55ffd..d30667784f73 100644
>> --- a/fs/btrfs/free-space-cache.h
>> +++ b/fs/btrfs/free-space-cache.h
>> @@ -73,10 +73,15 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
>>   int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>>   			   struct btrfs_free_space_ctl *ctl,
>>   			   u64 bytenr, u64 size);
>> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
>> +			       u64 bytenr, u64 size);
>>   static inline int
>>   btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
>>   		     u64 bytenr, u64 size)
>>   {
>> +	if (block_group->alloc_type == BTRFS_ALLOC_SEQ)
>> +		return __btrfs_add_free_space_seq(block_group, bytenr, size);
>> +
>>   	return __btrfs_add_free_space(block_group->fs_info,
>>   				      block_group->free_space_ctl,
>>   				      bytenr, size);
>> -- 
>> 2.21.0
>
Josef Bacik June 18, 2019, 1:37 p.m. UTC | #5
On Tue, Jun 18, 2019 at 08:28:07AM +0000, Naohiro Aota wrote:
> On 2019/06/13 23:07, Josef Bacik wrote:
> > On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
> >> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
> >>   	}
> >>   
> >>   	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
> >> -		    cache->bytes_super - btrfs_block_group_used(&cache->item);
> >> +		    cache->bytes_super - cache->unusable -
> >> +		    btrfs_block_group_used(&cache->item);
> >>   	sinfo_used = btrfs_space_info_used(sinfo, true);
> >>   
> >>   	if (sinfo_used + num_bytes + min_allocable_bytes <=
> >> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
> >>   	if (!--cache->ro) {
> >>   		num_bytes = cache->key.offset - cache->reserved -
> >>   			    cache->pinned - cache->bytes_super -
> >> +			    cache->unusable -
> >>   			    btrfs_block_group_used(&cache->item);
> > 
> > You've done this in a few places, but not all the places, most notably
> > btrfs_space_info_used() which is used in the space reservation code a lot.
> 
> I added "unsable" to struct btrfs_block_group_cache, but added
> nothing to struct btrfs_space_info. Once extent is allocated and
> freed in an ALLOC_SEQ Block Group, such extent is never resued
> until we remove the BG. I'm accounting the size of such region
> in "cache->unusable" and in "space_info->bytes_readonly". So,
> btrfs_space_info_used() does not need the modify.
> 
> I admit it's confusing here. I can add "bytes_zone_unusable" to
> struct btrfs_space_info, if it's better.
> 

Ah you're right, sorry I just read it as space_info.  Yes please add
bytes_zone_unusable, I'd like to be as verbose as possible about where our space
actually is.  I know if I go to debug something and see a huge amount in
read_only I'll be confused.  Thanks,

Josef
David Sterba June 27, 2019, 3:28 p.m. UTC | #6
On Tue, Jun 18, 2019 at 08:49:00AM +0000, Naohiro Aota wrote:
> On 2019/06/18 7:29, David Sterba wrote:
> > On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
> >> +	u64 unusable;
> > 
> > 'unusable' is specific to the zones, so 'zone_unusable' would make it
> > clear. The terminilogy around space is confusing already (we have
> > unused, free, reserved, allocated, slack).
> 
> Sure. I will change the name.
> 
> Or, is it better toadd new struct "btrfs_seq_alloc_info" and move all
> these variable there? Then, I can just add one pointer to the struct here.

There are 4 new members, but the block group structure is large already
(528 bytes) so adding a few more will not make the allocations worse.
There are also holes or inefficient types used so the size can be
squeezed a bit, but this is unrelated to this patchset.

Patch
diff mbox series

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c00101407e4..f4bcd2a6ec12 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -582,6 +582,20 @@  struct btrfs_full_stripe_locks_tree {
 	struct mutex lock;
 };
 
+/* Block group allocation types */
+enum btrfs_alloc_type {
+
+	/* Regular first fit allocation */
+	BTRFS_ALLOC_FIT		= 0,
+
+	/*
+	 * Sequential allocation: this is for HMZONED mode and
+	 * will result in ignoring free space before a block
+	 * group allocation offset.
+	 */
+	BTRFS_ALLOC_SEQ		= 1,
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
@@ -592,6 +606,7 @@  struct btrfs_block_group_cache {
 	u64 reserved;
 	u64 delalloc_bytes;
 	u64 bytes_super;
+	u64 unusable;
 	u64 flags;
 	u64 cache_generation;
 
@@ -621,6 +636,7 @@  struct btrfs_block_group_cache {
 	unsigned int iref:1;
 	unsigned int has_caching_ctl:1;
 	unsigned int removed:1;
+	unsigned int wp_broken:1;
 
 	int disk_cache_state;
 
@@ -694,6 +710,14 @@  struct btrfs_block_group_cache {
 
 	/* Record locked full stripes for RAID5/6 block group */
 	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+	/*
+	 * Allocation offset for the block group to implement sequential
+	 * allocation. This is used only with HMZONED mode enabled and if
+	 * the block group resides on a sequential zone.
+	 */
+	enum btrfs_alloc_type alloc_type;
+	u64 alloc_offset;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 363db58f56b8..ebd0d6eae038 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@ 
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "rcu-string.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -590,6 +591,8 @@  static int cache_block_group(struct btrfs_block_group_cache *cache,
 	struct btrfs_caching_control *caching_ctl;
 	int ret = 0;
 
+	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
+
 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 	if (!caching_ctl)
 		return -ENOMEM;
@@ -6555,6 +6558,19 @@  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 }
 
+static void __btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 ram_bytes, u64 num_bytes,
+				       int delalloc)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+
+	cache->reserved += num_bytes;
+	space_info->bytes_reserved += num_bytes;
+	update_bytes_may_use(space_info, -ram_bytes);
+	if (delalloc)
+		cache->delalloc_bytes += num_bytes;
+}
+
 /**
  * btrfs_add_reserved_bytes - update the block_group and space info counters
  * @cache:	The cache we are manipulating
@@ -6573,17 +6589,16 @@  static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
 
+	/* should handled by find_free_extent_seq */
+	WARN_ON(cache->alloc_type == BTRFS_ALLOC_SEQ);
+
 	spin_lock(&space_info->lock);
 	spin_lock(&cache->lock);
-	if (cache->ro) {
+	if (cache->ro)
 		ret = -EAGAIN;
-	} else {
-		cache->reserved += num_bytes;
-		space_info->bytes_reserved += num_bytes;
-		update_bytes_may_use(space_info, -ram_bytes);
-		if (delalloc)
-			cache->delalloc_bytes += num_bytes;
-	}
+	else
+		__btrfs_add_reserved_bytes(cache, ram_bytes, num_bytes,
+					   delalloc);
 	spin_unlock(&cache->lock);
 	spin_unlock(&space_info->lock);
 	return ret;
@@ -6701,9 +6716,13 @@  static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 			cache = btrfs_lookup_block_group(fs_info, start);
 			BUG_ON(!cache); /* Logic error */
 
-			cluster = fetch_cluster_info(fs_info,
-						     cache->space_info,
-						     &empty_cluster);
+			if (cache->alloc_type == BTRFS_ALLOC_FIT)
+				cluster = fetch_cluster_info(fs_info,
+							     cache->space_info,
+							     &empty_cluster);
+			else
+				cluster = NULL;
+
 			empty_cluster <<= 1;
 		}
 
@@ -6743,7 +6762,8 @@  static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 		space_info->max_extent_size = 0;
 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
-		if (cache->ro) {
+		if (cache->ro || cache->alloc_type == BTRFS_ALLOC_SEQ) {
+			/* need reset before reusing in ALLOC_SEQ BG */
 			space_info->bytes_readonly += len;
 			readonly = true;
 		}
@@ -7588,6 +7608,60 @@  static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
 	return 0;
 }
 
+/*
+ * Simple allocator for sequential only block group. It only allows
+ * sequential allocation. No need to play with trees. This function
+ * also reserve the bytes as in btrfs_add_reserved_bytes.
+ */
+
+static int find_free_extent_seq(struct btrfs_block_group_cache *cache,
+				struct find_free_extent_ctl *ffe_ctl)
+{
+	struct btrfs_space_info *space_info = cache->space_info;
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	u64 start = cache->key.objectid;
+	u64 num_bytes = ffe_ctl->num_bytes;
+	u64 avail;
+	int ret = 0;
+
+	/* Sanity check */
+	if (cache->alloc_type != BTRFS_ALLOC_SEQ)
+		return 1;
+
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+
+	if (cache->ro) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock(&ctl->tree_lock);
+	avail = cache->key.offset - cache->alloc_offset;
+	if (avail < num_bytes) {
+		ffe_ctl->max_extent_size = avail;
+		spin_unlock(&ctl->tree_lock);
+		ret = 1;
+		goto out;
+	}
+
+	ffe_ctl->found_offset = start + cache->alloc_offset;
+	cache->alloc_offset += num_bytes;
+	ctl->free_space -= num_bytes;
+	spin_unlock(&ctl->tree_lock);
+
+	BUG_ON(!IS_ALIGNED(ffe_ctl->found_offset,
+			   cache->fs_info->stripesize));
+	ffe_ctl->search_start = ffe_ctl->found_offset;
+	__btrfs_add_reserved_bytes(cache, ffe_ctl->ram_bytes, num_bytes,
+				   ffe_ctl->delalloc);
+
+out:
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
+	return ret;
+}
+
 /*
  * Return >0 means caller needs to re-search for free extent
  * Return 0 means we have the needed free extent.
@@ -7889,6 +7963,16 @@  static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
 			goto loop;
 
+		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
+			ret = find_free_extent_seq(block_group, &ffe_ctl);
+			if (ret)
+				goto loop;
+			/* btrfs_find_space_for_alloc_seq should ensure
+			 * that everything is OK and reserve the extent.
+			 */
+			goto nocheck;
+		}
+
 		/*
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
@@ -7944,6 +8028,7 @@  static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 					     num_bytes);
 			goto loop;
 		}
+nocheck:
 		btrfs_inc_block_group_reservations(block_group);
 
 		/* we are all good, lets return */
@@ -9616,7 +9701,8 @@  static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	}
 
 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+		    cache->bytes_super - cache->unusable -
+		    btrfs_block_group_used(&cache->item);
 	sinfo_used = btrfs_space_info_used(sinfo, true);
 
 	if (sinfo_used + num_bytes + min_allocable_bytes <=
@@ -9766,6 +9852,7 @@  void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
 	if (!--cache->ro) {
 		num_bytes = cache->key.offset - cache->reserved -
 			    cache->pinned - cache->bytes_super -
+			    cache->unusable -
 			    btrfs_block_group_used(&cache->item);
 		sinfo->bytes_readonly -= num_bytes;
 		list_del_init(&cache->ro_list);
@@ -10200,11 +10287,240 @@  static void link_block_group(struct btrfs_block_group_cache *cache)
 	}
 }
 
+static int
+btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_device *device;
+	u64 logical = cache->key.objectid;
+	u64 length = cache->key.offset;
+	u64 physical = 0;
+	int ret, alloc_type;
+	int i, j;
+	u64 *alloc_offsets = NULL;
+
+#define WP_MISSING_DEV ((u64)-1)
+
+	/* Sanity check */
+	if (!IS_ALIGNED(length, fs_info->zone_size)) {
+		btrfs_err(fs_info, "unaligned block group at %llu + %llu",
+			  logical, length);
+		return -EIO;
+	}
+
+	/* Get the chunk mapping */
+	em_tree = &fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, length);
+	read_unlock(&em_tree->lock);
+
+	if (!em)
+		return -EINVAL;
+
+	map = em->map_lookup;
+
+	/*
+	 * Get the zone type: if the group is mapped to a non-sequential zone,
+	 * there is no need for the allocation offset (fit allocation is OK).
+	 */
+	alloc_type = -1;
+	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
+				GFP_NOFS);
+	if (!alloc_offsets) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		int is_sequential;
+		struct blk_zone zone;
+
+		device = map->stripes[i].dev;
+		physical = map->stripes[i].physical;
+
+		if (device->bdev == NULL) {
+			alloc_offsets[i] = WP_MISSING_DEV;
+			continue;
+		}
+
+		is_sequential = btrfs_dev_is_sequential(device, physical);
+		if (alloc_type == -1)
+			alloc_type = is_sequential ?
+					BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
+
+		if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
+		    (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
+			btrfs_err(fs_info, "found block group of mixed zone types");
+			ret = -EIO;
+			goto out;
+		}
+
+		if (!is_sequential)
+			continue;
+
+		/* this zone will be used for allocation, so mark this
+		 * zone non-empty
+		 */
+		clear_bit(physical >> device->zone_size_shift,
+			  device->empty_zones);
+
+		/*
+		 * The group is mapped to a sequential zone. Get the zone write
+		 * pointer to determine the allocation offset within the zone.
+		 */
+		WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+		ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
+		if (ret == -EIO || ret == -EOPNOTSUPP) {
+			ret = 0;
+			alloc_offsets[i] = WP_MISSING_DEV;
+			continue;
+		} else if (ret) {
+			goto out;
+		}
+
+
+		switch (zone.cond) {
+		case BLK_ZONE_COND_OFFLINE:
+		case BLK_ZONE_COND_READONLY:
+			btrfs_err(fs_info, "Offline/readonly zone %llu",
+				  physical >> device->zone_size_shift);
+			alloc_offsets[i] = WP_MISSING_DEV;
+			break;
+		case BLK_ZONE_COND_EMPTY:
+			alloc_offsets[i] = 0;
+			break;
+		case BLK_ZONE_COND_FULL:
+			alloc_offsets[i] = fs_info->zone_size;
+			break;
+		default:
+			/* Partially used zone */
+			alloc_offsets[i] =
+				((zone.wp - zone.start) << SECTOR_SHIFT);
+			break;
+		}
+	}
+
+	if (alloc_type == BTRFS_ALLOC_FIT)
+		goto out;
+
+	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	case 0: /* single */
+	case BTRFS_BLOCK_GROUP_DUP:
+	case BTRFS_BLOCK_GROUP_RAID1:
+		cache->alloc_offset = WP_MISSING_DEV;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] == WP_MISSING_DEV)
+				continue;
+			if (cache->alloc_offset == WP_MISSING_DEV)
+				cache->alloc_offset = alloc_offsets[i];
+			if (alloc_offsets[i] == cache->alloc_offset)
+				continue;
+
+			btrfs_err(fs_info,
+				  "write pointer mismatch: block group %llu",
+				  logical);
+			cache->wp_broken = 1;
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID0:
+		cache->alloc_offset = 0;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] == WP_MISSING_DEV) {
+				btrfs_err(fs_info,
+					  "cannot recover write pointer: block group %llu",
+					  logical);
+				cache->wp_broken = 1;
+				continue;
+			}
+
+			if (alloc_offsets[0] < alloc_offsets[i]) {
+				btrfs_err(fs_info,
+					  "write pointer mismatch: block group %llu",
+					  logical);
+				cache->wp_broken = 1;
+				continue;
+			}
+
+			cache->alloc_offset += alloc_offsets[i];
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID10:
+		/*
+		 * Pass1: check write pointer of RAID1 level: each pointer
+		 * should be equal.
+		 */
+		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
+			int base = i*map->sub_stripes;
+			u64 offset = WP_MISSING_DEV;
+
+			for (j = 0; j < map->sub_stripes; j++) {
+				if (alloc_offsets[base+j] == WP_MISSING_DEV)
+					continue;
+				if (offset == WP_MISSING_DEV)
+					offset = alloc_offsets[base+j];
+				if (alloc_offsets[base+j] == offset)
+					continue;
+
+				btrfs_err(fs_info,
+					  "write pointer mismatch: block group %llu",
+					  logical);
+				cache->wp_broken = 1;
+			}
+			for (j = 0; j < map->sub_stripes; j++)
+				alloc_offsets[base+j] = offset;
+		}
+
+		/* Pass2: check write pointer of RAID1 level */
+		cache->alloc_offset = 0;
+		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
+			int base = i*map->sub_stripes;
+
+			if (alloc_offsets[base] == WP_MISSING_DEV) {
+				btrfs_err(fs_info,
+					  "cannot recover write pointer: block group %llu",
+					  logical);
+				cache->wp_broken = 1;
+				continue;
+			}
+
+			if (alloc_offsets[0] < alloc_offsets[base]) {
+				btrfs_err(fs_info,
+					  "write pointer mismatch: block group %llu",
+					  logical);
+				cache->wp_broken = 1;
+				continue;
+			}
+
+			cache->alloc_offset += alloc_offsets[base];
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID5:
+	case BTRFS_BLOCK_GROUP_RAID6:
+		/* RAID5/6 is not supported yet */
+	default:
+		btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
+			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		ret = -EINVAL;
+		goto out;
+	}
+
+out:
+	cache->alloc_type = alloc_type;
+	kfree(alloc_offsets);
+	free_extent_map(em);
+
+	return ret;
+}
+
 static struct btrfs_block_group_cache *
 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 			       u64 start, u64 size)
 {
 	struct btrfs_block_group_cache *cache;
+	int ret;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	if (!cache)
@@ -10238,6 +10554,16 @@  btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 	atomic_set(&cache->trimming, 0);
 	mutex_init(&cache->free_space_lock);
 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+	cache->alloc_type = BTRFS_ALLOC_FIT;
+	cache->alloc_offset = 0;
+
+	if (btrfs_fs_incompat(fs_info, HMZONED)) {
+		ret = btrfs_get_block_group_alloc_offset(cache);
+		if (ret) {
+			kfree(cache);
+			return NULL;
+		}
+	}
 
 	return cache;
 }
@@ -10310,6 +10636,7 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	int need_clear = 0;
 	u64 cache_gen;
 	u64 feature;
+	u64 unusable;
 	int mixed;
 
 	feature = btrfs_super_incompat_flags(info->super_copy);
@@ -10415,6 +10742,26 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			free_excluded_extents(cache);
 		}
 
+		switch (cache->alloc_type) {
+		case BTRFS_ALLOC_FIT:
+			unusable = cache->bytes_super;
+			break;
+		case BTRFS_ALLOC_SEQ:
+			WARN_ON(cache->bytes_super != 0);
+			unusable = cache->alloc_offset -
+				btrfs_block_group_used(&cache->item);
+			/* we only need ->free_space in ALLOC_SEQ BGs */
+			cache->last_byte_to_unpin = (u64)-1;
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->free_space_ctl->free_space =
+				cache->key.offset - cache->alloc_offset;
+			cache->unusable = unusable;
+			free_excluded_extents(cache);
+			break;
+		default:
+			BUG();
+		}
+
 		ret = btrfs_add_block_group_cache(info, cache);
 		if (ret) {
 			btrfs_remove_free_space_cache(cache);
@@ -10425,7 +10772,7 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 		trace_btrfs_add_block_group(info, cache, 0);
 		update_space_info(info, cache->flags, found_key.offset,
 				  btrfs_block_group_used(&cache->item),
-				  cache->bytes_super, &space_info);
+				  unusable, &space_info);
 
 		cache->space_info = space_info;
 
@@ -10438,6 +10785,9 @@  int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			ASSERT(list_empty(&cache->bg_list));
 			btrfs_mark_bg_unused(cache);
 		}
+
+		if (cache->wp_broken)
+			inc_block_group_ro(cache, 1);
 	}
 
 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f74dc259307b..cc69dc71f4c1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2326,8 +2326,11 @@  int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 			   u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
+	struct btrfs_block_group_cache *block_group = ctl->private;
 	int ret = 0;
 
+	WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
+
 	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
 	if (!info)
 		return -ENOMEM;
@@ -2376,6 +2379,28 @@  int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
+			       u64 bytenr, u64 size)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	u64 offset = bytenr - block_group->key.objectid;
+	u64 to_free, to_unusable;
+
+	spin_lock(&ctl->tree_lock);
+	if (offset >= block_group->alloc_offset)
+		to_free = size;
+	else if (offset + size <= block_group->alloc_offset)
+		to_free = 0;
+	else
+		to_free = offset + size - block_group->alloc_offset;
+	to_unusable = size - to_free;
+	ctl->free_space += to_free;
+	block_group->unusable += to_unusable;
+	spin_unlock(&ctl->tree_lock);
+	return 0;
+
+}
+
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 offset, u64 bytes)
 {
@@ -2384,6 +2409,8 @@  int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 	int ret;
 	bool re_search = false;
 
+	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
+
 	spin_lock(&ctl->tree_lock);
 
 again:
@@ -2619,6 +2646,8 @@  u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 
+	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
+
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
 				block_group->full_stripe_len, max_extent_size);
@@ -2738,6 +2767,8 @@  u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 	struct rb_node *node;
 	u64 ret = 0;
 
+	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
+
 	spin_lock(&cluster->lock);
 	if (bytes > cluster->max_size)
 		goto out;
@@ -3384,6 +3415,8 @@  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 {
 	int ret;
 
+	WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
+
 	*trimmed = 0;
 
 	spin_lock(&block_group->lock);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8760acb55ffd..d30667784f73 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -73,10 +73,15 @@  void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 			   struct btrfs_free_space_ctl *ctl,
 			   u64 bytenr, u64 size);
+int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
+			       u64 bytenr, u64 size);
 static inline int
 btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		     u64 bytenr, u64 size)
 {
+	if (block_group->alloc_type == BTRFS_ALLOC_SEQ)
+		return __btrfs_add_free_space_seq(block_group, bytenr, size);
+
 	return __btrfs_add_free_space(block_group->fs_info,
 				      block_group->free_space_ctl,
 				      bytenr, size);