diff mbox series

[v9,15/41] btrfs: emulate write pointer for conventional zones

Message ID af1830174f9dd9e2651dab213c0b984d90811138.1604065695.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Oct. 30, 2020, 1:51 p.m. UTC
Conventional zones do not have a write pointer, so we cannot use it to
determine the allocation offset if a block group contains a conventional
zone.

But instead, we can consider the end of the last allocated extent in the
block group as an allocation offset.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/zoned.c | 119 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 113 insertions(+), 6 deletions(-)

Comments

Josef Bacik Nov. 2, 2020, 8:37 p.m. UTC | #1
On 10/30/20 9:51 AM, Naohiro Aota wrote:
> Conventional zones do not have a write pointer, so we cannot use it to
> determine the allocation offset if a block group contains a conventional
> zone.
> 
> But instead, we can consider the end of the last allocated extent in the
> block group as an allocation offset.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/zoned.c | 119 ++++++++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 113 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index 0aa821893a51..8f58d0853cc3 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -740,6 +740,104 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
>   	return 0;
>   }
>   
> +static int emulate_write_pointer(struct btrfs_block_group *cache,
> +				 u64 *offset_ret)
> +{
> +	struct btrfs_fs_info *fs_info = cache->fs_info;
> +	struct btrfs_root *root = fs_info->extent_root;
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_key search_key;
> +	struct btrfs_key found_key;
> +	int slot;
> +	int ret;
> +	u64 length;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	search_key.objectid = cache->start + cache->length;
> +	search_key.type = 0;
> +	search_key.offset = 0;
> +

You can just use 'key', don't have to use 'search_key'.

Also you don't check for things like BTRFS_TREE_BLOCK_REF_KEY or whatever in the 
case that we don't have an inline extent ref, so this could error out with a fs 
with lots of snapshots and different references.  What you need is to search 
back until you hit an BTRFS_METADATA_ITEM_KEY or a BTRFS_EXTENT_ITEM_KEY, and 
then check the offset of that thing.  Otherwise this will screw up.  Thanks,

Josef
Naohiro Aota Nov. 3, 2020, 1:25 a.m. UTC | #2
On Mon, Nov 02, 2020 at 03:37:35PM -0500, Josef Bacik wrote:
>On 10/30/20 9:51 AM, Naohiro Aota wrote:
>>Conventional zones do not have a write pointer, so we cannot use it to
>>determine the allocation offset if a block group contains a conventional
>>zone.
>>
>>But instead, we can consider the end of the last allocated extent in the
>>block group as an allocation offset.
>>
>>Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>>---
>>  fs/btrfs/zoned.c | 119 ++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 113 insertions(+), 6 deletions(-)
>>
>>diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>index 0aa821893a51..8f58d0853cc3 100644
>>--- a/fs/btrfs/zoned.c
>>+++ b/fs/btrfs/zoned.c
>>@@ -740,6 +740,104 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
>>  	return 0;
>>  }
>>+static int emulate_write_pointer(struct btrfs_block_group *cache,
>>+				 u64 *offset_ret)
>>+{
>>+	struct btrfs_fs_info *fs_info = cache->fs_info;
>>+	struct btrfs_root *root = fs_info->extent_root;
>>+	struct btrfs_path *path;
>>+	struct extent_buffer *leaf;
>>+	struct btrfs_key search_key;
>>+	struct btrfs_key found_key;
>>+	int slot;
>>+	int ret;
>>+	u64 length;
>>+
>>+	path = btrfs_alloc_path();
>>+	if (!path)
>>+		return -ENOMEM;
>>+
>>+	search_key.objectid = cache->start + cache->length;
>>+	search_key.type = 0;
>>+	search_key.offset = 0;
>>+
>
>You can just use 'key', don't have to use 'search_key'.

Fixed.

>
>Also you don't check for things like BTRFS_TREE_BLOCK_REF_KEY or 
>whatever in the case that we don't have an inline extent ref, so this 
>could error out with a fs with lots of snapshots and different 
>references.  What you need is to search back until you hit an 
>BTRFS_METADATA_ITEM_KEY or a BTRFS_EXTENT_ITEM_KEY, and then check the 
>offset of that thing.  Otherwise this will screw up.  Thanks,
>
>Josef

I overlooked that case. And, I just noticed we can use
btrfs_previous_extent_item() here. It looks much simpler now. Thanks.
David Sterba Nov. 3, 2020, 1:32 p.m. UTC | #3
On Fri, Oct 30, 2020 at 10:51:22PM +0900, Naohiro Aota wrote:
> +	struct btrfs_key found_key;
> +	int slot;
> +	int ret;
> +	u64 length;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	search_key.objectid = cache->start + cache->length;
> +	search_key.type = 0;
> +	search_key.offset = 0;
> +
> +	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
> +	if (ret < 0)
> +		goto out;
> +	ASSERT(ret != 0);

This should be a runtime check not an assert, if this happens it's
likely a corruption

> +	slot = path->slots[0];
> +	leaf = path->nodes[0];
> +	ASSERT(slot != 0);

Same

> +	slot--;
> +	btrfs_item_key_to_cpu(leaf, &found_key, slot);
> +
> +	if (found_key.objectid < cache->start) {
> +		*offset_ret = 0;
> +	} else if (found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
> +		struct btrfs_key extent_item_key;
> +
> +		if (found_key.objectid != cache->start) {
> +			ret = -EUCLEAN;
> +			goto out;
> +		}
> +
> +		length = 0;
> +
> +		/* metadata may have METADATA_ITEM_KEY */

		/* Metadata ... */

> +		if (slot == 0) {
> +			btrfs_set_path_blocking(path);
> +			ret = btrfs_prev_leaf(root, path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret == 0) {
> +				slot = btrfs_header_nritems(leaf) - 1;
> +				btrfs_item_key_to_cpu(leaf, &extent_item_key,
> +						      slot);
> +			}
> +		} else {
> +			btrfs_item_key_to_cpu(leaf, &extent_item_key, slot - 1);
> +			ret = 0;
> +		}
> +
> +		if (ret == 0 &&
> +		    extent_item_key.objectid == cache->start) {
> +			if (extent_item_key.type == BTRFS_METADATA_ITEM_KEY)

			if (...) {

> +				length = fs_info->nodesize;

			} else if (...) {

> +			else if (extent_item_key.type == BTRFS_EXTENT_ITEM_KEY)
> +				length = extent_item_key.offset;

			} else {

> +			else {
> +				ret = -EUCLEAN;
> +				goto out;
> +			}
> +		}
> +
> +		*offset_ret = length;
> +	} else if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
> +		   found_key.type == BTRFS_METADATA_ITEM_KEY) {
> +
> +		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
> +			length = found_key.offset;
> +		else
> +			length = fs_info->nodesize;
> +
> +		if (!(found_key.objectid >= cache->start &&
> +		       found_key.objectid + length <=
> +		       cache->start + cache->length)) {
> +			ret = -EUCLEAN;
> +			goto out;
> +		}
> +		*offset_ret = found_key.objectid + length - cache->start;
> +	} else {
> +		ret = -EUCLEAN;
> +		goto out;
> +	}
> +	ret = 0;
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>  {
>  	struct btrfs_fs_info *fs_info = cache->fs_info;
> @@ -754,6 +852,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>  	int i;
>  	unsigned int nofs_flag;
>  	u64 *alloc_offsets = NULL;
> +	u64 emulated_offset = 0;
>  	u32 num_sequential = 0, num_conventional = 0;
>  
>  	if (!btrfs_is_zoned(fs_info))
> @@ -854,12 +953,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>  	}
>  
>  	if (num_conventional > 0) {
> -		/*
> -		 * Since conventional zones does not have write pointer, we
> -		 * cannot determine alloc_offset from the pointer
> -		 */
> -		ret = -EINVAL;
> -		goto out;
> +		ret = emulate_write_pointer(cache, &emulated_offset);
> +		if (ret || map->num_stripes == num_conventional) {
> +			if (!ret)
> +				cache->alloc_offset = emulated_offset;
> +			goto out;
> +		}
>  	}
>  
>  	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> @@ -881,6 +980,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>  	}
>  
>  out:
> +	/* an extent is allocated after the write pointer */

	/* An ... */

> +	if (num_conventional && emulated_offset > cache->alloc_offset) {
> +		btrfs_err(fs_info,
> +			  "got wrong write pointer in BG %llu: %llu > %llu",

		"zoned: got wrong write pointer in block group %llu found %llu emulated %llu"

> +			  logical, emulated_offset, cache->alloc_offset);
> +		ret = -EIO;
> +	}
> +
>  	kfree(alloc_offsets);
>  	free_extent_map(em);
>  
> -- 
> 2.27.0
diff mbox series

Patch

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 0aa821893a51..8f58d0853cc3 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -740,6 +740,104 @@  int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
 	return 0;
 }
 
+static int emulate_write_pointer(struct btrfs_block_group *cache,
+				 u64 *offset_ret)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	int slot;
+	int ret;
+	u64 length;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	search_key.objectid = cache->start + cache->length;
+	search_key.type = 0;
+	search_key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	ASSERT(ret != 0);
+	slot = path->slots[0];
+	leaf = path->nodes[0];
+	ASSERT(slot != 0);
+	slot--;
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+	if (found_key.objectid < cache->start) {
+		*offset_ret = 0;
+	} else if (found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+		struct btrfs_key extent_item_key;
+
+		if (found_key.objectid != cache->start) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+
+		length = 0;
+
+		/* metadata may have METADATA_ITEM_KEY */
+		if (slot == 0) {
+			btrfs_set_path_blocking(path);
+			ret = btrfs_prev_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 0) {
+				slot = btrfs_header_nritems(leaf) - 1;
+				btrfs_item_key_to_cpu(leaf, &extent_item_key,
+						      slot);
+			}
+		} else {
+			btrfs_item_key_to_cpu(leaf, &extent_item_key, slot - 1);
+			ret = 0;
+		}
+
+		if (ret == 0 &&
+		    extent_item_key.objectid == cache->start) {
+			if (extent_item_key.type == BTRFS_METADATA_ITEM_KEY)
+				length = fs_info->nodesize;
+			else if (extent_item_key.type == BTRFS_EXTENT_ITEM_KEY)
+				length = extent_item_key.offset;
+			else {
+				ret = -EUCLEAN;
+				goto out;
+			}
+		}
+
+		*offset_ret = length;
+	} else if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+		   found_key.type == BTRFS_METADATA_ITEM_KEY) {
+
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+			length = found_key.offset;
+		else
+			length = fs_info->nodesize;
+
+		if (!(found_key.objectid >= cache->start &&
+		       found_key.objectid + length <=
+		       cache->start + cache->length)) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+		*offset_ret = found_key.objectid + length - cache->start;
+	} else {
+		ret = -EUCLEAN;
+		goto out;
+	}
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -754,6 +852,7 @@  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
 	int i;
 	unsigned int nofs_flag;
 	u64 *alloc_offsets = NULL;
+	u64 emulated_offset = 0;
 	u32 num_sequential = 0, num_conventional = 0;
 
 	if (!btrfs_is_zoned(fs_info))
@@ -854,12 +953,12 @@  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
 	}
 
 	if (num_conventional > 0) {
-		/*
-		 * Since conventional zones does not have write pointer, we
-		 * cannot determine alloc_offset from the pointer
-		 */
-		ret = -EINVAL;
-		goto out;
+		ret = emulate_write_pointer(cache, &emulated_offset);
+		if (ret || map->num_stripes == num_conventional) {
+			if (!ret)
+				cache->alloc_offset = emulated_offset;
+			goto out;
+		}
 	}
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -881,6 +980,14 @@  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
 	}
 
 out:
+	/* an extent is allocated after the write pointer */
+	if (num_conventional && emulated_offset > cache->alloc_offset) {
+		btrfs_err(fs_info,
+			  "got wrong write pointer in BG %llu: %llu > %llu",
+			  logical, emulated_offset, cache->alloc_offset);
+		ret = -EIO;
+	}
+
 	kfree(alloc_offsets);
 	free_extent_map(em);