diff mbox series

btrfs: zoned: fix mounting with conventional zones

Message ID 0fef711e5ed1d9df00a5a749aab9e3cd3fe4c14c.1662042048.git.johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned: fix mounting with conventional zones | expand

Commit Message

Johannes Thumshirn Sept. 1, 2022, 2:21 p.m. UTC
Since commit 6a921de58992 ("btrfs: zoned: introduce
space_info->active_total_bytes"), we're only counting the bytes of a
block-group on an active zone as usable for metadata writes. But on a SMR
drive, we don't have active zones and short circuit some of the logic.

This leads to an error on mount, because we cannot reserve space for
metadata writes.

Fix this by also setting the BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE bit in the
block-group's runtime flag if the zone is a conventional zone.

Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/zoned.c | 68 ++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 31 deletions(-)

Comments

Naohiro Aota Sept. 2, 2022, 12:41 p.m. UTC | #1
On Thu, Sep 01, 2022 at 07:21:07AM -0700, Johannes Thumshirn wrote:
> Since commit 6a921de58992 ("btrfs: zoned: introduce
> space_info->active_total_bytes"), we're only counting the bytes of a
> block-group on an active zone as usable for metadata writes. But on a SMR
> drive, we don't have active zones and short circuit some of the logic.
> 
> This leads to an error on mount, because we cannot reserve space for
> metadata writes.
> 
> Fix this by also setting the BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE bit in the
> block-group's runtime flag if the zone is a conventional zone.
> 
> Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/zoned.c | 68 ++++++++++++++++++++++++++----------------------
>  1 file changed, 37 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index e12c0ca509fb..364f39decb4e 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -1187,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
>   * offset.
>   */
>  static int calculate_alloc_pointer(struct btrfs_block_group *cache,
> -				   u64 *offset_ret)
> +				   u64 *offset_ret, bool new)
>  {
>  	struct btrfs_fs_info *fs_info = cache->fs_info;
>  	struct btrfs_root *root;
> @@ -1197,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
>  	int ret;
>  	u64 length;
>  
> +	/*
> +	 * Avoid  tree lookups for a new BG. It has no use for a new BG. It
> +	 * must always be 0.
> +	 *
> +	 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
> +	 * For new a BG, this function is called from btrfs_make_block_group()
> +	 * which is already taking the chunk mutex. Thus, we cannot call
> +	 * calculate_alloc_pointer() which takes extent buffer locks to avoid
> +	 * deadlock.
> +	 */
> +	if (new) {
> +		*offset_ret = 0;
> +		return 0;
> +	}
> +
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> @@ -1332,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
>  		else
>  			num_conventional++;
>  
> +		/*
> +		 * Consider a zone as active if we can allow any number of
> +		 * active zones.
> +		 */
> +		if (!device->zone_info->max_active_zones)
> +			__set_bit(i, active);
> +
>  		if (!is_sequential) {
>  			alloc_offsets[i] = WP_CONVENTIONAL;
>  			continue;
> @@ -1398,45 +1420,29 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
>  			__set_bit(i, active);
>  			break;
>  		}
> -
> -		/*
> -		 * Consider a zone as active if we can allow any number of
> -		 * active zones.
> -		 */
> -		if (!device->zone_info->max_active_zones)
> -			__set_bit(i, active);
>  	}
>  
>  	if (num_sequential > 0)
>  		cache->seq_zone = true;
>  
>  	if (num_conventional > 0) {
> -		/*
> -		 * Avoid calling calculate_alloc_pointer() for new BG. It
> -		 * is no use for new BG. It must be always 0.
> -		 *
> -		 * Also, we have a lock chain of extent buffer lock ->
> -		 * chunk mutex.  For new BG, this function is called from
> -		 * btrfs_make_block_group() which is already taking the
> -		 * chunk mutex. Thus, we cannot call
> -		 * calculate_alloc_pointer() which takes extent buffer
> -		 * locks to avoid deadlock.
> -		 */
> -
>  		/* Zone capacity is always zone size in emulation */
>  		cache->zone_capacity = cache->length;
> -		if (new) {
> -			cache->alloc_offset = 0;
> +		ret = calculate_alloc_pointer(cache, &last_alloc, new);
> +		if (ret) {
> +			btrfs_err(fs_info,
> +			  "zoned: failed to determine allocation offset of bg %llu",
> +				  cache->start);
>  			goto out;
> -		}
> -		ret = calculate_alloc_pointer(cache, &last_alloc);
> -		if (ret || map->num_stripes == num_conventional) {
> -			if (!ret)
> -				cache->alloc_offset = last_alloc;
> -			else
> -				btrfs_err(fs_info,
> -			"zoned: failed to determine allocation offset of bg %llu",
> -					  cache->start);
> +		} else if (map->num_stripes == num_conventional) {
> +			cache->alloc_offset = last_alloc;
> +			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
> +				&cache->runtime_flags);
> +			btrfs_get_block_group(cache);
> +			spin_lock(&fs_info->zone_active_bgs_lock);
> +			list_add_tail(&cache->active_bg_list,
> +				      &fs_info->zone_active_bgs);
> +			spin_unlock(&fs_info->zone_active_bgs_lock);

Instead of duplicating the list manipulation code, how about moving the
"out" label one section above? So, it looks like this.

out:
	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
...


Or, even it might be better to move the list manipulation part here to make
it looks like this.

	if (!ret) {
		cache->meta_write_pointer = cache->alloc_offset + cache->start;
        	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
        		btrfs_get_block_group(cache);
        		spin_lock(&fs_info->zone_active_bgs_lock);
        		list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
        		spin_unlock(&fs_info->zone_active_bgs_lock);
        	}
	} else {
		kfree(cache->physical_map);
		cache->physical_map = NULL;
	}

>  			goto out;
>  		}
>  	}
> -- 
> 2.37.2
>
David Sterba Sept. 2, 2022, 12:47 p.m. UTC | #2
On Thu, Sep 01, 2022 at 07:21:07AM -0700, Johannes Thumshirn wrote:
> Since commit 6a921de58992 ("btrfs: zoned: introduce
> space_info->active_total_bytes"), we're only counting the bytes of a
> block-group on an active zone as usable for metadata writes. But on a SMR
> drive, we don't have active zones and short circuit some of the logic.
> 
> This leads to an error on mount, because we cannot reserve space for
> metadata writes.
> 
> Fix this by also setting the BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE bit in the
> block-group's runtime flag if the zone is a conventional zone.
> 
> Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Added to misc-next, thanks.
Johannes Thumshirn Sept. 2, 2022, 3:14 p.m. UTC | #3
On 02.09.22 14:41, Naohiro Aota wrote:
> Instead of duplicating the list manipulation code, how about moving the
> "out" label one section above? So, it looks like this.
> 
> out:
> 	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
> ...
> 

I did have that before but didn't really like it.

> Or, even it might be better to move the list manipulation part here to make
> it looks like this.
> 
> 	if (!ret) {
> 		cache->meta_write_pointer = cache->alloc_offset + cache->start;
>         	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
>         		btrfs_get_block_group(cache);
>         		spin_lock(&fs_info->zone_active_bgs_lock);
>         		list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
>         		spin_unlock(&fs_info->zone_active_bgs_lock);
>         	}
> 	} else {
> 		kfree(cache->physical_map);
> 		cache->physical_map = NULL;
> 	}
> 
>>  			goto out;

That makes sense, I'll add a follow up and David can decide if he wants to fold it in 
or leave as a separate cleanup.
diff mbox series

Patch

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index e12c0ca509fb..364f39decb4e 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1187,7 +1187,7 @@  int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
  * offset.
  */
 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
-				   u64 *offset_ret)
+				   u64 *offset_ret, bool new)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_root *root;
@@ -1197,6 +1197,21 @@  static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 	int ret;
 	u64 length;
 
+	/*
+	 * Avoid  tree lookups for a new BG. It has no use for a new BG. It
+	 * must always be 0.
+	 *
+	 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+	 * For new a BG, this function is called from btrfs_make_block_group()
+	 * which is already taking the chunk mutex. Thus, we cannot call
+	 * calculate_alloc_pointer() which takes extent buffer locks to avoid
+	 * deadlock.
+	 */
+	if (new) {
+		*offset_ret = 0;
+		return 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1332,6 +1347,13 @@  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		else
 			num_conventional++;
 
+		/*
+		 * Consider a zone as active if we can allow any number of
+		 * active zones.
+		 */
+		if (!device->zone_info->max_active_zones)
+			__set_bit(i, active);
+
 		if (!is_sequential) {
 			alloc_offsets[i] = WP_CONVENTIONAL;
 			continue;
@@ -1398,45 +1420,29 @@  int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 			__set_bit(i, active);
 			break;
 		}
-
-		/*
-		 * Consider a zone as active if we can allow any number of
-		 * active zones.
-		 */
-		if (!device->zone_info->max_active_zones)
-			__set_bit(i, active);
 	}
 
 	if (num_sequential > 0)
 		cache->seq_zone = true;
 
 	if (num_conventional > 0) {
-		/*
-		 * Avoid calling calculate_alloc_pointer() for new BG. It
-		 * is no use for new BG. It must be always 0.
-		 *
-		 * Also, we have a lock chain of extent buffer lock ->
-		 * chunk mutex.  For new BG, this function is called from
-		 * btrfs_make_block_group() which is already taking the
-		 * chunk mutex. Thus, we cannot call
-		 * calculate_alloc_pointer() which takes extent buffer
-		 * locks to avoid deadlock.
-		 */
-
 		/* Zone capacity is always zone size in emulation */
 		cache->zone_capacity = cache->length;
-		if (new) {
-			cache->alloc_offset = 0;
+		ret = calculate_alloc_pointer(cache, &last_alloc, new);
+		if (ret) {
+			btrfs_err(fs_info,
+			  "zoned: failed to determine allocation offset of bg %llu",
+				  cache->start);
 			goto out;
-		}
-		ret = calculate_alloc_pointer(cache, &last_alloc);
-		if (ret || map->num_stripes == num_conventional) {
-			if (!ret)
-				cache->alloc_offset = last_alloc;
-			else
-				btrfs_err(fs_info,
-			"zoned: failed to determine allocation offset of bg %llu",
-					  cache->start);
+		} else if (map->num_stripes == num_conventional) {
+			cache->alloc_offset = last_alloc;
+			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+				&cache->runtime_flags);
+			btrfs_get_block_group(cache);
+			spin_lock(&fs_info->zone_active_bgs_lock);
+			list_add_tail(&cache->active_bg_list,
+				      &fs_info->zone_active_bgs);
+			spin_unlock(&fs_info->zone_active_bgs_lock);
 			goto out;
 		}
 	}