diff mbox series

[RFC] btrfs: zoned: make auto-reclaim less aggressive

Message ID 6e2f241b0f43111efd6fe42d736a90275bb985a9.1644587521.git.johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series [RFC] btrfs: zoned: make auto-reclaim less aggressive | expand

Commit Message

Johannes Thumshirn Feb. 11, 2022, 1:54 p.m. UTC
The current auto-reclaim algorithm starts reclaiming all block-group's
with a zone_unusable value above a configured threshold. This is causing a
lot of reclaim IO even if there would be enough free zones on the device.

Instead of only accounting a block-group's zone_unusable value, also take
the number of empty zones into account.

Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

---

RFC because I'm a bit unsure about the user interface. Should we use the
same value / sysfs file for both the number of non-empty zones and the
number of zone_unusable bytes per block_group or add another knob to fine
tune?

 fs/btrfs/block-group.c |  3 +++
 fs/btrfs/zoned.c       | 29 +++++++++++++++++++++++++++++
 fs/btrfs/zoned.h       |  6 ++++++
 3 files changed, 38 insertions(+)

Comments

Naohiro Aota Feb. 14, 2022, 11:34 a.m. UTC | #1
On Fri, Feb 11, 2022 at 05:54:02AM -0800, Johannes Thumshirn wrote:
> The current auto-reclaim algorithm starts reclaiming all block-group's
> with a zone_unusable value above a configured threshold. This is causing a
> lot of reclaim IO even if there would be enough free zones on the device.
> 
> Instead of only accounting a block-group's zone_unusable value, also take
> the number of empty zones into account.
> 
> Cc: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> ---
> 
> RFC because I'm a bit unsure about the user interface. Should we use the
> same value / sysfs file for both the number of non-empty zones and the
> number of zone_unusable bytes per block_group or add another knob to fine
> tune?
> 
>  fs/btrfs/block-group.c |  3 +++
>  fs/btrfs/zoned.c       | 29 +++++++++++++++++++++++++++++
>  fs/btrfs/zoned.h       |  6 ++++++
>  3 files changed, 38 insertions(+)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 3113f6d7f335..c0f38f486deb 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1522,6 +1522,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
>  	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
>  		return;
>  
> +	if (!btrfs_zoned_should_reclaim(fs_info))
> +		return;
> +
>  	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
>  		return;
>  
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index b7b5fac1c779..47204f38f02e 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -15,6 +15,7 @@
>  #include "transaction.h"
>  #include "dev-replace.h"
>  #include "space-info.h"
> +#include "misc.h"
>  
>  /* Maximum number of zones to report per blkdev_report_zones() call */
>  #define BTRFS_REPORT_NR_ZONES   4096
> @@ -2082,3 +2083,31 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>  	}
>  	mutex_unlock(&fs_devices->device_list_mutex);
>  }
> +
> +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +	struct btrfs_device *device;
> +	u64 nr_free = 0;
> +	u64 nr_zones = 0;
> +	u64 factor;
> +
> +	if (!btrfs_is_zoned(fs_info))
> +		return false;
> +
> +	if (!fs_info->bg_reclaim_threshold)
> +		return false;
> +
> +	mutex_lock(&fs_devices->device_list_mutex);
> +	list_for_each_entry(device, &fs_devices->devices, dev_list) {
> +		struct btrfs_zoned_device_info *zone_info = device->zone_info;
> +

We should check "if (!device->bdev)" as we can have a missing device.

> +		nr_zones += zone_info->nr_zones;
> +		nr_free += bitmap_weight(zone_info->empty_zones,
> +					 zone_info->nr_zones);

Here, we can use device->bytes_used / device->disk_total_bytes instead
to see how much bytes are allocated as device extents. This metric is
also usable for regular btrfs.

> +	}
> +	mutex_unlock(&fs_devices->device_list_mutex);
> +
> +	factor = div_factor_fine(nr_free, nr_zones);
> +	return factor >= fs_info->bg_reclaim_threshold;
> +}
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index cbf016a7bb5d..d0d0e5c02606 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -78,6 +78,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
>  			     u64 length);
>  void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
>  void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
> +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
>  #else /* CONFIG_BLK_DEV_ZONED */
>  static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
>  				     struct blk_zone *zone)
> @@ -236,6 +237,11 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
>  static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
>  
>  static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
> +
> +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
> +{
> +	return false;
> +}
>  #endif
>  
>  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
> -- 
> 2.34.1
Johannes Thumshirn Feb. 14, 2022, 11:36 a.m. UTC | #2
On 14/02/2022 12:34, Naohiro Aota wrote:
> On Fri, Feb 11, 2022 at 05:54:02AM -0800, Johannes Thumshirn wrote:
>> The current auto-reclaim algorithm starts reclaiming all block-group's
>> with a zone_unusable value above a configured threshold. This is causing a
>> lot of reclaim IO even if there would be enough free zones on the device.
>>
>> Instead of only accounting a block-group's zone_unusable value, also take
>> the number of empty zones into account.
>>
>> Cc: Josef Bacik <josef@toxicpanda.com>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> ---
>>
>> RFC because I'm a bit unsure about the user interface. Should we use the
>> same value / sysfs file for both the number of non-empty zones and the
>> number of zone_unusable bytes per block_group or add another knob to fine
>> tune?
>>
>>  fs/btrfs/block-group.c |  3 +++
>>  fs/btrfs/zoned.c       | 29 +++++++++++++++++++++++++++++
>>  fs/btrfs/zoned.h       |  6 ++++++
>>  3 files changed, 38 insertions(+)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 3113f6d7f335..c0f38f486deb 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1522,6 +1522,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
>>  	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
>>  		return;
>>  
>> +	if (!btrfs_zoned_should_reclaim(fs_info))
>> +		return;
>> +
>>  	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
>>  		return;
>>  
>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>> index b7b5fac1c779..47204f38f02e 100644
>> --- a/fs/btrfs/zoned.c
>> +++ b/fs/btrfs/zoned.c
>> @@ -15,6 +15,7 @@
>>  #include "transaction.h"
>>  #include "dev-replace.h"
>>  #include "space-info.h"
>> +#include "misc.h"
>>  
>>  /* Maximum number of zones to report per blkdev_report_zones() call */
>>  #define BTRFS_REPORT_NR_ZONES   4096
>> @@ -2082,3 +2083,31 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>>  	}
>>  	mutex_unlock(&fs_devices->device_list_mutex);
>>  }
>> +
>> +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>> +{
>> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> +	struct btrfs_device *device;
>> +	u64 nr_free = 0;
>> +	u64 nr_zones = 0;
>> +	u64 factor;
>> +
>> +	if (!btrfs_is_zoned(fs_info))
>> +		return false;
>> +
>> +	if (!fs_info->bg_reclaim_threshold)
>> +		return false;
>> +
>> +	mutex_lock(&fs_devices->device_list_mutex);
>> +	list_for_each_entry(device, &fs_devices->devices, dev_list) {
>> +		struct btrfs_zoned_device_info *zone_info = device->zone_info;
>> +
> 
> We should check "if (!device->bdev)" as we can have a missing device.
> 
>> +		nr_zones += zone_info->nr_zones;
>> +		nr_free += bitmap_weight(zone_info->empty_zones,
>> +					 zone_info->nr_zones);
> 
> Here, we can use device->bytes_used / device->disk_total_bytes instead
> to see how much bytes are allocated as device extents. This metric is
> also usable for regular btrfs.
> 
>> +	}
>> +	mutex_unlock(&fs_devices->device_list_mutex);
>> +
>> +	factor = div_factor_fine(nr_free, nr_zones);
>> +	return factor >= fs_info->bg_reclaim_threshold;

... and we should check that 'factor' is less or equal not more than
fs_info->bg_reclaim_threshold *sigh*

>
Josef Bacik Feb. 14, 2022, 3:04 p.m. UTC | #3
On Fri, Feb 11, 2022 at 05:54:02AM -0800, Johannes Thumshirn wrote:
> The current auto-reclaim algorithm starts reclaiming all block-group's
> with a zone_unusable value above a configured threshold. This is causing a
> lot of reclaim IO even if there would be enough free zones on the device.
> 
> Instead of only accounting a block-group's zone_unusable value, also take
> the number of empty zones into account.
> 
> Cc: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> ---
> 
> RFC because I'm a bit unsure about the user interface. Should we use the
> same value / sysfs file for both the number of non-empty zones and the
> number of zone_unusable bytes per block_group or add another knob to fine
> tune?
> 

I want per-space_info thresholds, because for us we want to never relocate
metadata block groups and set a threshold for data.

But I think for this we could have a separate threshold of "don't start
auto-relocate until we are below X threshold for the whole file system" and this
could be the fs wide setting.  Does that make sense?  Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3113f6d7f335..c0f38f486deb 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1522,6 +1522,9 @@  void btrfs_reclaim_bgs_work(struct work_struct *work)
 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
 		return;
 
+	if (!btrfs_zoned_should_reclaim(fs_info))
+		return;
+
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
 		return;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b7b5fac1c779..47204f38f02e 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -15,6 +15,7 @@ 
 #include "transaction.h"
 #include "dev-replace.h"
 #include "space-info.h"
+#include "misc.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
@@ -2082,3 +2083,31 @@  void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 }
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 nr_free = 0;
+	u64 nr_zones = 0;
+	u64 factor;
+
+	if (!btrfs_is_zoned(fs_info))
+		return false;
+
+	if (!fs_info->bg_reclaim_threshold)
+		return false;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+		nr_zones += zone_info->nr_zones;
+		nr_free += bitmap_weight(zone_info->empty_zones,
+					 zone_info->nr_zones);
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	factor = div_factor_fine(nr_free, nr_zones);
+	return factor >= fs_info->bg_reclaim_threshold;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..d0d0e5c02606 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -78,6 +78,7 @@  void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -236,6 +237,11 @@  static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
 static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
 
 static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	return false;
+}
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)