diff mbox series

[5/5] btrfs: zoned: make auto-reclaim less aggressive

Message ID 89824f8b4ba1ac8f05f74c047333c970049e2815.1647878642.git.johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: rework background block group relocation | expand

Commit Message

Johannes Thumshirn March 21, 2022, 4:14 p.m. UTC
The current auto-reclaim algorithm starts reclaiming all block-group's
with a zone_unusable value above a configured threshold. This is causing a
lot of reclaim IO even if there would be enough free zones on the device.

Instead of only accounting a block-group's zone_unusable value, also take
the number of empty zones into account.

Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

---
Changes since v3:
* remove unusedd include (Filipe)
* Only call btrfs_zoned_should_reclaim() for zoned FSes (Josef)
* Use btrfs_calc_available_free_space() (Josef)

Changes since v2:
* Use div_u64()

Changes since RFC:
* Fix logic error
* Skip unavailable devices
* Use different metric working for non-zoned devices as well
---
 fs/btrfs/block-group.c | 10 ++++++++++
 fs/btrfs/zoned.c       | 23 +++++++++++++++++++++++
 fs/btrfs/zoned.h       |  6 ++++++
 3 files changed, 39 insertions(+)

Comments

Pankaj Raghav March 23, 2022, 9:08 a.m. UTC | #1
Hi Johannes,

I tried this patchset and I am noticing a weird behaviour wrt the value
of factor in btrfs_zoned_should_reclaim function.

Here is my setup in QEMU:
size 12800M
zoned=true,zoned.zone_capacity=128M,zoned.zone_size=128M

btrfs mkfs:
mkfs.btrfs -d single -m single <znsdev>;  mount -t auto <znsdev>
/mnt/real_scratch

I added a print statement in btrfs_zoned_should_reclaim to get the
values for the factor, used and total params.

When I run the btrfs/237 xfstest, I am noticing the following values:
[   54.829309] btrfs: factor: 4850 used: 19528679424, total: 402653184

The used > total, thereby making the factor greater than 100. This will
force a reclaim even though the drive is almost empty:

  start: 0x000000000, len 0x040000, cap 0x040000, wptr 0x000078 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000040000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000080000, len 0x040000, cap 0x040000, wptr 0x0001e0 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x0000c0000, len 0x040000, cap 0x040000, wptr 0x000d80 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000100000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000140000, len 0x040000, cap 0x040000, wptr 0x008520 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000180000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x0001c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000200000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000240000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x000280000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
  start: 0x0002c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
.....
.....

I am also noticing the same behaviour for ZNS drive with size 1280M:

[   86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184

Is something going wrong with the calculation? Or am I missing something
here?

On 2022-03-21 17:14, Johannes Thumshirn wrote:

> index 1b1b310c3c51..f2a412427921 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -2079,3 +2079,26 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>  	}
>  	mutex_unlock(&fs_devices->device_list_mutex);
>  }
> +
> +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_space_info *sinfo;
> +	u64 used = 0;
> +	u64 total = 0;
> +	u64 factor;
> +
> +	if (!btrfs_is_zoned(fs_info))
> +		return false;
> +
> +	if (!fs_info->bg_reclaim_threshold)
> +		return false;
> +
> +	list_for_each_entry(sinfo, &fs_info->space_info, list) {
> +		total += sinfo->total_bytes;
> +		used += btrfs_calc_available_free_space(fs_info, sinfo,
> +							BTRFS_RESERVE_NO_FLUSH);
> +	}
> +
> +	factor = div_u64(used * 100, total);
+       pr_info("btrfs: factor: %lld used: %lld, total: %lld ", factor,
used, total);
> +	return factor >= fs_info->bg_reclaim_threshold;
> +}

>  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
Johannes Thumshirn March 23, 2022, 9:11 a.m. UTC | #2
On 23/03/2022 10:09, Pankaj Raghav wrote:
> Hi Johannes,
> 
> I tried this patchset and I am noticing a weird behaviour wrt the value
> of factor in btrfs_zoned_should_reclaim function.
> 
> Here is my setup in QEMU:
> size 12800M
> zoned=true,zoned.zone_capacity=128M,zoned.zone_size=128M
> 
> btrfs mkfs:
> mkfs.btrfs -d single -m single <znsdev>;  mount -t auto <znsdev>
> /mnt/real_scratch
> 
> I added a print statement in btrfs_zoned_should_reclaim to get the
> values for the factor, used and total params.
> 
> When I run the btrfs/237 xfstest, I am noticing the following values:
> [   54.829309] btrfs: factor: 4850 used: 19528679424, total: 402653184
> 
> The used > total, thereby making the factor greater than 100. This will
> force a reclaim even though the drive is almost empty:
> 
>   start: 0x000000000, len 0x040000, cap 0x040000, wptr 0x000078 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000040000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000080000, len 0x040000, cap 0x040000, wptr 0x0001e0 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x0000c0000, len 0x040000, cap 0x040000, wptr 0x000d80 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000100000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000140000, len 0x040000, cap 0x040000, wptr 0x008520 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000180000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x0001c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000200000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000240000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x000280000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
>   start: 0x0002c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> .....
> .....
> 
> I am also noticing the same behaviour for ZNS drive with size 1280M:
> 
> [   86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
> 
> Is something going wrong with the calculation? Or am I missing something
> here?
> 

Apparently I'm either too dumb for basic maths, or 
btrfs_calc_available_free_space() doesn't give us the values we're expecting.

I'll recheck.
Pankaj Raghav March 23, 2022, 9:14 a.m. UTC | #3
On 2022-03-23 10:11, Johannes Thumshirn wrote:
> On 23/03/2022 10:09, Pankaj Raghav wrote:
>> I am also noticing the same behaviour for ZNS drive with size 1280M:
>>
>> [   86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
>>
>> Is something going wrong with the calculation? Or am I missing something
>> here?
>>
> 
> Apparently I'm either too dumb for basic maths, or 
> btrfs_calc_available_free_space() doesn't give us the values we're expecting.
> 
I would say the latter :)
> I'll recheck.
Let me know if you can also reproduce the results.
Johannes Thumshirn March 23, 2022, 10:39 a.m. UTC | #4
On 23/03/2022 10:14, Pankaj Raghav wrote:
> 
> 
> On 2022-03-23 10:11, Johannes Thumshirn wrote:
>> On 23/03/2022 10:09, Pankaj Raghav wrote:
>>> I am also noticing the same behaviour for ZNS drive with size 1280M:
>>>
>>> [   86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
>>>
>>> Is something going wrong with the calculation? Or am I missing something
>>> here?
>>>
>>
>> Apparently I'm either too dumb for basic maths, or 
>> btrfs_calc_available_free_space() doesn't give us the values we're expecting.
>>
> I would say the latter :)
>> I'll recheck.
> Let me know if you can also reproduce the results.
> 

It looks like we can't use btrfs_calc_available_free_space(), can
you try this one on top:

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f2a412427921..4a6c1f1a7223 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
 
 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_space_info *sinfo;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
        u64 used = 0;
        u64 total = 0;
        u64 factor;
 
-       if (!btrfs_is_zoned(fs_info))
-               return false;
-
        if (!fs_info->bg_reclaim_threshold)
                return false;
 
-       list_for_each_entry(sinfo, &fs_info->space_info, list) {
-               total += sinfo->total_bytes;
-               used += btrfs_calc_available_free_space(fs_info, sinfo,
-                                                       BTRFS_RESERVE_NO_FLUSH);
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               if (!device->bdev)
+                       continue;
+
+               total += device->disk_total_bytes;
+               used += device->bytes_used;
+
        }
+       mutex_unlock(&fs_devices->device_list_mutex);
 
-       factor = div_u64(used * 100, total);
+       factor = div64_u64(used * 100, total);
        return factor >= fs_info->bg_reclaim_threshold;
 }
Pankaj Raghav March 23, 2022, 11:24 a.m. UTC | #5
On 2022-03-23 11:39, Johannes Thumshirn wrote:
> 
> It looks like we can't use btrfs_calc_available_free_space(), can
> you try this one on top:
> 
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index f2a412427921..4a6c1f1a7223 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>  
>  bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>  {
> -       struct btrfs_space_info *sinfo;
> +       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +       struct btrfs_device *device;
>         u64 used = 0;
>         u64 total = 0;
>         u64 factor;
>  
> -       if (!btrfs_is_zoned(fs_info))
> -               return false;
> -
>         if (!fs_info->bg_reclaim_threshold)
>                 return false;
>  
> -       list_for_each_entry(sinfo, &fs_info->space_info, list) {
> -               total += sinfo->total_bytes;
> -               used += btrfs_calc_available_free_space(fs_info, sinfo,
> -                                                       BTRFS_RESERVE_NO_FLUSH);
> +
> +       mutex_lock(&fs_devices->device_list_mutex);
> +       list_for_each_entry(device, &fs_devices->devices, dev_list) {
> +               if (!device->bdev)
> +                       continue;
> +
> +               total += device->disk_total_bytes;
> +               used += device->bytes_used;
> +
>         }
> +       mutex_unlock(&fs_devices->device_list_mutex);
>  
> -       factor = div_u64(used * 100, total);
> +       factor = div64_u64(used * 100, total);
>         return factor >= fs_info->bg_reclaim_threshold;
>  }
> 
size 1280M:
[   47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
[   48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
[   49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
size: 12800M:
[   33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
[   34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
[   35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
size: 12800M, zcap=96M zsze=128M:
[   64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
[   65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
[   66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800

This looks good. And the test btrfs/237 is failing, as it should be
because of the change in reclaim condition. Are you planning to update
this test in fstests later?

I still have one more question: shouldn't we use the usable disk bytes
(zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
calculate the `total` variable? The `used` value is a part of the usable
disk space so I feel it makes more sense to calculate the `factor` with
the usable disk bytes instead of the disk_total_bytes.
Johannes Thumshirn March 23, 2022, 11:52 a.m. UTC | #6
On 23/03/2022 12:24, Pankaj Raghav wrote:
> 
> 
> On 2022-03-23 11:39, Johannes Thumshirn wrote:
>>
>> It looks like we can't use btrfs_calc_available_free_space(), can
>> you try this one on top:
>>
>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>> index f2a412427921..4a6c1f1a7223 100644
>> --- a/fs/btrfs/zoned.c
>> +++ b/fs/btrfs/zoned.c
>> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>>  
>>  bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>>  {
>> -       struct btrfs_space_info *sinfo;
>> +       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> +       struct btrfs_device *device;
>>         u64 used = 0;
>>         u64 total = 0;
>>         u64 factor;
>>  
>> -       if (!btrfs_is_zoned(fs_info))
>> -               return false;
>> -
>>         if (!fs_info->bg_reclaim_threshold)
>>                 return false;
>>  
>> -       list_for_each_entry(sinfo, &fs_info->space_info, list) {
>> -               total += sinfo->total_bytes;
>> -               used += btrfs_calc_available_free_space(fs_info, sinfo,
>> -                                                       BTRFS_RESERVE_NO_FLUSH);
>> +
>> +       mutex_lock(&fs_devices->device_list_mutex);
>> +       list_for_each_entry(device, &fs_devices->devices, dev_list) {
>> +               if (!device->bdev)
>> +                       continue;
>> +
>> +               total += device->disk_total_bytes;
>> +               used += device->bytes_used;
>> +
>>         }
>> +       mutex_unlock(&fs_devices->device_list_mutex);
>>  
>> -       factor = div_u64(used * 100, total);
>> +       factor = div64_u64(used * 100, total);
>>         return factor >= fs_info->bg_reclaim_threshold;
>>  }
>>
> size 1280M:
> [   47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
> [   48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
> [   49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
> size: 12800M:
> [   33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
> [   34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
> [   35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
> size: 12800M, zcap=96M zsze=128M:
> [   64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
> [   65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
> [   66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800
> 
> This looks good. And the test btrfs/237 is failing, as it should be
> because of the change in reclaim condition. Are you planning to update
> this test in fstests later?

Yes, once I have an idea how to do. Probably just fill the FS until
~75% of the drive is filled and then use the original logic.

> I still have one more question: shouldn't we use the usable disk bytes
> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
> calculate the `total` variable? The `used` value is a part of the usable
> disk space so I feel it makes more sense to calculate the `factor` with
> the usable disk bytes instead of the disk_total_bytes.
> 

disk_total_bytes comes from the value the underlying device driver set
for the gendisk's capacity via set_capacity().
Pankaj Raghav March 23, 2022, 7:37 p.m. UTC | #7
On 2022-03-23 12:52, Johannes Thumshirn wrote:

>> This looks good. And the test btrfs/237 is failing, as it should be
>> because of the change in reclaim condition. Are you planning to update
>> this test in fstests later?
> 
> Yes, once I have an idea how to do. Probably just fill the FS until
> ~75% of the drive is filled and then use the original logic.
> 
Perfect.
>> I still have one more question: shouldn't we use the usable disk bytes
>> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
>> calculate the `total` variable? The `used` value is a part of the usable
>> disk space so I feel it makes more sense to calculate the `factor` with
>> the usable disk bytes instead of the disk_total_bytes.
>>
> 
> disk_total_bytes comes from the value the underlying device driver set
> for the gendisk's capacity via set_capacity().
Yes, I understand that part. My comment was more about how pedantic we
need to be for reclaim as set capacity via the device driver will
include the unusable holes (lbas between zcap and zsze).

For e.g., zns drive with 96M zone capacity and 128M zone size with 10
zones will have a total disk capacity of 1280M but the usuable capacity
is 960M.

Let us say the `used` value is 128M, then the `factor` value with the
current approach is 128 / 1280 = 10%.

But if we use the usable capacity of a zns drive, then the `factor`
value will be 128 / 960 = 13.3%.

This might not be problem for lower value of `used` but my concern is
when the drive is nearing its capacity.

Let's take the same example where the `used` value is now 900M. Then the
factor with the current approach is 70% (900 / 1280), still below the
default 75 for bg_reclaim_threshold but when used with the usable
capacity, it is 93% (900 / 960).

So essentially we are postponing the reclaim assuming we have enough
room left but in reality it is not.

I still don't have a complete understanding of the full stack with btrfs
on zoned devices so please correct me if I am wrong.
Damien Le Moal March 24, 2022, 12:06 a.m. UTC | #8
On 3/23/22 20:52, Johannes Thumshirn wrote:
> On 23/03/2022 12:24, Pankaj Raghav wrote:
>>
>>
>> On 2022-03-23 11:39, Johannes Thumshirn wrote:
>>>
>>> It looks like we can't use btrfs_calc_available_free_space(), can
>>> you try this one on top:
>>>
>>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>> index f2a412427921..4a6c1f1a7223 100644
>>> --- a/fs/btrfs/zoned.c
>>> +++ b/fs/btrfs/zoned.c
>>> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>>>  
>>>  bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>>>  {
>>> -       struct btrfs_space_info *sinfo;
>>> +       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>>> +       struct btrfs_device *device;
>>>         u64 used = 0;
>>>         u64 total = 0;
>>>         u64 factor;
>>>  
>>> -       if (!btrfs_is_zoned(fs_info))
>>> -               return false;
>>> -
>>>         if (!fs_info->bg_reclaim_threshold)
>>>                 return false;
>>>  
>>> -       list_for_each_entry(sinfo, &fs_info->space_info, list) {
>>> -               total += sinfo->total_bytes;
>>> -               used += btrfs_calc_available_free_space(fs_info, sinfo,
>>> -                                                       BTRFS_RESERVE_NO_FLUSH);
>>> +
>>> +       mutex_lock(&fs_devices->device_list_mutex);
>>> +       list_for_each_entry(device, &fs_devices->devices, dev_list) {
>>> +               if (!device->bdev)
>>> +                       continue;
>>> +
>>> +               total += device->disk_total_bytes;
>>> +               used += device->bytes_used;

Does bytes_used include all the unusable blocks between zone cap and zone
size for all zones ? If yes, then the calculation will be OK. If not, then
you will get an artificially low factor not reflecting the need for defrag.

>>> +
>>>         }
>>> +       mutex_unlock(&fs_devices->device_list_mutex);
>>>  
>>> -       factor = div_u64(used * 100, total);
>>> +       factor = div64_u64(used * 100, total);
>>>         return factor >= fs_info->bg_reclaim_threshold;

Not sure if the factor variable is really necessary here.

>>>  }
>>>
>> size 1280M:
>> [   47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
>> [   48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
>> [   49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
>> size: 12800M:
>> [   33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [   34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [   35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
>> size: 12800M, zcap=96M zsze=128M:
>> [   64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [   65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [   66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800
>>
>> This looks good. And the test btrfs/237 is failing, as it should be
>> because of the change in reclaim condition. Are you planning to update
>> this test in fstests later?
> 
> Yes, once I have an idea how to do. Probably just fill the FS until
> ~75% of the drive is filled and then use the original logic.
> 
>> I still have one more question: shouldn't we use the usable disk bytes
>> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
>> calculate the `total` variable? The `used` value is a part of the usable
>> disk space so I feel it makes more sense to calculate the `factor` with
>> the usable disk bytes instead of the disk_total_bytes.
>>
> 
> disk_total_bytes comes from the value the underlying device driver set
> for the gendisk's capacity via set_capacity().
diff mbox series

Patch

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 628741ecb97b..12454304bb85 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1512,6 +1512,13 @@  static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
 	return bg1->used > bg2->used;
 }
 
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_is_zoned(fs_info))
+		return btrfs_zoned_should_reclaim(fs_info);
+	return true;
+}
+
 void btrfs_reclaim_bgs_work(struct work_struct *work)
 {
 	struct btrfs_fs_info *fs_info =
@@ -1522,6 +1529,9 @@  void btrfs_reclaim_bgs_work(struct work_struct *work)
 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
 		return;
 
+	if (!btrfs_should_reclaim(fs_info))
+		return;
+
 	sb_start_write(fs_info->sb);
 
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1b1b310c3c51..f2a412427921 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2079,3 +2079,26 @@  void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 }
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *sinfo;
+	u64 used = 0;
+	u64 total = 0;
+	u64 factor;
+
+	if (!btrfs_is_zoned(fs_info))
+		return false;
+
+	if (!fs_info->bg_reclaim_threshold)
+		return false;
+
+	list_for_each_entry(sinfo, &fs_info->space_info, list) {
+		total += sinfo->total_bytes;
+		used += btrfs_calc_available_free_space(fs_info, sinfo,
+							BTRFS_RESERVE_NO_FLUSH);
+	}
+
+	factor = div_u64(used * 100, total);
+	return factor >= fs_info->bg_reclaim_threshold;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index c489c08d7fd5..f2d16395087f 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -74,6 +74,7 @@  void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -232,6 +233,11 @@  static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
 static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
 
 static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	return false;
+}
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)