diff mbox series

[v2,5/6] btrfs: prevent pathological periodic reclaim loops

Message ID 34fe3a28628bcd97e2b7c9659da73f43744f4bdf.1718665689.git.boris@bur.io (mailing list archive)
State New
Headers show
Series btrfs: dynamic and periodic block_group reclaim | expand

Commit Message

Boris Burkov June 17, 2024, 11:11 p.m. UTC
Periodic reclaim runs the risk of getting stuck in a state where it
keeps reclaiming the same block group over and over. This can happen if
1. reclaiming that block_group fails
2. reclaiming that block_group fails to move any extents into existing
   block_groups and just allocates a fresh chunk and moves everything.

Currently, 1. is a very tight loop inside the reclaim worker. That is
critical for edge triggered reclaim or else we risk forgetting about a
reclaimable group. On the other hand, with level triggered reclaim we
can break out of that loop and get it later.

With that fixed, 2. applies to both failures and "successes" with no
progress. If we have done a periodic reclaim on a space_info and nothing
has changed in that space_info, there is not much point to trying again,
so don't, until enough space gets free, which we capture with a
heuristic of needing to net free 1 chunk.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/block-group.c | 12 ++++++---
 fs/btrfs/space-info.c  | 56 ++++++++++++++++++++++++++++++++++++------
 fs/btrfs/space-info.h  | 14 +++++++++++
 3 files changed, 71 insertions(+), 11 deletions(-)

Comments

Josef Bacik June 24, 2024, 3:23 p.m. UTC | #1
On Mon, Jun 17, 2024 at 04:11:17PM -0700, Boris Burkov wrote:
> Periodic reclaim runs the risk of getting stuck in a state where it
> keeps reclaiming the same block group over and over. This can happen if
> 1. reclaiming that block_group fails
> 2. reclaiming that block_group fails to move any extents into existing
>    block_groups and just allocates a fresh chunk and moves everything.
> 
> Currently, 1. is a very tight loop inside the reclaim worker. That is
> critical for edge triggered reclaim or else we risk forgetting about a
> reclaimable group. On the other hand, with level triggered reclaim we
> can break out of that loop and get it later.
> 
> With that fixed, 2. applies to both failures and "successes" with no
> progress. If we have done a periodic reclaim on a space_info and nothing
> has changed in that space_info, there is not much point to trying again,
> so don't, until enough space gets free, which we capture with a
> heuristic of needing to net free 1 chunk.
> 
> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/block-group.c | 12 ++++++---
>  fs/btrfs/space-info.c  | 56 ++++++++++++++++++++++++++++++++++++------
>  fs/btrfs/space-info.h  | 14 +++++++++++
>  3 files changed, 71 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 6bcf24f2ac79..ba9afb94e7ce 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1933,6 +1933,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
>  			reclaimed = 0;
>  			spin_lock(&space_info->lock);
>  			space_info->reclaim_errors++;
> +			if (READ_ONCE(space_info->periodic_reclaim))
> +				space_info->periodic_reclaim_ready = false;
>  			spin_unlock(&space_info->lock);
>  		}
>  		spin_lock(&space_info->lock);
> @@ -1941,7 +1943,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
>  		spin_unlock(&space_info->lock);
>  
>  next:
> -		if (ret) {
> +		if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
>  			/* Refcount held by the reclaim_bgs list after splice. */
>  			btrfs_get_block_group(bg);
>  			list_add_tail(&bg->bg_list, &retry_list);
> @@ -3677,6 +3679,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
>  		space_info->bytes_reserved -= num_bytes;
>  		space_info->bytes_used += num_bytes;
>  		space_info->disk_used += num_bytes * factor;
> +		if (READ_ONCE(space_info->periodic_reclaim))
> +			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
>  		spin_unlock(&cache->lock);
>  		spin_unlock(&space_info->lock);
>  	} else {
> @@ -3686,8 +3690,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
>  		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
>  		space_info->bytes_used -= num_bytes;
>  		space_info->disk_used -= num_bytes * factor;
> -
> -		reclaim = should_reclaim_block_group(cache, num_bytes);
> +		if (READ_ONCE(space_info->periodic_reclaim))
> +			btrfs_space_info_update_reclaimable(space_info, num_bytes);
> +		else
> +			reclaim = should_reclaim_block_group(cache, num_bytes);
>  
>  		spin_unlock(&cache->lock);
>  		spin_unlock(&space_info->lock);
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index ff92ad26ffa2..e7a2aa751f8f 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0
>  
> +#include "linux/spinlock.h"
>  #include <linux/minmax.h>
>  #include "misc.h"
>  #include "ctree.h"
> @@ -1899,7 +1900,9 @@ static u64 calc_pct_ratio(u64 x, u64 y)
>   */
>  static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
>  {
> -	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * calc_effective_data_chunk_size(fs_info);
> +	u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
> +
> +	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
>  }
>  
>  /*
> @@ -1935,14 +1938,13 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
>  	u64 unused = alloc - used;
>  	u64 want = target > unalloc ? target - unalloc : 0;
>  	u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
> -	/* Cast to int is OK because want <= target */
> -	int ratio = calc_pct_ratio(want, target);
>  
> -	/* If we have no unused space, don't bother, it won't work anyway */
> +	/* If we have no unused space, don't bother, it won't work anyway. */
>  	if (unused < data_chunk_size)
>  		return 0;
>  
> -	return ratio;
> +	/* Cast to int is OK because want <= target. */
> +	return calc_pct_ratio(want, target);
>  }
>  
>  int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
> @@ -1984,6 +1986,46 @@ static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
>  	return 0;
>  }
>  
> +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
> +{
> +	u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
> +
> +	assert_spin_locked(&space_info->lock);
> +	space_info->reclaimable_bytes += bytes;
> +
> +	if (space_info->reclaimable_bytes >= chunk_sz)
> +		btrfs_set_periodic_reclaim_ready(space_info, true);
> +}
> +
> +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
> +{
> +	assert_spin_locked(&space_info->lock);

This is essentially

BUG_ON(!locked(spin_lock));

instead use

lockdep_assert_held()

which will just yell at us so we can fix it.  Thanks,

Josef
David Sterba June 24, 2024, 4:05 p.m. UTC | #2
On Mon, Jun 24, 2024 at 11:23:00AM -0400, Josef Bacik wrote:
> On Mon, Jun 17, 2024 at 04:11:17PM -0700, Boris Burkov wrote:
> > Periodic reclaim runs the risk of getting stuck in a state where it
> > keeps reclaiming the same block group over and over. This can happen if
> > 1. reclaiming that block_group fails
> > 2. reclaiming that block_group fails to move any extents into existing
> >    block_groups and just allocates a fresh chunk and moves everything.
> > 
> > Currently, 1. is a very tight loop inside the reclaim worker. That is
> > critical for edge triggered reclaim or else we risk forgetting about a
> > reclaimable group. On the other hand, with level triggered reclaim we
> > can break out of that loop and get it later.
> > 
> > With that fixed, 2. applies to both failures and "successes" with no
> > progress. If we have done a periodic reclaim on a space_info and nothing
> > has changed in that space_info, there is not much point to trying again,
> > so don't, until enough space gets free, which we capture with a
> > heuristic of needing to net free 1 chunk.
> > 
> > Signed-off-by: Boris Burkov <boris@bur.io>
> > ---
> >  fs/btrfs/block-group.c | 12 ++++++---
> >  fs/btrfs/space-info.c  | 56 ++++++++++++++++++++++++++++++++++++------
> >  fs/btrfs/space-info.h  | 14 +++++++++++
> >  3 files changed, 71 insertions(+), 11 deletions(-)
> > 
> > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > index 6bcf24f2ac79..ba9afb94e7ce 100644
> > --- a/fs/btrfs/block-group.c
> > +++ b/fs/btrfs/block-group.c
> > @@ -1933,6 +1933,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
> >  			reclaimed = 0;
> >  			spin_lock(&space_info->lock);
> >  			space_info->reclaim_errors++;
> > +			if (READ_ONCE(space_info->periodic_reclaim))
> > +				space_info->periodic_reclaim_ready = false;
> >  			spin_unlock(&space_info->lock);
> >  		}
> >  		spin_lock(&space_info->lock);
> > @@ -1941,7 +1943,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
> >  		spin_unlock(&space_info->lock);
> >  
> >  next:
> > -		if (ret) {
> > +		if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
> >  			/* Refcount held by the reclaim_bgs list after splice. */
> >  			btrfs_get_block_group(bg);
> >  			list_add_tail(&bg->bg_list, &retry_list);
> > @@ -3677,6 +3679,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
> >  		space_info->bytes_reserved -= num_bytes;
> >  		space_info->bytes_used += num_bytes;
> >  		space_info->disk_used += num_bytes * factor;
> > +		if (READ_ONCE(space_info->periodic_reclaim))
> > +			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
> >  		spin_unlock(&cache->lock);
> >  		spin_unlock(&space_info->lock);
> >  	} else {
> > @@ -3686,8 +3690,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
> >  		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
> >  		space_info->bytes_used -= num_bytes;
> >  		space_info->disk_used -= num_bytes * factor;
> > -
> > -		reclaim = should_reclaim_block_group(cache, num_bytes);
> > +		if (READ_ONCE(space_info->periodic_reclaim))
> > +			btrfs_space_info_update_reclaimable(space_info, num_bytes);
> > +		else
> > +			reclaim = should_reclaim_block_group(cache, num_bytes);
> >  
> >  		spin_unlock(&cache->lock);
> >  		spin_unlock(&space_info->lock);
> > diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> > index ff92ad26ffa2..e7a2aa751f8f 100644
> > --- a/fs/btrfs/space-info.c
> > +++ b/fs/btrfs/space-info.c
> > @@ -1,5 +1,6 @@
> >  // SPDX-License-Identifier: GPL-2.0
> >  
> > +#include "linux/spinlock.h"
> >  #include <linux/minmax.h>
> >  #include "misc.h"
> >  #include "ctree.h"
> > @@ -1899,7 +1900,9 @@ static u64 calc_pct_ratio(u64 x, u64 y)
> >   */
> >  static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
> >  {
> > -	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * calc_effective_data_chunk_size(fs_info);
> > +	u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
> > +
> > +	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
> >  }
> >  
> >  /*
> > @@ -1935,14 +1938,13 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
> >  	u64 unused = alloc - used;
> >  	u64 want = target > unalloc ? target - unalloc : 0;
> >  	u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
> > -	/* Cast to int is OK because want <= target */
> > -	int ratio = calc_pct_ratio(want, target);
> >  
> > -	/* If we have no unused space, don't bother, it won't work anyway */
> > +	/* If we have no unused space, don't bother, it won't work anyway. */
> >  	if (unused < data_chunk_size)
> >  		return 0;
> >  
> > -	return ratio;
> > +	/* Cast to int is OK because want <= target. */
> > +	return calc_pct_ratio(want, target);
> >  }
> >  
> >  int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
> > @@ -1984,6 +1986,46 @@ static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
> >  	return 0;
> >  }
> >  
> > +void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
> > +{
> > +	u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
> > +
> > +	assert_spin_locked(&space_info->lock);
> > +	space_info->reclaimable_bytes += bytes;
> > +
> > +	if (space_info->reclaimable_bytes >= chunk_sz)
> > +		btrfs_set_periodic_reclaim_ready(space_info, true);
> > +}
> > +
> > +void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
> > +{
> > +	assert_spin_locked(&space_info->lock);
> 
> This is essentially
> 
> BUG_ON(!locked(spin_lock));
> 
> instead use
> 
> lockdep_assert_held()
> 
> which will just yell at us so we can fix it.  Thanks,

Also documented

https://btrfs.readthedocs.io/en/latest/dev/Development-notes.html#locking
diff mbox series

Patch

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6bcf24f2ac79..ba9afb94e7ce 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1933,6 +1933,8 @@  void btrfs_reclaim_bgs_work(struct work_struct *work)
 			reclaimed = 0;
 			spin_lock(&space_info->lock);
 			space_info->reclaim_errors++;
+			if (READ_ONCE(space_info->periodic_reclaim))
+				space_info->periodic_reclaim_ready = false;
 			spin_unlock(&space_info->lock);
 		}
 		spin_lock(&space_info->lock);
@@ -1941,7 +1943,7 @@  void btrfs_reclaim_bgs_work(struct work_struct *work)
 		spin_unlock(&space_info->lock);
 
 next:
-		if (ret) {
+		if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
 			/* Refcount held by the reclaim_bgs list after splice. */
 			btrfs_get_block_group(bg);
 			list_add_tail(&bg->bg_list, &retry_list);
@@ -3677,6 +3679,8 @@  int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 		space_info->bytes_reserved -= num_bytes;
 		space_info->bytes_used += num_bytes;
 		space_info->disk_used += num_bytes * factor;
+		if (READ_ONCE(space_info->periodic_reclaim))
+			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
 		spin_unlock(&cache->lock);
 		spin_unlock(&space_info->lock);
 	} else {
@@ -3686,8 +3690,10 @@  int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
 		space_info->bytes_used -= num_bytes;
 		space_info->disk_used -= num_bytes * factor;
-
-		reclaim = should_reclaim_block_group(cache, num_bytes);
+		if (READ_ONCE(space_info->periodic_reclaim))
+			btrfs_space_info_update_reclaimable(space_info, num_bytes);
+		else
+			reclaim = should_reclaim_block_group(cache, num_bytes);
 
 		spin_unlock(&cache->lock);
 		spin_unlock(&space_info->lock);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index ff92ad26ffa2..e7a2aa751f8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0
 
+#include "linux/spinlock.h"
 #include <linux/minmax.h>
 #include "misc.h"
 #include "ctree.h"
@@ -1899,7 +1900,9 @@  static u64 calc_pct_ratio(u64 x, u64 y)
  */
 static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
 {
-	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * calc_effective_data_chunk_size(fs_info);
+	u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
+
+	return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
 }
 
 /*
@@ -1935,14 +1938,13 @@  static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
 	u64 unused = alloc - used;
 	u64 want = target > unalloc ? target - unalloc : 0;
 	u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
-	/* Cast to int is OK because want <= target */
-	int ratio = calc_pct_ratio(want, target);
 
-	/* If we have no unused space, don't bother, it won't work anyway */
+	/* If we have no unused space, don't bother, it won't work anyway. */
 	if (unused < data_chunk_size)
 		return 0;
 
-	return ratio;
+	/* Cast to int is OK because want <= target. */
+	return calc_pct_ratio(want, target);
 }
 
 int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
@@ -1984,6 +1986,46 @@  static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
+{
+	u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
+
+	assert_spin_locked(&space_info->lock);
+	space_info->reclaimable_bytes += bytes;
+
+	if (space_info->reclaimable_bytes >= chunk_sz)
+		btrfs_set_periodic_reclaim_ready(space_info, true);
+}
+
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
+{
+	assert_spin_locked(&space_info->lock);
+	if (!READ_ONCE(space_info->periodic_reclaim))
+		return;
+	if (ready != space_info->periodic_reclaim_ready) {
+		space_info->periodic_reclaim_ready = ready;
+		if (!ready)
+			space_info->reclaimable_bytes = 0;
+	}
+}
+
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+{
+	bool ret;
+
+	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		return false;
+	if (!READ_ONCE(space_info->periodic_reclaim))
+		return false;
+
+	spin_lock(&space_info->lock);
+	ret = space_info->periodic_reclaim_ready;
+	btrfs_set_periodic_reclaim_ready(space_info, false);
+	spin_unlock(&space_info->lock);
+
+	return ret;
+}
+
 int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
 {
 	int ret;
@@ -1991,9 +2033,7 @@  int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
 	struct btrfs_space_info *space_info;
 
 	list_for_each_entry(space_info, &fs_info->space_info, list) {
-		if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			continue;
-		if (!READ_ONCE(space_info->periodic_reclaim))
+		if (!btrfs_should_periodic_reclaim(space_info))
 			continue;
 		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
 			ret = do_reclaim_sweep(fs_info, space_info, raid);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index ae4a1f7d5856..4db8a0267c16 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -196,6 +196,17 @@  struct btrfs_space_info {
 	 * threshold in the cleaner thread.
 	 */
 	bool periodic_reclaim;
+
+	/*
+	 * Periodic reclaim should be a no-op if a space_info hasn't
+	 * freed any space since the last time we tried.
+	 */
+	bool periodic_reclaim_ready;
+
+	/*
+	 * Net bytes freed or allocated since the last reclaim pass.
+	 */
+	s64 reclaimable_bytes;
 };
 
 struct reserve_ticket {
@@ -278,6 +289,9 @@  void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
 int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
 int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);