diff mbox series

[2/5] btrfs: fix bg->bg_list list_del refcount races

Message ID 8ba94e9758ff9d5278ed86fcff2acdd429d5deee.1741306938.git.boris@bur.io (mailing list archive)
State New
Headers show
Series btrfs: block_group refcounting fixes | expand

Commit Message

Boris Burkov March 7, 2025, 12:29 a.m. UTC
If there is any chance at all of racing with mark_bg_unused, better safe
than sorry.

Otherwise we risk the following interleaving (bg_list refcount in parens)

T1 (some random op)                         T2 (mark_bg_unused)
                                        !list_empty(&bg->bg_list); (1)
list_del_init(&bg->bg_list); (1)
                                        list_move_tail (1)
btrfs_put_block_group (0)
                                        btrfs_delete_unused_bgs
                                             bg = list_first_entry
                                             list_del_init(&bg->bg_list);
                                             btrfs_put_block_group(bg); (-1)

Ultimately, this results in a broken ref count that hits zero one deref
early and the real final deref underflows the refcount, resulting in a WARNING.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/extent-tree.c | 3 +++
 fs/btrfs/transaction.c | 5 +++++
 2 files changed, 8 insertions(+)

Comments

Qu Wenruo March 7, 2025, 6:52 a.m. UTC | #1
在 2025/3/7 10:59, Boris Burkov 写道:
> If there is any chance at all of racing with mark_bg_unused, better safe
> than sorry.
> 
> Otherwise we risk the following interleaving (bg_list refcount in parens)
> 
> T1 (some random op)                         T2 (mark_bg_unused)
>                                          !list_empty(&bg->bg_list); (1)
> list_del_init(&bg->bg_list); (1)
>                                          list_move_tail (1)
> btrfs_put_block_group (0)
>                                          btrfs_delete_unused_bgs
>                                               bg = list_first_entry
>                                               list_del_init(&bg->bg_list);
>                                               btrfs_put_block_group(bg); (-1)
> 
> Ultimately, this results in a broken ref count that hits zero one deref
> early and the real final deref underflows the refcount, resulting in a WARNING.
> 
> Signed-off-by: Boris Burkov <boris@bur.io>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu
> ---
>   fs/btrfs/extent-tree.c | 3 +++
>   fs/btrfs/transaction.c | 5 +++++
>   2 files changed, 8 insertions(+)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 5de1a1293c93..80560065cc1b 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2868,7 +2868,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
>   						   block_group->length,
>   						   &trimmed);
>   
> +		spin_lock(&fs_info->unused_bgs_lock);
>   		list_del_init(&block_group->bg_list);
> +		spin_unlock(&fs_info->unused_bgs_lock);
> +
>   		btrfs_unfreeze_block_group(block_group);
>   		btrfs_put_block_group(block_group);
>   
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index db8fe291d010..c98a8efd1bea 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -160,7 +160,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
>   			cache = list_first_entry(&transaction->deleted_bgs,
>   						 struct btrfs_block_group,
>   						 bg_list);
> +			spin_lock(&transaction->fs_info->unused_bgs_lock);
>   			list_del_init(&cache->bg_list);
> +			spin_unlock(&transaction->fs_info->unused_bgs_lock);
>   			btrfs_unfreeze_block_group(cache);
>   			btrfs_put_block_group(cache);
>   		}
> @@ -2096,7 +2098,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
>   
>          list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
>                  btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
> +	       spin_lock(&fs_info->unused_bgs_lock);
>                  list_del_init(&block_group->bg_list);
> +	       btrfs_put_block_group(block_group);
> +	       spin_unlock(&fs_info->unused_bgs_lock);
>          }
>   }
>
Filipe Manana March 7, 2025, 2:24 p.m. UTC | #2
On Fri, Mar 7, 2025 at 12:31 AM Boris Burkov <boris@bur.io> wrote:
>
> If there is any chance at all of racing with mark_bg_unused, better safe
> than sorry.
>
> Otherwise we risk the following interleaving (bg_list refcount in parens)
>
> T1 (some random op)                         T2 (mark_bg_unused)

mark_bg_unused -> btrfs_mark_bg_unused

Please use complete names everywhere.

>                                         !list_empty(&bg->bg_list); (1)
> list_del_init(&bg->bg_list); (1)
>                                         list_move_tail (1)
> btrfs_put_block_group (0)
>                                         btrfs_delete_unused_bgs
>                                              bg = list_first_entry
>                                              list_del_init(&bg->bg_list);
>                                              btrfs_put_block_group(bg); (-1)
>
> Ultimately, this results in a broken ref count that hits zero one deref
> early and the real final deref underflows the refcount, resulting in a WARNING.
>
> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/extent-tree.c | 3 +++
>  fs/btrfs/transaction.c | 5 +++++
>  2 files changed, 8 insertions(+)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 5de1a1293c93..80560065cc1b 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2868,7 +2868,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
>                                                    block_group->length,
>                                                    &trimmed);
>
> +               spin_lock(&fs_info->unused_bgs_lock);
>                 list_del_init(&block_group->bg_list);
> +               spin_unlock(&fs_info->unused_bgs_lock);

This shouldn't need the lock, because block groups added to the
transaction->deleted_bgs list were made RO only before at
btrfs_delete_unused_bgs().

> +
>                 btrfs_unfreeze_block_group(block_group);
>                 btrfs_put_block_group(block_group);
>
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index db8fe291d010..c98a8efd1bea 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -160,7 +160,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
>                         cache = list_first_entry(&transaction->deleted_bgs,
>                                                  struct btrfs_block_group,
>                                                  bg_list);
> +                       spin_lock(&transaction->fs_info->unused_bgs_lock);
>                         list_del_init(&cache->bg_list);
> +                       spin_unlock(&transaction->fs_info->unused_bgs_lock);

In the transaction abort path no else should be messing up with the list too.

>                         btrfs_unfreeze_block_group(cache);
>                         btrfs_put_block_group(cache);
>                 }
> @@ -2096,7 +2098,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
>
>         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
>                 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
> +              spin_lock(&fs_info->unused_bgs_lock);
>                 list_del_init(&block_group->bg_list);
> +              btrfs_put_block_group(block_group);
> +              spin_unlock(&fs_info->unused_bgs_lock);

What's this new btrfs_put_block_group() for? I don't see a matching
ref count increase in the patch.
Or is this fixing a separate bug? Where's the matching ref count
increase in the codebase?

As before, we're in the transaction abort path, no one should be
messing with the list too.

I don't mind adding the locking just to be safe, but leaving a comment
to mention it shouldn't be needed and why there's concurrency expected
in these cases would be nice.

Thanks.

>         }
>  }

>
> --
> 2.48.1
>
>
Boris Burkov March 7, 2025, 9:37 p.m. UTC | #3
On Fri, Mar 07, 2025 at 02:24:34PM +0000, Filipe Manana wrote:
> On Fri, Mar 7, 2025 at 12:31 AM Boris Burkov <boris@bur.io> wrote:
> >
> > If there is any chance at all of racing with mark_bg_unused, better safe
> > than sorry.
> >
> > Otherwise we risk the following interleaving (bg_list refcount in parens)
> >
> > T1 (some random op)                         T2 (mark_bg_unused)
> 
> mark_bg_unused -> btrfs_mark_bg_unused
> 
> Please use complete names everywhere.
> 
> >                                         !list_empty(&bg->bg_list); (1)
> > list_del_init(&bg->bg_list); (1)
> >                                         list_move_tail (1)
> > btrfs_put_block_group (0)
> >                                         btrfs_delete_unused_bgs
> >                                              bg = list_first_entry
> >                                              list_del_init(&bg->bg_list);
> >                                              btrfs_put_block_group(bg); (-1)
> >
> > Ultimately, this results in a broken ref count that hits zero one deref
> > early and the real final deref underflows the refcount, resulting in a WARNING.
> >
> > Signed-off-by: Boris Burkov <boris@bur.io>
> > ---
> >  fs/btrfs/extent-tree.c | 3 +++
> >  fs/btrfs/transaction.c | 5 +++++
> >  2 files changed, 8 insertions(+)
> >
> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index 5de1a1293c93..80560065cc1b 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -2868,7 +2868,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
> >                                                    block_group->length,
> >                                                    &trimmed);
> >
> > +               spin_lock(&fs_info->unused_bgs_lock);
> >                 list_del_init(&block_group->bg_list);
> > +               spin_unlock(&fs_info->unused_bgs_lock);
> 
> This shouldn't need the lock, because block groups added to the
> transaction->deleted_bgs list were made RO only before at
> btrfs_delete_unused_bgs().
> 

My thinking is that it is a lot easier to reason about this if we also
lock it when modifying the bg_list. That will insulate us against any
possible bugs with different uses of bg_list attaching to various lists.

When hunting for "broken refcount maybe" this time around, I had to dig
through all of these and carefully analyze them.

I agree with you that these are probably not strictly necessary in any
way, which is partly why I made them a separate patch from the one I
think is a bug. Perhaps I should retitle the patch to not use the terms
"fix" and "race" and instead call it something like:
"harden uses of bg_list against possible races" or something?

> > +
> >                 btrfs_unfreeze_block_group(block_group);
> >                 btrfs_put_block_group(block_group);
> >
> > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> > index db8fe291d010..c98a8efd1bea 100644
> > --- a/fs/btrfs/transaction.c
> > +++ b/fs/btrfs/transaction.c
> > @@ -160,7 +160,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
> >                         cache = list_first_entry(&transaction->deleted_bgs,
> >                                                  struct btrfs_block_group,
> >                                                  bg_list);
> > +                       spin_lock(&transaction->fs_info->unused_bgs_lock);
> >                         list_del_init(&cache->bg_list);
> > +                       spin_unlock(&transaction->fs_info->unused_bgs_lock);
> 
> In the transaction abort path no else should be messing up with the list too.
> 
> >                         btrfs_unfreeze_block_group(cache);
> >                         btrfs_put_block_group(cache);
> >                 }
> > @@ -2096,7 +2098,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
> >
> >         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
> >                 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
> > +              spin_lock(&fs_info->unused_bgs_lock);
> >                 list_del_init(&block_group->bg_list);
> > +              btrfs_put_block_group(block_group);
> > +              spin_unlock(&fs_info->unused_bgs_lock);
> 
> What's this new btrfs_put_block_group() for? I don't see a matching
> ref count increase in the patch.
> Or is this fixing a separate bug? Where's the matching ref count
> increase in the codebase?

Sloppy to include it here, sorry. I can pull it out separately if you
like.

The intention of this change is to simply be disciplined about
maintaining the invariant that "bg is linked to a list via bg_list iff
bg refcount is incremented". That way we can confidently assert that the
list should be empty upon deletion, and catch more bugs, for example.

It certainly matters the least on transaction abort, when the state is
messed up anyway.

> 
> As before, we're in the transaction abort path, no one should be
> messing with the list too.
> 
> I don't mind adding the locking just to be safe, but leaving a comment
> to mention it shouldn't be needed and why there's concurrency expected
> in these cases would be nice.

I can definitely add a comment to the ones we expect don't care. (As
well as the re-titling I suggested above)

> 
> Thanks.
> 
> >         }
> >  }
> 
> >
> > --
> > 2.48.1
> >
> >
diff mbox series

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5de1a1293c93..80560065cc1b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2868,7 +2868,10 @@  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 						   block_group->length,
 						   &trimmed);
 
+		spin_lock(&fs_info->unused_bgs_lock);
 		list_del_init(&block_group->bg_list);
+		spin_unlock(&fs_info->unused_bgs_lock);
+
 		btrfs_unfreeze_block_group(block_group);
 		btrfs_put_block_group(block_group);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index db8fe291d010..c98a8efd1bea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -160,7 +160,9 @@  void btrfs_put_transaction(struct btrfs_transaction *transaction)
 			cache = list_first_entry(&transaction->deleted_bgs,
 						 struct btrfs_block_group,
 						 bg_list);
+			spin_lock(&transaction->fs_info->unused_bgs_lock);
 			list_del_init(&cache->bg_list);
+			spin_unlock(&transaction->fs_info->unused_bgs_lock);
 			btrfs_unfreeze_block_group(cache);
 			btrfs_put_block_group(cache);
 		}
@@ -2096,7 +2098,10 @@  static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
 
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
                btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+	       spin_lock(&fs_info->unused_bgs_lock);
                list_del_init(&block_group->bg_list);
+	       btrfs_put_block_group(block_group);
+	       spin_unlock(&fs_info->unused_bgs_lock);
        }
 }