diff mbox series

btrfs: use atomic64_t for free_objectid

Message ID 20250122122459.1148668-1-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: use atomic64_t for free_objectid | expand

Commit Message

Mark Harmstone Jan. 22, 2025, 12:21 p.m. UTC
Currently btrfs_get_free_objectid() uses a mutex to protect
free_objectid; I'm guessing this was because of the inode cache that we
used to have. The inode cache is no more, so simplify things by
replacing it with an atomic.

There's no issues with ordering: free_objectid gets set to an initial
value, then calls to btrfs_get_free_objectid() return a monotonically
increasing value.

This change means that btrfs_get_free_objectid() will no longer
potentially sleep, which was a blocker for adding a non-blocking mode
for inode and subvol creation.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 fs/btrfs/ctree.h    |  4 +---
 fs/btrfs/disk-io.c  | 43 ++++++++++++++++++-------------------------
 fs/btrfs/qgroup.c   | 11 ++++++-----
 fs/btrfs/tree-log.c |  3 ---
 4 files changed, 25 insertions(+), 36 deletions(-)

Comments

Filipe Manana Jan. 22, 2025, 12:39 p.m. UTC | #1
On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote:
>
> Currently btrfs_get_free_objectid() uses a mutex to protect
> free_objectid; I'm guessing this was because of the inode cache that we
> used to have. The inode cache is no more, so simplify things by
> replacing it with an atomic.
>
> There's no issues with ordering: free_objectid gets set to an initial
> value, then calls to btrfs_get_free_objectid() return a monotonically
> increasing value.
>
> This change means that btrfs_get_free_objectid() will no longer
> potentially sleep, which was a blocker for adding a non-blocking mode
> for inode and subvol creation.
>
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>  fs/btrfs/ctree.h    |  4 +---
>  fs/btrfs/disk-io.c  | 43 ++++++++++++++++++-------------------------
>  fs/btrfs/qgroup.c   | 11 ++++++-----
>  fs/btrfs/tree-log.c |  3 ---
>  4 files changed, 25 insertions(+), 36 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1096a80a64e7..04175698525b 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -179,8 +179,6 @@ struct btrfs_root {
>         struct btrfs_fs_info *fs_info;
>         struct extent_io_tree dirty_log_pages;
>
> -       struct mutex objectid_mutex;
> -
>         spinlock_t accounting_lock;
>         struct btrfs_block_rsv *block_rsv;
>
> @@ -214,7 +212,7 @@ struct btrfs_root {
>
>         u64 last_trans;
>
> -       u64 free_objectid;
> +       atomic64_t free_objectid;
>
>         struct btrfs_key defrag_progress;
>         struct btrfs_key defrag_max;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index f09db62e61a1..0543d9c3f8c0 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -659,7 +659,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
>         RB_CLEAR_NODE(&root->rb_node);
>
>         btrfs_set_root_last_trans(root, 0);
> -       root->free_objectid = 0;
> +       atomic64_set(&root->free_objectid, 0);
>         root->nr_delalloc_inodes = 0;
>         root->nr_ordered_extents = 0;
>         xa_init(&root->inodes);
> @@ -678,7 +678,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
>         spin_lock_init(&root->ordered_extent_lock);
>         spin_lock_init(&root->accounting_lock);
>         spin_lock_init(&root->qgroup_meta_rsv_lock);
> -       mutex_init(&root->objectid_mutex);
>         mutex_init(&root->log_mutex);
>         mutex_init(&root->ordered_extent_mutex);
>         mutex_init(&root->delalloc_mutex);
> @@ -1133,16 +1132,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
>                 }
>         }
>
> -       mutex_lock(&root->objectid_mutex);
>         ret = btrfs_init_root_free_objectid(root);
> -       if (ret) {
> -               mutex_unlock(&root->objectid_mutex);
> +       if (ret)
>                 goto fail;
> -       }
>
> -       ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
> -
> -       mutex_unlock(&root->objectid_mutex);
> +       ASSERT((u64)atomic64_read(&root->free_objectid) <=
> +               BTRFS_LAST_FREE_OBJECTID);
>
>         return 0;
>  fail:
> @@ -2730,8 +2725,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
>                 }
>
>                 /*
> -                * No need to hold btrfs_root::objectid_mutex since the fs
> -                * hasn't been fully initialised and we are the only user
> +                * No need to worry about atomic ordering of btrfs_root::free_objectid
> +                * since the fs hasn't been fully initialised and we are the
> +                * only user
>                  */
>                 ret = btrfs_init_root_free_objectid(tree_root);
>                 if (ret < 0) {
> @@ -2739,7 +2735,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
>                         continue;
>                 }
>
> -               ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
> +               ASSERT((u64)atomic64_read(&tree_root->free_objectid) <=
> +                       BTRFS_LAST_FREE_OBJECTID);
>
>                 ret = btrfs_read_roots(fs_info);
>                 if (ret < 0) {
> @@ -4931,10 +4928,11 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
>                 slot = path->slots[0] - 1;
>                 l = path->nodes[0];
>                 btrfs_item_key_to_cpu(l, &found_key, slot);
> -               root->free_objectid = max_t(u64, found_key.objectid + 1,
> -                                           BTRFS_FIRST_FREE_OBJECTID);
> +               atomic64_set(&root->free_objectid,
> +                            max_t(u64, found_key.objectid + 1,
> +                                  BTRFS_FIRST_FREE_OBJECTID));
>         } else {
> -               root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
> +               atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID);
>         }
>         ret = 0;
>  error:
> @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
>
>  int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
>  {
> -       int ret;
> -       mutex_lock(&root->objectid_mutex);
> +       u64 val = atomic64_inc_return(&root->free_objectid) - 1;
>
> -       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
> +       if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) {
>                 btrfs_warn(root->fs_info,
>                            "the objectid of root %llu reaches its highest value",
>                            btrfs_root_id(root));
> -               ret = -ENOSPC;
> -               goto out;
> +               return -ENOSPC;
>         }
>
> -       *objectid = root->free_objectid++;
> -       ret = 0;

So this gives different semantics now.

Before we increment free_objectid only if it's less than
BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign
more object IDs and must return -ENOSPC.

But now we always increment and then do the check, so after some calls
to btrfs_get_free_objectid() we wrap around the counter due to
overflow and eventually allow reusing already assigned object IDs.

I'm afraid the lock is still needed because of that. To make it more
lightweight maybe switch the mutex to a spinlock.

Thanks.

> -out:
> -       mutex_unlock(&root->objectid_mutex);
> -       return ret;
> +       *objectid = val;
> +       return 0;
>  }
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index aaf16019d829..b41e06d5d2fb 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -472,18 +472,19 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
>                          *
>                          * Ensure that we skip any such subvol ids.
>                          *
> -                        * We don't need to lock because this is only called
> -                        * during mount before we start doing things like creating
> -                        * subvolumes.
> +                        * We don't need to worry about atomic ordering because
> +                        * this is only called during mount before we start
> +                        * doing things like creating subvolumes.
>                          */
>                         if (is_fstree(qgroup->qgroupid) &&
> -                           qgroup->qgroupid > tree_root->free_objectid)
> +                           qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid))
>                                 /*
>                                  * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
>                                  * as it will get checked on the next call to
>                                  * btrfs_get_free_objectid.
>                                  */
> -                               tree_root->free_objectid = qgroup->qgroupid + 1;
> +                               atomic64_set(&tree_root->free_objectid,
> +                                            qgroup->qgroupid + 1);
>                 }
>                 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
>                 if (ret < 0)
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 955d1677e865..9d19528fee17 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -7325,9 +7325,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
>                          * We have just replayed everything, and the highest
>                          * objectid of fs roots probably has changed in case
>                          * some inode_item's got replayed.
> -                        *
> -                        * root->objectid_mutex is not acquired as log replay
> -                        * could only happen during mount.
>                          */
>                         ret = btrfs_init_root_free_objectid(root);
>                         if (ret)
> --
> 2.45.2
>
>
Mark Harmstone Jan. 22, 2025, 12:59 p.m. UTC | #2
On 22/1/25 12:39, Filipe Manana wrote:
> On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote:
>> @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
>>
>>   int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
>>   {
>> -       int ret;
>> -       mutex_lock(&root->objectid_mutex);
>> +       u64 val = atomic64_inc_return(&root->free_objectid) - 1;
>>
>> -       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
>> +       if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) {
>>                  btrfs_warn(root->fs_info,
>>                             "the objectid of root %llu reaches its highest value",
>>                             btrfs_root_id(root));
>> -               ret = -ENOSPC;
>> -               goto out;
>> +               return -ENOSPC;
>>          }
>>
>> -       *objectid = root->free_objectid++;
>> -       ret = 0;
> 
> So this gives different semantics now.
> 
> Before we increment free_objectid only if it's less than
> BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign
> more object IDs and must return -ENOSPC.
> 
> But now we always increment and then do the check, so after some calls
> to btrfs_get_free_objectid() we wrap around the counter due to
> overflow and eventually allow reusing already assigned object IDs.
> 
> I'm afraid the lock is still needed because of that. To make it more
> lightweight maybe switch the mutex to a spinlock.
> 
> Thanks.

Thanks Filipe. Do we even need the check, really? Even a denial of 
service attack wouldn't be able to practically call the function ~2^64 
times. And there's no way to create an inode with an arbitrary number, 
short of manually hex-editing the disk.

Mark
Filipe Manana Jan. 22, 2025, 1:04 p.m. UTC | #3
On Wed, Jan 22, 2025 at 12:59 PM Mark Harmstone <maharmstone@meta.com> wrote:
>
> On 22/1/25 12:39, Filipe Manana wrote:
> > On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote:
> >> @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
> >>
> >>   int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
> >>   {
> >> -       int ret;
> >> -       mutex_lock(&root->objectid_mutex);
> >> +       u64 val = atomic64_inc_return(&root->free_objectid) - 1;
> >>
> >> -       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
> >> +       if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) {
> >>                  btrfs_warn(root->fs_info,
> >>                             "the objectid of root %llu reaches its highest value",
> >>                             btrfs_root_id(root));
> >> -               ret = -ENOSPC;
> >> -               goto out;
> >> +               return -ENOSPC;
> >>          }
> >>
> >> -       *objectid = root->free_objectid++;
> >> -       ret = 0;
> >
> > So this gives different semantics now.
> >
> > Before we increment free_objectid only if it's less than
> > BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign
> > more object IDs and must return -ENOSPC.
> >
> > But now we always increment and then do the check, so after some calls
> > to btrfs_get_free_objectid() we wrap around the counter due to
> > overflow and eventually allow reusing already assigned object IDs.
> >
> > I'm afraid the lock is still needed because of that. To make it more
> > lightweight maybe switch the mutex to a spinlock.
> >
> > Thanks.
>
> Thanks Filipe. Do we even need the check, really? Even a denial of
> service attack wouldn't be able to practically call the function ~2^64
> times. And there's no way to create an inode with an arbitrary number,
> short of manually hex-editing the disk.

Well we do such limit checks everywhere, why would we ignore them only here?
Even if they are very unlikely to be hit in practice, if they happen,
we can get into serious trouble.

So I don't think it's wise to ignore the limit.

Thanks.

>
> Mark
Daniel Vacek Jan. 22, 2025, 1:51 p.m. UTC | #4
On Wed, 22 Jan 2025 at 13:40, Filipe Manana <fdmanana@kernel.org> wrote:
>
> On Wed, Jan 22, 2025 at 12:25 PM Mark Harmstone <maharmstone@fb.com> wrote:
> >
> > Currently btrfs_get_free_objectid() uses a mutex to protect
> > free_objectid; I'm guessing this was because of the inode cache that we
> > used to have. The inode cache is no more, so simplify things by
> > replacing it with an atomic.
> >
> > There's no issues with ordering: free_objectid gets set to an initial
> > value, then calls to btrfs_get_free_objectid() return a monotonically
> > increasing value.
> >
> > This change means that btrfs_get_free_objectid() will no longer
> > potentially sleep, which was a blocker for adding a non-blocking mode
> > for inode and subvol creation.
> >
> > Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> > ---
> >  fs/btrfs/ctree.h    |  4 +---
> >  fs/btrfs/disk-io.c  | 43 ++++++++++++++++++-------------------------
> >  fs/btrfs/qgroup.c   | 11 ++++++-----
> >  fs/btrfs/tree-log.c |  3 ---
> >  4 files changed, 25 insertions(+), 36 deletions(-)
> >
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index 1096a80a64e7..04175698525b 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -179,8 +179,6 @@ struct btrfs_root {
> >         struct btrfs_fs_info *fs_info;
> >         struct extent_io_tree dirty_log_pages;
> >
> > -       struct mutex objectid_mutex;
> > -
> >         spinlock_t accounting_lock;
> >         struct btrfs_block_rsv *block_rsv;
> >
> > @@ -214,7 +212,7 @@ struct btrfs_root {
> >
> >         u64 last_trans;
> >
> > -       u64 free_objectid;
> > +       atomic64_t free_objectid;
> >
> >         struct btrfs_key defrag_progress;
> >         struct btrfs_key defrag_max;
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index f09db62e61a1..0543d9c3f8c0 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -659,7 +659,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
> >         RB_CLEAR_NODE(&root->rb_node);
> >
> >         btrfs_set_root_last_trans(root, 0);
> > -       root->free_objectid = 0;
> > +       atomic64_set(&root->free_objectid, 0);
> >         root->nr_delalloc_inodes = 0;
> >         root->nr_ordered_extents = 0;
> >         xa_init(&root->inodes);
> > @@ -678,7 +678,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
> >         spin_lock_init(&root->ordered_extent_lock);
> >         spin_lock_init(&root->accounting_lock);
> >         spin_lock_init(&root->qgroup_meta_rsv_lock);
> > -       mutex_init(&root->objectid_mutex);
> >         mutex_init(&root->log_mutex);
> >         mutex_init(&root->ordered_extent_mutex);
> >         mutex_init(&root->delalloc_mutex);
> > @@ -1133,16 +1132,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
> >                 }
> >         }
> >
> > -       mutex_lock(&root->objectid_mutex);
> >         ret = btrfs_init_root_free_objectid(root);
> > -       if (ret) {
> > -               mutex_unlock(&root->objectid_mutex);
> > +       if (ret)
> >                 goto fail;
> > -       }
> >
> > -       ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
> > -
> > -       mutex_unlock(&root->objectid_mutex);
> > +       ASSERT((u64)atomic64_read(&root->free_objectid) <=
> > +               BTRFS_LAST_FREE_OBJECTID);
> >
> >         return 0;
> >  fail:
> > @@ -2730,8 +2725,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
> >                 }
> >
> >                 /*
> > -                * No need to hold btrfs_root::objectid_mutex since the fs
> > -                * hasn't been fully initialised and we are the only user
> > +                * No need to worry about atomic ordering of btrfs_root::free_objectid
> > +                * since the fs hasn't been fully initialised and we are the
> > +                * only user
> >                  */
> >                 ret = btrfs_init_root_free_objectid(tree_root);
> >                 if (ret < 0) {
> > @@ -2739,7 +2735,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
> >                         continue;
> >                 }
> >
> > -               ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
> > +               ASSERT((u64)atomic64_read(&tree_root->free_objectid) <=
> > +                       BTRFS_LAST_FREE_OBJECTID);
> >
> >                 ret = btrfs_read_roots(fs_info);
> >                 if (ret < 0) {
> > @@ -4931,10 +4928,11 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
> >                 slot = path->slots[0] - 1;
> >                 l = path->nodes[0];
> >                 btrfs_item_key_to_cpu(l, &found_key, slot);
> > -               root->free_objectid = max_t(u64, found_key.objectid + 1,
> > -                                           BTRFS_FIRST_FREE_OBJECTID);
> > +               atomic64_set(&root->free_objectid,
> > +                            max_t(u64, found_key.objectid + 1,
> > +                                  BTRFS_FIRST_FREE_OBJECTID));
> >         } else {
> > -               root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
> > +               atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID);
> >         }
> >         ret = 0;
> >  error:
> > @@ -4944,20 +4942,15 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
> >
> >  int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
> >  {
> > -       int ret;
> > -       mutex_lock(&root->objectid_mutex);
> > +       u64 val = atomic64_inc_return(&root->free_objectid) - 1;
> >
> > -       if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
> > +       if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) {
> >                 btrfs_warn(root->fs_info,
> >                            "the objectid of root %llu reaches its highest value",
> >                            btrfs_root_id(root));
> > -               ret = -ENOSPC;
> > -               goto out;
> > +               return -ENOSPC;
> >         }
> >
> > -       *objectid = root->free_objectid++;
> > -       ret = 0;
>
> So this gives different semantics now.
>
> Before we increment free_objectid only if it's less than
> BTRFS_LAST_FREE_OBJECTID, so once we reach that value we can't assign
> more object IDs and must return -ENOSPC.
>
> But now we always increment and then do the check, so after some calls
> to btrfs_get_free_objectid() we wrap around the counter due to
> overflow and eventually allow reusing already assigned object IDs.
>
> I'm afraid the lock is still needed because of that. To make it more
> lightweight maybe switch the mutex to a spinlock.

How about this?

```
retry:  val = atomic64_read(&root->free_objectid);
        ....;
        if (atomic64_cmpxchg(&root->free_objectid, val, val+1) != val)
                goto retry;
        *objectid = val;
        return 0;
```

> Thanks.
>
> > -out:
> > -       mutex_unlock(&root->objectid_mutex);
> > -       return ret;
> > +       *objectid = val;
> > +       return 0;
> >  }
> > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > index aaf16019d829..b41e06d5d2fb 100644
> > --- a/fs/btrfs/qgroup.c
> > +++ b/fs/btrfs/qgroup.c
> > @@ -472,18 +472,19 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
> >                          *
> >                          * Ensure that we skip any such subvol ids.
> >                          *
> > -                        * We don't need to lock because this is only called
> > -                        * during mount before we start doing things like creating
> > -                        * subvolumes.
> > +                        * We don't need to worry about atomic ordering because
> > +                        * this is only called during mount before we start
> > +                        * doing things like creating subvolumes.
> >                          */
> >                         if (is_fstree(qgroup->qgroupid) &&
> > -                           qgroup->qgroupid > tree_root->free_objectid)
> > +                           qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid))
> >                                 /*
> >                                  * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
> >                                  * as it will get checked on the next call to
> >                                  * btrfs_get_free_objectid.
> >                                  */
> > -                               tree_root->free_objectid = qgroup->qgroupid + 1;
> > +                               atomic64_set(&tree_root->free_objectid,
> > +                                            qgroup->qgroupid + 1);
> >                 }
> >                 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
> >                 if (ret < 0)
> > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> > index 955d1677e865..9d19528fee17 100644
> > --- a/fs/btrfs/tree-log.c
> > +++ b/fs/btrfs/tree-log.c
> > @@ -7325,9 +7325,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
> >                          * We have just replayed everything, and the highest
> >                          * objectid of fs roots probably has changed in case
> >                          * some inode_item's got replayed.
> > -                        *
> > -                        * root->objectid_mutex is not acquired as log replay
> > -                        * could only happen during mount.
> >                          */
> >                         ret = btrfs_init_root_free_objectid(root);
> >                         if (ret)
> > --
> > 2.45.2
> >
> >
>
David Sterba Jan. 22, 2025, 3:07 p.m. UTC | #5
On Wed, Jan 22, 2025 at 12:21:28PM +0000, Mark Harmstone wrote:
> Currently btrfs_get_free_objectid() uses a mutex to protect
> free_objectid; I'm guessing this was because of the inode cache that we
> used to have. The inode cache is no more, so simplify things by
> replacing it with an atomic.
> 
> There's no issues with ordering: free_objectid gets set to an initial
> value, then calls to btrfs_get_free_objectid() return a monotonically
> increasing value.
> 
> This change means that btrfs_get_free_objectid() will no longer
> potentially sleep, which was a blocker for adding a non-blocking mode
> for inode and subvol creation.
> 
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>  fs/btrfs/ctree.h    |  4 +---
>  fs/btrfs/disk-io.c  | 43 ++++++++++++++++++-------------------------
>  fs/btrfs/qgroup.c   | 11 ++++++-----
>  fs/btrfs/tree-log.c |  3 ---
>  4 files changed, 25 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1096a80a64e7..04175698525b 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -179,8 +179,6 @@ struct btrfs_root {
>  	struct btrfs_fs_info *fs_info;
>  	struct extent_io_tree dirty_log_pages;
>  
> -	struct mutex objectid_mutex;
> -
>  	spinlock_t accounting_lock;
>  	struct btrfs_block_rsv *block_rsv;
>  
> @@ -214,7 +212,7 @@ struct btrfs_root {
>  
>  	u64 last_trans;
>  
> -	u64 free_objectid;
> +	atomic64_t free_objectid;

Besides the other things pointed out, this also changes the type from
u64 to s64 or requiring casts so we keep u64 as this is what the on-disk
format defines.
Mark Harmstone Jan. 22, 2025, 6:19 p.m. UTC | #6
On 22/1/25 15:07, David Sterba wrote:
> > 
> On Wed, Jan 22, 2025 at 12:21:28PM +0000, Mark Harmstone wrote:
>> Currently btrfs_get_free_objectid() uses a mutex to protect
>> free_objectid; I'm guessing this was because of the inode cache that we
>> used to have. The inode cache is no more, so simplify things by
>> replacing it with an atomic.
>>
>> There's no issues with ordering: free_objectid gets set to an initial
>> value, then calls to btrfs_get_free_objectid() return a monotonically
>> increasing value.
>>
>> This change means that btrfs_get_free_objectid() will no longer
>> potentially sleep, which was a blocker for adding a non-blocking mode
>> for inode and subvol creation.
>>
>> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
>> ---
>>   fs/btrfs/ctree.h    |  4 +---
>>   fs/btrfs/disk-io.c  | 43 ++++++++++++++++++-------------------------
>>   fs/btrfs/qgroup.c   | 11 ++++++-----
>>   fs/btrfs/tree-log.c |  3 ---
>>   4 files changed, 25 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 1096a80a64e7..04175698525b 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -179,8 +179,6 @@ struct btrfs_root {
>>   	struct btrfs_fs_info *fs_info;
>>   	struct extent_io_tree dirty_log_pages;
>>   
>> -	struct mutex objectid_mutex;
>> -
>>   	spinlock_t accounting_lock;
>>   	struct btrfs_block_rsv *block_rsv;
>>   
>> @@ -214,7 +212,7 @@ struct btrfs_root {
>>   
>>   	u64 last_trans;
>>   
>> -	u64 free_objectid;
>> +	atomic64_t free_objectid;
> 
> Besides the other things pointed out, this also changes the type from
> u64 to s64 or requiring casts so we keep u64 as this is what the on-disk
> format defines.

It does, but there's casts to u64 every time it's read (which, asserts 
aside, is only in btrfs_get_free_objectid).
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1096a80a64e7..04175698525b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -179,8 +179,6 @@  struct btrfs_root {
 	struct btrfs_fs_info *fs_info;
 	struct extent_io_tree dirty_log_pages;
 
-	struct mutex objectid_mutex;
-
 	spinlock_t accounting_lock;
 	struct btrfs_block_rsv *block_rsv;
 
@@ -214,7 +212,7 @@  struct btrfs_root {
 
 	u64 last_trans;
 
-	u64 free_objectid;
+	atomic64_t free_objectid;
 
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f09db62e61a1..0543d9c3f8c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -659,7 +659,7 @@  static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	RB_CLEAR_NODE(&root->rb_node);
 
 	btrfs_set_root_last_trans(root, 0);
-	root->free_objectid = 0;
+	atomic64_set(&root->free_objectid, 0);
 	root->nr_delalloc_inodes = 0;
 	root->nr_ordered_extents = 0;
 	xa_init(&root->inodes);
@@ -678,7 +678,6 @@  static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	spin_lock_init(&root->ordered_extent_lock);
 	spin_lock_init(&root->accounting_lock);
 	spin_lock_init(&root->qgroup_meta_rsv_lock);
-	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	mutex_init(&root->ordered_extent_mutex);
 	mutex_init(&root->delalloc_mutex);
@@ -1133,16 +1132,12 @@  static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
 		}
 	}
 
-	mutex_lock(&root->objectid_mutex);
 	ret = btrfs_init_root_free_objectid(root);
-	if (ret) {
-		mutex_unlock(&root->objectid_mutex);
+	if (ret)
 		goto fail;
-	}
 
-	ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
-
-	mutex_unlock(&root->objectid_mutex);
+	ASSERT((u64)atomic64_read(&root->free_objectid) <=
+		BTRFS_LAST_FREE_OBJECTID);
 
 	return 0;
 fail:
@@ -2730,8 +2725,9 @@  static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 		}
 
 		/*
-		 * No need to hold btrfs_root::objectid_mutex since the fs
-		 * hasn't been fully initialised and we are the only user
+		 * No need to worry about atomic ordering of btrfs_root::free_objectid
+		 * since the fs hasn't been fully initialised and we are the
+		 * only user
 		 */
 		ret = btrfs_init_root_free_objectid(tree_root);
 		if (ret < 0) {
@@ -2739,7 +2735,8 @@  static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 			continue;
 		}
 
-		ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
+		ASSERT((u64)atomic64_read(&tree_root->free_objectid) <=
+			BTRFS_LAST_FREE_OBJECTID);
 
 		ret = btrfs_read_roots(fs_info);
 		if (ret < 0) {
@@ -4931,10 +4928,11 @@  int btrfs_init_root_free_objectid(struct btrfs_root *root)
 		slot = path->slots[0] - 1;
 		l = path->nodes[0];
 		btrfs_item_key_to_cpu(l, &found_key, slot);
-		root->free_objectid = max_t(u64, found_key.objectid + 1,
-					    BTRFS_FIRST_FREE_OBJECTID);
+		atomic64_set(&root->free_objectid,
+			     max_t(u64, found_key.objectid + 1,
+				   BTRFS_FIRST_FREE_OBJECTID));
 	} else {
-		root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
+		atomic64_set(&root->free_objectid, BTRFS_FIRST_FREE_OBJECTID);
 	}
 	ret = 0;
 error:
@@ -4944,20 +4942,15 @@  int btrfs_init_root_free_objectid(struct btrfs_root *root)
 
 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
 {
-	int ret;
-	mutex_lock(&root->objectid_mutex);
+	u64 val = atomic64_inc_return(&root->free_objectid) - 1;
 
-	if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+	if (unlikely(val >= BTRFS_LAST_FREE_OBJECTID)) {
 		btrfs_warn(root->fs_info,
 			   "the objectid of root %llu reaches its highest value",
 			   btrfs_root_id(root));
-		ret = -ENOSPC;
-		goto out;
+		return -ENOSPC;
 	}
 
-	*objectid = root->free_objectid++;
-	ret = 0;
-out:
-	mutex_unlock(&root->objectid_mutex);
-	return ret;
+	*objectid = val;
+	return 0;
 }
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aaf16019d829..b41e06d5d2fb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -472,18 +472,19 @@  int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 			 *
 			 * Ensure that we skip any such subvol ids.
 			 *
-			 * We don't need to lock because this is only called
-			 * during mount before we start doing things like creating
-			 * subvolumes.
+			 * We don't need to worry about atomic ordering because
+			 * this is only called during mount before we start
+			 * doing things like creating subvolumes.
 			 */
 			if (is_fstree(qgroup->qgroupid) &&
-			    qgroup->qgroupid > tree_root->free_objectid)
+			    qgroup->qgroupid > (u64)atomic64_read(&tree_root->free_objectid))
 				/*
 				 * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
 				 * as it will get checked on the next call to
 				 * btrfs_get_free_objectid.
 				 */
-				tree_root->free_objectid = qgroup->qgroupid + 1;
+				atomic64_set(&tree_root->free_objectid,
+					     qgroup->qgroupid + 1);
 		}
 		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 		if (ret < 0)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 955d1677e865..9d19528fee17 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7325,9 +7325,6 @@  int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 			 * We have just replayed everything, and the highest
 			 * objectid of fs roots probably has changed in case
 			 * some inode_item's got replayed.
-			 *
-			 * root->objectid_mutex is not acquired as log replay
-			 * could only happen during mount.
 			 */
 			ret = btrfs_init_root_free_objectid(root);
 			if (ret)