diff mbox series

btrfs: keep `priv` struct on stack for sync reads in `btrfs_encoded_read_regular_fill_pages()`

Message ID 20250108114326.1729250-1-neelx@suse.com (mailing list archive)
State New
Headers show
Series btrfs: keep `priv` struct on stack for sync reads in `btrfs_encoded_read_regular_fill_pages()` | expand

Commit Message

Daniel Vacek Jan. 8, 2025, 11:43 a.m. UTC
Only allocate the `priv` struct from slab for asynchronous mode.

There's no need to allocate an object from slab in the synchronous mode. In
such a case stack can be happily used as it used to be before 68d3b27e05c7
("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
which was a preparation for the async mode.

While at it, fix the comment to reflect the atomic => refcount change in
d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").

Signed-off-by: Daniel Vacek <neelx@suse.com>
---
 fs/btrfs/inode.c | 62 +++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

Comments

Johannes Thumshirn Jan. 8, 2025, 12:42 p.m. UTC | #1
On 08.01.25 12:44, Daniel Vacek wrote:
> Only allocate the `priv` struct from slab for asynchronous mode.
> 
> There's no need to allocate an object from slab in the synchronous mode. In
> such a case stack can be happily used as it used to be before 68d3b27e05c7
> ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> which was a preparation for the async mode.
> 
> While at it, fix the comment to reflect the atomic => refcount change in
> d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").


Generally I'm not a huge fan of conditional allocation/freeing. It just 
complicates matters. I get it in case of the bio's bi_inline_vecs where 
it's a optimization, but I fail to see why it would make a difference in 
this case.

If we're really going down that route, there should at least be a 
justification other than "no need" to.

>   struct btrfs_encoded_read_private {
> -	struct completion done;
> +	struct completion *sync_reads;
>   	void *uring_ctx;
> -	refcount_t pending_refs;
> +	refcount_t pending_reads;
>   	blk_status_t status;
>   };

These renames just make the diff harder to read (and yes I shouldn't 
have renamed pending to pending_refs but that at least also changed the 
type).


> -	if (refcount_dec_and_test(&priv->pending_refs)) {
> -		int err = blk_status_to_errno(READ_ONCE(priv->status));
> -
> +	if (refcount_dec_and_test(&priv->pending_reads)) {
>   		if (priv->uring_ctx) {
> +			int err = blk_status_to_errno(READ_ONCE(priv->status));

Missing newline after the declaration.

>   	unsigned long i = 0;
>   	struct btrfs_bio *bbio;
> -	int ret;

That seems unrelated.


> @@ -9155,25 +9159,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   		disk_io_size -= bytes;
>   	} while (disk_io_size);
>   
> -	refcount_inc(&priv->pending_refs);
> +	refcount_inc(&priv->pending_reads);
>   	btrfs_submit_bbio(bbio, 0);
>   
>   	if (uring_ctx) {
> -		if (refcount_dec_and_test(&priv->pending_refs)) {
> -			ret = blk_status_to_errno(READ_ONCE(priv->status));
> -			btrfs_uring_read_extent_endio(uring_ctx, ret);
> +		if (refcount_dec_and_test(&priv->pending_reads)) {
> +			int err = blk_status_to_errno(READ_ONCE(priv->status));
> +			btrfs_uring_read_extent_endio(uring_ctx, err);
>   			kfree(priv);

Missing newline after the declaration, but still why can't we just keep ret?
Daniel Vacek Jan. 8, 2025, 3:24 p.m. UTC | #2
On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 08.01.25 12:44, Daniel Vacek wrote:
> > Only allocate the `priv` struct from slab for asynchronous mode.
> >
> > There's no need to allocate an object from slab in the synchronous mode. In
> > such a case stack can be happily used as it used to be before 68d3b27e05c7
> > ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> > which was a preparation for the async mode.
> >
> > While at it, fix the comment to reflect the atomic => refcount change in
> > d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
>
>
> Generally I'm not a huge fan of conditional allocation/freeing. It just
> complicates matters. I get it in case of the bio's bi_inline_vecs where
> it's a optimization, but I fail to see why it would make a difference in
> this case.
>
> If we're really going down that route, there should at least be a
> justification other than "no need" to.

Well the main motivation was not to needlessly exercise the slab
allocator when IO uring is not used. It is a bit of an overhead,
though the object is not really big so I guess it's not a big deal
after all (the slab should manage just fine even under low memory
conditions).

68d3b27e05c7 added the allocation for the async mode but also changed
the original behavior of the sync mode which was using stack before.
The async mode indeed requires the allocation as the object's lifetime
extends over the function's one. The sync mode is perfectly contained
within as it always was.

Simply, I tend not to do any allocations which are not strictly
needed. If you prefer to simply allocate the object unconditionally,
we can just drop this patch.

> >   struct btrfs_encoded_read_private {
> > -     struct completion done;
> > +     struct completion *sync_reads;
> >       void *uring_ctx;
> > -     refcount_t pending_refs;
> > +     refcount_t pending_reads;
> >       blk_status_t status;
> >   };
>
> These renames just make the diff harder to read (and yes I shouldn't
> have renamed pending to pending_refs but that at least also changed the
> type).

I see. But also the completion is changed to a pointer here for a
reason and I tried to make it clear it's only used for sync reads,
hence the rename.

> > -     if (refcount_dec_and_test(&priv->pending_refs)) {
> > -             int err = blk_status_to_errno(READ_ONCE(priv->status));
> > -
> > +     if (refcount_dec_and_test(&priv->pending_reads)) {
> >               if (priv->uring_ctx) {
> > +                     int err = blk_status_to_errno(READ_ONCE(priv->status));
>
> Missing newline after the declaration.

Right. Clearly I did not run checkpatch before sending. Sorry.
I was just trying to match this block to the same one later, which did
not have the newline. (But it also did not have the declaration
before.)

> >       unsigned long i = 0;
> >       struct btrfs_bio *bbio;
> > -     int ret;
>
> That seems unrelated.

It's only used in the async branch. I prefer to have it local to that
branch for readability. Also it matches the same block from the
previous function.

> > @@ -9155,25 +9159,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
> >               disk_io_size -= bytes;
> >       } while (disk_io_size);
> >
> > -     refcount_inc(&priv->pending_refs);
> > +     refcount_inc(&priv->pending_reads);
> >       btrfs_submit_bbio(bbio, 0);
> >
> >       if (uring_ctx) {
> > -             if (refcount_dec_and_test(&priv->pending_refs)) {
> > -                     ret = blk_status_to_errno(READ_ONCE(priv->status));
> > -                     btrfs_uring_read_extent_endio(uring_ctx, ret);
> > +             if (refcount_dec_and_test(&priv->pending_reads)) {
> > +                     int err = blk_status_to_errno(READ_ONCE(priv->status));
> > +                     btrfs_uring_read_extent_endio(uring_ctx, err);
> >                       kfree(priv);
>
> Missing newline after the declaration, but still why can't we just keep ret?

The new line was not there before. But back then it was not a
declaration, right?

The rename to `err` just happened as I copied this block from
`btrfs_encoded_read_endio()`. It's a matching code, we could even
factor it out eventually. Again, it's just a bit more descriptive that
the returned value is an error code. It's a matter of preference. I
was just polishing the code a bit while already touching it.

--nX
David Sterba Jan. 8, 2025, 6:27 p.m. UTC | #3
On Wed, Jan 08, 2025 at 04:24:19PM +0100, Daniel Vacek wrote:
> On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> <Johannes.Thumshirn@wdc.com> wrote:
> > On 08.01.25 12:44, Daniel Vacek wrote:
> > > Only allocate the `priv` struct from slab for asynchronous mode.
> > >
> > > There's no need to allocate an object from slab in the synchronous mode. In
> > > such a case stack can be happily used as it used to be before 68d3b27e05c7
> > > ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> > > which was a preparation for the async mode.
> > >
> > > While at it, fix the comment to reflect the atomic => refcount change in
> > > d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
> >
> > Generally I'm not a huge fan of conditional allocation/freeing. It just
> > complicates matters. I get it in case of the bio's bi_inline_vecs where
> > it's a optimization, but I fail to see why it would make a difference in
> > this case.
> >
> > If we're really going down that route, there should at least be a
> > justification other than "no need" to.
> 
> Well the main motivation was not to needlessly exercise the slab
> allocator when IO uring is not used. It is a bit of an overhead,
> though the object is not really big so I guess it's not a big deal
> after all (the slab should manage just fine even under low memory
> conditions).
> 
> 68d3b27e05c7 added the allocation for the async mode but also changed
> the original behavior of the sync mode which was using stack before.
> The async mode indeed requires the allocation as the object's lifetime
> extends over the function's one. The sync mode is perfectly contained
> within as it always was.
> 
> Simply, I tend not to do any allocations which are not strictly
> needed.

Nothing wrong about that and I think in kernel it's preferred to avoid
allocations in such cases. The sync case is calling the ioctl and
repeatedly too, so reducing the allocator overhead makes sense, at the
slight cost of code complexity. The worst case is when allocator has to
free memory by flushing or blocking the tasks, so we're actively
avoiding and reducing the chances for that.

> If you prefer to simply allocate the object unconditionally,
> we can just drop this patch.
> 
> > >   struct btrfs_encoded_read_private {
> > > -     struct completion done;
> > > +     struct completion *sync_reads;
> > >       void *uring_ctx;
> > > -     refcount_t pending_refs;
> > > +     refcount_t pending_reads;
> > >       blk_status_t status;
> > >   };
> >
> > These renames just make the diff harder to read (and yes I shouldn't
> > have renamed pending to pending_refs but that at least also changed the
> > type).
> 
> I see. But also the completion is changed to a pointer here for a
> reason and I tried to make it clear it's only used for sync reads,
> hence the rename.

Functional and cleanup/cosmetic changes are better kept separate. It's
keeping the scope of changes minimal and usually the fixes can get
backported so the cleanup changes are not wanted.

> > > -     if (refcount_dec_and_test(&priv->pending_refs)) {
> > > -             int err = blk_status_to_errno(READ_ONCE(priv->status));
> > > -
> > > +     if (refcount_dec_and_test(&priv->pending_reads)) {
> > >               if (priv->uring_ctx) {
> > > +                     int err = blk_status_to_errno(READ_ONCE(priv->status));
> >
> > Missing newline after the declaration.
> 
> Right. Clearly I did not run checkpatch before sending. Sorry.
> I was just trying to match this block to the same one later, which did
> not have the newline. (But it also did not have the declaration
> before.)

We don't stick to what checkpatch reports, I'm fixing a lot of coding
style when merging patches that would not pass checkpatch, the general
CodingStyle applies but we have a local fs/btrfs style too, loosely
documented at https://btrfs.readthedocs.io/en/latest/dev/Development-notes.html .
Johannes Thumshirn Jan. 8, 2025, 6:29 p.m. UTC | #4
On 08.01.25 16:25, Daniel Vacek wrote:
> On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> <Johannes.Thumshirn@wdc.com> wrote:
>>
>> On 08.01.25 12:44, Daniel Vacek wrote:
>>> Only allocate the `priv` struct from slab for asynchronous mode.
>>>
>>> There's no need to allocate an object from slab in the synchronous mode. In
>>> such a case stack can be happily used as it used to be before 68d3b27e05c7
>>> ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
>>> which was a preparation for the async mode.
>>>
>>> While at it, fix the comment to reflect the atomic => refcount change in
>>> d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
>>
>>
>> Generally I'm not a huge fan of conditional allocation/freeing. It just
>> complicates matters. I get it in case of the bio's bi_inline_vecs where
>> it's a optimization, but I fail to see why it would make a difference in
>> this case.
>>
>> If we're really going down that route, there should at least be a
>> justification other than "no need" to.
> 
> Well the main motivation was not to needlessly exercise the slab
> allocator when IO uring is not used. It is a bit of an overhead,
> though the object is not really big so I guess it's not a big deal
> after all (the slab should manage just fine even under low memory
> conditions).
> 
> 68d3b27e05c7 added the allocation for the async mode but also changed
> the original behavior of the sync mode which was using stack before.
> The async mode indeed requires the allocation as the object's lifetime
> extends over the function's one. The sync mode is perfectly contained
> within as it always was.
> 
> Simply, I tend not to do any allocations which are not strictly
> needed. If you prefer to simply allocate the object unconditionally,
> we can just drop this patch.

At the end of the day it's David's call, he's the maintainer. I'm just 
not sure if skipping the allocator for a small short lived object is 
worth the special casing. Especially as I got bitten by this in the past 
when hunting down kmemleak reports. Conditional allocation is like 
conditional locking, sometimes OK but it raises suspicion.
David Sterba Jan. 8, 2025, 6:35 p.m. UTC | #5
On Wed, Jan 08, 2025 at 06:29:24PM +0000, Johannes Thumshirn wrote:
> On 08.01.25 16:25, Daniel Vacek wrote:
> > On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> >>
> >> On 08.01.25 12:44, Daniel Vacek wrote:
> >>> Only allocate the `priv` struct from slab for asynchronous mode.
> >>>
> >>> There's no need to allocate an object from slab in the synchronous mode. In
> >>> such a case stack can be happily used as it used to be before 68d3b27e05c7
> >>> ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> >>> which was a preparation for the async mode.
> >>>
> >>> While at it, fix the comment to reflect the atomic => refcount change in
> >>> d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
> >>
> >>
> >> Generally I'm not a huge fan of conditional allocation/freeing. It just
> >> complicates matters. I get it in case of the bio's bi_inline_vecs where
> >> it's a optimization, but I fail to see why it would make a difference in
> >> this case.
> >>
> >> If we're really going down that route, there should at least be a
> >> justification other than "no need" to.
> > 
> > Well the main motivation was not to needlessly exercise the slab
> > allocator when IO uring is not used. It is a bit of an overhead,
> > though the object is not really big so I guess it's not a big deal
> > after all (the slab should manage just fine even under low memory
> > conditions).
> > 
> > 68d3b27e05c7 added the allocation for the async mode but also changed
> > the original behavior of the sync mode which was using stack before.
> > The async mode indeed requires the allocation as the object's lifetime
> > extends over the function's one. The sync mode is perfectly contained
> > within as it always was.
> > 
> > Simply, I tend not to do any allocations which are not strictly
> > needed. If you prefer to simply allocate the object unconditionally,
> > we can just drop this patch.
> 
> At the end of the day it's David's call, he's the maintainer. I'm just 
> not sure if skipping the allocator for a small short lived object is 
> worth the special casing. Especially as I got bitten by this in the past 
> when hunting down kmemleak reports. Conditional allocation is like 
> conditional locking, sometimes OK but it raises suspicion.

Yeah, in this case it's the uring that makes the allocation/freeing
times different. The "normal" ioctl case does not need it so I think
it's keeping the scope clear while the uring has it's own specialities
like returing to user space with inode lock held
(22d2e48e318564f8c9b09faf03ecb4f03fb44dd5).
Daniel Vacek Jan. 9, 2025, 8:31 a.m. UTC | #6
On Wed, 8 Jan 2025 at 19:27, David Sterba <dsterba@suse.cz> wrote:
>
> On Wed, Jan 08, 2025 at 04:24:19PM +0100, Daniel Vacek wrote:
> > On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> > > On 08.01.25 12:44, Daniel Vacek wrote:
> > > > Only allocate the `priv` struct from slab for asynchronous mode.
> > > >
> > > > There's no need to allocate an object from slab in the synchronous mode. In
> > > > such a case stack can be happily used as it used to be before 68d3b27e05c7
> > > > ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> > > > which was a preparation for the async mode.
> > > >
> > > > While at it, fix the comment to reflect the atomic => refcount change in
> > > > d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
> > >
> > > Generally I'm not a huge fan of conditional allocation/freeing. It just
> > > complicates matters. I get it in case of the bio's bi_inline_vecs where
> > > it's a optimization, but I fail to see why it would make a difference in
> > > this case.
> > >
> > > If we're really going down that route, there should at least be a
> > > justification other than "no need" to.
> >
> > Well the main motivation was not to needlessly exercise the slab
> > allocator when IO uring is not used. It is a bit of an overhead,
> > though the object is not really big so I guess it's not a big deal
> > after all (the slab should manage just fine even under low memory
> > conditions).
> >
> > 68d3b27e05c7 added the allocation for the async mode but also changed
> > the original behavior of the sync mode which was using stack before.
> > The async mode indeed requires the allocation as the object's lifetime
> > extends over the function's one. The sync mode is perfectly contained
> > within as it always was.
> >
> > Simply, I tend not to do any allocations which are not strictly
> > needed.
>
> Nothing wrong about that and I think in kernel it's preferred to avoid
> allocations in such cases. The sync case is calling the ioctl and
> repeatedly too, so reducing the allocator overhead makes sense, at the
> slight cost of code complexity. The worst case is when allocator has to
> free memory by flushing or blocking the tasks, so we're actively
> avoiding and reducing the chances for that.
>
> > If you prefer to simply allocate the object unconditionally,
> > we can just drop this patch.
> >
> > > >   struct btrfs_encoded_read_private {
> > > > -     struct completion done;
> > > > +     struct completion *sync_reads;
> > > >       void *uring_ctx;
> > > > -     refcount_t pending_refs;
> > > > +     refcount_t pending_reads;
> > > >       blk_status_t status;
> > > >   };
> > >
> > > These renames just make the diff harder to read (and yes I shouldn't
> > > have renamed pending to pending_refs but that at least also changed the
> > > type).
> >
> > I see. But also the completion is changed to a pointer here for a
> > reason and I tried to make it clear it's only used for sync reads,
> > hence the rename.
>
> Functional and cleanup/cosmetic changes are better kept separate. It's
> keeping the scope of changes minimal and usually the fixes can get
> backported so the cleanup changes are not wanted.

Right, I see. Noted.

Though in this case there is no fix and I don't see a reason for
backporting this one.
But I guess you were talking about eventual other possible fixes in the area.

> > > > -     if (refcount_dec_and_test(&priv->pending_refs)) {
> > > > -             int err = blk_status_to_errno(READ_ONCE(priv->status));
> > > > -
> > > > +     if (refcount_dec_and_test(&priv->pending_reads)) {
> > > >               if (priv->uring_ctx) {
> > > > +                     int err = blk_status_to_errno(READ_ONCE(priv->status));
> > >
> > > Missing newline after the declaration.
> >
> > Right. Clearly I did not run checkpatch before sending. Sorry.
> > I was just trying to match this block to the same one later, which did
> > not have the newline. (But it also did not have the declaration
> > before.)
>
> We don't stick to what checkpatch reports, I'm fixing a lot of coding
> style when merging patches that would not pass checkpatch, the general
> CodingStyle applies but we have a local fs/btrfs style too, loosely
> documented at https://btrfs.readthedocs.io/en/latest/dev/Development-notes.html .

Bookmarked, thanks.

--nX

On Wed, 8 Jan 2025 at 19:27, David Sterba <dsterba@suse.cz> wrote:
>
> On Wed, Jan 08, 2025 at 04:24:19PM +0100, Daniel Vacek wrote:
> > On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> > > On 08.01.25 12:44, Daniel Vacek wrote:
> > > > Only allocate the `priv` struct from slab for asynchronous mode.
> > > >
> > > > There's no need to allocate an object from slab in the synchronous mode. In
> > > > such a case stack can be happily used as it used to be before 68d3b27e05c7
> > > > ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> > > > which was a preparation for the async mode.
> > > >
> > > > While at it, fix the comment to reflect the atomic => refcount change in
> > > > d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
> > >
> > > Generally I'm not a huge fan of conditional allocation/freeing. It just
> > > complicates matters. I get it in case of the bio's bi_inline_vecs where
> > > it's a optimization, but I fail to see why it would make a difference in
> > > this case.
> > >
> > > If we're really going down that route, there should at least be a
> > > justification other than "no need" to.
> >
> > Well the main motivation was not to needlessly exercise the slab
> > allocator when IO uring is not used. It is a bit of an overhead,
> > though the object is not really big so I guess it's not a big deal
> > after all (the slab should manage just fine even under low memory
> > conditions).
> >
> > 68d3b27e05c7 added the allocation for the async mode but also changed
> > the original behavior of the sync mode which was using stack before.
> > The async mode indeed requires the allocation as the object's lifetime
> > extends over the function's one. The sync mode is perfectly contained
> > within as it always was.
> >
> > Simply, I tend not to do any allocations which are not strictly
> > needed.
>
> Nothing wrong about that and I think in kernel it's preferred to avoid
> allocations in such cases. The sync case is calling the ioctl and
> repeatedly too, so reducing the allocator overhead makes sense, at the
> slight cost of code complexity. The worst case is when allocator has to
> free memory by flushing or blocking the tasks, so we're actively
> avoiding and reducing the chances for that.
>
> > If you prefer to simply allocate the object unconditionally,
> > we can just drop this patch.
> >
> > > >   struct btrfs_encoded_read_private {
> > > > -     struct completion done;
> > > > +     struct completion *sync_reads;
> > > >       void *uring_ctx;
> > > > -     refcount_t pending_refs;
> > > > +     refcount_t pending_reads;
> > > >       blk_status_t status;
> > > >   };
> > >
> > > These renames just make the diff harder to read (and yes I shouldn't
> > > have renamed pending to pending_refs but that at least also changed the
> > > type).
> >
> > I see. But also the completion is changed to a pointer here for a
> > reason and I tried to make it clear it's only used for sync reads,
> > hence the rename.
>
> Functional and cleanup/cosmetic changes are better kept separate. It's
> keeping the scope of changes minimal and usually the fixes can get
> backported so the cleanup changes are not wanted.
>
> > > > -     if (refcount_dec_and_test(&priv->pending_refs)) {
> > > > -             int err = blk_status_to_errno(READ_ONCE(priv->status));
> > > > -
> > > > +     if (refcount_dec_and_test(&priv->pending_reads)) {
> > > >               if (priv->uring_ctx) {
> > > > +                     int err = blk_status_to_errno(READ_ONCE(priv->status));
> > >
> > > Missing newline after the declaration.
> >
> > Right. Clearly I did not run checkpatch before sending. Sorry.
> > I was just trying to match this block to the same one later, which did
> > not have the newline. (But it also did not have the declaration
> > before.)
>
> We don't stick to what checkpatch reports, I'm fixing a lot of coding
> style when merging patches that would not pass checkpatch, the general
> CodingStyle applies but we have a local fs/btrfs style too, loosely
> documented at https://btrfs.readthedocs.io/en/latest/dev/Development-notes.html .
Daniel Vacek Jan. 9, 2025, 8:41 a.m. UTC | #7
On Wed, 8 Jan 2025 at 19:29, Johannes Thumshirn
<Johannes.Thumshirn@wdc.com> wrote:
>
> On 08.01.25 16:25, Daniel Vacek wrote:
> > On Wed, 8 Jan 2025 at 13:42, Johannes Thumshirn
> > <Johannes.Thumshirn@wdc.com> wrote:
> >>
> >> On 08.01.25 12:44, Daniel Vacek wrote:
> >>> Only allocate the `priv` struct from slab for asynchronous mode.
> >>>
> >>> There's no need to allocate an object from slab in the synchronous mode. In
> >>> such a case stack can be happily used as it used to be before 68d3b27e05c7
> >>> ("btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()")
> >>> which was a preparation for the async mode.
> >>>
> >>> While at it, fix the comment to reflect the atomic => refcount change in
> >>> d29662695ed7 ("btrfs: fix use-after-free waiting for encoded read endios").
> >>
> >>
> >> Generally I'm not a huge fan of conditional allocation/freeing. It just
> >> complicates matters. I get it in case of the bio's bi_inline_vecs where
> >> it's a optimization, but I fail to see why it would make a difference in
> >> this case.
> >>
> >> If we're really going down that route, there should at least be a
> >> justification other than "no need" to.
> >
> > Well the main motivation was not to needlessly exercise the slab
> > allocator when IO uring is not used. It is a bit of an overhead,
> > though the object is not really big so I guess it's not a big deal
> > after all (the slab should manage just fine even under low memory
> > conditions).
> >
> > 68d3b27e05c7 added the allocation for the async mode but also changed
> > the original behavior of the sync mode which was using stack before.
> > The async mode indeed requires the allocation as the object's lifetime
> > extends over the function's one. The sync mode is perfectly contained
> > within as it always was.
> >
> > Simply, I tend not to do any allocations which are not strictly
> > needed. If you prefer to simply allocate the object unconditionally,
> > we can just drop this patch.
>
> At the end of the day it's David's call, he's the maintainer. I'm just
> not sure if skipping the allocator for a small short lived object is
> worth the special casing. Especially as I got bitten by this in the past
> when hunting down kmemleak reports. Conditional allocation is like
> conditional locking, sometimes OK but it raises suspicion.

Well in this case the scope is really limited just to those two
functions which you can fit on the screen. It looks quite readable to
me.

And the async/iouring case is already treated conditionally here.

In the end this patch only makes the sync case behave as it was before
introducing the async changes.

--nX
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 27b2fe7f735d5..4d30857df4bcb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9078,9 +9078,9 @@  static ssize_t btrfs_encoded_read_inline(
 }
 
 struct btrfs_encoded_read_private {
-	struct completion done;
+	struct completion *sync_reads;
 	void *uring_ctx;
-	refcount_t pending_refs;
+	refcount_t pending_reads;
 	blk_status_t status;
 };
 
@@ -9090,23 +9090,22 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 
 	if (bbio->bio.bi_status) {
 		/*
-		 * The memory barrier implied by the atomic_dec_return() here
-		 * pairs with the memory barrier implied by the
-		 * atomic_dec_return() or io_wait_event() in
-		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
-		 * write is observed before the load of status in
-		 * btrfs_encoded_read_regular_fill_pages().
+		 * The memory barrier implied by the
+		 * refcount_dec_and_test() here pairs with the memory
+		 * barrier implied by the refcount_dec_and_test() in
+		 * btrfs_encoded_read_regular_fill_pages() to ensure
+		 * that this write is observed before the load of
+		 * status in btrfs_encoded_read_regular_fill_pages().
 		 */
 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 	}
-	if (refcount_dec_and_test(&priv->pending_refs)) {
-		int err = blk_status_to_errno(READ_ONCE(priv->status));
-
+	if (refcount_dec_and_test(&priv->pending_reads)) {
 		if (priv->uring_ctx) {
+			int err = blk_status_to_errno(READ_ONCE(priv->status));
 			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
 			kfree(priv);
 		} else {
-			complete(&priv->done);
+			complete(priv->sync_reads);
 		}
 	}
 	bio_put(&bbio->bio);
@@ -9117,17 +9116,22 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					  struct page **pages, void *uring_ctx)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct btrfs_encoded_read_private *priv;
+	struct btrfs_encoded_read_private *priv, sync_priv;
+	struct completion sync_reads;
 	unsigned long i = 0;
 	struct btrfs_bio *bbio;
-	int ret;
 
-	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
-	if (!priv)
-		return -ENOMEM;
+	if (uring_ctx) {
+		priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+		if (!priv)
+			return -ENOMEM;
+	} else {
+		priv = &sync_priv;
+		init_completion(&sync_reads);
+		priv->sync_reads = &sync_reads;
+	}
 
-	init_completion(&priv->done);
-	refcount_set(&priv->pending_refs, 1);
+	refcount_set(&priv->pending_reads, 1);
 	priv->status = 0;
 	priv->uring_ctx = uring_ctx;
 
@@ -9140,7 +9144,7 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 
 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
-			refcount_inc(&priv->pending_refs);
+			refcount_inc(&priv->pending_reads);
 			btrfs_submit_bbio(bbio, 0);
 
 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
@@ -9155,25 +9159,23 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 		disk_io_size -= bytes;
 	} while (disk_io_size);
 
-	refcount_inc(&priv->pending_refs);
+	refcount_inc(&priv->pending_reads);
 	btrfs_submit_bbio(bbio, 0);
 
 	if (uring_ctx) {
-		if (refcount_dec_and_test(&priv->pending_refs)) {
-			ret = blk_status_to_errno(READ_ONCE(priv->status));
-			btrfs_uring_read_extent_endio(uring_ctx, ret);
+		if (refcount_dec_and_test(&priv->pending_reads)) {
+			int err = blk_status_to_errno(READ_ONCE(priv->status));
+			btrfs_uring_read_extent_endio(uring_ctx, err);
 			kfree(priv);
-			return ret;
+			return err;
 		}
 
 		return -EIOCBQUEUED;
 	} else {
-		if (!refcount_dec_and_test(&priv->pending_refs))
-			wait_for_completion_io(&priv->done);
+		if (!refcount_dec_and_test(&priv->pending_reads))
+			wait_for_completion_io(&sync_reads);
 		/* See btrfs_encoded_read_endio() for ordering. */
-		ret = blk_status_to_errno(READ_ONCE(priv->status));
-		kfree(priv);
-		return ret;
+		return blk_status_to_errno(READ_ONCE(priv->status));
 	}
 }