diff mbox series

xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely done

Message ID 20230411010638.GF360889@frogsfrogsfrogs (mailing list archive)
State Superseded
Headers show
Series xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely done | expand

Commit Message

Darrick J. Wong April 11, 2023, 1:06 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

While fuzzing the data fork extent count on a btree-format directory
with xfs/375, I observed the following (excerpted) splat:

XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208
------------[ cut here ]------------
WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
Call Trace:
 <TASK>
 xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
 __x64_sys_ioctl+0x82/0xa0
 do_syscall_64+0x2b/0x80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

The cause of this is a race condition in xfs_ilock_data_map_shared,
which performs an unlocked access to the data fork to guess which lock
mode it needs:

Thread 0                          Thread 1

xfs_need_iread_extents
<observe no iext tree>
xfs_ilock(..., ILOCK_EXCL)
xfs_iread_extents
<observe no iext tree>
<check ILOCK_EXCL>
<load bmbt extents into iext>
<notice iext size doesn't
 match nextents>
                                  xfs_need_iread_extents
                                  <observe iext tree>
                                  xfs_ilock(..., ILOCK_SHARED)
<tear down iext tree>
xfs_iunlock(..., ILOCK_EXCL)
                                  xfs_iread_extents
                                  <observe no iext tree>
                                  <check ILOCK_EXCL>
                                  *BOOM*

Fix this race by adding a flag to the xfs_ifork structure to indicate
that we have not yet read in the extent records and changing the
predicate to look at the flag state, not if_height.  The memory barrier
ensures that the flag will not be set until the very end of the
function.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c       |    2 ++
 fs/xfs/libxfs/xfs_inode_fork.c |    2 ++
 fs/xfs/libxfs/xfs_inode_fork.h |    3 ++-
 3 files changed, 6 insertions(+), 1 deletion(-)

Comments

Dave Chinner April 11, 2023, 3:20 a.m. UTC | #1
On Mon, Apr 10, 2023 at 06:06:38PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> While fuzzing the data fork extent count on a btree-format directory
> with xfs/375, I observed the following (excerpted) splat:
> 
> XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
> Call Trace:
>  <TASK>
>  xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
>  __x64_sys_ioctl+0x82/0xa0
>  do_syscall_64+0x2b/0x80
>  entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
> The cause of this is a race condition in xfs_ilock_data_map_shared,
> which performs an unlocked access to the data fork to guess which lock
> mode it needs:
> 
> Thread 0                          Thread 1
> 
> xfs_need_iread_extents
> <observe no iext tree>
> xfs_ilock(..., ILOCK_EXCL)
> xfs_iread_extents
> <observe no iext tree>
> <check ILOCK_EXCL>
> <load bmbt extents into iext>
> <notice iext size doesn't
>  match nextents>
>                                   xfs_need_iread_extents
>                                   <observe iext tree>
>                                   xfs_ilock(..., ILOCK_SHARED)
> <tear down iext tree>
> xfs_iunlock(..., ILOCK_EXCL)
>                                   xfs_iread_extents
>                                   <observe no iext tree>
>                                   <check ILOCK_EXCL>
>                                   *BOOM*
> 
> Fix this race by adding a flag to the xfs_ifork structure to indicate
> that we have not yet read in the extent records and changing the
> predicate to look at the flag state, not if_height.  The memory barrier
> ensures that the flag will not be set until the very end of the
> function.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/libxfs/xfs_bmap.c       |    2 ++
>  fs/xfs/libxfs/xfs_inode_fork.c |    2 ++
>  fs/xfs/libxfs/xfs_inode_fork.h |    3 ++-
>  3 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 34de6e6898c4..5f96e7ce7b4a 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -1171,6 +1171,8 @@ xfs_iread_extents(
>  		goto out;
>  	}
>  	ASSERT(ir.loaded == xfs_iext_count(ifp));
> +	smp_mb();
> +	ifp->if_needextents = 0;

Hmmm - if this is to ensure that everything above is completed
before the clearing of this flag is visible everywhere else, then we
should be able to use load_acquire/store_release semantics? i.e. the
above is

	smp_store_release(ifp->if_needextents, 0);

and we use

	smp_load_acquire(ifp->if_needextents)

when we need to read the value to ensure that all the changes made
before the relevant stores are also visible?

>  	return 0;
>  out:
>  	xfs_iext_destroy(ifp);
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> index 6b21760184d9..eadae924dc42 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.c
> +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> @@ -174,6 +174,8 @@ xfs_iformat_btree(
>  	int			level;
>  
>  	ifp = xfs_ifork_ptr(ip, whichfork);
> +	ifp->if_needextents = 1;

Hmmm - what's the guarantee that the reader will see ifp->if_format
set correctly if they if_needextents = 1?

Wouldn't it be better to set this at the same time we set the
ifp->if_format value? We clear it unconditionally above in
xfs_iread_extents(), so why not set it unconditionally there, too,
before we start. i.e.

	/*
	 * Set the format before we set needsextents with release
	 * semantics. This ensures that we can use acquire semantics
	 * on needextents in xfs_need_iread_extents() and be
	 * guaranteed to see a valid format value after that load.
	 */
	ifp->if_format = dip->di_format;
	smp_store_release(ifp->if_needextents, 1);

That then means xfs_need_iread_extents() is guaranteed to see a
valid ifp->if_format if ifp->if_needextents is set if we do:

/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
{

	/* see xfs_iread_extents() for needextents semantics */
	return smp_load_acquire(ifp->if_needextents) &&
			ifp->if_format == XFS_DINODE_FMT_BTREE;
}

Cheers,

Dave.
Darrick J. Wong April 11, 2023, 4:18 a.m. UTC | #2
On Tue, Apr 11, 2023 at 01:20:35PM +1000, Dave Chinner wrote:
> On Mon, Apr 10, 2023 at 06:06:38PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > While fuzzing the data fork extent count on a btree-format directory
> > with xfs/375, I observed the following (excerpted) splat:
> > 
> > XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208
> > ------------[ cut here ]------------
> > WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
> > Call Trace:
> >  <TASK>
> >  xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
> >  __x64_sys_ioctl+0x82/0xa0
> >  do_syscall_64+0x2b/0x80
> >  entry_SYSCALL_64_after_hwframe+0x46/0xb0
> > 
> > The cause of this is a race condition in xfs_ilock_data_map_shared,
> > which performs an unlocked access to the data fork to guess which lock
> > mode it needs:
> > 
> > Thread 0                          Thread 1
> > 
> > xfs_need_iread_extents
> > <observe no iext tree>
> > xfs_ilock(..., ILOCK_EXCL)
> > xfs_iread_extents
> > <observe no iext tree>
> > <check ILOCK_EXCL>
> > <load bmbt extents into iext>
> > <notice iext size doesn't
> >  match nextents>
> >                                   xfs_need_iread_extents
> >                                   <observe iext tree>
> >                                   xfs_ilock(..., ILOCK_SHARED)
> > <tear down iext tree>
> > xfs_iunlock(..., ILOCK_EXCL)
> >                                   xfs_iread_extents
> >                                   <observe no iext tree>
> >                                   <check ILOCK_EXCL>
> >                                   *BOOM*
> > 
> > Fix this race by adding a flag to the xfs_ifork structure to indicate
> > that we have not yet read in the extent records and changing the
> > predicate to look at the flag state, not if_height.  The memory barrier
> > ensures that the flag will not be set until the very end of the
> > function.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c       |    2 ++
> >  fs/xfs/libxfs/xfs_inode_fork.c |    2 ++
> >  fs/xfs/libxfs/xfs_inode_fork.h |    3 ++-
> >  3 files changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > index 34de6e6898c4..5f96e7ce7b4a 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.c
> > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > @@ -1171,6 +1171,8 @@ xfs_iread_extents(
> >  		goto out;
> >  	}
> >  	ASSERT(ir.loaded == xfs_iext_count(ifp));
> > +	smp_mb();
> > +	ifp->if_needextents = 0;
> 
> Hmmm - if this is to ensure that everything above is completed
> before the clearing of this flag is visible everywhere else, then we
> should be able to use load_acquire/store_release semantics? i.e. the
> above is
> 
> 	smp_store_release(ifp->if_needextents, 0);
> 
> and we use
> 
> 	smp_load_acquire(ifp->if_needextents)
> 
> when we need to read the value to ensure that all the changes made
> before the relevant stores are also visible?

I think we can; see below.

> >  	return 0;
> >  out:
> >  	xfs_iext_destroy(ifp);
> > diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> > index 6b21760184d9..eadae924dc42 100644
> > --- a/fs/xfs/libxfs/xfs_inode_fork.c
> > +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> > @@ -174,6 +174,8 @@ xfs_iformat_btree(
> >  	int			level;
> >  
> >  	ifp = xfs_ifork_ptr(ip, whichfork);
> > +	ifp->if_needextents = 1;
> 
> Hmmm - what's the guarantee that the reader will see ifp->if_format
> set correctly if they if_needextents = 1?

At this point in the iget miss path, the only thread that can see the
xfs_inode object is the one currently running the miss path.  I think
the spin_lock call to add the inode to the radix tree is sufficient to
guarantee that both if_format and if_needextents are set consistently
when any other thread gains the ability to find the inode in the radix
tree.

That said, smp_store_release/smp_load_acquire would make that more
explicit.

How will we port this to userspace libxfs?

> Wouldn't it be better to set this at the same time we set the
> ifp->if_format value? We clear it unconditionally above in
> xfs_iread_extents(), so why not set it unconditionally there, too,
> before we start. i.e.
> 
> 	/*
> 	 * Set the format before we set needsextents with release
> 	 * semantics. This ensures that we can use acquire semantics
> 	 * on needextents in xfs_need_iread_extents() and be
> 	 * guaranteed to see a valid format value after that load.
> 	 */
> 	ifp->if_format = dip->di_format;
> 	smp_store_release(ifp->if_needextents, 1);
> 
> That then means xfs_need_iread_extents() is guaranteed to see a
> valid ifp->if_format if ifp->if_needextents is set if we do:

I think we should be smp_stor'ing needextents as appropriate for
if_format, so that...

> /* returns true if the fork has extents but they are not read in yet. */
> static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
> {
> 
> 	/* see xfs_iread_extents() for needextents semantics */
> 	return smp_load_acquire(ifp->if_needextents) &&
> 			ifp->if_format == XFS_DINODE_FMT_BTREE;

...then we don't need this FMT_BTREE check at all anymore.

--D

> }
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Christoph Hellwig April 11, 2023, 5:06 a.m. UTC | #3
On Mon, Apr 10, 2023 at 06:06:38PM -0700, Darrick J. Wong wrote:
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -1171,6 +1171,8 @@ xfs_iread_extents(
>  		goto out;
>  	}
>  	ASSERT(ir.loaded == xfs_iext_count(ifp));
> +	smp_mb();
> +	ifp->if_needextents = 0;

Shouldn't this be a WRITE_ONCE?

> diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> index 6b21760184d9..eadae924dc42 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.c
> +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> @@ -174,6 +174,8 @@ xfs_iformat_btree(
>  	int			level;
>  
>  	ifp = xfs_ifork_ptr(ip, whichfork);
> +	ifp->if_needextents = 1;

Same here.

>  	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
>  	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
>  	nrecs = be16_to_cpu(dfp->bb_numrecs);
> diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
> index d3943d6ad0b9..e69c78c35c96 100644
> --- a/fs/xfs/libxfs/xfs_inode_fork.h
> +++ b/fs/xfs/libxfs/xfs_inode_fork.h
> @@ -24,6 +24,7 @@ struct xfs_ifork {
>  	xfs_extnum_t		if_nextents;	/* # of extents in this fork */
>  	short			if_broot_bytes;	/* bytes allocated for root */
>  	int8_t			if_format;	/* format of this fork */
> +	int8_t			if_needextents;	/* extents have not been read */
>  };

I don't think this should be signed.

>  static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
>  {
> -	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
> +	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_needextents;

.. and this should use READ_ONCE and drop the if_format check given
that if_needextents is only set in xfs_iformat_btree.
Christoph Hellwig April 11, 2023, 5:11 a.m. UTC | #4
On Tue, Apr 11, 2023 at 01:20:35PM +1000, Dave Chinner wrote:
> >  	ASSERT(ir.loaded == xfs_iext_count(ifp));
> > +	smp_mb();
> > +	ifp->if_needextents = 0;
> 
> Hmmm - if this is to ensure that everything above is completed
> before the clearing of this flag is visible everywhere else, then we
> should be able to use load_acquire/store_release semantics? i.e. the
> above is
> 
> 	smp_store_release(ifp->if_needextents, 0);
> 
> and we use
> 
> 	smp_load_acquire(ifp->if_needextents)

Yeah, that's probably better than my READ_ONCE/WRITE_ONCE suggestions
as it also orders vs the previous assignments.

> >  	ifp = xfs_ifork_ptr(ip, whichfork);
> > +	ifp->if_needextents = 1;
> 
> Hmmm - what's the guarantee that the reader will see ifp->if_format
> set correctly if they if_needextents = 1?
> 
> Wouldn't it be better to set this at the same time we set the
> ifp->if_format value? We clear it unconditionally above in
> xfs_iread_extents(), so why not set it unconditionally there, too,
> before we start. i.e.
> 
> 	/*
> 	 * Set the format before we set needsextents with release
> 	 * semantics. This ensures that we can use acquire semantics
> 	 * on needextents in xfs_need_iread_extents() and be
> 	 * guaranteed to see a valid format value after that load.
> 	 */
> 	ifp->if_format = dip->di_format;
> 	smp_store_release(ifp->if_needextents, 1);
> 
> That then means xfs_need_iread_extents() is guaranteed to see a
> valid ifp->if_format if ifp->if_needextents is set if we do:

I'd just drop the if_format check in xfs_need_iread_extents,
which together with the memory barriers should fix all this.
Dave Chinner April 11, 2023, 9:58 p.m. UTC | #5
On Mon, Apr 10, 2023 at 09:18:58PM -0700, Darrick J. Wong wrote:
> On Tue, Apr 11, 2023 at 01:20:35PM +1000, Dave Chinner wrote:
> > On Mon, Apr 10, 2023 at 06:06:38PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > While fuzzing the data fork extent count on a btree-format directory
> > > with xfs/375, I observed the following (excerpted) splat:

....

> > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > > index 34de6e6898c4..5f96e7ce7b4a 100644
> > > --- a/fs/xfs/libxfs/xfs_bmap.c
> > > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > > @@ -1171,6 +1171,8 @@ xfs_iread_extents(
> > >  		goto out;
> > >  	}
> > >  	ASSERT(ir.loaded == xfs_iext_count(ifp));
> > > +	smp_mb();
> > > +	ifp->if_needextents = 0;
> > 
> > Hmmm - if this is to ensure that everything above is completed
> > before the clearing of this flag is visible everywhere else, then we
> > should be able to use load_acquire/store_release semantics? i.e. the
> > above is
> > 
> > 	smp_store_release(ifp->if_needextents, 0);
> > 
> > and we use
> > 
> > 	smp_load_acquire(ifp->if_needextents)
> > 
> > when we need to read the value to ensure that all the changes made
> > before the relevant stores are also visible?
> 
> I think we can; see below.
> 
> > >  	return 0;
> > >  out:
> > >  	xfs_iext_destroy(ifp);
> > > diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
> > > index 6b21760184d9..eadae924dc42 100644
> > > --- a/fs/xfs/libxfs/xfs_inode_fork.c
> > > +++ b/fs/xfs/libxfs/xfs_inode_fork.c
> > > @@ -174,6 +174,8 @@ xfs_iformat_btree(
> > >  	int			level;
> > >  
> > >  	ifp = xfs_ifork_ptr(ip, whichfork);
> > > +	ifp->if_needextents = 1;
> > 
> > Hmmm - what's the guarantee that the reader will see ifp->if_format
> > set correctly if they if_needextents = 1?
> 
> At this point in the iget miss path, the only thread that can see the
> xfs_inode object is the one currently running the miss path.  I think
> the spin_lock call to add the inode to the radix tree is sufficient to
> guarantee that both if_format and if_needextents are set consistently
> when any other thread gains the ability to find the inode in the radix
> tree.
> 
> That said, smp_store_release/smp_load_acquire would make that more
> explicit.
> 
> How will we port this to userspace libxfs?

Add liburcu support. This brings across all the kernel side atomic
operations and memory model support, along with RCU. Basically, we
can then use the kernel side concurrency algorithms with almost no
change to the libxfs userspace at all...

See:

https://github.com/urcu/userspace-rcu/blob/master/include/urcu/arch/x86.h

Then we can just add two line wrappers that use cmm_rmb() and
cmm_wmb() to provide store_release/load_acquire semantics to the
required operations...

I've attached the bitrotted patch I have from the old xfs_buf port
to userspace I have sitting around. That series has other
libxfs implementations of various kernel concurrency APIs, too...

> > Wouldn't it be better to set this at the same time we set the
> > ifp->if_format value? We clear it unconditionally above in
> > xfs_iread_extents(), so why not set it unconditionally there, too,
> > before we start. i.e.
> > 
> > 	/*
> > 	 * Set the format before we set needsextents with release
> > 	 * semantics. This ensures that we can use acquire semantics
> > 	 * on needextents in xfs_need_iread_extents() and be
> > 	 * guaranteed to see a valid format value after that load.
> > 	 */
> > 	ifp->if_format = dip->di_format;
> > 	smp_store_release(ifp->if_needextents, 1);
> > 
> > That then means xfs_need_iread_extents() is guaranteed to see a
> > valid ifp->if_format if ifp->if_needextents is set if we do:
> 
> I think we should be smp_stor'ing needextents as appropriate for
> if_format, so that...
> 
> > /* returns true if the fork has extents but they are not read in yet. */
> > static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
> > {
> > 
> > 	/* see xfs_iread_extents() for needextents semantics */
> > 	return smp_load_acquire(ifp->if_needextents) &&
> > 			ifp->if_format == XFS_DINODE_FMT_BTREE;
> 
> ...then we don't need this FMT_BTREE check at all anymore.

Yup, as Christoph also pointed out. That definitely simplifies
things. :)

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..5f96e7ce7b4a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1171,6 +1171,8 @@  xfs_iread_extents(
 		goto out;
 	}
 	ASSERT(ir.loaded == xfs_iext_count(ifp));
+	smp_mb();
+	ifp->if_needextents = 0;
 	return 0;
 out:
 	xfs_iext_destroy(ifp);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6b21760184d9..eadae924dc42 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -174,6 +174,8 @@  xfs_iformat_btree(
 	int			level;
 
 	ifp = xfs_ifork_ptr(ip, whichfork);
+	ifp->if_needextents = 1;
+
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index d3943d6ad0b9..e69c78c35c96 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -24,6 +24,7 @@  struct xfs_ifork {
 	xfs_extnum_t		if_nextents;	/* # of extents in this fork */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	int8_t			if_format;	/* format of this fork */
+	int8_t			if_needextents;	/* extents have not been read */
 };
 
 /*
@@ -262,7 +263,7 @@  int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
 /* returns true if the fork has extents but they are not read in yet. */
 static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
 {
-	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+	return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_needextents;
 }
 
 #endif	/* __XFS_INODE_FORK_H__ */