diff mbox series

[4/4] xfs: reactivate XFS_NEED_INACTIVE inodes from xfs_iget

Message ID 20240319001707.3430251-5-david@fromorbit.com (mailing list archive)
State New
Headers show
Series xfs: recycle inactive inodes immediately | expand

Commit Message

Dave Chinner March 19, 2024, 12:16 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

When xfs_iget() finds an inode that is queued for inactivation, it
issues an inodegc flush to trigger the inactivation work and then
retries the lookup.

However, when the filesystem is frozen, inodegc is turned off and
the flush does nothing and does not block. This results in lookup
spinning on NEED_INACTIVE inodes and being unable to make progress
until the filesystem is thawed. This is less than ideal.

The only reason we can't immediately recycle the inode is that it
queued on a lockless list we can't remove it from. However, those
lists now support lazy removal, and so we can now modify the lookup
code to reactivate inode queued for inactivation. The process is
identical to how we recycle reclaimable inodes from xfs_iget(), so
this ends up being a relatively simple change to make.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 110 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 23 deletions(-)

Comments

Darrick J. Wong March 19, 2024, 6:11 p.m. UTC | #1
On Tue, Mar 19, 2024 at 11:16:00AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When xfs_iget() finds an inode that is queued for inactivation, it
> issues an inodegc flush to trigger the inactivation work and then
> retries the lookup.
> 
> However, when the filesystem is frozen, inodegc is turned off and
> the flush does nothing and does not block. This results in lookup
> spinning on NEED_INACTIVE inodes and being unable to make progress
> until the filesystem is thawed. This is less than ideal.
> 
> The only reason we can't immediately recycle the inode is that it
> queued on a lockless list we can't remove it from. However, those
> lists now support lazy removal, and so we can now modify the lookup
> code to reactivate inode queued for inactivation. The process is
> identical to how we recycle reclaimable inodes from xfs_iget(), so
> this ends up being a relatively simple change to make.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_icache.c | 110 +++++++++++++++++++++++++++++++++++---------
>  1 file changed, 87 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 7359753b892b..56de3e843df2 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -63,6 +63,8 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
>  					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
>  					 XFS_ICWALK_FLAG_UNION)
>  
> +static void xfs_inodegc_queue(struct xfs_inode *ip);
> +
>  /*
>   * Allocate and initialise an xfs_inode.
>   */
> @@ -325,6 +327,7 @@ xfs_reinit_inode(
>  	return error;
>  }
>  
> +
>  /*
>   * Carefully nudge an inode whose VFS state has been torn down back into a
>   * usable state.  Drops the i_flags_lock and the rcu read lock.
> @@ -388,7 +391,82 @@ xfs_iget_recycle(
>  	inode->i_state = I_NEW;
>  	spin_unlock(&ip->i_flags_lock);
>  	spin_unlock(&pag->pag_ici_lock);
> +	XFS_STATS_INC(mp, xs_ig_frecycle);
> +	return 0;
> +}
>  
> +static int
> +xfs_iget_reactivate(
> +	struct xfs_perag	*pag,
> +	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct inode		*inode = VFS_I(ip);
> +	int			error;
> +
> +	trace_xfs_iget_recycle(ip);
> +
> +	/*
> +	 * If the inode has been unlinked, then the lookup must not find it
> +	 * until inactivation has actually freed the inode.
> +	 */
> +	if (VFS_I(ip)->i_nlink == 0) {
> +		spin_unlock(&ip->i_flags_lock);
> +		rcu_read_unlock();
> +		return -ENOENT;
> +	}
> +
> +	/*
> +	 * Take the ILOCK here to serialise against lookup races with putting
> +	 * the inode back on the inodegc queue during error handling.
> +	 */
> +	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
> +		return -EAGAIN;
> +
> +	/*
> +	 * Move the state to inactivating so both inactivation and racing
> +	 * lookups will skip over this inode until we've finished reactivating
> +	 * it and can return it to the XFS_INEW state.
> +	 */
> +	ip->i_flags &= ~XFS_NEED_INACTIVE;
> +	ip->i_flags |= XFS_INACTIVATING;
> +	spin_unlock(&ip->i_flags_lock);
> +	rcu_read_unlock();
> +
> +	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
> +	error = xfs_reinit_inode(mp, inode);
> +	if (error) {
> +		/*
> +		 * Well, that sucks. Put the inode back on the inactive queue.
> +		 * Do this while still under the ILOCK so that we can set the
> +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not

The sentence structure here is a little funky to me.  How about:

"...and clear the INACTIVATING flag without another lookup racing with us..."

?

With that changed,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D


> +		 * have another lookup race with us before we've finished
> +		 * putting the inode back on the inodegc queue.
> +		 */
> +		spin_unlock(&ip->i_flags_lock);
> +		ip->i_flags |= XFS_NEED_INACTIVE;
> +		ip->i_flags &= ~XFS_INACTIVATING;
> +		spin_unlock(&ip->i_flags_lock);
> +
> +		xfs_inodegc_queue(ip);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +		trace_xfs_iget_recycle_fail(ip);
> +		return error;
> +	}
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +	/*
> +	 * Reset the inode state to new so that xfs_iget() will complete
> +	 * the required remaining inode initialisation before it returns the
> +	 * inode to the caller.
> +	 */
> +	spin_lock(&ip->i_flags_lock);
> +	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
> +	ip->i_flags |= XFS_INEW;
> +	inode->i_state = I_NEW;
> +	spin_unlock(&ip->i_flags_lock);
> +	XFS_STATS_INC(mp, xs_ig_frecycle);
>  	return 0;
>  }
>  
> @@ -526,15 +604,6 @@ xfs_iget_cache_hit(
>  	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
>  		goto out_skip;
>  
> -	if (ip->i_flags & XFS_NEED_INACTIVE) {
> -		/* Unlinked inodes cannot be re-grabbed. */
> -		if (VFS_I(ip)->i_nlink == 0) {
> -			error = -ENOENT;
> -			goto out_error;
> -		}
> -		goto out_inodegc_flush;
> -	}
> -
>  	/*
>  	 * Check the inode free state is valid. This also detects lookup
>  	 * racing with unlinks.
> @@ -545,11 +614,18 @@ xfs_iget_cache_hit(
>  
>  	/* Skip inodes that have no vfs state. */
>  	if ((flags & XFS_IGET_INCORE) &&
> -	    (ip->i_flags & XFS_IRECLAIMABLE))
> +	    (ip->i_flags & (XFS_IRECLAIMABLE | XFS_NEED_INACTIVE)))
>  		goto out_skip;
>  
>  	/* The inode fits the selection criteria; process it. */
> -	if (ip->i_flags & XFS_IRECLAIMABLE) {
> +	if (ip->i_flags & XFS_NEED_INACTIVE) {
> +		/* Drops i_flags_lock and RCU read lock. */
> +		error = xfs_iget_reactivate(pag, ip);
> +		if (error == -EAGAIN)
> +			goto out_skip;
> +		if (error)
> +			return error;
> +	} else if (ip->i_flags & XFS_IRECLAIMABLE) {
>  		/* Drops i_flags_lock and RCU read lock. */
>  		error = xfs_iget_recycle(pag, ip);
>  		if (error == -EAGAIN)
> @@ -578,23 +654,11 @@ xfs_iget_cache_hit(
>  
>  out_skip:
>  	trace_xfs_iget_skip(ip);
> -	XFS_STATS_INC(mp, xs_ig_frecycle);
>  	error = -EAGAIN;
>  out_error:
>  	spin_unlock(&ip->i_flags_lock);
>  	rcu_read_unlock();
>  	return error;
> -
> -out_inodegc_flush:
> -	spin_unlock(&ip->i_flags_lock);
> -	rcu_read_unlock();
> -	/*
> -	 * Do not wait for the workers, because the caller could hold an AGI
> -	 * buffer lock.  We're just going to sleep in a loop anyway.
> -	 */
> -	if (xfs_is_inodegc_enabled(mp))
> -		xfs_inodegc_queue_all(mp);
> -	return -EAGAIN;
>  }
>  
>  static int
> -- 
> 2.43.0
> 
>
Andre Noll March 20, 2024, 8:39 a.m. UTC | #2
On Tue, Mar 19, 11:16, Dave Chinner wrote
> +		/*
> +		 * Well, that sucks. Put the inode back on the inactive queue.
> +		 * Do this while still under the ILOCK so that we can set the
> +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
> +		 * have another lookup race with us before we've finished
> +		 * putting the inode back on the inodegc queue.
> +		 */
> +		spin_unlock(&ip->i_flags_lock);
> +		ip->i_flags |= XFS_NEED_INACTIVE;
> +		ip->i_flags &= ~XFS_INACTIVATING;
> +		spin_unlock(&ip->i_flags_lock);

This doesn't look right. Shouldn't the first spin_unlock() be spin_lock()?

Also, there's a typo in the comment (s/an/and).

Best
Andre
Darrick J. Wong March 20, 2024, 2:53 p.m. UTC | #3
On Wed, Mar 20, 2024 at 09:39:57AM +0100, Andre Noll wrote:
> On Tue, Mar 19, 11:16, Dave Chinner wrote
> > +		/*
> > +		 * Well, that sucks. Put the inode back on the inactive queue.
> > +		 * Do this while still under the ILOCK so that we can set the
> > +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
> > +		 * have another lookup race with us before we've finished
> > +		 * putting the inode back on the inodegc queue.
> > +		 */
> > +		spin_unlock(&ip->i_flags_lock);
> > +		ip->i_flags |= XFS_NEED_INACTIVE;
> > +		ip->i_flags &= ~XFS_INACTIVATING;
> > +		spin_unlock(&ip->i_flags_lock);
> 
> This doesn't look right. Shouldn't the first spin_unlock() be spin_lock()?

Yes.  So much for my hand inspection of code. :(

(Doesn't simple lock debugging catch these sorts of things?)

((It sure would be nice if locking returned a droppable "object" to do
the unlock ala Rust and then spin_lock could be __must_check.))

--D

> Also, there's a typo in the comment (s/an/and).
> Best
> Andre
> -- 
> Max Planck Institute for Biology
> Tel: (+49) 7071 601 829
> Max-Planck-Ring 5, 72076 Tübingen, Germany
> http://people.tuebingen.mpg.de/maan/
Andre Noll March 20, 2024, 4:58 p.m. UTC | #4
On Wed, Mar 20, 07:53, Darrick J. Wong wrote
> On Wed, Mar 20, 2024 at 09:39:57AM +0100, Andre Noll wrote:
> > On Tue, Mar 19, 11:16, Dave Chinner wrote
> > > +		/*
> > > +		 * Well, that sucks. Put the inode back on the inactive queue.
> > > +		 * Do this while still under the ILOCK so that we can set the
> > > +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
> > > +		 * have another lookup race with us before we've finished
> > > +		 * putting the inode back on the inodegc queue.
> > > +		 */
> > > +		spin_unlock(&ip->i_flags_lock);
> > > +		ip->i_flags |= XFS_NEED_INACTIVE;
> > > +		ip->i_flags &= ~XFS_INACTIVATING;
> > > +		spin_unlock(&ip->i_flags_lock);
> > 
> > This doesn't look right. Shouldn't the first spin_unlock() be spin_lock()?
> 
> Yes.  So much for my hand inspection of code. :(

Given enough hand inspections, all bugs are shallow :)

> (Doesn't simple lock debugging catch these sorts of things?)

Maybe this error path doesn't get exercised because xfs_reinit_inode()
never fails. AFAICT, it can only fail if security_inode_alloc()
can't allocate the composite inode blob.

> ((It sure would be nice if locking returned a droppable "object" to do
> the unlock ala Rust and then spin_lock could be __must_check.))

There's the *LOCK_GUARD* macros which employ gcc's cleanup attribute
to automatically call e.g. spin_unlock() when a variable goes out of
scope (see 54da6a0924311).

Best
Andre
Dave Chinner March 20, 2024, 9:58 p.m. UTC | #5
On Wed, Mar 20, 2024 at 09:39:57AM +0100, Andre Noll wrote:
> On Tue, Mar 19, 11:16, Dave Chinner wrote
> > +		/*
> > +		 * Well, that sucks. Put the inode back on the inactive queue.
> > +		 * Do this while still under the ILOCK so that we can set the
> > +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
> > +		 * have another lookup race with us before we've finished
> > +		 * putting the inode back on the inodegc queue.
> > +		 */
> > +		spin_unlock(&ip->i_flags_lock);
> > +		ip->i_flags |= XFS_NEED_INACTIVE;
> > +		ip->i_flags &= ~XFS_INACTIVATING;
> > +		spin_unlock(&ip->i_flags_lock);
> 
> This doesn't look right. Shouldn't the first spin_unlock() be spin_lock()?

Good catch. Fixed.

-Dave.
Dave Chinner March 20, 2024, 10:51 p.m. UTC | #6
On Wed, Mar 20, 2024 at 05:58:53PM +0100, Andre Noll wrote:
> On Wed, Mar 20, 07:53, Darrick J. Wong wrote
> > On Wed, Mar 20, 2024 at 09:39:57AM +0100, Andre Noll wrote:
> > > On Tue, Mar 19, 11:16, Dave Chinner wrote
> > > > +		/*
> > > > +		 * Well, that sucks. Put the inode back on the inactive queue.
> > > > +		 * Do this while still under the ILOCK so that we can set the
> > > > +		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
> > > > +		 * have another lookup race with us before we've finished
> > > > +		 * putting the inode back on the inodegc queue.
> > > > +		 */
> > > > +		spin_unlock(&ip->i_flags_lock);
> > > > +		ip->i_flags |= XFS_NEED_INACTIVE;
> > > > +		ip->i_flags &= ~XFS_INACTIVATING;
> > > > +		spin_unlock(&ip->i_flags_lock);
> > > 
> > > This doesn't look right. Shouldn't the first spin_unlock() be spin_lock()?
> > 
> > Yes.  So much for my hand inspection of code. :(
> 
> Given enough hand inspections, all bugs are shallow :)

Sparse should have found that, if I ran it. :/

Ah, but sparse gets confused by the fact that the return from the
function may or may not have unlocked stuff:

fs/xfs/xfs_icache.c:355:9: warning: context imbalance in 'xfs_iget_recycle' - unexpected unlock
fs/xfs/xfs_icache.c:414:28: warning: context imbalance in 'xfs_iget_reactivate' - unexpected unlock
fs/xfs/xfs_icache.c:656:28: warning: context imbalance in 'xfs_iget_cache_hit' - different lock contexts for basic block

So if I fix that (that'll be patch 5 for this series), i get:

  CC      fs/xfs/xfs_icache.o
  CHECK   fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.c:459:28: warning: context imbalance in 'xfs_iget_reactivate' - unexpected unlock

Yup, sparse now catches the unbalanced locking.

I just haven't thought to run sparse on XFS recently - running
sparse on a full kernel build is just .... awful. I think I'll
change my build script so that when I do an '--xfs-only' built it
also enables sparse as it's only rebuilding fs/xfs at that point....

> > (Doesn't simple lock debugging catch these sorts of things?)
> 
> Maybe this error path doesn't get exercised because xfs_reinit_inode()
> never fails. AFAICT, it can only fail if security_inode_alloc()
> can't allocate the composite inode blob.

Which syzkaller triggers every so often. I also do all my testing
with selinux enabled, so security_inode_alloc() is actually being
exercised and definitely has the potential to fail on my small
memory configs...

> > ((It sure would be nice if locking returned a droppable "object" to do
> > the unlock ala Rust and then spin_lock could be __must_check.))
> 
> There's the *LOCK_GUARD* macros which employ gcc's cleanup attribute
> to automatically call e.g. spin_unlock() when a variable goes out of
> scope (see 54da6a0924311).

IMO, the LOCK_GUARD stuff is an awful anti-pattern. It means some
error paths -look broken- because they lack unlocks, and we have to
explicitly change code to return from functions with the guarded
locks held. This is a diametrically opposed locking pattern to the
existing non-guarded lockign patterns - correct behaviour in one
pattern is broken behaviour in the other, and vice versa.

That's just -insane- from a code maintenance point of view.

And they are completely useless for anythign complex like these
XFS icache functions because the lock scope is not balanced across
functions.

The lock can also be taken by functions called within the guard
scope, and so using guarded lock scoping would result in deadlocks.
i.e. xfs_inodegc_queue() needs to take the i_flags_lock, so it must
be dropped before we call that.

So, yeah, lock guards seem to me to be largely just a "look ma, no
need for rust because we can mightily abuse the C preprocessor!"
anti-pattern looking for a problem to solve.

-Dave.
Andre Noll March 21, 2024, 9:59 a.m. UTC | #7
On Thu, Mar 21, 09:51, Dave Chinner wrote
> I just haven't thought to run sparse on XFS recently - running
> sparse on a full kernel build is just .... awful. I think I'll
> change my build script so that when I do an '--xfs-only' built it
> also enables sparse as it's only rebuilding fs/xfs at that point....

Would it be less awful to run coccinelle with a selected set of
semantic patches that catch defective patterns such as double
unlock/free?

> > > (Doesn't simple lock debugging catch these sorts of things?)
> > 
> > Maybe this error path doesn't get exercised because xfs_reinit_inode()
> > never fails. AFAICT, it can only fail if security_inode_alloc()
> > can't allocate the composite inode blob.
> 
> Which syzkaller triggers every so often. I also do all my testing
> with selinux enabled, so security_inode_alloc() is actually being
> exercised and definitely has the potential to fail on my small
> memory configs...

One could try to trigger ENOMEM more easily in functions like this
by allocating bigger slab caches for debug builds.

> > > ((It sure would be nice if locking returned a droppable "object" to do
> > > the unlock ala Rust and then spin_lock could be __must_check.))
> > 
> > There's the *LOCK_GUARD* macros which employ gcc's cleanup attribute
> > to automatically call e.g. spin_unlock() when a variable goes out of
> > scope (see 54da6a0924311).
> 
> IMO, the LOCK_GUARD stuff is an awful anti-pattern. It means some
> error paths -look broken- because they lack unlocks, and we have to
> explicitly change code to return from functions with the guarded
> locks held. This is a diametrically opposed locking pattern to the
> existing non-guarded lockign patterns - correct behaviour in one
> pattern is broken behaviour in the other, and vice versa.
> 
> That's just -insane- from a code maintenance point of view.

Converting all locks in fs/xfs in one go is not an option either, as
this would be too big to review, and non-trivial to begin with. There
are 180+ calls to spin_lock(), and that's just the spinlocks. Also
these patches would interfere badly with ongoing work.

> And they are completely useless for anythign complex like these
> XFS icache functions because the lock scope is not balanced across
> functions.
>
> The lock can also be taken by functions called within the guard
> scope, and so using guarded lock scoping would result in deadlocks.
> i.e. xfs_inodegc_queue() needs to take the i_flags_lock, so it must
> be dropped before we call that.

Yup, these can't use the LOCK_GUARD macros, which leads to an unholy
mix of guarded and unguarded locks.

> So, yeah, lock guards seem to me to be largely just a "look ma, no
> need for rust because we can mightily abuse the C preprocessor!"
> anti-pattern looking for a problem to solve.

Do you think there is a valid use case for the cleanup attribute,
or do you believe that the whole concept is mis-designed?

Thanks for sharing your opinions.
Andre
Dave Chinner March 22, 2024, 1:09 a.m. UTC | #8
On Thu, Mar 21, 2024 at 10:59:22AM +0100, Andre Noll wrote:
> On Thu, Mar 21, 09:51, Dave Chinner wrote
> > I just haven't thought to run sparse on XFS recently - running
> > sparse on a full kernel build is just .... awful. I think I'll
> > change my build script so that when I do an '--xfs-only' built it
> > also enables sparse as it's only rebuilding fs/xfs at that point....
> 
> Would it be less awful to run coccinelle with a selected set of
> semantic patches that catch defective patterns such as double
> unlock/free?

Much more awful - because then I have to write scripts to do this
checking rather than just add a command line parameter to the build.

> > > > (Doesn't simple lock debugging catch these sorts of things?)
> > > 
> > > Maybe this error path doesn't get exercised because xfs_reinit_inode()
> > > never fails. AFAICT, it can only fail if security_inode_alloc()
> > > can't allocate the composite inode blob.
> > 
> > Which syzkaller triggers every so often. I also do all my testing
> > with selinux enabled, so security_inode_alloc() is actually being
> > exercised and definitely has the potential to fail on my small
> > memory configs...
> 
> One could try to trigger ENOMEM more easily in functions like this
> by allocating bigger slab caches for debug builds.

That doesn't solve the problem - people keep trying to tell us that
all we need it "better testing" when the right solution to the
problem is for memory allocation to *never fail* unless the caller
says it is OK to fail. Better error injection and/or forced failures
don't actually help us all that much because of the massive scope of
the error checking that has to be done. Getting rid of the need for
error checking altogether is a much better long term solution to
this problem...

> > > > ((It sure would be nice if locking returned a droppable "object" to do
> > > > the unlock ala Rust and then spin_lock could be __must_check.))
> > > 
> > > There's the *LOCK_GUARD* macros which employ gcc's cleanup attribute
> > > to automatically call e.g. spin_unlock() when a variable goes out of
> > > scope (see 54da6a0924311).
> > 
> > IMO, the LOCK_GUARD stuff is an awful anti-pattern. It means some
> > error paths -look broken- because they lack unlocks, and we have to
> > explicitly change code to return from functions with the guarded
> > locks held. This is a diametrically opposed locking pattern to the
> > existing non-guarded lockign patterns - correct behaviour in one
> > pattern is broken behaviour in the other, and vice versa.
> > 
> > That's just -insane- from a code maintenance point of view.
> 
> Converting all locks in fs/xfs in one go is not an option either, as
> this would be too big to review, and non-trivial to begin with.

It's simply not possible because of the issues I mentioned, plus
others.

> There
> are 180+ calls to spin_lock(), and that's just the spinlocks. Also
> these patches would interfere badly with ongoing work.

ANywhere you have unbalanced lock contexts, non-trivial nested
locking, reverse order locking (via trylocks), children doing unlock
and lock to change lock contexts, etc then this "guarded lock scope"
does not work. XFS is -full- of these non-trivial locking
algorithms, so it's just not a good idea to even start trying to do
a conversion...

> > And they are completely useless for anythign complex like these
> > XFS icache functions because the lock scope is not balanced across
> > functions.
> >
> > The lock can also be taken by functions called within the guard
> > scope, and so using guarded lock scoping would result in deadlocks.
> > i.e. xfs_inodegc_queue() needs to take the i_flags_lock, so it must
> > be dropped before we call that.
> 
> Yup, these can't use the LOCK_GUARD macros, which leads to an unholy
> mix of guarded and unguarded locks.

Exactly my point.

> > So, yeah, lock guards seem to me to be largely just a "look ma, no
> > need for rust because we can mightily abuse the C preprocessor!"
> > anti-pattern looking for a problem to solve.
> 
> Do you think there is a valid use case for the cleanup attribute,
> or do you believe that the whole concept is mis-designed?

Sure, there's plenty of cases where scoped cleanup attributes really
does make the code better.  e.g. we had XFS changes that used this
attribute in a complex loop iterator rejected back before it became
accepted so that this lock guard template thingy could be
implemented with it.

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7359753b892b..56de3e843df2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,6 +63,8 @@  static int xfs_icwalk_ag(struct xfs_perag *pag,
 					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
 					 XFS_ICWALK_FLAG_UNION)
 
+static void xfs_inodegc_queue(struct xfs_inode *ip);
+
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -325,6 +327,7 @@  xfs_reinit_inode(
 	return error;
 }
 
+
 /*
  * Carefully nudge an inode whose VFS state has been torn down back into a
  * usable state.  Drops the i_flags_lock and the rcu read lock.
@@ -388,7 +391,82 @@  xfs_iget_recycle(
 	inode->i_state = I_NEW;
 	spin_unlock(&ip->i_flags_lock);
 	spin_unlock(&pag->pag_ici_lock);
+	XFS_STATS_INC(mp, xs_ig_frecycle);
+	return 0;
+}
 
+static int
+xfs_iget_reactivate(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			error;
+
+	trace_xfs_iget_recycle(ip);
+
+	/*
+	 * If the inode has been unlinked, then the lookup must not find it
+	 * until inactivation has actually freed the inode.
+	 */
+	if (VFS_I(ip)->i_nlink == 0) {
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/*
+	 * Take the ILOCK here to serialise against lookup races with putting
+	 * the inode back on the inodegc queue during error handling.
+	 */
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		return -EAGAIN;
+
+	/*
+	 * Move the state to inactivating so both inactivation and racing
+	 * lookups will skip over this inode until we've finished reactivating
+	 * it and can return it to the XFS_INEW state.
+	 */
+	ip->i_flags &= ~XFS_NEED_INACTIVE;
+	ip->i_flags |= XFS_INACTIVATING;
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+	error = xfs_reinit_inode(mp, inode);
+	if (error) {
+		/*
+		 * Well, that sucks. Put the inode back on the inactive queue.
+		 * Do this while still under the ILOCK so that we can set the
+		 * NEED_INACTIVE flag and clear the INACTIVATING flag an not
+		 * have another lookup race with us before we've finished
+		 * putting the inode back on the inodegc queue.
+		 */
+		spin_unlock(&ip->i_flags_lock);
+		ip->i_flags |= XFS_NEED_INACTIVE;
+		ip->i_flags &= ~XFS_INACTIVATING;
+		spin_unlock(&ip->i_flags_lock);
+
+		xfs_inodegc_queue(ip);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		trace_xfs_iget_recycle_fail(ip);
+		return error;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Reset the inode state to new so that xfs_iget() will complete
+	 * the required remaining inode initialisation before it returns the
+	 * inode to the caller.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+	ip->i_flags |= XFS_INEW;
+	inode->i_state = I_NEW;
+	spin_unlock(&ip->i_flags_lock);
+	XFS_STATS_INC(mp, xs_ig_frecycle);
 	return 0;
 }
 
@@ -526,15 +604,6 @@  xfs_iget_cache_hit(
 	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
 		goto out_skip;
 
-	if (ip->i_flags & XFS_NEED_INACTIVE) {
-		/* Unlinked inodes cannot be re-grabbed. */
-		if (VFS_I(ip)->i_nlink == 0) {
-			error = -ENOENT;
-			goto out_error;
-		}
-		goto out_inodegc_flush;
-	}
-
 	/*
 	 * Check the inode free state is valid. This also detects lookup
 	 * racing with unlinks.
@@ -545,11 +614,18 @@  xfs_iget_cache_hit(
 
 	/* Skip inodes that have no vfs state. */
 	if ((flags & XFS_IGET_INCORE) &&
-	    (ip->i_flags & XFS_IRECLAIMABLE))
+	    (ip->i_flags & (XFS_IRECLAIMABLE | XFS_NEED_INACTIVE)))
 		goto out_skip;
 
 	/* The inode fits the selection criteria; process it. */
-	if (ip->i_flags & XFS_IRECLAIMABLE) {
+	if (ip->i_flags & XFS_NEED_INACTIVE) {
+		/* Drops i_flags_lock and RCU read lock. */
+		error = xfs_iget_reactivate(pag, ip);
+		if (error == -EAGAIN)
+			goto out_skip;
+		if (error)
+			return error;
+	} else if (ip->i_flags & XFS_IRECLAIMABLE) {
 		/* Drops i_flags_lock and RCU read lock. */
 		error = xfs_iget_recycle(pag, ip);
 		if (error == -EAGAIN)
@@ -578,23 +654,11 @@  xfs_iget_cache_hit(
 
 out_skip:
 	trace_xfs_iget_skip(ip);
-	XFS_STATS_INC(mp, xs_ig_frecycle);
 	error = -EAGAIN;
 out_error:
 	spin_unlock(&ip->i_flags_lock);
 	rcu_read_unlock();
 	return error;
-
-out_inodegc_flush:
-	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
-	/*
-	 * Do not wait for the workers, because the caller could hold an AGI
-	 * buffer lock.  We're just going to sleep in a loop anyway.
-	 */
-	if (xfs_is_inodegc_enabled(mp))
-		xfs_inodegc_queue_all(mp);
-	return -EAGAIN;
 }
 
 static int