diff mbox series

[2/4] xfs: add log item precommit operation

Message ID 20200623095015.1934171-3-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: in memory inode unlink log items | expand

Commit Message

Dave Chinner June 23, 2020, 9:50 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

For inodes that are dirty, we have an attached cluster buffer that
we want to use to track the dirty inode through the AIL.
Unfortunately, locking the cluster buffer and adding it to the
transaction when the inode is first logged in a transaction leads to
buffer lock ordering inversions.

The specific problem is ordering against the AGI buffer. When
modifying unlinked lists, the buffer lock order is AGI -> inode
cluster buffer as the AGI buffer lock serialises all access to the
unlinked lists. Unfortunately, functionality like xfs_droplink()
logs the inode before calling xfs_iunlink(), as do various directory
manipulation functions. The inode can be logged way down in the
stack as far as the bmapi routines and hence, without a major
rewrite of lots of APIs there's no way we can avoid the inode being
logged by something until after the AGI has been logged.

As we are going to be using ordered buffers for inode AIL tracking,
there isn't a need to actually lock that buffer against modification
as all the modifications are captured by logging the inode item
itself. Hence we don't actually need to join the cluster buffer into
the transaction until just before it is committed. This means we do
not perturb any of the existing buffer lock orders in transactions,
and the inode cluster buffer is always locked last in a transaction
that doesn't otherwise touch inode cluster buffers.

We do this by introducing a precommit log item method. A log item
method is used because it is likely dquots will be moved to this
same ordered buffer tracking scheme and hence will need a similar
callout. This commit just introduces the mechanism; the inode item
implementation is in followup commits.

The precommit items need to be sorted into consistent order as we
may be locking multiple items here. Hence if we have two dirty
inodes in cluster buffers A and B, and some other transaction has
two separate dirty inodes in the same cluster buffers, locking them
in different orders opens us up to ABBA deadlocks. Hence we sort the
items on the transaction based on the presence of a sort log item
method.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c |  1 +
 fs/xfs/xfs_trans.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trans.h  |  6 ++-
 3 files changed, 95 insertions(+), 2 deletions(-)

Comments

Darrick J. Wong June 30, 2020, 6:06 p.m. UTC | #1
On Tue, Jun 23, 2020 at 07:50:13PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> For inodes that are dirty, we have an attached cluster buffer that
> we want to use to track the dirty inode through the AIL.
> Unfortunately, locking the cluster buffer and adding it to the
> transaction when the inode is first logged in a transaction leads to
> buffer lock ordering inversions.
> 
> The specific problem is ordering against the AGI buffer. When
> modifying unlinked lists, the buffer lock order is AGI -> inode
> cluster buffer as the AGI buffer lock serialises all access to the
> unlinked lists. Unfortunately, functionality like xfs_droplink()
> logs the inode before calling xfs_iunlink(), as do various directory
> manipulation functions. The inode can be logged way down in the
> stack as far as the bmapi routines and hence, without a major
> rewrite of lots of APIs there's no way we can avoid the inode being
> logged by something until after the AGI has been logged.
> 
> As we are going to be using ordered buffers for inode AIL tracking,
> there isn't a need to actually lock that buffer against modification
> as all the modifications are captured by logging the inode item
> itself. Hence we don't actually need to join the cluster buffer into
> the transaction until just before it is committed. This means we do
> not perturb any of the existing buffer lock orders in transactions,
> and the inode cluster buffer is always locked last in a transaction
> that doesn't otherwise touch inode cluster buffers.
> 
> We do this by introducing a precommit log item method. A log item
> method is used because it is likely dquots will be moved to this
> same ordered buffer tracking scheme and hence will need a similar
> callout. This commit just introduces the mechanism; the inode item
> implementation is in followup commits.
> 
> The precommit items need to be sorted into consistent order as we
> may be locking multiple items here. Hence if we have two dirty
> inodes in cluster buffers A and B, and some other transaction has
> two separate dirty inodes in the same cluster buffers, locking them
> in different orders opens us up to ABBA deadlocks. Hence we sort the
> items on the transaction based on the presence of a sort log item
> method.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_icache.c |  1 +
>  fs/xfs/xfs_trans.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_trans.h  |  6 ++-
>  3 files changed, 95 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 0d73559f2d58..1c744dbb313f 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -1077,6 +1077,7 @@ xfs_reclaim_inode(
>  	ip->i_ino = 0;
>  	spin_unlock(&ip->i_flags_lock);
>  
> +	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  
>  	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 3c94e5ff4316..6f350490f84b 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -799,6 +799,89 @@ xfs_trans_committed_bulk(
>  	spin_unlock(&ailp->ail_lock);
>  }
>  
> +/*
> + * Sort transaction items prior to running precommit operations. This will
> + * attempt to order the items such that they will always be locked in the same
> + * order. Items that have no sort function are moved to the end of the list
> + * and so are locked last (XXX: need to check the logic matches the comment).
> + *
> + * This may need refinement as different types of objects add sort functions.
> + *
> + * Function is more complex than it needs to be because we are comparing 64 bit
> + * values and the function only returns 32 bit values.
> + */
> +static int
> +xfs_trans_precommit_sort(
> +	void			*unused_arg,
> +	struct list_head	*a,
> +	struct list_head	*b)
> +{
> +	struct xfs_log_item	*lia = container_of(a,
> +					struct xfs_log_item, li_trans);
> +	struct xfs_log_item	*lib = container_of(b,
> +					struct xfs_log_item, li_trans);
> +	int64_t			diff;
> +
> +	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
> +		return 0;
> +	if (!lia->li_ops->iop_sort)
> +		return 1;
> +	if (!lib->li_ops->iop_sort)
> +		return -1;
> +
> +	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
> +	if (diff < 0)
> +		return -1;
> +	if (diff > 0)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * Run transaction precommit functions.
> + *
> + * If there is an error in any of the callouts, then stop immediately and
> + * trigger a shutdown to abort the transaction. There is no recovery possible
> + * from errors at this point as the transaction is dirty....
> + */
> +static int
> +xfs_trans_run_precommits(
> +	struct xfs_trans	*tp)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_log_item	*lip, *n;
> +	int			error = 0;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +
> +	/*
> +	 * Sort the item list to avoid ABBA deadlocks with other transactions
> +	 * running precommit operations that lock multiple shared items such as
> +	 * inode cluster buffers.
> +	 */
> +	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
> +
> +	/*
> +	 * Precommit operations can remove the log item from the transaction
> +	 * if the log item exists purely to delay modifications until they
> +	 * can be ordered against other operations. Hence we have to use
> +	 * list_for_each_entry_safe() here.
> +	 */
> +	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
> +		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
> +			continue;
> +		if (lip->li_ops->iop_precommit) {
> +			error = lip->li_ops->iop_precommit(tp, lip);
> +			if (error)
> +				break;
> +		}
> +	}
> +	if (error)
> +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> +	return error;
> +}
> +
>  /*
>   * Commit the given transaction to the log.
>   *
> @@ -823,6 +906,13 @@ __xfs_trans_commit(
>  
>  	trace_xfs_trans_commit(tp, _RET_IP_);
>  
> +	error = xfs_trans_run_precommits(tp);
> +	if (error) {
> +		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
> +			xfs_defer_cancel(tp);
> +		goto out_unreserve;
> +	}
> +
>  	/*
>  	 * Finish deferred items on final commit. Only permanent transactions
>  	 * should ever have deferred ops.
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index b752501818d2..26ea19bd0621 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -70,10 +70,12 @@ struct xfs_item_ops {
>  	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
>  	void (*iop_pin)(struct xfs_log_item *);
>  	void (*iop_unpin)(struct xfs_log_item *, int remove);
> -	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
> +	uint64_t (*iop_sort)(struct xfs_log_item *);

Shouldn't the return value be signed, since this is a comparison
function...?  Or does the ->iop_sort function itself do sorting??

--D

> +	int (*iop_precommit)(struct xfs_trans *, struct xfs_log_item *);
>  	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
> -	void (*iop_release)(struct xfs_log_item *);
>  	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
> +	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
> +	void (*iop_release)(struct xfs_log_item *);
>  	int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
>  	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
>  };
> -- 
> 2.26.2.761.g0e0b3e54be
>
Brian Foster July 1, 2020, 2:30 p.m. UTC | #2
On Tue, Jun 23, 2020 at 07:50:13PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> For inodes that are dirty, we have an attached cluster buffer that
> we want to use to track the dirty inode through the AIL.
> Unfortunately, locking the cluster buffer and adding it to the
> transaction when the inode is first logged in a transaction leads to
> buffer lock ordering inversions.
> 
> The specific problem is ordering against the AGI buffer. When
> modifying unlinked lists, the buffer lock order is AGI -> inode
> cluster buffer as the AGI buffer lock serialises all access to the
> unlinked lists. Unfortunately, functionality like xfs_droplink()
> logs the inode before calling xfs_iunlink(), as do various directory
> manipulation functions. The inode can be logged way down in the
> stack as far as the bmapi routines and hence, without a major
> rewrite of lots of APIs there's no way we can avoid the inode being
> logged by something until after the AGI has been logged.
> 
> As we are going to be using ordered buffers for inode AIL tracking,
> there isn't a need to actually lock that buffer against modification
> as all the modifications are captured by logging the inode item
> itself. Hence we don't actually need to join the cluster buffer into
> the transaction until just before it is committed. This means we do
> not perturb any of the existing buffer lock orders in transactions,
> and the inode cluster buffer is always locked last in a transaction
> that doesn't otherwise touch inode cluster buffers.
> 
> We do this by introducing a precommit log item method. A log item
> method is used because it is likely dquots will be moved to this
> same ordered buffer tracking scheme and hence will need a similar
> callout. This commit just introduces the mechanism; the inode item
> implementation is in followup commits.
> 
> The precommit items need to be sorted into consistent order as we
> may be locking multiple items here. Hence if we have two dirty
> inodes in cluster buffers A and B, and some other transaction has
> two separate dirty inodes in the same cluster buffers, locking them
> in different orders opens us up to ABBA deadlocks. Hence we sort the
> items on the transaction based on the presence of a sort log item
> method.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---

Seems like a nice abstraction, particularly when you consider the other
use cases you described that should fall into place over time. A couple
minor comments..

>  fs/xfs/xfs_icache.c |  1 +
>  fs/xfs/xfs_trans.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_trans.h  |  6 ++-
>  3 files changed, 95 insertions(+), 2 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 3c94e5ff4316..6f350490f84b 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -799,6 +799,89 @@ xfs_trans_committed_bulk(
>  	spin_unlock(&ailp->ail_lock);
>  }
>  
> +/*
> + * Sort transaction items prior to running precommit operations. This will
> + * attempt to order the items such that they will always be locked in the same
> + * order. Items that have no sort function are moved to the end of the list
> + * and so are locked last (XXX: need to check the logic matches the comment).
> + *

Heh, I was going to ask what the expected behavior was with the various
!iop_sort() cases and whether we can really expect those items to be
isolated at the end of the list.

> + * This may need refinement as different types of objects add sort functions.
> + *
> + * Function is more complex than it needs to be because we are comparing 64 bit
> + * values and the function only returns 32 bit values.
> + */
> +static int
> +xfs_trans_precommit_sort(
> +	void			*unused_arg,
> +	struct list_head	*a,
> +	struct list_head	*b)
> +{
> +	struct xfs_log_item	*lia = container_of(a,
> +					struct xfs_log_item, li_trans);
> +	struct xfs_log_item	*lib = container_of(b,
> +					struct xfs_log_item, li_trans);
> +	int64_t			diff;
> +
> +	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
> +		return 0;
> +	if (!lia->li_ops->iop_sort)
> +		return 1;
> +	if (!lib->li_ops->iop_sort)
> +		return -1;

I'm a little confused on what these values are supposed to mean if one
of the two items is non-sortable. Is the purpose of this simply to move
sortable items to the head and non-sortable toward the tail, as noted
above?

> +
> +	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
> +	if (diff < 0)
> +		return -1;
> +	if (diff > 0)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * Run transaction precommit functions.
> + *
> + * If there is an error in any of the callouts, then stop immediately and
> + * trigger a shutdown to abort the transaction. There is no recovery possible
> + * from errors at this point as the transaction is dirty....
> + */
> +static int
> +xfs_trans_run_precommits(
> +	struct xfs_trans	*tp)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_log_item	*lip, *n;
> +	int			error = 0;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +

I'd rather not change behavior here. This effectively overrides the
shutdown check in the caller because we get here regardless of whether
the transaction has any pre-commit callouts or not. It seems like this
is unnecessary, at least for the time being, if the precommit is
primarily focused on sorting.

Brian

> +	/*
> +	 * Sort the item list to avoid ABBA deadlocks with other transactions
> +	 * running precommit operations that lock multiple shared items such as
> +	 * inode cluster buffers.
> +	 */
> +	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
> +
> +	/*
> +	 * Precommit operations can remove the log item from the transaction
> +	 * if the log item exists purely to delay modifications until they
> +	 * can be ordered against other operations. Hence we have to use
> +	 * list_for_each_entry_safe() here.
> +	 */
> +	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
> +		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
> +			continue;
> +		if (lip->li_ops->iop_precommit) {
> +			error = lip->li_ops->iop_precommit(tp, lip);
> +			if (error)
> +				break;
> +		}
> +	}
> +	if (error)
> +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> +	return error;
> +}
> +
>  /*
>   * Commit the given transaction to the log.
>   *
> @@ -823,6 +906,13 @@ __xfs_trans_commit(
>  
>  	trace_xfs_trans_commit(tp, _RET_IP_);
>  
> +	error = xfs_trans_run_precommits(tp);
> +	if (error) {
> +		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
> +			xfs_defer_cancel(tp);
> +		goto out_unreserve;
> +	}
> +
>  	/*
>  	 * Finish deferred items on final commit. Only permanent transactions
>  	 * should ever have deferred ops.
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index b752501818d2..26ea19bd0621 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -70,10 +70,12 @@ struct xfs_item_ops {
>  	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
>  	void (*iop_pin)(struct xfs_log_item *);
>  	void (*iop_unpin)(struct xfs_log_item *, int remove);
> -	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
> +	uint64_t (*iop_sort)(struct xfs_log_item *);
> +	int (*iop_precommit)(struct xfs_trans *, struct xfs_log_item *);
>  	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
> -	void (*iop_release)(struct xfs_log_item *);
>  	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
> +	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
> +	void (*iop_release)(struct xfs_log_item *);
>  	int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
>  	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
>  };
> -- 
> 2.26.2.761.g0e0b3e54be
>
Dave Chinner July 1, 2020, 10:02 p.m. UTC | #3
On Wed, Jul 01, 2020 at 10:30:57AM -0400, Brian Foster wrote:
> On Tue, Jun 23, 2020 at 07:50:13PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > For inodes that are dirty, we have an attached cluster buffer that
> > we want to use to track the dirty inode through the AIL.
> > Unfortunately, locking the cluster buffer and adding it to the
> > transaction when the inode is first logged in a transaction leads to
> > buffer lock ordering inversions.
> > 
> > The specific problem is ordering against the AGI buffer. When
> > modifying unlinked lists, the buffer lock order is AGI -> inode
> > cluster buffer as the AGI buffer lock serialises all access to the
> > unlinked lists. Unfortunately, functionality like xfs_droplink()
> > logs the inode before calling xfs_iunlink(), as do various directory
> > manipulation functions. The inode can be logged way down in the
> > stack as far as the bmapi routines and hence, without a major
> > rewrite of lots of APIs there's no way we can avoid the inode being
> > logged by something until after the AGI has been logged.
> > 
> > As we are going to be using ordered buffers for inode AIL tracking,
> > there isn't a need to actually lock that buffer against modification
> > as all the modifications are captured by logging the inode item
> > itself. Hence we don't actually need to join the cluster buffer into
> > the transaction until just before it is committed. This means we do
> > not perturb any of the existing buffer lock orders in transactions,
> > and the inode cluster buffer is always locked last in a transaction
> > that doesn't otherwise touch inode cluster buffers.
> > 
> > We do this by introducing a precommit log item method. A log item
> > method is used because it is likely dquots will be moved to this
> > same ordered buffer tracking scheme and hence will need a similar
> > callout. This commit just introduces the mechanism; the inode item
> > implementation is in followup commits.
> > 
> > The precommit items need to be sorted into consistent order as we
> > may be locking multiple items here. Hence if we have two dirty
> > inodes in cluster buffers A and B, and some other transaction has
> > two separate dirty inodes in the same cluster buffers, locking them
> > in different orders opens us up to ABBA deadlocks. Hence we sort the
> > items on the transaction based on the presence of a sort log item
> > method.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> 
> Seems like a nice abstraction, particularly when you consider the other
> use cases you described that should fall into place over time. A couple
> minor comments..
> 
> >  fs/xfs/xfs_icache.c |  1 +
> >  fs/xfs/xfs_trans.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_trans.h  |  6 ++-
> >  3 files changed, 95 insertions(+), 2 deletions(-)
> > 
> ...
> > diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> > index 3c94e5ff4316..6f350490f84b 100644
> > --- a/fs/xfs/xfs_trans.c
> > +++ b/fs/xfs/xfs_trans.c
> > @@ -799,6 +799,89 @@ xfs_trans_committed_bulk(
> >  	spin_unlock(&ailp->ail_lock);
> >  }
> >  
> > +/*
> > + * Sort transaction items prior to running precommit operations. This will
> > + * attempt to order the items such that they will always be locked in the same
> > + * order. Items that have no sort function are moved to the end of the list
> > + * and so are locked last (XXX: need to check the logic matches the comment).
> > + *
> 
> Heh, I was going to ask what the expected behavior was with the various
> !iop_sort() cases and whether we can really expect those items to be
> isolated at the end of the list.
> 
> > + * This may need refinement as different types of objects add sort functions.
> > + *
> > + * Function is more complex than it needs to be because we are comparing 64 bit
> > + * values and the function only returns 32 bit values.
> > + */
> > +static int
> > +xfs_trans_precommit_sort(
> > +	void			*unused_arg,
> > +	struct list_head	*a,
> > +	struct list_head	*b)
> > +{
> > +	struct xfs_log_item	*lia = container_of(a,
> > +					struct xfs_log_item, li_trans);
> > +	struct xfs_log_item	*lib = container_of(b,
> > +					struct xfs_log_item, li_trans);
> > +	int64_t			diff;
> > +
> > +	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
> > +		return 0;
> > +	if (!lia->li_ops->iop_sort)
> > +		return 1;
> > +	if (!lib->li_ops->iop_sort)
> > +		return -1;
> 
> I'm a little confused on what these values are supposed to mean if one
> of the two items is non-sortable. Is the purpose of this simply to move
> sortable items to the head and non-sortable toward the tail, as noted
> above?

If the log item doesn't have a sort function, it implies the object
is already locked and modified and there's no pre-commit operation
going to be performed on it. In that case, I decided to move them to
the tail of the list so that it would be easier to verify that the
items that need sorting were, indeed, sorted into the correct order.

The choice was arbitrary - the could be moved to the head of the
list or they could be left where they are any everything else is
ordered around them, but I went for the behaviour that it easy to
verify visually with debug output or via a list walk in a debugger...

> > +static int
> > +xfs_trans_run_precommits(
> > +	struct xfs_trans	*tp)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	struct xfs_log_item	*lip, *n;
> > +	int			error = 0;
> > +
> > +	if (XFS_FORCED_SHUTDOWN(mp))
> > +		return -EIO;
> > +
> 
> I'd rather not change behavior here. This effectively overrides the
> shutdown check in the caller because we get here regardless of whether
> the transaction has any pre-commit callouts or not. It seems like this
> is unnecessary, at least for the time being, if the precommit is
> primarily focused on sorting.

I put that there because if we are already shut down then there's no
point in even sorting or running pre-commits - they are going to
error out trying to access the objects they need to modify anyway.

It really isn't critical, just seemed superfluous to run code that
we already know will be cancelled and/or error out...

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0d73559f2d58..1c744dbb313f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1077,6 +1077,7 @@  xfs_reclaim_inode(
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
+	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3c94e5ff4316..6f350490f84b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -799,6 +799,89 @@  xfs_trans_committed_bulk(
 	spin_unlock(&ailp->ail_lock);
 }
 
+/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last (XXX: need to check the logic matches the comment).
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+	void			*unused_arg,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xfs_log_item	*lia = container_of(a,
+					struct xfs_log_item, li_trans);
+	struct xfs_log_item	*lib = container_of(b,
+					struct xfs_log_item, li_trans);
+	int64_t			diff;
+
+	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+		return 0;
+	if (!lia->li_ops->iop_sort)
+		return 1;
+	if (!lib->li_ops->iop_sort)
+		return -1;
+
+	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip, *n;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Sort the item list to avoid ABBA deadlocks with other transactions
+	 * running precommit operations that lock multiple shared items such as
+	 * inode cluster buffers.
+	 */
+	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+	/*
+	 * Precommit operations can remove the log item from the transaction
+	 * if the log item exists purely to delay modifications until they
+	 * can be ordered against other operations. Hence we have to use
+	 * list_for_each_entry_safe() here.
+	 */
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+		if (lip->li_ops->iop_precommit) {
+			error = lip->li_ops->iop_precommit(tp, lip);
+			if (error)
+				break;
+		}
+	}
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return error;
+}
+
 /*
  * Commit the given transaction to the log.
  *
@@ -823,6 +906,13 @@  __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
+	error = xfs_trans_run_precommits(tp);
+	if (error) {
+		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+			xfs_defer_cancel(tp);
+		goto out_unreserve;
+	}
+
 	/*
 	 * Finish deferred items on final commit. Only permanent transactions
 	 * should ever have deferred ops.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b752501818d2..26ea19bd0621 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -70,10 +70,12 @@  struct xfs_item_ops {
 	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
 	void (*iop_pin)(struct xfs_log_item *);
 	void (*iop_unpin)(struct xfs_log_item *, int remove);
-	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	uint64_t (*iop_sort)(struct xfs_log_item *);
+	int (*iop_precommit)(struct xfs_trans *, struct xfs_log_item *);
 	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
-	void (*iop_release)(struct xfs_log_item *);
 	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	void (*iop_release)(struct xfs_log_item *);
 	int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
 	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
 };