diff mbox series

xfs: Fix agi&agf ABBA deadlock when performing rename with RENAME_WHITEOUT flag

Message ID 5f2ab55c-c1ef-a8f2-5662-b35e0838b979@gmail.com (mailing list archive)
State Superseded
Headers show
Series xfs: Fix agi&agf ABBA deadlock when performing rename with RENAME_WHITEOUT flag | expand

Commit Message

Kaixu Xia Aug. 13, 2019, 11:17 a.m. UTC
When performing rename operation with RENAME_WHITEOUT flag, we will
hold AGF lock to allocate or free extents in manipulating the dirents
firstly, and then doing the xfs_iunlink_remove() call last to hold
AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.

The big problem here is that we have an ordering constraint on AGF
and AGI locking - inode allocation locks the AGI, then can allocate
a new extent for new inodes, locking the AGF after the AGI. Hence
the ordering that is imposed by other parts of the code is AGI before
AGF. So we get the ABBA agi&agf deadlock here.

Process A:
Call trace:
  ? __schedule+0x2bd/0x620
  schedule+0x33/0x90
  schedule_timeout+0x17d/0x290
  __down_common+0xef/0x125
  ? xfs_buf_find+0x215/0x6c0 [xfs]
  down+0x3b/0x50
  xfs_buf_lock+0x34/0xf0 [xfs]
  xfs_buf_find+0x215/0x6c0 [xfs]
  xfs_buf_get_map+0x37/0x230 [xfs]
  xfs_buf_read_map+0x29/0x190 [xfs]
  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
  xfs_read_agf+0xa6/0x180 [xfs]
  ? schedule_timeout+0x17d/0x290
  xfs_alloc_read_agf+0x52/0x1f0 [xfs]
  xfs_alloc_fix_freelist+0x432/0x590 [xfs]
  ? down+0x3b/0x50
  ? xfs_buf_lock+0x34/0xf0 [xfs]
  ? xfs_buf_find+0x215/0x6c0 [xfs]
  xfs_alloc_vextent+0x301/0x6c0 [xfs]
  xfs_ialloc_ag_alloc+0x182/0x700 [xfs]
  ? _xfs_trans_bjoin+0x72/0xf0 [xfs]
  xfs_dialloc+0x116/0x290 [xfs]
  xfs_ialloc+0x6d/0x5e0 [xfs]
  ? xfs_log_reserve+0x165/0x280 [xfs]
  xfs_dir_ialloc+0x8c/0x240 [xfs]
  xfs_create+0x35a/0x610 [xfs]
  xfs_generic_create+0x1f1/0x2f0 [xfs]
  ...

Process B:
Call trace:
  ? __schedule+0x2bd/0x620
  ? xfs_bmapi_allocate+0x245/0x380 [xfs]
  schedule+0x33/0x90
  schedule_timeout+0x17d/0x290
  ? xfs_buf_find+0x1fd/0x6c0 [xfs]
  __down_common+0xef/0x125
  ? xfs_buf_get_map+0x37/0x230 [xfs]
  ? xfs_buf_find+0x215/0x6c0 [xfs]
  down+0x3b/0x50
  xfs_buf_lock+0x34/0xf0 [xfs]
  xfs_buf_find+0x215/0x6c0 [xfs]
  xfs_buf_get_map+0x37/0x230 [xfs]
  xfs_buf_read_map+0x29/0x190 [xfs]
  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
  xfs_read_agi+0xa8/0x160 [xfs]
  xfs_iunlink_remove+0x6f/0x2a0 [xfs]
  ? current_time+0x46/0x80
  ? xfs_trans_ichgtime+0x39/0xb0 [xfs]
  xfs_rename+0x57a/0xae0 [xfs]
  xfs_vn_rename+0xe4/0x150 [xfs]
  ...

In this patch we make the unlinked list removal a deferred operation,
i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
transaction has committed, and the iunlink remove intention and done
log items are provided.

Change the ordering of the operations in the xfs_rename() function
to hold the AGF lock in the RENAME_WHITEOUT transaction and hold the
AGI lock in it's own transaction to match that of the rest of the code.

Signed-off-by: kaixuxia <kaixuxia@tencent.com>
---
  fs/xfs/Makefile                |   1 +
  fs/xfs/libxfs/xfs_defer.c      |   1 +
  fs/xfs/libxfs/xfs_defer.h      |   2 +
  fs/xfs/libxfs/xfs_log_format.h |  27 ++-
  fs/xfs/xfs_inode.c             |  36 +---
  fs/xfs/xfs_inode.h             |   3 +
  fs/xfs/xfs_iunlinkrm_item.c    | 458 +++++++++++++++++++++++++++++++++++++++++
  fs/xfs/xfs_iunlinkrm_item.h    |  67 ++++++
  fs/xfs/xfs_log.c               |   2 +
  fs/xfs/xfs_log_recover.c       | 148 +++++++++++++
  fs/xfs/xfs_super.c             |  17 ++
  fs/xfs/xfs_trans.h             |   2 +
  12 files changed, 733 insertions(+), 31 deletions(-)
  create mode 100644 fs/xfs/xfs_iunlinkrm_item.c
  create mode 100644 fs/xfs/xfs_iunlinkrm_item.h

Comments

Brian Foster Aug. 13, 2019, 1:36 p.m. UTC | #1
On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> When performing rename operation with RENAME_WHITEOUT flag, we will
> hold AGF lock to allocate or free extents in manipulating the dirents
> firstly, and then doing the xfs_iunlink_remove() call last to hold
> AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
> 

IIUC, the whiteout use case is that we're renaming a file, but the
source dentry must be replaced with a magic whiteout inode rather than
be removed. Therefore, xfs_rename() allocates the whiteout inode as a
tmpfile first in a separate transaction, updates the target dentry with
the source inode, replaces the source dentry to point to the whiteout
inode and finally removes the whiteout inode from the unlinked list
(since it is a tmpfile). This leads to the problem described below
because the rename transaction ends up doing directory block allocs
(locking the AGF) followed by the unlinked list remove (locking the
AGI).

My understanding from reading the code is that this is primarly to
cleanly handle error scenarios. If anything fails after we've allocated
the whiteout tmpfile, it's simply left on the unlinked list and so the
filesystem remains in a consistent/recoverable state. Given that, the
solution here seems like overkill to me. For one, I thought background
unlinked list removal was already on our roadmap (Darrick might have
been looking at that and may already have a prototype as well). Also,
unlinked list removal occurs at log recovery time already. That's
somewhat of an existing purpose of the list, which makes a deferred
unlinked list removal operation superfluous in more traditional cases
where unlinked list removal doesn't require consistency with a directory
operation.

Functional discussion aside.. from a complexity standpoint I'm wondering
if we could do something much more simple like acquire the AGI lock for
a whiteout inode earlier in xfs_rename(). For example, suppose we did
something like:

	/*
	 * Acquire the whiteout agi to preserve locking order in anticipation of
	 * unlinked list removal.
	 */
	if (wip)
		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);

... after we allocate the transaction but before we do any directory ops
that can result in block allocations. Would that prevent the problem
you've observed?

Brian

> The big problem here is that we have an ordering constraint on AGF
> and AGI locking - inode allocation locks the AGI, then can allocate
> a new extent for new inodes, locking the AGF after the AGI. Hence
> the ordering that is imposed by other parts of the code is AGI before
> AGF. So we get the ABBA agi&agf deadlock here.
> 
> Process A:
> Call trace:
>  ? __schedule+0x2bd/0x620
>  schedule+0x33/0x90
>  schedule_timeout+0x17d/0x290
>  __down_common+0xef/0x125
>  ? xfs_buf_find+0x215/0x6c0 [xfs]
>  down+0x3b/0x50
>  xfs_buf_lock+0x34/0xf0 [xfs]
>  xfs_buf_find+0x215/0x6c0 [xfs]
>  xfs_buf_get_map+0x37/0x230 [xfs]
>  xfs_buf_read_map+0x29/0x190 [xfs]
>  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
>  xfs_read_agf+0xa6/0x180 [xfs]
>  ? schedule_timeout+0x17d/0x290
>  xfs_alloc_read_agf+0x52/0x1f0 [xfs]
>  xfs_alloc_fix_freelist+0x432/0x590 [xfs]
>  ? down+0x3b/0x50
>  ? xfs_buf_lock+0x34/0xf0 [xfs]
>  ? xfs_buf_find+0x215/0x6c0 [xfs]
>  xfs_alloc_vextent+0x301/0x6c0 [xfs]
>  xfs_ialloc_ag_alloc+0x182/0x700 [xfs]
>  ? _xfs_trans_bjoin+0x72/0xf0 [xfs]
>  xfs_dialloc+0x116/0x290 [xfs]
>  xfs_ialloc+0x6d/0x5e0 [xfs]
>  ? xfs_log_reserve+0x165/0x280 [xfs]
>  xfs_dir_ialloc+0x8c/0x240 [xfs]
>  xfs_create+0x35a/0x610 [xfs]
>  xfs_generic_create+0x1f1/0x2f0 [xfs]
>  ...
> 
> Process B:
> Call trace:
>  ? __schedule+0x2bd/0x620
>  ? xfs_bmapi_allocate+0x245/0x380 [xfs]
>  schedule+0x33/0x90
>  schedule_timeout+0x17d/0x290
>  ? xfs_buf_find+0x1fd/0x6c0 [xfs]
>  __down_common+0xef/0x125
>  ? xfs_buf_get_map+0x37/0x230 [xfs]
>  ? xfs_buf_find+0x215/0x6c0 [xfs]
>  down+0x3b/0x50
>  xfs_buf_lock+0x34/0xf0 [xfs]
>  xfs_buf_find+0x215/0x6c0 [xfs]
>  xfs_buf_get_map+0x37/0x230 [xfs]
>  xfs_buf_read_map+0x29/0x190 [xfs]
>  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
>  xfs_read_agi+0xa8/0x160 [xfs]
>  xfs_iunlink_remove+0x6f/0x2a0 [xfs]
>  ? current_time+0x46/0x80
>  ? xfs_trans_ichgtime+0x39/0xb0 [xfs]
>  xfs_rename+0x57a/0xae0 [xfs]
>  xfs_vn_rename+0xe4/0x150 [xfs]
>  ...
> 
> In this patch we make the unlinked list removal a deferred operation,
> i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> transaction has committed, and the iunlink remove intention and done
> log items are provided.
> 
> Change the ordering of the operations in the xfs_rename() function
> to hold the AGF lock in the RENAME_WHITEOUT transaction and hold the
> AGI lock in it's own transaction to match that of the rest of the code.
> 
> Signed-off-by: kaixuxia <kaixuxia@tencent.com>
> ---
>  fs/xfs/Makefile                |   1 +
>  fs/xfs/libxfs/xfs_defer.c      |   1 +
>  fs/xfs/libxfs/xfs_defer.h      |   2 +
>  fs/xfs/libxfs/xfs_log_format.h |  27 ++-
>  fs/xfs/xfs_inode.c             |  36 +---
>  fs/xfs/xfs_inode.h             |   3 +
>  fs/xfs/xfs_iunlinkrm_item.c    | 458 +++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iunlinkrm_item.h    |  67 ++++++
>  fs/xfs/xfs_log.c               |   2 +
>  fs/xfs/xfs_log_recover.c       | 148 +++++++++++++
>  fs/xfs/xfs_super.c             |  17 ++
>  fs/xfs/xfs_trans.h             |   2 +
>  12 files changed, 733 insertions(+), 31 deletions(-)
>  create mode 100644 fs/xfs/xfs_iunlinkrm_item.c
>  create mode 100644 fs/xfs/xfs_iunlinkrm_item.h
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 06b68b6..9d5012e 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -106,6 +106,7 @@ xfs-y				+= xfs_log.o \
>  				   xfs_inode_item.o \
>  				   xfs_refcount_item.o \
>  				   xfs_rmap_item.o \
> +				   xfs_iunlinkrm_item.o \
>  				   xfs_log_recover.o \
>  				   xfs_trans_ail.o \
>  				   xfs_trans_buf.o
> diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> index eb2be2a..a0f0a3d 100644
> --- a/fs/xfs/libxfs/xfs_defer.c
> +++ b/fs/xfs/libxfs/xfs_defer.c
> @@ -176,6 +176,7 @@
>  	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
>  	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
>  	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
> +	[XFS_DEFER_OPS_TYPE_IUNRE]	= &xfs_iunlink_remove_defer_type,
>  };
> 
>  /*
> diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
> index 7c28d76..9e91a36 100644
> --- a/fs/xfs/libxfs/xfs_defer.h
> +++ b/fs/xfs/libxfs/xfs_defer.h
> @@ -17,6 +17,7 @@ enum xfs_defer_ops_type {
>  	XFS_DEFER_OPS_TYPE_RMAP,
>  	XFS_DEFER_OPS_TYPE_FREE,
>  	XFS_DEFER_OPS_TYPE_AGFL_FREE,
> +	XFS_DEFER_OPS_TYPE_IUNRE,
>  	XFS_DEFER_OPS_TYPE_MAX,
>  };
> 
> @@ -60,5 +61,6 @@ struct xfs_defer_op_type {
>  extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
>  extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
>  extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
> +extern const struct xfs_defer_op_type xfs_iunlink_remove_defer_type;
> 
>  #endif /* __XFS_DEFER_H__ */
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index e5f97c6..dc85b28 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -117,7 +117,9 @@ struct xfs_unmount_log_format {
>  #define XLOG_REG_TYPE_CUD_FORMAT	24
>  #define XLOG_REG_TYPE_BUI_FORMAT	25
>  #define XLOG_REG_TYPE_BUD_FORMAT	26
> -#define XLOG_REG_TYPE_MAX		26
> +#define XLOG_REG_TYPE_IRI_FORMAT	27
> +#define XLOG_REG_TYPE_IRD_FORMAT	28
> +#define XLOG_REG_TYPE_MAX		28
> 
>  /*
>   * Flags to log operation header
> @@ -240,6 +242,8 @@ struct xfs_unmount_log_format {
>  #define	XFS_LI_CUD		0x1243
>  #define	XFS_LI_BUI		0x1244	/* bmbt update intent */
>  #define	XFS_LI_BUD		0x1245
> +#define	XFS_LI_IRI		0x1246	/* iunlink remove intent */
> +#define	XFS_LI_IRD		0x1247
> 
>  #define XFS_LI_TYPE_DESC \
>  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> @@ -255,7 +259,9 @@ struct xfs_unmount_log_format {
>  	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
>  	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
>  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
> -	{ XFS_LI_BUD,		"XFS_LI_BUD" }
> +	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
> +	{ XFS_LI_IRI,		"XFS_LI_IRI" }, \
> +	{ XFS_LI_IRD,		"XFS_LI_IRD" }
> 
>  /*
>   * Inode Log Item Format definitions.
> @@ -773,6 +779,23 @@ struct xfs_bud_log_format {
>  };
> 
>  /*
> + * This is the structure used to lay out iri&ird log item in the log.
> + */
> +typedef struct xfs_iri_log_format {
> +	uint16_t		iri_type;	/* iri log item type */
> +	uint16_t		iri_size;	/* size of this item */
> +	uint64_t		iri_id;		/* id of corresponding iri */
> +	uint64_t		wip_ino;	/* inode number */
> +} xfs_iri_log_format_t;
> +
> +typedef struct xfs_ird_log_format {
> +	uint16_t		ird_type;	/* ird log item type */
> +	uint16_t		ird_size;	/* size of this item */
> +	uint64_t		ird_iri_id;	/* id of corresponding iri */
> +	uint64_t		wip_ino;	/* inode number */
> +} xfs_ird_log_format_t;
> +
> +/*
>   * Dquot Log format definitions.
>   *
>   * The first two fields must be the type and size fitting into
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 6467d5e..7bb3102 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -35,6 +35,7 @@
>  #include "xfs_log.h"
>  #include "xfs_bmap_btree.h"
>  #include "xfs_reflink.h"
> +#include "xfs_iunlinkrm_item.h"
> 
>  kmem_zone_t *xfs_inode_zone;
> 
> @@ -46,7 +47,6 @@
> 
>  STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
>  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
> -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
> 
>  /*
>   * helper function to extract extent size hint from inode
> @@ -1110,7 +1110,7 @@
>  /*
>   * Increment the link count on an inode & log the change.
>   */
> -static void
> +void
>  xfs_bumplink(
>  	xfs_trans_t *tp,
>  	xfs_inode_t *ip)
> @@ -2406,7 +2406,7 @@ struct xfs_iunlink {
>  /*
>   * Pull the on-disk inode from the AGI unlinked list.
>   */
> -STATIC int
> +int
>  xfs_iunlink_remove(
>  	struct xfs_trans	*tp,
>  	struct xfs_inode	*ip)
> @@ -3261,8 +3261,6 @@ struct xfs_iunlink {
>  	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
>  	if (target_ip)
>  		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
> -	if (wip)
> -		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> 
>  	/*
>  	 * If we are using project inheritance, we only allow renames
> @@ -3417,35 +3415,15 @@ struct xfs_iunlink {
>  	if (error)
>  		goto out_trans_cancel;
> 
> -	/*
> -	 * For whiteouts, we need to bump the link count on the whiteout inode.
> -	 * This means that failures all the way up to this point leave the inode
> -	 * on the unlinked list and so cleanup is a simple matter of dropping
> -	 * the remaining reference to it. If we fail here after bumping the link
> -	 * count, we're shutting down the filesystem so we'll never see the
> -	 * intermediate state on disk.
> -	 */
> -	if (wip) {
> -		ASSERT(VFS_I(wip)->i_nlink == 0);
> -		xfs_bumplink(tp, wip);
> -		error = xfs_iunlink_remove(tp, wip);
> -		if (error)
> -			goto out_trans_cancel;
> -		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> -
> -		/*
> -		 * Now we have a real link, clear the "I'm a tmpfile" state
> -		 * flag from the inode so it doesn't accidentally get misused in
> -		 * future.
> -		 */
> -		VFS_I(wip)->i_state &= ~I_LINKABLE;
> -	}
> -
>  	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
>  	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
>  	if (new_parent)
>  		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
> 
> +	/* add the iunlink remove intent to the tp */
> +	if (wip)
> +		xfs_iunlink_remove_add(tp, wip);
> +
>  	error = xfs_finish_rename(tp);
>  	if (wip)
>  		xfs_irele(wip);
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 558173f..f8c30ca 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -20,6 +20,7 @@
>  struct xfs_mount;
>  struct xfs_trans;
>  struct xfs_dquot;
> +struct xfs_trans;
> 
>  typedef struct xfs_inode {
>  	/* Inode linking and identification information. */
> @@ -414,6 +415,7 @@ enum layout_break_reason {
>  void		xfs_inactive(struct xfs_inode *ip);
>  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
>  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
> +void		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
>  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
>  			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
>  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
> @@ -436,6 +438,7 @@ int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
>  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
> 
>  uint		xfs_ip2xflags(struct xfs_inode *);
> +int		xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
>  int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
>  int		xfs_itruncate_extents_flags(struct xfs_trans **,
>  				struct xfs_inode *, int, xfs_fsize_t, int);
> diff --git a/fs/xfs/xfs_iunlinkrm_item.c b/fs/xfs/xfs_iunlinkrm_item.c
> new file mode 100644
> index 0000000..4e38329
> --- /dev/null
> +++ b/fs/xfs/xfs_iunlinkrm_item.c
> @@ -0,0 +1,458 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> + * Author: Kaixuxia <kaixuxia@tencent.com>
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_bit.h"
> +#include "xfs_shared.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_trans.h"
> +#include "xfs_trans_priv.h"
> +#include "xfs_log.h"
> +#include "xfs_alloc.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_iunlinkrm_item.h"
> +
> +kmem_zone_t	*xfs_iri_zone;
> +kmem_zone_t	*xfs_ird_zone;
> +
> +static inline struct xfs_iri_log_item *IRI_ITEM(struct xfs_log_item *lip)
> +{
> +	return container_of(lip, struct xfs_iri_log_item, iri_item);
> +}
> +
> +void
> +xfs_iri_item_free(
> +	struct xfs_iri_log_item *irip)
> +{
> +	kmem_zone_free(xfs_iri_zone, irip);
> +}
> +
> +/*
> + * Freeing the iri requires that we remove it from the AIL if it has already
> + * been placed there. However, the IRI may not yet have been placed in the AIL
> + * when called by xfs_iri_release() from IRD processing due to the ordering of
> + * committed vs unpin operations in bulk insert operations. Hence the reference
> + * count to ensure only the last caller frees the IRI.
> + */
> +void
> +xfs_iri_release(
> +	struct xfs_iri_log_item *irip)
> +{
> +	ASSERT(atomic_read(&irip->iri_refcount) > 0);
> +	if (atomic_dec_and_test(&irip->iri_refcount)) {
> +		xfs_trans_ail_remove(&irip->iri_item, SHUTDOWN_LOG_IO_ERROR);
> +		xfs_iri_item_free(irip);
> +	}
> +}
> +
> +static inline int
> +xfs_iri_item_sizeof(
> +	struct xfs_iri_log_item *irip)
> +{
> +	return sizeof(struct xfs_iri_log_format);
> +}
> +
> +STATIC void
> +xfs_iri_item_size(
> +	struct xfs_log_item	*lip,
> +	int			*nvecs,
> +	int			*nbytes)
> +{
> +	*nvecs += 1;
> +	*nbytes += xfs_iri_item_sizeof(IRI_ITEM(lip));
> +}
> +
> +STATIC void
> +xfs_iri_item_format(
> +	struct xfs_log_item	*lip,
> +	struct xfs_log_vec	*lv)
> +{
> +	struct xfs_iri_log_item	*irip = IRI_ITEM(lip);
> +	struct xfs_log_iovec	*vecp = NULL;
> +
> +	irip->iri_format.iri_type = XFS_LI_IRI;
> +	irip->iri_format.iri_size = 1;
> +
> +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRI_FORMAT,
> +			&irip->iri_format,
> +			xfs_iri_item_sizeof(irip));
> +}
> +
> +/*
> + * The unpin operation is the last place an IRI is manipulated in the log. It is
> + * either inserted in the AIL or aborted in the event of a log I/O error. In
> + * either case, the IRI transaction has been successfully committed to make it
> + * this far. Therefore, we expect whoever committed the IRI to either construct
> + * and commit the IRD or drop the IRD's reference in the event of error. Simply
> + * drop the log's IRI reference now that the log is done with it.
> + */
> +STATIC void
> +xfs_iri_item_unpin(
> +	struct xfs_log_item	*lip,
> +	int			remove)
> +{
> +	struct xfs_iri_log_item *irip = IRI_ITEM(lip);
> +	xfs_iri_release(irip);
> +}
> +
> +/*
> + * The IRI has been either committed or aborted if the transaction has been
> + * cancelled. If the transaction was cancelled, an IRD isn't going to be
> + * constructed and thus we free the IRI here directly.
> + */
> +STATIC void
> +xfs_iri_item_release(
> +	struct xfs_log_item     *lip)
> +{
> +	xfs_iri_release(IRI_ITEM(lip));
> +}
> +
> +/*
> + * This is the ops vector shared by all iri log items.
> + */
> +static const struct xfs_item_ops xfs_iri_item_ops = {
> +	.iop_size	= xfs_iri_item_size,
> +	.iop_format	= xfs_iri_item_format,
> +	.iop_unpin	= xfs_iri_item_unpin,
> +	.iop_release	= xfs_iri_item_release,
> +};
> +
> +/*
> + * Allocate and initialize an iri item with the given wip ino.
> + */
> +struct xfs_iri_log_item *
> +xfs_iri_init(struct xfs_mount  *mp,
> +	     uint		count)
> +{
> +	struct xfs_iri_log_item *irip;
> +
> +	irip = kmem_zone_zalloc(xfs_iri_zone, KM_SLEEP);
> +
> +	xfs_log_item_init(mp, &irip->iri_item, XFS_LI_IRI, &xfs_iri_item_ops);
> +	irip->iri_format.iri_id = (uintptr_t)(void *)irip;
> +	atomic_set(&irip->iri_refcount, 2);
> +
> +	return irip;
> +}
> +
> +static inline struct xfs_ird_log_item *IRD_ITEM(struct xfs_log_item *lip)
> +{
> +	return container_of(lip, struct xfs_ird_log_item, ird_item);
> +}
> +
> +STATIC void
> +xfs_ird_item_free(struct xfs_ird_log_item *irdp)
> +{
> +	kmem_zone_free(xfs_ird_zone, irdp);
> +}
> +
> +/*
> + * This returns the number of iovecs needed to log the given ird item.
> + * We only need 1 iovec for an ird item.  It just logs the ird_log_format
> + * structure.
> + */
> +STATIC void
> +xfs_ird_item_size(
> +	struct xfs_log_item	*lip,
> +	int			*nvecs,
> +	int			*nbytes)
> +{
> +	*nvecs += 1;
> +	*nbytes += sizeof(struct xfs_ird_log_format);
> +}
> +
> +STATIC void
> +xfs_ird_item_format(
> +	struct xfs_log_item	*lip,
> +	struct xfs_log_vec	*lv)
> +{
> +	struct xfs_ird_log_item *irdp = IRD_ITEM(lip);
> +	struct xfs_log_iovec	*vecp = NULL;
> +
> +	irdp->ird_format.ird_type = XFS_LI_IRD;
> +	irdp->ird_format.ird_size = 1;
> +
> +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRD_FORMAT, &irdp->ird_format,
> +			sizeof(struct xfs_ird_log_format));
> +}
> +
> +/*
> + * The IRD is either committed or aborted if the transaction is cancelled. If
> + * the transaction is cancelled, drop our reference to the IRI and free the
> + * IRD.
> + */
> +STATIC void
> +xfs_ird_item_release(
> +	struct xfs_log_item	*lip)
> +{
> +	struct xfs_ird_log_item	*irdp = IRD_ITEM(lip);
> +
> +	xfs_iri_release(irdp->ird_irip);
> +	xfs_ird_item_free(irdp);
> +}
> +
> +static const struct xfs_item_ops xfs_ird_item_ops = {
> +	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
> +	.iop_size	= xfs_ird_item_size,
> +	.iop_format	= xfs_ird_item_format,
> +	.iop_release	= xfs_ird_item_release,
> +};
> +
> +static struct xfs_ird_log_item *
> +xfs_trans_get_ird(
> +	struct xfs_trans		*tp,
> +	struct xfs_iri_log_item		*irip)
> +{
> +	xfs_ird_log_item_t	*irdp;
> +
> +	ASSERT(tp != NULL);
> +
> +	irdp = kmem_zone_zalloc(xfs_ird_zone, KM_SLEEP);
> +	xfs_log_item_init(tp->t_mountp, &irdp->ird_item, XFS_LI_IRD,
> +			  &xfs_ird_item_ops);
> +	irdp->ird_irip = irip;
> +	irdp->ird_format.wip_ino = irip->iri_format.wip_ino;
> +	irdp->ird_format.ird_iri_id = irip->iri_format.iri_id;
> +
> +	xfs_trans_add_item(tp, &irdp->ird_item);
> +	return irdp;
> +}
> +
> +/* record a iunlink remove intent */
> +int
> +xfs_iunlink_remove_add(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*wip)
> +{
> +	struct xfs_iunlink_remove_intent	*ii;
> +
> +	ii = kmem_alloc(sizeof(struct xfs_iunlink_remove_intent),
> +			KM_SLEEP | KM_NOFS);
> +	INIT_LIST_HEAD(&ii->ri_list);
> +	ii->wip = wip;
> +
> +	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_IUNRE, &ii->ri_list);
> +	return 0;
> +}
> +
> +/* Sort iunlink remove intents by AG. */
> +static int
> +xfs_iunlink_remove_diff_items(
> +	void				*priv,
> +	struct list_head		*a,
> +	struct list_head		*b)
> +{
> +	struct xfs_mount			*mp = priv;
> +	struct xfs_iunlink_remove_intent	*ra;
> +	struct xfs_iunlink_remove_intent	*rb;
> +
> +	ra = container_of(a, struct xfs_iunlink_remove_intent, ri_list);
> +	rb = container_of(b, struct xfs_iunlink_remove_intent, ri_list);
> +	return	XFS_INO_TO_AGNO(mp, ra->wip->i_ino) -
> +		XFS_INO_TO_AGNO(mp, rb->wip->i_ino);
> +}
> +
> +/* Get an IRI */
> +STATIC void *
> +xfs_iunlink_remove_create_intent(
> +	struct xfs_trans		*tp,
> +	unsigned int			count)
> +{
> +	struct xfs_iri_log_item		*irip;
> +
> +	ASSERT(tp != NULL);
> +	ASSERT(count == 1);
> +
> +	irip = xfs_iri_init(tp->t_mountp, count);
> +	ASSERT(irip != NULL);
> +
> +	/*
> +	 * Get a log_item_desc to point at the new item.
> +	 */
> +	xfs_trans_add_item(tp, &irip->iri_item);
> +	return irip;
> +}
> +
> +/* Log a iunlink remove to the intent item. */
> +STATIC void
> +xfs_iunlink_remove_log_item(
> +	struct xfs_trans		*tp,
> +	void				*intent,
> +	struct list_head		*item)
> +{
> +	struct xfs_iri_log_item			*irip = intent;
> +	struct xfs_iunlink_remove_intent	*iunre;
> +
> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> +
> +	tp->t_flags |= XFS_TRANS_DIRTY;
> +	set_bit(XFS_LI_DIRTY, &irip->iri_item.li_flags);
> +
> +	irip->iri_format.wip_ino = (uint64_t)(iunre->wip->i_ino);
> +}
> +
> +/* Get an IRD so we can process all the deferred iunlink remove. */
> +STATIC void *
> +xfs_iunlink_remove_create_done(
> +	struct xfs_trans		*tp,
> +	void				*intent,
> +	unsigned int			count)
> +{
> +	return xfs_trans_get_ird(tp, intent);
> +}
> +
> +/*
> + * For whiteouts, we need to bump the link count on the whiteout inode.
> + * This means that failures all the way up to this point leave the inode
> + * on the unlinked list and so cleanup is a simple matter of dropping
> + * the remaining reference to it. If we fail here after bumping the link
> + * count, we're shutting down the filesystem so we'll never see the
> + * intermediate state on disk.
> + */
> +static int
> +xfs_trans_log_finish_iunlink_remove(
> +	struct xfs_trans		*tp,
> +	struct xfs_ird_log_item		*irdp,
> +	struct xfs_inode		*wip)
> +{
> +	int 	error;
> +
> +	ASSERT(xfs_isilocked(wip, XFS_ILOCK_EXCL));
> +
> +	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> +
> +	ASSERT(VFS_I(wip)->i_nlink == 0);
> +	xfs_bumplink(tp, wip);
> +	error = xfs_iunlink_remove(tp, wip);
> +	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> +	/*
> +	 * Now we have a real link, clear the "I'm a tmpfile" state
> +	 * flag from the inode so it doesn't accidentally get misused in
> +	 * future.
> +	 */
> +	VFS_I(wip)->i_state &= ~I_LINKABLE;
> +
> +	/*
> +	 * Mark the transaction dirty, even on error. This ensures the
> +	 * transaction is aborted, which:
> +	 *
> +	 * 1.) releases the IRI and frees the IRD
> +	 * 2.) shuts down the filesystem
> +	 */
> +	tp->t_flags |= XFS_TRANS_DIRTY;
> +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> +
> +	return error;
> +}
> +
> +/* Process a deferred iunlink remove. */
> +STATIC int
> +xfs_iunlink_remove_finish_item(
> +	struct xfs_trans		*tp,
> +	struct list_head		*item,
> +	void				*done_item,
> +	void				**state)
> +{
> +	struct xfs_iunlink_remove_intent	*iunre;
> +	int					error;
> +
> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> +	error = xfs_trans_log_finish_iunlink_remove(tp, done_item,
> +			iunre->wip);
> +	kmem_free(iunre);
> +	return error;
> +}
> +
> +/* Abort all pending IRIs. */
> +STATIC void
> +xfs_iunlink_remove_abort_intent(
> +	void		*intent)
> +{
> +	xfs_iri_release(intent);
> +}
> +
> +/* Cancel a deferred iunlink remove. */
> +STATIC void
> +xfs_iunlink_remove_cancel_item(
> +	struct list_head		*item)
> +{
> +	struct xfs_iunlink_remove_intent	*iunre;
> +
> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> +	kmem_free(iunre);
> +}
> +
> +const struct xfs_defer_op_type xfs_iunlink_remove_defer_type = {
> +	.diff_items	= xfs_iunlink_remove_diff_items,
> +	.create_intent	= xfs_iunlink_remove_create_intent,
> +	.abort_intent	= xfs_iunlink_remove_abort_intent,
> +	.log_item	= xfs_iunlink_remove_log_item,
> +	.create_done	= xfs_iunlink_remove_create_done,
> +	.finish_item	= xfs_iunlink_remove_finish_item,
> +	.cancel_item	= xfs_iunlink_remove_cancel_item,
> +};
> +
> +/*
> + * Process a iunlink remove intent item that was recovered from the log.
> + */
> +int
> +xfs_iri_recover(
> +	struct xfs_trans		*parent_tp,
> +	struct xfs_iri_log_item		*irip)
> +{
> +	int				error = 0;
> +	struct xfs_trans		*tp;
> +	xfs_ino_t			ino;
> +	struct xfs_inode		*ip;
> +	struct xfs_mount		*mp = parent_tp->t_mountp;
> +	struct xfs_ird_log_item		*irdp;
> +
> +	ASSERT(!test_bit(XFS_IRI_RECOVERED, &irip->iri_flags));
> +
> +	ino = irip->iri_format.wip_ino;
> +	if (ino == NULLFSINO || !xfs_verify_dir_ino(mp, ino)) {
> +		xfs_alert(mp, "IRI recover used bad inode ino 0x%llx!", ino);
> +		set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> +		xfs_iri_release(irip);
> +		return -EIO;
> +	}
> +	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
> +	if (error)
> +		return error;
> +
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> +	if (error)
> +		return error;
> +	irdp = xfs_trans_get_ird(tp, irip);
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> +
> +	ASSERT(VFS_I(ip)->i_nlink == 0);
> +	VFS_I(ip)->i_state |= I_LINKABLE;
> +	xfs_bumplink(tp, ip);
> +	error = xfs_iunlink_remove(tp, ip);
> +	if (error)
> +		goto abort_error;
> +	VFS_I(ip)->i_state &= ~I_LINKABLE;
> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> +
> +	tp->t_flags |= XFS_TRANS_DIRTY;
> +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> +
> +	set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> +	error = xfs_trans_commit(tp);
> +	return error;
> +
> +abort_error:
> +	xfs_trans_cancel(tp);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_iunlinkrm_item.h b/fs/xfs/xfs_iunlinkrm_item.h
> new file mode 100644
> index 0000000..54c4ca3
> --- /dev/null
> +++ b/fs/xfs/xfs_iunlinkrm_item.h
> @@ -0,0 +1,67 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> + * Author: Kaixuxia <kaixuxia@tencent.com>
> + */
> +#ifndef	__XFS_IUNLINKRM_ITEM_H__
> +#define	__XFS_IUNLINKRM_ITEM_H__
> +
> +/*
> + * When performing rename operation with RENAME_WHITEOUT flag, we will hold AGF lock to
> + * allocate or free extents in manipulating the dirents firstly, and then doing the
> + * xfs_iunlink_remove() call last to hold AGI lock to modify the tmpfile info, so we the
> + * lock order AGI->AGF.
> + *
> + * The big problem here is that we have an ordering constraint on AGF and AGI locking -
> + * inode allocation locks the AGI, then can allocate a new extent for new inodes, locking
> + * the AGF after the AGI. Hence the ordering that is imposed by other parts of the code
> + * is AGI before AGF. So we get the ABBA agi&agf deadlock here.
> + *
> + * So make the unlinked list removal a deferred operation, i.e. log an iunlink remove intent
> + * and then do it after the RENAME_WHITEOUT transaction has committed, and the iunlink remove
> + * intention(IRI) and done log items(IRD) are provided.
> + */
> +
> +/* kernel only IRI/IRD definitions */
> +
> +struct xfs_mount;
> +struct kmem_zone;
> +struct xfs_inode;
> +
> +/*
> + * Define IRI flag bits. Manipulated by set/clear/test_bit operators.
> + */
> +#define	XFS_IRI_RECOVERED		1
> +
> +/* This is the "iunlink remove intention" log item. It is used in conjunction
> + * with the "iunlink remove done" log item described below.
> + */
> +typedef struct xfs_iri_log_item {
> +	struct xfs_log_item	iri_item;
> +	atomic_t		iri_refcount;
> +	unsigned long		iri_flags;
> +	xfs_iri_log_format_t	iri_format;
> +} xfs_iri_log_item_t;
> +
> +/* This is the "iunlink remove done" log item. */
> +typedef struct xfs_ird_log_item {
> +	struct xfs_log_item	ird_item;
> +	xfs_iri_log_item_t	*ird_irip;
> +	xfs_ird_log_format_t	ird_format;
> +} xfs_ird_log_item_t;
> +
> +struct xfs_iunlink_remove_intent {
> +	struct list_head		ri_list;
> +	struct xfs_inode		*wip;
> +};
> +
> +extern struct kmem_zone	*xfs_iri_zone;
> +extern struct kmem_zone	*xfs_ird_zone;
> +
> +struct xfs_iri_log_item	*xfs_iri_init(struct xfs_mount *, uint);
> +void xfs_iri_item_free(struct xfs_iri_log_item *);
> +void xfs_iri_release(struct xfs_iri_log_item *);
> +int xfs_iri_recover(struct xfs_trans *, struct xfs_iri_log_item *);
> +int xfs_iunlink_remove_add(struct xfs_trans *, struct xfs_inode *);
> +
> +#endif	/* __XFS_IUNLINKRM_ITEM_H__ */
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 00e9f5c..f87f510 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -2005,6 +2005,8 @@ STATIC void xlog_state_done_syncing(
>  	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
>  	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
>  	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
> +	    REG_TYPE_STR(IRI_FORMAT, "iri_format"),
> +	    REG_TYPE_STR(IRD_FORMAT, "ird_format"),
>  	};
>  	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
>  #undef REG_TYPE_STR
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 13d1d3e..a916f40 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -33,6 +33,7 @@
>  #include "xfs_buf_item.h"
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
> +#include "xfs_iunlinkrm_item.h"
> 
>  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
> 
> @@ -1885,6 +1886,8 @@ struct xfs_buf_cancel {
>  		case XFS_LI_CUD:
>  		case XFS_LI_BUI:
>  		case XFS_LI_BUD:
> +		case XFS_LI_IRI:
> +		case XFS_LI_IRD:
>  			trace_xfs_log_recover_item_reorder_tail(log,
>  							trans, item, pass);
>  			list_move_tail(&item->ri_list, &inode_list);
> @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
>  }
> 
>  /*
> + * This routine is called to create an in-core iunlink remove intent
> + * item from the iri format structure which was logged on disk.
> + * It allocates an in-core iri, copies the inode from the format
> + * structure into it, and adds the iri to the AIL with the given
> + * LSN.
> + */
> +STATIC int
> +xlog_recover_iri_pass2(
> +	struct xlog			*log,
> +	struct xlog_recover_item	*item,
> +	xfs_lsn_t			lsn)
> +{
> +	xfs_mount_t		*mp = log->l_mp;
> +	xfs_iri_log_item_t	*irip;
> +	xfs_iri_log_format_t	*iri_formatp;
> +
> +	iri_formatp = item->ri_buf[0].i_addr;
> +
> +	irip = xfs_iri_init(mp, 1);
> +	irip->iri_format = *iri_formatp;
> +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
> +		xfs_iri_item_free(irip);
> +		return EFSCORRUPTED;
> +	}
> +
> +	spin_lock(&log->l_ailp->ail_lock);
> +	/*
> +	 * The IRI has two references. One for the IRD and one for IRI to ensure
> +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
> +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
> +	 * AIL lock.
> +	 */
> +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
> +	xfs_iri_release(irip);
> +	return 0;
> +}
> +
> +/*
> + * This routine is called when an IRD format structure is found in a committed
> + * transaction in the log. Its purpose is to cancel the corresponding IRI if it
> + * was still in the log. To do this it searches the AIL for the IRI with an id
> + * equal to that in the IRD format structure. If we find it we drop the IRD
> + * reference, which removes the IRI from the AIL and frees it.
> + */
> +STATIC int
> +xlog_recover_ird_pass2(
> +	struct xlog			*log,
> +	struct xlog_recover_item	*item)
> +{
> +	xfs_ird_log_format_t	*ird_formatp;
> +	xfs_iri_log_item_t	*irip = NULL;
> +	struct xfs_log_item	*lip;
> +	uint64_t		iri_id;
> +	struct xfs_ail_cursor	cur;
> +	struct xfs_ail		*ailp = log->l_ailp;
> +
> +	ird_formatp = item->ri_buf[0].i_addr;
> +	if (item->ri_buf[0].i_len == sizeof(xfs_ird_log_format_t))
> +		return -EFSCORRUPTED;
> +	iri_id = ird_formatp->ird_iri_id;
> +
> +	/*
> +	 * Search for the iri with the id in the ird format structure
> +	 * in the AIL.
> +	 */
> +	spin_lock(&ailp->ail_lock);
> +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
> +	while (lip != NULL) {
> +		if (lip->li_type == XFS_LI_IRI) {
> +			irip = (xfs_iri_log_item_t *)lip;
> +			if (irip->iri_format.iri_id == iri_id) {
> +				/*
> +				 * Drop the IRD reference to the IRI. This
> +				 * removes the IRI from the AIL and frees it.
> +				 */
> +				spin_unlock(&ailp->ail_lock);
> +				xfs_iri_release(irip);
> +				spin_lock(&ailp->ail_lock);
> +				break;
> +			}
> +		}
> +		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> +	}
> +	xfs_trans_ail_cursor_done(&cur);
> +	spin_unlock(&ailp->ail_lock);
> +
> +	return 0;
> +}
> +
> +/*
>   * This routine is called when an inode create format structure is found in a
>   * committed transaction in the log.  It's purpose is to initialise the inodes
>   * being allocated on disk. This requires us to get inode cluster buffers that
> @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
>  	case XFS_LI_CUD:
>  	case XFS_LI_BUI:
>  	case XFS_LI_BUD:
> +	case XFS_LI_IRI:
> +	case XFS_LI_IRD:
>  	default:
>  		break;
>  	}
> @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
>  	case XFS_LI_CUD:
>  	case XFS_LI_BUI:
>  	case XFS_LI_BUD:
> +	case XFS_LI_IRI:
> +	case XFS_LI_IRD:
>  		/* nothing to do in pass 1 */
>  		return 0;
>  	default:
> @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
>  		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
>  	case XFS_LI_BUD:
>  		return xlog_recover_bud_pass2(log, item);
> +	case XFS_LI_IRI:
> +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
> +	case XFS_LI_IRD:
> +		return xlog_recover_ird_pass2(log, item);
>  	case XFS_LI_DQUOT:
>  		return xlog_recover_dquot_pass2(log, buffer_list, item,
>  						trans->r_lsn);
> @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
>  	spin_lock(&ailp->ail_lock);
>  }
> 
> +/* Recover the IRI if necessary. */
> +STATIC int
> +xlog_recover_process_iri(
> +	struct xfs_trans		*parent_tp,
> +	struct xfs_ail			*ailp,
> +	struct xfs_log_item		*lip)
> +{
> +	struct xfs_iri_log_item		*irip;
> +	int				error;
> +
> +	/*
> +	 * Skip IRIs that we've already processed.
> +	 */
> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
> +		return 0;
> +
> +	spin_unlock(&ailp->ail_lock);
> +	error = xfs_iri_recover(parent_tp, irip);
> +	spin_lock(&ailp->ail_lock);
> +
> +	return error;
> +}
> +
> +/* Release the IRI since we're cancelling everything. */
> +STATIC void
> +xlog_recover_cancel_iri(
> +	struct xfs_mount		*mp,
> +	struct xfs_ail			*ailp,
> +	struct xfs_log_item		*lip)
> +{
> +	struct xfs_iri_log_item         *irip;
> +
> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> +
> +	spin_unlock(&ailp->ail_lock);
> +	xfs_iri_release(irip);
> +	spin_lock(&ailp->ail_lock);
> +}
> +
>  /* Is this log item a deferred action intent? */
>  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  {
> @@ -4729,6 +4870,7 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  	case XFS_LI_RUI:
>  	case XFS_LI_CUI:
>  	case XFS_LI_BUI:
> +	case XFS_LI_IRI:
>  		return true;
>  	default:
>  		return false;
> @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  		case XFS_LI_BUI:
>  			error = xlog_recover_process_bui(parent_tp, ailp, lip);
>  			break;
> +		case XFS_LI_IRI:
> +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
> +			break;
>  		}
>  		if (error)
>  			goto out;
> @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  		case XFS_LI_BUI:
>  			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
>  			break;
> +		case XFS_LI_IRI:
> +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
> +			break;
>  		}
> 
>  		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index f945023..66742b7 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -34,6 +34,7 @@
>  #include "xfs_rmap_item.h"
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
> +#include "xfs_iunlinkrm_item.h"
>  #include "xfs_reflink.h"
> 
>  #include <linux/magic.h>
> @@ -1957,8 +1958,22 @@ struct proc_xfs_info {
>  	if (!xfs_bui_zone)
>  		goto out_destroy_bud_zone;
> 
> +	xfs_ird_zone = kmem_zone_init(sizeof(xfs_ird_log_item_t),
> +			"xfs_ird_item");
> +	if (!xfs_ird_zone)
> +		goto out_destroy_bui_zone;
> +
> +	xfs_iri_zone = kmem_zone_init(sizeof(xfs_iri_log_item_t),
> +			"xfs_iri_item");
> +	if (!xfs_iri_zone)
> +		goto out_destroy_ird_zone;
> +
>  	return 0;
> 
> + out_destroy_ird_zone:
> +	kmem_zone_destroy(xfs_ird_zone);
> + out_destroy_bui_zone:
> +	kmem_zone_destroy(xfs_bui_zone);
>   out_destroy_bud_zone:
>  	kmem_zone_destroy(xfs_bud_zone);
>   out_destroy_cui_zone:
> @@ -2007,6 +2022,8 @@ struct proc_xfs_info {
>  	 * destroy caches.
>  	 */
>  	rcu_barrier();
> +	kmem_zone_destroy(xfs_iri_zone);
> +	kmem_zone_destroy(xfs_ird_zone);
>  	kmem_zone_destroy(xfs_bui_zone);
>  	kmem_zone_destroy(xfs_bud_zone);
>  	kmem_zone_destroy(xfs_cui_zone);
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index 64d7f17..dd63eaa 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -26,6 +26,8 @@
>  struct xfs_cud_log_item;
>  struct xfs_bui_log_item;
>  struct xfs_bud_log_item;
> +struct xfs_iri_log_item;
> +struct xfs_ird_log_item;
> 
>  struct xfs_log_item {
>  	struct list_head		li_ail;		/* AIL pointers */
> -- 
> 1.8.3.1
> 
> -- 
> kaixuxia
Darrick J. Wong Aug. 13, 2019, 2:20 p.m. UTC | #2
On Tue, Aug 13, 2019 at 09:36:14AM -0400, Brian Foster wrote:
> On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > When performing rename operation with RENAME_WHITEOUT flag, we will
> > hold AGF lock to allocate or free extents in manipulating the dirents
> > firstly, and then doing the xfs_iunlink_remove() call last to hold
> > AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
> > 
> 
> IIUC, the whiteout use case is that we're renaming a file, but the
> source dentry must be replaced with a magic whiteout inode rather than
> be removed. Therefore, xfs_rename() allocates the whiteout inode as a
> tmpfile first in a separate transaction, updates the target dentry with
> the source inode, replaces the source dentry to point to the whiteout
> inode and finally removes the whiteout inode from the unlinked list
> (since it is a tmpfile). This leads to the problem described below
> because the rename transaction ends up doing directory block allocs
> (locking the AGF) followed by the unlinked list remove (locking the
> AGI).
> 
> My understanding from reading the code is that this is primarly to
> cleanly handle error scenarios. If anything fails after we've allocated
> the whiteout tmpfile, it's simply left on the unlinked list and so the
> filesystem remains in a consistent/recoverable state. Given that, the
> solution here seems like overkill to me. For one, I thought background
> unlinked list removal was already on our roadmap (Darrick might have
> been looking at that and may already have a prototype as well). Also,
> unlinked list removal occurs at log recovery time already. That's
> somewhat of an existing purpose of the list, which makes a deferred
> unlinked list removal operation superfluous in more traditional cases
> where unlinked list removal doesn't require consistency with a directory
> operation.

Not to mention this doesn't fix the problem for existing filesystems,
because adding new log item types changes the on-disk log format and
therefore requires an log incompat feature bit to prevent old kernels
from trying to recover the log.

> Functional discussion aside.. from a complexity standpoint I'm wondering
> if we could do something much more simple like acquire the AGI lock for
> a whiteout inode earlier in xfs_rename(). For example, suppose we did
> something like:
> 
> 	/*
> 	 * Acquire the whiteout agi to preserve locking order in anticipation of
> 	 * unlinked list removal.
> 	 */
> 	if (wip)
> 		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);
> 
> ... after we allocate the transaction but before we do any directory ops
> that can result in block allocations. Would that prevent the problem
> you've observed?

I had the same thought, but fun question: if @wip is allocated in AG 1
but the dirent blocks come from AG 0, is that a problem?

Would it make more sense to expand the directory in one transaction,
roll it, and add the actual directory entry after that?

--D

> Brian
> 
> > The big problem here is that we have an ordering constraint on AGF
> > and AGI locking - inode allocation locks the AGI, then can allocate
> > a new extent for new inodes, locking the AGF after the AGI. Hence
> > the ordering that is imposed by other parts of the code is AGI before
> > AGF. So we get the ABBA agi&agf deadlock here.
> > 
> > Process A:
> > Call trace:
> >  ? __schedule+0x2bd/0x620
> >  schedule+0x33/0x90
> >  schedule_timeout+0x17d/0x290
> >  __down_common+0xef/0x125
> >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> >  down+0x3b/0x50
> >  xfs_buf_lock+0x34/0xf0 [xfs]
> >  xfs_buf_find+0x215/0x6c0 [xfs]
> >  xfs_buf_get_map+0x37/0x230 [xfs]
> >  xfs_buf_read_map+0x29/0x190 [xfs]
> >  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
> >  xfs_read_agf+0xa6/0x180 [xfs]
> >  ? schedule_timeout+0x17d/0x290
> >  xfs_alloc_read_agf+0x52/0x1f0 [xfs]
> >  xfs_alloc_fix_freelist+0x432/0x590 [xfs]
> >  ? down+0x3b/0x50
> >  ? xfs_buf_lock+0x34/0xf0 [xfs]
> >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> >  xfs_alloc_vextent+0x301/0x6c0 [xfs]
> >  xfs_ialloc_ag_alloc+0x182/0x700 [xfs]
> >  ? _xfs_trans_bjoin+0x72/0xf0 [xfs]
> >  xfs_dialloc+0x116/0x290 [xfs]
> >  xfs_ialloc+0x6d/0x5e0 [xfs]
> >  ? xfs_log_reserve+0x165/0x280 [xfs]
> >  xfs_dir_ialloc+0x8c/0x240 [xfs]
> >  xfs_create+0x35a/0x610 [xfs]
> >  xfs_generic_create+0x1f1/0x2f0 [xfs]
> >  ...
> > 
> > Process B:
> > Call trace:
> >  ? __schedule+0x2bd/0x620
> >  ? xfs_bmapi_allocate+0x245/0x380 [xfs]
> >  schedule+0x33/0x90
> >  schedule_timeout+0x17d/0x290
> >  ? xfs_buf_find+0x1fd/0x6c0 [xfs]
> >  __down_common+0xef/0x125
> >  ? xfs_buf_get_map+0x37/0x230 [xfs]
> >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> >  down+0x3b/0x50
> >  xfs_buf_lock+0x34/0xf0 [xfs]
> >  xfs_buf_find+0x215/0x6c0 [xfs]
> >  xfs_buf_get_map+0x37/0x230 [xfs]
> >  xfs_buf_read_map+0x29/0x190 [xfs]
> >  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
> >  xfs_read_agi+0xa8/0x160 [xfs]
> >  xfs_iunlink_remove+0x6f/0x2a0 [xfs]
> >  ? current_time+0x46/0x80
> >  ? xfs_trans_ichgtime+0x39/0xb0 [xfs]
> >  xfs_rename+0x57a/0xae0 [xfs]
> >  xfs_vn_rename+0xe4/0x150 [xfs]
> >  ...
> > 
> > In this patch we make the unlinked list removal a deferred operation,
> > i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> > transaction has committed, and the iunlink remove intention and done
> > log items are provided.
> > 
> > Change the ordering of the operations in the xfs_rename() function
> > to hold the AGF lock in the RENAME_WHITEOUT transaction and hold the
> > AGI lock in it's own transaction to match that of the rest of the code.
> > 
> > Signed-off-by: kaixuxia <kaixuxia@tencent.com>
> > ---
> >  fs/xfs/Makefile                |   1 +
> >  fs/xfs/libxfs/xfs_defer.c      |   1 +
> >  fs/xfs/libxfs/xfs_defer.h      |   2 +
> >  fs/xfs/libxfs/xfs_log_format.h |  27 ++-
> >  fs/xfs/xfs_inode.c             |  36 +---
> >  fs/xfs/xfs_inode.h             |   3 +
> >  fs/xfs/xfs_iunlinkrm_item.c    | 458 +++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_iunlinkrm_item.h    |  67 ++++++
> >  fs/xfs/xfs_log.c               |   2 +
> >  fs/xfs/xfs_log_recover.c       | 148 +++++++++++++
> >  fs/xfs/xfs_super.c             |  17 ++
> >  fs/xfs/xfs_trans.h             |   2 +
> >  12 files changed, 733 insertions(+), 31 deletions(-)
> >  create mode 100644 fs/xfs/xfs_iunlinkrm_item.c
> >  create mode 100644 fs/xfs/xfs_iunlinkrm_item.h
> > 
> > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > index 06b68b6..9d5012e 100644
> > --- a/fs/xfs/Makefile
> > +++ b/fs/xfs/Makefile
> > @@ -106,6 +106,7 @@ xfs-y				+= xfs_log.o \
> >  				   xfs_inode_item.o \
> >  				   xfs_refcount_item.o \
> >  				   xfs_rmap_item.o \
> > +				   xfs_iunlinkrm_item.o \
> >  				   xfs_log_recover.o \
> >  				   xfs_trans_ail.o \
> >  				   xfs_trans_buf.o
> > diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> > index eb2be2a..a0f0a3d 100644
> > --- a/fs/xfs/libxfs/xfs_defer.c
> > +++ b/fs/xfs/libxfs/xfs_defer.c
> > @@ -176,6 +176,7 @@
> >  	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
> >  	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
> >  	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
> > +	[XFS_DEFER_OPS_TYPE_IUNRE]	= &xfs_iunlink_remove_defer_type,
> >  };
> > 
> >  /*
> > diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
> > index 7c28d76..9e91a36 100644
> > --- a/fs/xfs/libxfs/xfs_defer.h
> > +++ b/fs/xfs/libxfs/xfs_defer.h
> > @@ -17,6 +17,7 @@ enum xfs_defer_ops_type {
> >  	XFS_DEFER_OPS_TYPE_RMAP,
> >  	XFS_DEFER_OPS_TYPE_FREE,
> >  	XFS_DEFER_OPS_TYPE_AGFL_FREE,
> > +	XFS_DEFER_OPS_TYPE_IUNRE,
> >  	XFS_DEFER_OPS_TYPE_MAX,
> >  };
> > 
> > @@ -60,5 +61,6 @@ struct xfs_defer_op_type {
> >  extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
> >  extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
> >  extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
> > +extern const struct xfs_defer_op_type xfs_iunlink_remove_defer_type;
> > 
> >  #endif /* __XFS_DEFER_H__ */
> > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> > index e5f97c6..dc85b28 100644
> > --- a/fs/xfs/libxfs/xfs_log_format.h
> > +++ b/fs/xfs/libxfs/xfs_log_format.h
> > @@ -117,7 +117,9 @@ struct xfs_unmount_log_format {
> >  #define XLOG_REG_TYPE_CUD_FORMAT	24
> >  #define XLOG_REG_TYPE_BUI_FORMAT	25
> >  #define XLOG_REG_TYPE_BUD_FORMAT	26
> > -#define XLOG_REG_TYPE_MAX		26
> > +#define XLOG_REG_TYPE_IRI_FORMAT	27
> > +#define XLOG_REG_TYPE_IRD_FORMAT	28
> > +#define XLOG_REG_TYPE_MAX		28
> > 
> >  /*
> >   * Flags to log operation header
> > @@ -240,6 +242,8 @@ struct xfs_unmount_log_format {
> >  #define	XFS_LI_CUD		0x1243
> >  #define	XFS_LI_BUI		0x1244	/* bmbt update intent */
> >  #define	XFS_LI_BUD		0x1245
> > +#define	XFS_LI_IRI		0x1246	/* iunlink remove intent */
> > +#define	XFS_LI_IRD		0x1247
> > 
> >  #define XFS_LI_TYPE_DESC \
> >  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> > @@ -255,7 +259,9 @@ struct xfs_unmount_log_format {
> >  	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
> >  	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
> >  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
> > -	{ XFS_LI_BUD,		"XFS_LI_BUD" }
> > +	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
> > +	{ XFS_LI_IRI,		"XFS_LI_IRI" }, \
> > +	{ XFS_LI_IRD,		"XFS_LI_IRD" }
> > 
> >  /*
> >   * Inode Log Item Format definitions.
> > @@ -773,6 +779,23 @@ struct xfs_bud_log_format {
> >  };
> > 
> >  /*
> > + * This is the structure used to lay out iri&ird log item in the log.
> > + */
> > +typedef struct xfs_iri_log_format {
> > +	uint16_t		iri_type;	/* iri log item type */
> > +	uint16_t		iri_size;	/* size of this item */
> > +	uint64_t		iri_id;		/* id of corresponding iri */
> > +	uint64_t		wip_ino;	/* inode number */
> > +} xfs_iri_log_format_t;
> > +
> > +typedef struct xfs_ird_log_format {
> > +	uint16_t		ird_type;	/* ird log item type */
> > +	uint16_t		ird_size;	/* size of this item */
> > +	uint64_t		ird_iri_id;	/* id of corresponding iri */
> > +	uint64_t		wip_ino;	/* inode number */
> > +} xfs_ird_log_format_t;
> > +
> > +/*
> >   * Dquot Log format definitions.
> >   *
> >   * The first two fields must be the type and size fitting into
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 6467d5e..7bb3102 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -35,6 +35,7 @@
> >  #include "xfs_log.h"
> >  #include "xfs_bmap_btree.h"
> >  #include "xfs_reflink.h"
> > +#include "xfs_iunlinkrm_item.h"
> > 
> >  kmem_zone_t *xfs_inode_zone;
> > 
> > @@ -46,7 +47,6 @@
> > 
> >  STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
> >  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
> > -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
> > 
> >  /*
> >   * helper function to extract extent size hint from inode
> > @@ -1110,7 +1110,7 @@
> >  /*
> >   * Increment the link count on an inode & log the change.
> >   */
> > -static void
> > +void
> >  xfs_bumplink(
> >  	xfs_trans_t *tp,
> >  	xfs_inode_t *ip)
> > @@ -2406,7 +2406,7 @@ struct xfs_iunlink {
> >  /*
> >   * Pull the on-disk inode from the AGI unlinked list.
> >   */
> > -STATIC int
> > +int
> >  xfs_iunlink_remove(
> >  	struct xfs_trans	*tp,
> >  	struct xfs_inode	*ip)
> > @@ -3261,8 +3261,6 @@ struct xfs_iunlink {
> >  	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
> >  	if (target_ip)
> >  		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
> > -	if (wip)
> > -		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> > 
> >  	/*
> >  	 * If we are using project inheritance, we only allow renames
> > @@ -3417,35 +3415,15 @@ struct xfs_iunlink {
> >  	if (error)
> >  		goto out_trans_cancel;
> > 
> > -	/*
> > -	 * For whiteouts, we need to bump the link count on the whiteout inode.
> > -	 * This means that failures all the way up to this point leave the inode
> > -	 * on the unlinked list and so cleanup is a simple matter of dropping
> > -	 * the remaining reference to it. If we fail here after bumping the link
> > -	 * count, we're shutting down the filesystem so we'll never see the
> > -	 * intermediate state on disk.
> > -	 */
> > -	if (wip) {
> > -		ASSERT(VFS_I(wip)->i_nlink == 0);
> > -		xfs_bumplink(tp, wip);
> > -		error = xfs_iunlink_remove(tp, wip);
> > -		if (error)
> > -			goto out_trans_cancel;
> > -		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> > -
> > -		/*
> > -		 * Now we have a real link, clear the "I'm a tmpfile" state
> > -		 * flag from the inode so it doesn't accidentally get misused in
> > -		 * future.
> > -		 */
> > -		VFS_I(wip)->i_state &= ~I_LINKABLE;
> > -	}
> > -
> >  	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
> >  	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
> >  	if (new_parent)
> >  		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
> > 
> > +	/* add the iunlink remove intent to the tp */
> > +	if (wip)
> > +		xfs_iunlink_remove_add(tp, wip);
> > +
> >  	error = xfs_finish_rename(tp);
> >  	if (wip)
> >  		xfs_irele(wip);
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index 558173f..f8c30ca 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -20,6 +20,7 @@
> >  struct xfs_mount;
> >  struct xfs_trans;
> >  struct xfs_dquot;
> > +struct xfs_trans;
> > 
> >  typedef struct xfs_inode {
> >  	/* Inode linking and identification information. */
> > @@ -414,6 +415,7 @@ enum layout_break_reason {
> >  void		xfs_inactive(struct xfs_inode *ip);
> >  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
> >  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
> > +void		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
> >  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
> >  			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
> >  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
> > @@ -436,6 +438,7 @@ int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
> >  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
> > 
> >  uint		xfs_ip2xflags(struct xfs_inode *);
> > +int		xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
> >  int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
> >  int		xfs_itruncate_extents_flags(struct xfs_trans **,
> >  				struct xfs_inode *, int, xfs_fsize_t, int);
> > diff --git a/fs/xfs/xfs_iunlinkrm_item.c b/fs/xfs/xfs_iunlinkrm_item.c
> > new file mode 100644
> > index 0000000..4e38329
> > --- /dev/null
> > +++ b/fs/xfs/xfs_iunlinkrm_item.c
> > @@ -0,0 +1,458 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> > + * Author: Kaixuxia <kaixuxia@tencent.com>
> > + */
> > +#include "xfs.h"
> > +#include "xfs_fs.h"
> > +#include "xfs_format.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_bit.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_trans.h"
> > +#include "xfs_trans_priv.h"
> > +#include "xfs_log.h"
> > +#include "xfs_alloc.h"
> > +#include "xfs_inode.h"
> > +#include "xfs_icache.h"
> > +#include "xfs_iunlinkrm_item.h"
> > +
> > +kmem_zone_t	*xfs_iri_zone;
> > +kmem_zone_t	*xfs_ird_zone;
> > +
> > +static inline struct xfs_iri_log_item *IRI_ITEM(struct xfs_log_item *lip)
> > +{
> > +	return container_of(lip, struct xfs_iri_log_item, iri_item);
> > +}
> > +
> > +void
> > +xfs_iri_item_free(
> > +	struct xfs_iri_log_item *irip)
> > +{
> > +	kmem_zone_free(xfs_iri_zone, irip);
> > +}
> > +
> > +/*
> > + * Freeing the iri requires that we remove it from the AIL if it has already
> > + * been placed there. However, the IRI may not yet have been placed in the AIL
> > + * when called by xfs_iri_release() from IRD processing due to the ordering of
> > + * committed vs unpin operations in bulk insert operations. Hence the reference
> > + * count to ensure only the last caller frees the IRI.
> > + */
> > +void
> > +xfs_iri_release(
> > +	struct xfs_iri_log_item *irip)
> > +{
> > +	ASSERT(atomic_read(&irip->iri_refcount) > 0);
> > +	if (atomic_dec_and_test(&irip->iri_refcount)) {
> > +		xfs_trans_ail_remove(&irip->iri_item, SHUTDOWN_LOG_IO_ERROR);
> > +		xfs_iri_item_free(irip);
> > +	}
> > +}
> > +
> > +static inline int
> > +xfs_iri_item_sizeof(
> > +	struct xfs_iri_log_item *irip)
> > +{
> > +	return sizeof(struct xfs_iri_log_format);
> > +}
> > +
> > +STATIC void
> > +xfs_iri_item_size(
> > +	struct xfs_log_item	*lip,
> > +	int			*nvecs,
> > +	int			*nbytes)
> > +{
> > +	*nvecs += 1;
> > +	*nbytes += xfs_iri_item_sizeof(IRI_ITEM(lip));
> > +}
> > +
> > +STATIC void
> > +xfs_iri_item_format(
> > +	struct xfs_log_item	*lip,
> > +	struct xfs_log_vec	*lv)
> > +{
> > +	struct xfs_iri_log_item	*irip = IRI_ITEM(lip);
> > +	struct xfs_log_iovec	*vecp = NULL;
> > +
> > +	irip->iri_format.iri_type = XFS_LI_IRI;
> > +	irip->iri_format.iri_size = 1;
> > +
> > +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRI_FORMAT,
> > +			&irip->iri_format,
> > +			xfs_iri_item_sizeof(irip));
> > +}
> > +
> > +/*
> > + * The unpin operation is the last place an IRI is manipulated in the log. It is
> > + * either inserted in the AIL or aborted in the event of a log I/O error. In
> > + * either case, the IRI transaction has been successfully committed to make it
> > + * this far. Therefore, we expect whoever committed the IRI to either construct
> > + * and commit the IRD or drop the IRD's reference in the event of error. Simply
> > + * drop the log's IRI reference now that the log is done with it.
> > + */
> > +STATIC void
> > +xfs_iri_item_unpin(
> > +	struct xfs_log_item	*lip,
> > +	int			remove)
> > +{
> > +	struct xfs_iri_log_item *irip = IRI_ITEM(lip);
> > +	xfs_iri_release(irip);
> > +}
> > +
> > +/*
> > + * The IRI has been either committed or aborted if the transaction has been
> > + * cancelled. If the transaction was cancelled, an IRD isn't going to be
> > + * constructed and thus we free the IRI here directly.
> > + */
> > +STATIC void
> > +xfs_iri_item_release(
> > +	struct xfs_log_item     *lip)
> > +{
> > +	xfs_iri_release(IRI_ITEM(lip));
> > +}
> > +
> > +/*
> > + * This is the ops vector shared by all iri log items.
> > + */
> > +static const struct xfs_item_ops xfs_iri_item_ops = {
> > +	.iop_size	= xfs_iri_item_size,
> > +	.iop_format	= xfs_iri_item_format,
> > +	.iop_unpin	= xfs_iri_item_unpin,
> > +	.iop_release	= xfs_iri_item_release,
> > +};
> > +
> > +/*
> > + * Allocate and initialize an iri item with the given wip ino.
> > + */
> > +struct xfs_iri_log_item *
> > +xfs_iri_init(struct xfs_mount  *mp,
> > +	     uint		count)
> > +{
> > +	struct xfs_iri_log_item *irip;
> > +
> > +	irip = kmem_zone_zalloc(xfs_iri_zone, KM_SLEEP);
> > +
> > +	xfs_log_item_init(mp, &irip->iri_item, XFS_LI_IRI, &xfs_iri_item_ops);
> > +	irip->iri_format.iri_id = (uintptr_t)(void *)irip;
> > +	atomic_set(&irip->iri_refcount, 2);
> > +
> > +	return irip;
> > +}
> > +
> > +static inline struct xfs_ird_log_item *IRD_ITEM(struct xfs_log_item *lip)
> > +{
> > +	return container_of(lip, struct xfs_ird_log_item, ird_item);
> > +}
> > +
> > +STATIC void
> > +xfs_ird_item_free(struct xfs_ird_log_item *irdp)
> > +{
> > +	kmem_zone_free(xfs_ird_zone, irdp);
> > +}
> > +
> > +/*
> > + * This returns the number of iovecs needed to log the given ird item.
> > + * We only need 1 iovec for an ird item.  It just logs the ird_log_format
> > + * structure.
> > + */
> > +STATIC void
> > +xfs_ird_item_size(
> > +	struct xfs_log_item	*lip,
> > +	int			*nvecs,
> > +	int			*nbytes)
> > +{
> > +	*nvecs += 1;
> > +	*nbytes += sizeof(struct xfs_ird_log_format);
> > +}
> > +
> > +STATIC void
> > +xfs_ird_item_format(
> > +	struct xfs_log_item	*lip,
> > +	struct xfs_log_vec	*lv)
> > +{
> > +	struct xfs_ird_log_item *irdp = IRD_ITEM(lip);
> > +	struct xfs_log_iovec	*vecp = NULL;
> > +
> > +	irdp->ird_format.ird_type = XFS_LI_IRD;
> > +	irdp->ird_format.ird_size = 1;
> > +
> > +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRD_FORMAT, &irdp->ird_format,
> > +			sizeof(struct xfs_ird_log_format));
> > +}
> > +
> > +/*
> > + * The IRD is either committed or aborted if the transaction is cancelled. If
> > + * the transaction is cancelled, drop our reference to the IRI and free the
> > + * IRD.
> > + */
> > +STATIC void
> > +xfs_ird_item_release(
> > +	struct xfs_log_item	*lip)
> > +{
> > +	struct xfs_ird_log_item	*irdp = IRD_ITEM(lip);
> > +
> > +	xfs_iri_release(irdp->ird_irip);
> > +	xfs_ird_item_free(irdp);
> > +}
> > +
> > +static const struct xfs_item_ops xfs_ird_item_ops = {
> > +	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
> > +	.iop_size	= xfs_ird_item_size,
> > +	.iop_format	= xfs_ird_item_format,
> > +	.iop_release	= xfs_ird_item_release,
> > +};
> > +
> > +static struct xfs_ird_log_item *
> > +xfs_trans_get_ird(
> > +	struct xfs_trans		*tp,
> > +	struct xfs_iri_log_item		*irip)
> > +{
> > +	xfs_ird_log_item_t	*irdp;
> > +
> > +	ASSERT(tp != NULL);
> > +
> > +	irdp = kmem_zone_zalloc(xfs_ird_zone, KM_SLEEP);
> > +	xfs_log_item_init(tp->t_mountp, &irdp->ird_item, XFS_LI_IRD,
> > +			  &xfs_ird_item_ops);
> > +	irdp->ird_irip = irip;
> > +	irdp->ird_format.wip_ino = irip->iri_format.wip_ino;
> > +	irdp->ird_format.ird_iri_id = irip->iri_format.iri_id;
> > +
> > +	xfs_trans_add_item(tp, &irdp->ird_item);
> > +	return irdp;
> > +}
> > +
> > +/* record a iunlink remove intent */
> > +int
> > +xfs_iunlink_remove_add(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_inode	*wip)
> > +{
> > +	struct xfs_iunlink_remove_intent	*ii;
> > +
> > +	ii = kmem_alloc(sizeof(struct xfs_iunlink_remove_intent),
> > +			KM_SLEEP | KM_NOFS);
> > +	INIT_LIST_HEAD(&ii->ri_list);
> > +	ii->wip = wip;
> > +
> > +	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_IUNRE, &ii->ri_list);
> > +	return 0;
> > +}
> > +
> > +/* Sort iunlink remove intents by AG. */
> > +static int
> > +xfs_iunlink_remove_diff_items(
> > +	void				*priv,
> > +	struct list_head		*a,
> > +	struct list_head		*b)
> > +{
> > +	struct xfs_mount			*mp = priv;
> > +	struct xfs_iunlink_remove_intent	*ra;
> > +	struct xfs_iunlink_remove_intent	*rb;
> > +
> > +	ra = container_of(a, struct xfs_iunlink_remove_intent, ri_list);
> > +	rb = container_of(b, struct xfs_iunlink_remove_intent, ri_list);
> > +	return	XFS_INO_TO_AGNO(mp, ra->wip->i_ino) -
> > +		XFS_INO_TO_AGNO(mp, rb->wip->i_ino);
> > +}
> > +
> > +/* Get an IRI */
> > +STATIC void *
> > +xfs_iunlink_remove_create_intent(
> > +	struct xfs_trans		*tp,
> > +	unsigned int			count)
> > +{
> > +	struct xfs_iri_log_item		*irip;
> > +
> > +	ASSERT(tp != NULL);
> > +	ASSERT(count == 1);
> > +
> > +	irip = xfs_iri_init(tp->t_mountp, count);
> > +	ASSERT(irip != NULL);
> > +
> > +	/*
> > +	 * Get a log_item_desc to point at the new item.
> > +	 */
> > +	xfs_trans_add_item(tp, &irip->iri_item);
> > +	return irip;
> > +}
> > +
> > +/* Log a iunlink remove to the intent item. */
> > +STATIC void
> > +xfs_iunlink_remove_log_item(
> > +	struct xfs_trans		*tp,
> > +	void				*intent,
> > +	struct list_head		*item)
> > +{
> > +	struct xfs_iri_log_item			*irip = intent;
> > +	struct xfs_iunlink_remove_intent	*iunre;
> > +
> > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > +
> > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > +	set_bit(XFS_LI_DIRTY, &irip->iri_item.li_flags);
> > +
> > +	irip->iri_format.wip_ino = (uint64_t)(iunre->wip->i_ino);
> > +}
> > +
> > +/* Get an IRD so we can process all the deferred iunlink remove. */
> > +STATIC void *
> > +xfs_iunlink_remove_create_done(
> > +	struct xfs_trans		*tp,
> > +	void				*intent,
> > +	unsigned int			count)
> > +{
> > +	return xfs_trans_get_ird(tp, intent);
> > +}
> > +
> > +/*
> > + * For whiteouts, we need to bump the link count on the whiteout inode.
> > + * This means that failures all the way up to this point leave the inode
> > + * on the unlinked list and so cleanup is a simple matter of dropping
> > + * the remaining reference to it. If we fail here after bumping the link
> > + * count, we're shutting down the filesystem so we'll never see the
> > + * intermediate state on disk.
> > + */
> > +static int
> > +xfs_trans_log_finish_iunlink_remove(
> > +	struct xfs_trans		*tp,
> > +	struct xfs_ird_log_item		*irdp,
> > +	struct xfs_inode		*wip)
> > +{
> > +	int 	error;
> > +
> > +	ASSERT(xfs_isilocked(wip, XFS_ILOCK_EXCL));
> > +
> > +	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> > +
> > +	ASSERT(VFS_I(wip)->i_nlink == 0);
> > +	xfs_bumplink(tp, wip);
> > +	error = xfs_iunlink_remove(tp, wip);
> > +	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> > +	/*
> > +	 * Now we have a real link, clear the "I'm a tmpfile" state
> > +	 * flag from the inode so it doesn't accidentally get misused in
> > +	 * future.
> > +	 */
> > +	VFS_I(wip)->i_state &= ~I_LINKABLE;
> > +
> > +	/*
> > +	 * Mark the transaction dirty, even on error. This ensures the
> > +	 * transaction is aborted, which:
> > +	 *
> > +	 * 1.) releases the IRI and frees the IRD
> > +	 * 2.) shuts down the filesystem
> > +	 */
> > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> > +
> > +	return error;
> > +}
> > +
> > +/* Process a deferred iunlink remove. */
> > +STATIC int
> > +xfs_iunlink_remove_finish_item(
> > +	struct xfs_trans		*tp,
> > +	struct list_head		*item,
> > +	void				*done_item,
> > +	void				**state)
> > +{
> > +	struct xfs_iunlink_remove_intent	*iunre;
> > +	int					error;
> > +
> > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > +	error = xfs_trans_log_finish_iunlink_remove(tp, done_item,
> > +			iunre->wip);
> > +	kmem_free(iunre);
> > +	return error;
> > +}
> > +
> > +/* Abort all pending IRIs. */
> > +STATIC void
> > +xfs_iunlink_remove_abort_intent(
> > +	void		*intent)
> > +{
> > +	xfs_iri_release(intent);
> > +}
> > +
> > +/* Cancel a deferred iunlink remove. */
> > +STATIC void
> > +xfs_iunlink_remove_cancel_item(
> > +	struct list_head		*item)
> > +{
> > +	struct xfs_iunlink_remove_intent	*iunre;
> > +
> > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > +	kmem_free(iunre);
> > +}
> > +
> > +const struct xfs_defer_op_type xfs_iunlink_remove_defer_type = {
> > +	.diff_items	= xfs_iunlink_remove_diff_items,
> > +	.create_intent	= xfs_iunlink_remove_create_intent,
> > +	.abort_intent	= xfs_iunlink_remove_abort_intent,
> > +	.log_item	= xfs_iunlink_remove_log_item,
> > +	.create_done	= xfs_iunlink_remove_create_done,
> > +	.finish_item	= xfs_iunlink_remove_finish_item,
> > +	.cancel_item	= xfs_iunlink_remove_cancel_item,
> > +};
> > +
> > +/*
> > + * Process a iunlink remove intent item that was recovered from the log.
> > + */
> > +int
> > +xfs_iri_recover(
> > +	struct xfs_trans		*parent_tp,
> > +	struct xfs_iri_log_item		*irip)
> > +{
> > +	int				error = 0;
> > +	struct xfs_trans		*tp;
> > +	xfs_ino_t			ino;
> > +	struct xfs_inode		*ip;
> > +	struct xfs_mount		*mp = parent_tp->t_mountp;
> > +	struct xfs_ird_log_item		*irdp;
> > +
> > +	ASSERT(!test_bit(XFS_IRI_RECOVERED, &irip->iri_flags));
> > +
> > +	ino = irip->iri_format.wip_ino;
> > +	if (ino == NULLFSINO || !xfs_verify_dir_ino(mp, ino)) {
> > +		xfs_alert(mp, "IRI recover used bad inode ino 0x%llx!", ino);
> > +		set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> > +		xfs_iri_release(irip);
> > +		return -EIO;
> > +	}
> > +	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
> > +	if (error)
> > +		return error;
> > +
> > +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> > +	if (error)
> > +		return error;
> > +	irdp = xfs_trans_get_ird(tp, irip);
> > +
> > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> > +
> > +	ASSERT(VFS_I(ip)->i_nlink == 0);
> > +	VFS_I(ip)->i_state |= I_LINKABLE;
> > +	xfs_bumplink(tp, ip);
> > +	error = xfs_iunlink_remove(tp, ip);
> > +	if (error)
> > +		goto abort_error;
> > +	VFS_I(ip)->i_state &= ~I_LINKABLE;
> > +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> > +
> > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> > +
> > +	set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> > +	error = xfs_trans_commit(tp);
> > +	return error;
> > +
> > +abort_error:
> > +	xfs_trans_cancel(tp);
> > +	return error;
> > +}
> > diff --git a/fs/xfs/xfs_iunlinkrm_item.h b/fs/xfs/xfs_iunlinkrm_item.h
> > new file mode 100644
> > index 0000000..54c4ca3
> > --- /dev/null
> > +++ b/fs/xfs/xfs_iunlinkrm_item.h
> > @@ -0,0 +1,67 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> > + * Author: Kaixuxia <kaixuxia@tencent.com>
> > + */
> > +#ifndef	__XFS_IUNLINKRM_ITEM_H__
> > +#define	__XFS_IUNLINKRM_ITEM_H__
> > +
> > +/*
> > + * When performing rename operation with RENAME_WHITEOUT flag, we will hold AGF lock to
> > + * allocate or free extents in manipulating the dirents firstly, and then doing the
> > + * xfs_iunlink_remove() call last to hold AGI lock to modify the tmpfile info, so we the
> > + * lock order AGI->AGF.
> > + *
> > + * The big problem here is that we have an ordering constraint on AGF and AGI locking -
> > + * inode allocation locks the AGI, then can allocate a new extent for new inodes, locking
> > + * the AGF after the AGI. Hence the ordering that is imposed by other parts of the code
> > + * is AGI before AGF. So we get the ABBA agi&agf deadlock here.
> > + *
> > + * So make the unlinked list removal a deferred operation, i.e. log an iunlink remove intent
> > + * and then do it after the RENAME_WHITEOUT transaction has committed, and the iunlink remove
> > + * intention(IRI) and done log items(IRD) are provided.
> > + */
> > +
> > +/* kernel only IRI/IRD definitions */
> > +
> > +struct xfs_mount;
> > +struct kmem_zone;
> > +struct xfs_inode;
> > +
> > +/*
> > + * Define IRI flag bits. Manipulated by set/clear/test_bit operators.
> > + */
> > +#define	XFS_IRI_RECOVERED		1
> > +
> > +/* This is the "iunlink remove intention" log item. It is used in conjunction
> > + * with the "iunlink remove done" log item described below.
> > + */
> > +typedef struct xfs_iri_log_item {
> > +	struct xfs_log_item	iri_item;
> > +	atomic_t		iri_refcount;
> > +	unsigned long		iri_flags;
> > +	xfs_iri_log_format_t	iri_format;
> > +} xfs_iri_log_item_t;
> > +
> > +/* This is the "iunlink remove done" log item. */
> > +typedef struct xfs_ird_log_item {
> > +	struct xfs_log_item	ird_item;
> > +	xfs_iri_log_item_t	*ird_irip;
> > +	xfs_ird_log_format_t	ird_format;
> > +} xfs_ird_log_item_t;
> > +
> > +struct xfs_iunlink_remove_intent {
> > +	struct list_head		ri_list;
> > +	struct xfs_inode		*wip;
> > +};
> > +
> > +extern struct kmem_zone	*xfs_iri_zone;
> > +extern struct kmem_zone	*xfs_ird_zone;
> > +
> > +struct xfs_iri_log_item	*xfs_iri_init(struct xfs_mount *, uint);
> > +void xfs_iri_item_free(struct xfs_iri_log_item *);
> > +void xfs_iri_release(struct xfs_iri_log_item *);
> > +int xfs_iri_recover(struct xfs_trans *, struct xfs_iri_log_item *);
> > +int xfs_iunlink_remove_add(struct xfs_trans *, struct xfs_inode *);
> > +
> > +#endif	/* __XFS_IUNLINKRM_ITEM_H__ */
> > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > index 00e9f5c..f87f510 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> > @@ -2005,6 +2005,8 @@ STATIC void xlog_state_done_syncing(
> >  	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
> >  	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
> >  	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
> > +	    REG_TYPE_STR(IRI_FORMAT, "iri_format"),
> > +	    REG_TYPE_STR(IRD_FORMAT, "ird_format"),
> >  	};
> >  	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
> >  #undef REG_TYPE_STR
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index 13d1d3e..a916f40 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -33,6 +33,7 @@
> >  #include "xfs_buf_item.h"
> >  #include "xfs_refcount_item.h"
> >  #include "xfs_bmap_item.h"
> > +#include "xfs_iunlinkrm_item.h"
> > 
> >  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
> > 
> > @@ -1885,6 +1886,8 @@ struct xfs_buf_cancel {
> >  		case XFS_LI_CUD:
> >  		case XFS_LI_BUI:
> >  		case XFS_LI_BUD:
> > +		case XFS_LI_IRI:
> > +		case XFS_LI_IRD:
> >  			trace_xfs_log_recover_item_reorder_tail(log,
> >  							trans, item, pass);
> >  			list_move_tail(&item->ri_list, &inode_list);
> > @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
> >  }
> > 
> >  /*
> > + * This routine is called to create an in-core iunlink remove intent
> > + * item from the iri format structure which was logged on disk.
> > + * It allocates an in-core iri, copies the inode from the format
> > + * structure into it, and adds the iri to the AIL with the given
> > + * LSN.
> > + */
> > +STATIC int
> > +xlog_recover_iri_pass2(
> > +	struct xlog			*log,
> > +	struct xlog_recover_item	*item,
> > +	xfs_lsn_t			lsn)
> > +{
> > +	xfs_mount_t		*mp = log->l_mp;
> > +	xfs_iri_log_item_t	*irip;
> > +	xfs_iri_log_format_t	*iri_formatp;
> > +
> > +	iri_formatp = item->ri_buf[0].i_addr;
> > +
> > +	irip = xfs_iri_init(mp, 1);
> > +	irip->iri_format = *iri_formatp;
> > +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
> > +		xfs_iri_item_free(irip);
> > +		return EFSCORRUPTED;
> > +	}
> > +
> > +	spin_lock(&log->l_ailp->ail_lock);
> > +	/*
> > +	 * The IRI has two references. One for the IRD and one for IRI to ensure
> > +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
> > +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
> > +	 * AIL lock.
> > +	 */
> > +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
> > +	xfs_iri_release(irip);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * This routine is called when an IRD format structure is found in a committed
> > + * transaction in the log. Its purpose is to cancel the corresponding IRI if it
> > + * was still in the log. To do this it searches the AIL for the IRI with an id
> > + * equal to that in the IRD format structure. If we find it we drop the IRD
> > + * reference, which removes the IRI from the AIL and frees it.
> > + */
> > +STATIC int
> > +xlog_recover_ird_pass2(
> > +	struct xlog			*log,
> > +	struct xlog_recover_item	*item)
> > +{
> > +	xfs_ird_log_format_t	*ird_formatp;
> > +	xfs_iri_log_item_t	*irip = NULL;
> > +	struct xfs_log_item	*lip;
> > +	uint64_t		iri_id;
> > +	struct xfs_ail_cursor	cur;
> > +	struct xfs_ail		*ailp = log->l_ailp;
> > +
> > +	ird_formatp = item->ri_buf[0].i_addr;
> > +	if (item->ri_buf[0].i_len == sizeof(xfs_ird_log_format_t))
> > +		return -EFSCORRUPTED;
> > +	iri_id = ird_formatp->ird_iri_id;
> > +
> > +	/*
> > +	 * Search for the iri with the id in the ird format structure
> > +	 * in the AIL.
> > +	 */
> > +	spin_lock(&ailp->ail_lock);
> > +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
> > +	while (lip != NULL) {
> > +		if (lip->li_type == XFS_LI_IRI) {
> > +			irip = (xfs_iri_log_item_t *)lip;
> > +			if (irip->iri_format.iri_id == iri_id) {
> > +				/*
> > +				 * Drop the IRD reference to the IRI. This
> > +				 * removes the IRI from the AIL and frees it.
> > +				 */
> > +				spin_unlock(&ailp->ail_lock);
> > +				xfs_iri_release(irip);
> > +				spin_lock(&ailp->ail_lock);
> > +				break;
> > +			}
> > +		}
> > +		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> > +	}
> > +	xfs_trans_ail_cursor_done(&cur);
> > +	spin_unlock(&ailp->ail_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> >   * This routine is called when an inode create format structure is found in a
> >   * committed transaction in the log.  It's purpose is to initialise the inodes
> >   * being allocated on disk. This requires us to get inode cluster buffers that
> > @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
> >  	case XFS_LI_CUD:
> >  	case XFS_LI_BUI:
> >  	case XFS_LI_BUD:
> > +	case XFS_LI_IRI:
> > +	case XFS_LI_IRD:
> >  	default:
> >  		break;
> >  	}
> > @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
> >  	case XFS_LI_CUD:
> >  	case XFS_LI_BUI:
> >  	case XFS_LI_BUD:
> > +	case XFS_LI_IRI:
> > +	case XFS_LI_IRD:
> >  		/* nothing to do in pass 1 */
> >  		return 0;
> >  	default:
> > @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
> >  		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
> >  	case XFS_LI_BUD:
> >  		return xlog_recover_bud_pass2(log, item);
> > +	case XFS_LI_IRI:
> > +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
> > +	case XFS_LI_IRD:
> > +		return xlog_recover_ird_pass2(log, item);
> >  	case XFS_LI_DQUOT:
> >  		return xlog_recover_dquot_pass2(log, buffer_list, item,
> >  						trans->r_lsn);
> > @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
> >  	spin_lock(&ailp->ail_lock);
> >  }
> > 
> > +/* Recover the IRI if necessary. */
> > +STATIC int
> > +xlog_recover_process_iri(
> > +	struct xfs_trans		*parent_tp,
> > +	struct xfs_ail			*ailp,
> > +	struct xfs_log_item		*lip)
> > +{
> > +	struct xfs_iri_log_item		*irip;
> > +	int				error;
> > +
> > +	/*
> > +	 * Skip IRIs that we've already processed.
> > +	 */
> > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
> > +		return 0;
> > +
> > +	spin_unlock(&ailp->ail_lock);
> > +	error = xfs_iri_recover(parent_tp, irip);
> > +	spin_lock(&ailp->ail_lock);
> > +
> > +	return error;
> > +}
> > +
> > +/* Release the IRI since we're cancelling everything. */
> > +STATIC void
> > +xlog_recover_cancel_iri(
> > +	struct xfs_mount		*mp,
> > +	struct xfs_ail			*ailp,
> > +	struct xfs_log_item		*lip)
> > +{
> > +	struct xfs_iri_log_item         *irip;
> > +
> > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > +
> > +	spin_unlock(&ailp->ail_lock);
> > +	xfs_iri_release(irip);
> > +	spin_lock(&ailp->ail_lock);
> > +}
> > +
> >  /* Is this log item a deferred action intent? */
> >  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> >  {
> > @@ -4729,6 +4870,7 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> >  	case XFS_LI_RUI:
> >  	case XFS_LI_CUI:
> >  	case XFS_LI_BUI:
> > +	case XFS_LI_IRI:
> >  		return true;
> >  	default:
> >  		return false;
> > @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> >  		case XFS_LI_BUI:
> >  			error = xlog_recover_process_bui(parent_tp, ailp, lip);
> >  			break;
> > +		case XFS_LI_IRI:
> > +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
> > +			break;
> >  		}
> >  		if (error)
> >  			goto out;
> > @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> >  		case XFS_LI_BUI:
> >  			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
> >  			break;
> > +		case XFS_LI_IRI:
> > +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
> > +			break;
> >  		}
> > 
> >  		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index f945023..66742b7 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -34,6 +34,7 @@
> >  #include "xfs_rmap_item.h"
> >  #include "xfs_refcount_item.h"
> >  #include "xfs_bmap_item.h"
> > +#include "xfs_iunlinkrm_item.h"
> >  #include "xfs_reflink.h"
> > 
> >  #include <linux/magic.h>
> > @@ -1957,8 +1958,22 @@ struct proc_xfs_info {
> >  	if (!xfs_bui_zone)
> >  		goto out_destroy_bud_zone;
> > 
> > +	xfs_ird_zone = kmem_zone_init(sizeof(xfs_ird_log_item_t),
> > +			"xfs_ird_item");
> > +	if (!xfs_ird_zone)
> > +		goto out_destroy_bui_zone;
> > +
> > +	xfs_iri_zone = kmem_zone_init(sizeof(xfs_iri_log_item_t),
> > +			"xfs_iri_item");
> > +	if (!xfs_iri_zone)
> > +		goto out_destroy_ird_zone;
> > +
> >  	return 0;
> > 
> > + out_destroy_ird_zone:
> > +	kmem_zone_destroy(xfs_ird_zone);
> > + out_destroy_bui_zone:
> > +	kmem_zone_destroy(xfs_bui_zone);
> >   out_destroy_bud_zone:
> >  	kmem_zone_destroy(xfs_bud_zone);
> >   out_destroy_cui_zone:
> > @@ -2007,6 +2022,8 @@ struct proc_xfs_info {
> >  	 * destroy caches.
> >  	 */
> >  	rcu_barrier();
> > +	kmem_zone_destroy(xfs_iri_zone);
> > +	kmem_zone_destroy(xfs_ird_zone);
> >  	kmem_zone_destroy(xfs_bui_zone);
> >  	kmem_zone_destroy(xfs_bud_zone);
> >  	kmem_zone_destroy(xfs_cui_zone);
> > diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> > index 64d7f17..dd63eaa 100644
> > --- a/fs/xfs/xfs_trans.h
> > +++ b/fs/xfs/xfs_trans.h
> > @@ -26,6 +26,8 @@
> >  struct xfs_cud_log_item;
> >  struct xfs_bui_log_item;
> >  struct xfs_bud_log_item;
> > +struct xfs_iri_log_item;
> > +struct xfs_ird_log_item;
> > 
> >  struct xfs_log_item {
> >  	struct list_head		li_ail;		/* AIL pointers */
> > -- 
> > 1.8.3.1
> > 
> > -- 
> > kaixuxia
Brian Foster Aug. 13, 2019, 2:57 p.m. UTC | #3
On Tue, Aug 13, 2019 at 07:20:46AM -0700, Darrick J. Wong wrote:
> On Tue, Aug 13, 2019 at 09:36:14AM -0400, Brian Foster wrote:
> > On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > > When performing rename operation with RENAME_WHITEOUT flag, we will
> > > hold AGF lock to allocate or free extents in manipulating the dirents
> > > firstly, and then doing the xfs_iunlink_remove() call last to hold
> > > AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
> > > 
> > 
> > IIUC, the whiteout use case is that we're renaming a file, but the
> > source dentry must be replaced with a magic whiteout inode rather than
> > be removed. Therefore, xfs_rename() allocates the whiteout inode as a
> > tmpfile first in a separate transaction, updates the target dentry with
> > the source inode, replaces the source dentry to point to the whiteout
> > inode and finally removes the whiteout inode from the unlinked list
> > (since it is a tmpfile). This leads to the problem described below
> > because the rename transaction ends up doing directory block allocs
> > (locking the AGF) followed by the unlinked list remove (locking the
> > AGI).
> > 
> > My understanding from reading the code is that this is primarly to
> > cleanly handle error scenarios. If anything fails after we've allocated
> > the whiteout tmpfile, it's simply left on the unlinked list and so the
> > filesystem remains in a consistent/recoverable state. Given that, the
> > solution here seems like overkill to me. For one, I thought background
> > unlinked list removal was already on our roadmap (Darrick might have
> > been looking at that and may already have a prototype as well). Also,
> > unlinked list removal occurs at log recovery time already. That's
> > somewhat of an existing purpose of the list, which makes a deferred
> > unlinked list removal operation superfluous in more traditional cases
> > where unlinked list removal doesn't require consistency with a directory
> > operation.
> 
> Not to mention this doesn't fix the problem for existing filesystems,
> because adding new log item types changes the on-disk log format and
> therefore requires an log incompat feature bit to prevent old kernels
> from trying to recover the log.
> 

Yeah..

> > Functional discussion aside.. from a complexity standpoint I'm wondering
> > if we could do something much more simple like acquire the AGI lock for
> > a whiteout inode earlier in xfs_rename(). For example, suppose we did
> > something like:
> > 
> > 	/*
> > 	 * Acquire the whiteout agi to preserve locking order in anticipation of
> > 	 * unlinked list removal.
> > 	 */
> > 	if (wip)
> > 		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);
> > 
> > ... after we allocate the transaction but before we do any directory ops
> > that can result in block allocations. Would that prevent the problem
> > you've observed?
> 
> I had the same thought, but fun question: if @wip is allocated in AG 1
> but the dirent blocks come from AG 0, is that a problem?
> 

Not sure.. I was thinking that locking order is only a problem here as
such if we are locking the AGI/AGF of the same AG. Otherwise AG locking
order comes into play only when you're locking the same type AG buffer
on multiple AGs. IOW, the above hack wouldn't have been necessary if
they end up in different AGs (but we know we'll be locking the AGI
anyways), but it's not clear to me that it's a problem if they aren't.

> Would it make more sense to expand the directory in one transaction,
> roll it, and add the actual directory entry after that?
> 

That might be a cleaner approach from a design standpoint so long as we
can hold whatever locks, etc. to guarantee the recently added space can
only be used by the pending dir operation. I suppose we'd have to handle
potential format conversions, too..

Brian

> --D
> 
> > Brian
> > 
> > > The big problem here is that we have an ordering constraint on AGF
> > > and AGI locking - inode allocation locks the AGI, then can allocate
> > > a new extent for new inodes, locking the AGF after the AGI. Hence
> > > the ordering that is imposed by other parts of the code is AGI before
> > > AGF. So we get the ABBA agi&agf deadlock here.
> > > 
> > > Process A:
> > > Call trace:
> > >  ? __schedule+0x2bd/0x620
> > >  schedule+0x33/0x90
> > >  schedule_timeout+0x17d/0x290
> > >  __down_common+0xef/0x125
> > >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> > >  down+0x3b/0x50
> > >  xfs_buf_lock+0x34/0xf0 [xfs]
> > >  xfs_buf_find+0x215/0x6c0 [xfs]
> > >  xfs_buf_get_map+0x37/0x230 [xfs]
> > >  xfs_buf_read_map+0x29/0x190 [xfs]
> > >  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
> > >  xfs_read_agf+0xa6/0x180 [xfs]
> > >  ? schedule_timeout+0x17d/0x290
> > >  xfs_alloc_read_agf+0x52/0x1f0 [xfs]
> > >  xfs_alloc_fix_freelist+0x432/0x590 [xfs]
> > >  ? down+0x3b/0x50
> > >  ? xfs_buf_lock+0x34/0xf0 [xfs]
> > >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> > >  xfs_alloc_vextent+0x301/0x6c0 [xfs]
> > >  xfs_ialloc_ag_alloc+0x182/0x700 [xfs]
> > >  ? _xfs_trans_bjoin+0x72/0xf0 [xfs]
> > >  xfs_dialloc+0x116/0x290 [xfs]
> > >  xfs_ialloc+0x6d/0x5e0 [xfs]
> > >  ? xfs_log_reserve+0x165/0x280 [xfs]
> > >  xfs_dir_ialloc+0x8c/0x240 [xfs]
> > >  xfs_create+0x35a/0x610 [xfs]
> > >  xfs_generic_create+0x1f1/0x2f0 [xfs]
> > >  ...
> > > 
> > > Process B:
> > > Call trace:
> > >  ? __schedule+0x2bd/0x620
> > >  ? xfs_bmapi_allocate+0x245/0x380 [xfs]
> > >  schedule+0x33/0x90
> > >  schedule_timeout+0x17d/0x290
> > >  ? xfs_buf_find+0x1fd/0x6c0 [xfs]
> > >  __down_common+0xef/0x125
> > >  ? xfs_buf_get_map+0x37/0x230 [xfs]
> > >  ? xfs_buf_find+0x215/0x6c0 [xfs]
> > >  down+0x3b/0x50
> > >  xfs_buf_lock+0x34/0xf0 [xfs]
> > >  xfs_buf_find+0x215/0x6c0 [xfs]
> > >  xfs_buf_get_map+0x37/0x230 [xfs]
> > >  xfs_buf_read_map+0x29/0x190 [xfs]
> > >  xfs_trans_read_buf_map+0x13d/0x520 [xfs]
> > >  xfs_read_agi+0xa8/0x160 [xfs]
> > >  xfs_iunlink_remove+0x6f/0x2a0 [xfs]
> > >  ? current_time+0x46/0x80
> > >  ? xfs_trans_ichgtime+0x39/0xb0 [xfs]
> > >  xfs_rename+0x57a/0xae0 [xfs]
> > >  xfs_vn_rename+0xe4/0x150 [xfs]
> > >  ...
> > > 
> > > In this patch we make the unlinked list removal a deferred operation,
> > > i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> > > transaction has committed, and the iunlink remove intention and done
> > > log items are provided.
> > > 
> > > Change the ordering of the operations in the xfs_rename() function
> > > to hold the AGF lock in the RENAME_WHITEOUT transaction and hold the
> > > AGI lock in it's own transaction to match that of the rest of the code.
> > > 
> > > Signed-off-by: kaixuxia <kaixuxia@tencent.com>
> > > ---
> > >  fs/xfs/Makefile                |   1 +
> > >  fs/xfs/libxfs/xfs_defer.c      |   1 +
> > >  fs/xfs/libxfs/xfs_defer.h      |   2 +
> > >  fs/xfs/libxfs/xfs_log_format.h |  27 ++-
> > >  fs/xfs/xfs_inode.c             |  36 +---
> > >  fs/xfs/xfs_inode.h             |   3 +
> > >  fs/xfs/xfs_iunlinkrm_item.c    | 458 +++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_iunlinkrm_item.h    |  67 ++++++
> > >  fs/xfs/xfs_log.c               |   2 +
> > >  fs/xfs/xfs_log_recover.c       | 148 +++++++++++++
> > >  fs/xfs/xfs_super.c             |  17 ++
> > >  fs/xfs/xfs_trans.h             |   2 +
> > >  12 files changed, 733 insertions(+), 31 deletions(-)
> > >  create mode 100644 fs/xfs/xfs_iunlinkrm_item.c
> > >  create mode 100644 fs/xfs/xfs_iunlinkrm_item.h
> > > 
> > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > > index 06b68b6..9d5012e 100644
> > > --- a/fs/xfs/Makefile
> > > +++ b/fs/xfs/Makefile
> > > @@ -106,6 +106,7 @@ xfs-y				+= xfs_log.o \
> > >  				   xfs_inode_item.o \
> > >  				   xfs_refcount_item.o \
> > >  				   xfs_rmap_item.o \
> > > +				   xfs_iunlinkrm_item.o \
> > >  				   xfs_log_recover.o \
> > >  				   xfs_trans_ail.o \
> > >  				   xfs_trans_buf.o
> > > diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> > > index eb2be2a..a0f0a3d 100644
> > > --- a/fs/xfs/libxfs/xfs_defer.c
> > > +++ b/fs/xfs/libxfs/xfs_defer.c
> > > @@ -176,6 +176,7 @@
> > >  	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
> > >  	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
> > >  	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
> > > +	[XFS_DEFER_OPS_TYPE_IUNRE]	= &xfs_iunlink_remove_defer_type,
> > >  };
> > > 
> > >  /*
> > > diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
> > > index 7c28d76..9e91a36 100644
> > > --- a/fs/xfs/libxfs/xfs_defer.h
> > > +++ b/fs/xfs/libxfs/xfs_defer.h
> > > @@ -17,6 +17,7 @@ enum xfs_defer_ops_type {
> > >  	XFS_DEFER_OPS_TYPE_RMAP,
> > >  	XFS_DEFER_OPS_TYPE_FREE,
> > >  	XFS_DEFER_OPS_TYPE_AGFL_FREE,
> > > +	XFS_DEFER_OPS_TYPE_IUNRE,
> > >  	XFS_DEFER_OPS_TYPE_MAX,
> > >  };
> > > 
> > > @@ -60,5 +61,6 @@ struct xfs_defer_op_type {
> > >  extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
> > >  extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
> > >  extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
> > > +extern const struct xfs_defer_op_type xfs_iunlink_remove_defer_type;
> > > 
> > >  #endif /* __XFS_DEFER_H__ */
> > > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> > > index e5f97c6..dc85b28 100644
> > > --- a/fs/xfs/libxfs/xfs_log_format.h
> > > +++ b/fs/xfs/libxfs/xfs_log_format.h
> > > @@ -117,7 +117,9 @@ struct xfs_unmount_log_format {
> > >  #define XLOG_REG_TYPE_CUD_FORMAT	24
> > >  #define XLOG_REG_TYPE_BUI_FORMAT	25
> > >  #define XLOG_REG_TYPE_BUD_FORMAT	26
> > > -#define XLOG_REG_TYPE_MAX		26
> > > +#define XLOG_REG_TYPE_IRI_FORMAT	27
> > > +#define XLOG_REG_TYPE_IRD_FORMAT	28
> > > +#define XLOG_REG_TYPE_MAX		28
> > > 
> > >  /*
> > >   * Flags to log operation header
> > > @@ -240,6 +242,8 @@ struct xfs_unmount_log_format {
> > >  #define	XFS_LI_CUD		0x1243
> > >  #define	XFS_LI_BUI		0x1244	/* bmbt update intent */
> > >  #define	XFS_LI_BUD		0x1245
> > > +#define	XFS_LI_IRI		0x1246	/* iunlink remove intent */
> > > +#define	XFS_LI_IRD		0x1247
> > > 
> > >  #define XFS_LI_TYPE_DESC \
> > >  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> > > @@ -255,7 +259,9 @@ struct xfs_unmount_log_format {
> > >  	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
> > >  	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
> > >  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
> > > -	{ XFS_LI_BUD,		"XFS_LI_BUD" }
> > > +	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
> > > +	{ XFS_LI_IRI,		"XFS_LI_IRI" }, \
> > > +	{ XFS_LI_IRD,		"XFS_LI_IRD" }
> > > 
> > >  /*
> > >   * Inode Log Item Format definitions.
> > > @@ -773,6 +779,23 @@ struct xfs_bud_log_format {
> > >  };
> > > 
> > >  /*
> > > + * This is the structure used to lay out iri&ird log item in the log.
> > > + */
> > > +typedef struct xfs_iri_log_format {
> > > +	uint16_t		iri_type;	/* iri log item type */
> > > +	uint16_t		iri_size;	/* size of this item */
> > > +	uint64_t		iri_id;		/* id of corresponding iri */
> > > +	uint64_t		wip_ino;	/* inode number */
> > > +} xfs_iri_log_format_t;
> > > +
> > > +typedef struct xfs_ird_log_format {
> > > +	uint16_t		ird_type;	/* ird log item type */
> > > +	uint16_t		ird_size;	/* size of this item */
> > > +	uint64_t		ird_iri_id;	/* id of corresponding iri */
> > > +	uint64_t		wip_ino;	/* inode number */
> > > +} xfs_ird_log_format_t;
> > > +
> > > +/*
> > >   * Dquot Log format definitions.
> > >   *
> > >   * The first two fields must be the type and size fitting into
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 6467d5e..7bb3102 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -35,6 +35,7 @@
> > >  #include "xfs_log.h"
> > >  #include "xfs_bmap_btree.h"
> > >  #include "xfs_reflink.h"
> > > +#include "xfs_iunlinkrm_item.h"
> > > 
> > >  kmem_zone_t *xfs_inode_zone;
> > > 
> > > @@ -46,7 +47,6 @@
> > > 
> > >  STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
> > >  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
> > > -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
> > > 
> > >  /*
> > >   * helper function to extract extent size hint from inode
> > > @@ -1110,7 +1110,7 @@
> > >  /*
> > >   * Increment the link count on an inode & log the change.
> > >   */
> > > -static void
> > > +void
> > >  xfs_bumplink(
> > >  	xfs_trans_t *tp,
> > >  	xfs_inode_t *ip)
> > > @@ -2406,7 +2406,7 @@ struct xfs_iunlink {
> > >  /*
> > >   * Pull the on-disk inode from the AGI unlinked list.
> > >   */
> > > -STATIC int
> > > +int
> > >  xfs_iunlink_remove(
> > >  	struct xfs_trans	*tp,
> > >  	struct xfs_inode	*ip)
> > > @@ -3261,8 +3261,6 @@ struct xfs_iunlink {
> > >  	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
> > >  	if (target_ip)
> > >  		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
> > > -	if (wip)
> > > -		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> > > 
> > >  	/*
> > >  	 * If we are using project inheritance, we only allow renames
> > > @@ -3417,35 +3415,15 @@ struct xfs_iunlink {
> > >  	if (error)
> > >  		goto out_trans_cancel;
> > > 
> > > -	/*
> > > -	 * For whiteouts, we need to bump the link count on the whiteout inode.
> > > -	 * This means that failures all the way up to this point leave the inode
> > > -	 * on the unlinked list and so cleanup is a simple matter of dropping
> > > -	 * the remaining reference to it. If we fail here after bumping the link
> > > -	 * count, we're shutting down the filesystem so we'll never see the
> > > -	 * intermediate state on disk.
> > > -	 */
> > > -	if (wip) {
> > > -		ASSERT(VFS_I(wip)->i_nlink == 0);
> > > -		xfs_bumplink(tp, wip);
> > > -		error = xfs_iunlink_remove(tp, wip);
> > > -		if (error)
> > > -			goto out_trans_cancel;
> > > -		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> > > -
> > > -		/*
> > > -		 * Now we have a real link, clear the "I'm a tmpfile" state
> > > -		 * flag from the inode so it doesn't accidentally get misused in
> > > -		 * future.
> > > -		 */
> > > -		VFS_I(wip)->i_state &= ~I_LINKABLE;
> > > -	}
> > > -
> > >  	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
> > >  	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
> > >  	if (new_parent)
> > >  		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
> > > 
> > > +	/* add the iunlink remove intent to the tp */
> > > +	if (wip)
> > > +		xfs_iunlink_remove_add(tp, wip);
> > > +
> > >  	error = xfs_finish_rename(tp);
> > >  	if (wip)
> > >  		xfs_irele(wip);
> > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > index 558173f..f8c30ca 100644
> > > --- a/fs/xfs/xfs_inode.h
> > > +++ b/fs/xfs/xfs_inode.h
> > > @@ -20,6 +20,7 @@
> > >  struct xfs_mount;
> > >  struct xfs_trans;
> > >  struct xfs_dquot;
> > > +struct xfs_trans;
> > > 
> > >  typedef struct xfs_inode {
> > >  	/* Inode linking and identification information. */
> > > @@ -414,6 +415,7 @@ enum layout_break_reason {
> > >  void		xfs_inactive(struct xfs_inode *ip);
> > >  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
> > >  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
> > > +void		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
> > >  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
> > >  			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
> > >  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
> > > @@ -436,6 +438,7 @@ int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
> > >  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
> > > 
> > >  uint		xfs_ip2xflags(struct xfs_inode *);
> > > +int		xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
> > >  int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
> > >  int		xfs_itruncate_extents_flags(struct xfs_trans **,
> > >  				struct xfs_inode *, int, xfs_fsize_t, int);
> > > diff --git a/fs/xfs/xfs_iunlinkrm_item.c b/fs/xfs/xfs_iunlinkrm_item.c
> > > new file mode 100644
> > > index 0000000..4e38329
> > > --- /dev/null
> > > +++ b/fs/xfs/xfs_iunlinkrm_item.c
> > > @@ -0,0 +1,458 @@
> > > +// SPDX-License-Identifier: GPL-2.0+
> > > +/*
> > > + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> > > + * Author: Kaixuxia <kaixuxia@tencent.com>
> > > + */
> > > +#include "xfs.h"
> > > +#include "xfs_fs.h"
> > > +#include "xfs_format.h"
> > > +#include "xfs_log_format.h"
> > > +#include "xfs_trans_resv.h"
> > > +#include "xfs_bit.h"
> > > +#include "xfs_shared.h"
> > > +#include "xfs_mount.h"
> > > +#include "xfs_defer.h"
> > > +#include "xfs_trans.h"
> > > +#include "xfs_trans_priv.h"
> > > +#include "xfs_log.h"
> > > +#include "xfs_alloc.h"
> > > +#include "xfs_inode.h"
> > > +#include "xfs_icache.h"
> > > +#include "xfs_iunlinkrm_item.h"
> > > +
> > > +kmem_zone_t	*xfs_iri_zone;
> > > +kmem_zone_t	*xfs_ird_zone;
> > > +
> > > +static inline struct xfs_iri_log_item *IRI_ITEM(struct xfs_log_item *lip)
> > > +{
> > > +	return container_of(lip, struct xfs_iri_log_item, iri_item);
> > > +}
> > > +
> > > +void
> > > +xfs_iri_item_free(
> > > +	struct xfs_iri_log_item *irip)
> > > +{
> > > +	kmem_zone_free(xfs_iri_zone, irip);
> > > +}
> > > +
> > > +/*
> > > + * Freeing the iri requires that we remove it from the AIL if it has already
> > > + * been placed there. However, the IRI may not yet have been placed in the AIL
> > > + * when called by xfs_iri_release() from IRD processing due to the ordering of
> > > + * committed vs unpin operations in bulk insert operations. Hence the reference
> > > + * count to ensure only the last caller frees the IRI.
> > > + */
> > > +void
> > > +xfs_iri_release(
> > > +	struct xfs_iri_log_item *irip)
> > > +{
> > > +	ASSERT(atomic_read(&irip->iri_refcount) > 0);
> > > +	if (atomic_dec_and_test(&irip->iri_refcount)) {
> > > +		xfs_trans_ail_remove(&irip->iri_item, SHUTDOWN_LOG_IO_ERROR);
> > > +		xfs_iri_item_free(irip);
> > > +	}
> > > +}
> > > +
> > > +static inline int
> > > +xfs_iri_item_sizeof(
> > > +	struct xfs_iri_log_item *irip)
> > > +{
> > > +	return sizeof(struct xfs_iri_log_format);
> > > +}
> > > +
> > > +STATIC void
> > > +xfs_iri_item_size(
> > > +	struct xfs_log_item	*lip,
> > > +	int			*nvecs,
> > > +	int			*nbytes)
> > > +{
> > > +	*nvecs += 1;
> > > +	*nbytes += xfs_iri_item_sizeof(IRI_ITEM(lip));
> > > +}
> > > +
> > > +STATIC void
> > > +xfs_iri_item_format(
> > > +	struct xfs_log_item	*lip,
> > > +	struct xfs_log_vec	*lv)
> > > +{
> > > +	struct xfs_iri_log_item	*irip = IRI_ITEM(lip);
> > > +	struct xfs_log_iovec	*vecp = NULL;
> > > +
> > > +	irip->iri_format.iri_type = XFS_LI_IRI;
> > > +	irip->iri_format.iri_size = 1;
> > > +
> > > +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRI_FORMAT,
> > > +			&irip->iri_format,
> > > +			xfs_iri_item_sizeof(irip));
> > > +}
> > > +
> > > +/*
> > > + * The unpin operation is the last place an IRI is manipulated in the log. It is
> > > + * either inserted in the AIL or aborted in the event of a log I/O error. In
> > > + * either case, the IRI transaction has been successfully committed to make it
> > > + * this far. Therefore, we expect whoever committed the IRI to either construct
> > > + * and commit the IRD or drop the IRD's reference in the event of error. Simply
> > > + * drop the log's IRI reference now that the log is done with it.
> > > + */
> > > +STATIC void
> > > +xfs_iri_item_unpin(
> > > +	struct xfs_log_item	*lip,
> > > +	int			remove)
> > > +{
> > > +	struct xfs_iri_log_item *irip = IRI_ITEM(lip);
> > > +	xfs_iri_release(irip);
> > > +}
> > > +
> > > +/*
> > > + * The IRI has been either committed or aborted if the transaction has been
> > > + * cancelled. If the transaction was cancelled, an IRD isn't going to be
> > > + * constructed and thus we free the IRI here directly.
> > > + */
> > > +STATIC void
> > > +xfs_iri_item_release(
> > > +	struct xfs_log_item     *lip)
> > > +{
> > > +	xfs_iri_release(IRI_ITEM(lip));
> > > +}
> > > +
> > > +/*
> > > + * This is the ops vector shared by all iri log items.
> > > + */
> > > +static const struct xfs_item_ops xfs_iri_item_ops = {
> > > +	.iop_size	= xfs_iri_item_size,
> > > +	.iop_format	= xfs_iri_item_format,
> > > +	.iop_unpin	= xfs_iri_item_unpin,
> > > +	.iop_release	= xfs_iri_item_release,
> > > +};
> > > +
> > > +/*
> > > + * Allocate and initialize an iri item with the given wip ino.
> > > + */
> > > +struct xfs_iri_log_item *
> > > +xfs_iri_init(struct xfs_mount  *mp,
> > > +	     uint		count)
> > > +{
> > > +	struct xfs_iri_log_item *irip;
> > > +
> > > +	irip = kmem_zone_zalloc(xfs_iri_zone, KM_SLEEP);
> > > +
> > > +	xfs_log_item_init(mp, &irip->iri_item, XFS_LI_IRI, &xfs_iri_item_ops);
> > > +	irip->iri_format.iri_id = (uintptr_t)(void *)irip;
> > > +	atomic_set(&irip->iri_refcount, 2);
> > > +
> > > +	return irip;
> > > +}
> > > +
> > > +static inline struct xfs_ird_log_item *IRD_ITEM(struct xfs_log_item *lip)
> > > +{
> > > +	return container_of(lip, struct xfs_ird_log_item, ird_item);
> > > +}
> > > +
> > > +STATIC void
> > > +xfs_ird_item_free(struct xfs_ird_log_item *irdp)
> > > +{
> > > +	kmem_zone_free(xfs_ird_zone, irdp);
> > > +}
> > > +
> > > +/*
> > > + * This returns the number of iovecs needed to log the given ird item.
> > > + * We only need 1 iovec for an ird item.  It just logs the ird_log_format
> > > + * structure.
> > > + */
> > > +STATIC void
> > > +xfs_ird_item_size(
> > > +	struct xfs_log_item	*lip,
> > > +	int			*nvecs,
> > > +	int			*nbytes)
> > > +{
> > > +	*nvecs += 1;
> > > +	*nbytes += sizeof(struct xfs_ird_log_format);
> > > +}
> > > +
> > > +STATIC void
> > > +xfs_ird_item_format(
> > > +	struct xfs_log_item	*lip,
> > > +	struct xfs_log_vec	*lv)
> > > +{
> > > +	struct xfs_ird_log_item *irdp = IRD_ITEM(lip);
> > > +	struct xfs_log_iovec	*vecp = NULL;
> > > +
> > > +	irdp->ird_format.ird_type = XFS_LI_IRD;
> > > +	irdp->ird_format.ird_size = 1;
> > > +
> > > +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRD_FORMAT, &irdp->ird_format,
> > > +			sizeof(struct xfs_ird_log_format));
> > > +}
> > > +
> > > +/*
> > > + * The IRD is either committed or aborted if the transaction is cancelled. If
> > > + * the transaction is cancelled, drop our reference to the IRI and free the
> > > + * IRD.
> > > + */
> > > +STATIC void
> > > +xfs_ird_item_release(
> > > +	struct xfs_log_item	*lip)
> > > +{
> > > +	struct xfs_ird_log_item	*irdp = IRD_ITEM(lip);
> > > +
> > > +	xfs_iri_release(irdp->ird_irip);
> > > +	xfs_ird_item_free(irdp);
> > > +}
> > > +
> > > +static const struct xfs_item_ops xfs_ird_item_ops = {
> > > +	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
> > > +	.iop_size	= xfs_ird_item_size,
> > > +	.iop_format	= xfs_ird_item_format,
> > > +	.iop_release	= xfs_ird_item_release,
> > > +};
> > > +
> > > +static struct xfs_ird_log_item *
> > > +xfs_trans_get_ird(
> > > +	struct xfs_trans		*tp,
> > > +	struct xfs_iri_log_item		*irip)
> > > +{
> > > +	xfs_ird_log_item_t	*irdp;
> > > +
> > > +	ASSERT(tp != NULL);
> > > +
> > > +	irdp = kmem_zone_zalloc(xfs_ird_zone, KM_SLEEP);
> > > +	xfs_log_item_init(tp->t_mountp, &irdp->ird_item, XFS_LI_IRD,
> > > +			  &xfs_ird_item_ops);
> > > +	irdp->ird_irip = irip;
> > > +	irdp->ird_format.wip_ino = irip->iri_format.wip_ino;
> > > +	irdp->ird_format.ird_iri_id = irip->iri_format.iri_id;
> > > +
> > > +	xfs_trans_add_item(tp, &irdp->ird_item);
> > > +	return irdp;
> > > +}
> > > +
> > > +/* record a iunlink remove intent */
> > > +int
> > > +xfs_iunlink_remove_add(
> > > +	struct xfs_trans	*tp,
> > > +	struct xfs_inode	*wip)
> > > +{
> > > +	struct xfs_iunlink_remove_intent	*ii;
> > > +
> > > +	ii = kmem_alloc(sizeof(struct xfs_iunlink_remove_intent),
> > > +			KM_SLEEP | KM_NOFS);
> > > +	INIT_LIST_HEAD(&ii->ri_list);
> > > +	ii->wip = wip;
> > > +
> > > +	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_IUNRE, &ii->ri_list);
> > > +	return 0;
> > > +}
> > > +
> > > +/* Sort iunlink remove intents by AG. */
> > > +static int
> > > +xfs_iunlink_remove_diff_items(
> > > +	void				*priv,
> > > +	struct list_head		*a,
> > > +	struct list_head		*b)
> > > +{
> > > +	struct xfs_mount			*mp = priv;
> > > +	struct xfs_iunlink_remove_intent	*ra;
> > > +	struct xfs_iunlink_remove_intent	*rb;
> > > +
> > > +	ra = container_of(a, struct xfs_iunlink_remove_intent, ri_list);
> > > +	rb = container_of(b, struct xfs_iunlink_remove_intent, ri_list);
> > > +	return	XFS_INO_TO_AGNO(mp, ra->wip->i_ino) -
> > > +		XFS_INO_TO_AGNO(mp, rb->wip->i_ino);
> > > +}
> > > +
> > > +/* Get an IRI */
> > > +STATIC void *
> > > +xfs_iunlink_remove_create_intent(
> > > +	struct xfs_trans		*tp,
> > > +	unsigned int			count)
> > > +{
> > > +	struct xfs_iri_log_item		*irip;
> > > +
> > > +	ASSERT(tp != NULL);
> > > +	ASSERT(count == 1);
> > > +
> > > +	irip = xfs_iri_init(tp->t_mountp, count);
> > > +	ASSERT(irip != NULL);
> > > +
> > > +	/*
> > > +	 * Get a log_item_desc to point at the new item.
> > > +	 */
> > > +	xfs_trans_add_item(tp, &irip->iri_item);
> > > +	return irip;
> > > +}
> > > +
> > > +/* Log a iunlink remove to the intent item. */
> > > +STATIC void
> > > +xfs_iunlink_remove_log_item(
> > > +	struct xfs_trans		*tp,
> > > +	void				*intent,
> > > +	struct list_head		*item)
> > > +{
> > > +	struct xfs_iri_log_item			*irip = intent;
> > > +	struct xfs_iunlink_remove_intent	*iunre;
> > > +
> > > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > > +
> > > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > > +	set_bit(XFS_LI_DIRTY, &irip->iri_item.li_flags);
> > > +
> > > +	irip->iri_format.wip_ino = (uint64_t)(iunre->wip->i_ino);
> > > +}
> > > +
> > > +/* Get an IRD so we can process all the deferred iunlink remove. */
> > > +STATIC void *
> > > +xfs_iunlink_remove_create_done(
> > > +	struct xfs_trans		*tp,
> > > +	void				*intent,
> > > +	unsigned int			count)
> > > +{
> > > +	return xfs_trans_get_ird(tp, intent);
> > > +}
> > > +
> > > +/*
> > > + * For whiteouts, we need to bump the link count on the whiteout inode.
> > > + * This means that failures all the way up to this point leave the inode
> > > + * on the unlinked list and so cleanup is a simple matter of dropping
> > > + * the remaining reference to it. If we fail here after bumping the link
> > > + * count, we're shutting down the filesystem so we'll never see the
> > > + * intermediate state on disk.
> > > + */
> > > +static int
> > > +xfs_trans_log_finish_iunlink_remove(
> > > +	struct xfs_trans		*tp,
> > > +	struct xfs_ird_log_item		*irdp,
> > > +	struct xfs_inode		*wip)
> > > +{
> > > +	int 	error;
> > > +
> > > +	ASSERT(xfs_isilocked(wip, XFS_ILOCK_EXCL));
> > > +
> > > +	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
> > > +
> > > +	ASSERT(VFS_I(wip)->i_nlink == 0);
> > > +	xfs_bumplink(tp, wip);
> > > +	error = xfs_iunlink_remove(tp, wip);
> > > +	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
> > > +	/*
> > > +	 * Now we have a real link, clear the "I'm a tmpfile" state
> > > +	 * flag from the inode so it doesn't accidentally get misused in
> > > +	 * future.
> > > +	 */
> > > +	VFS_I(wip)->i_state &= ~I_LINKABLE;
> > > +
> > > +	/*
> > > +	 * Mark the transaction dirty, even on error. This ensures the
> > > +	 * transaction is aborted, which:
> > > +	 *
> > > +	 * 1.) releases the IRI and frees the IRD
> > > +	 * 2.) shuts down the filesystem
> > > +	 */
> > > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > > +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> > > +
> > > +	return error;
> > > +}
> > > +
> > > +/* Process a deferred iunlink remove. */
> > > +STATIC int
> > > +xfs_iunlink_remove_finish_item(
> > > +	struct xfs_trans		*tp,
> > > +	struct list_head		*item,
> > > +	void				*done_item,
> > > +	void				**state)
> > > +{
> > > +	struct xfs_iunlink_remove_intent	*iunre;
> > > +	int					error;
> > > +
> > > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > > +	error = xfs_trans_log_finish_iunlink_remove(tp, done_item,
> > > +			iunre->wip);
> > > +	kmem_free(iunre);
> > > +	return error;
> > > +}
> > > +
> > > +/* Abort all pending IRIs. */
> > > +STATIC void
> > > +xfs_iunlink_remove_abort_intent(
> > > +	void		*intent)
> > > +{
> > > +	xfs_iri_release(intent);
> > > +}
> > > +
> > > +/* Cancel a deferred iunlink remove. */
> > > +STATIC void
> > > +xfs_iunlink_remove_cancel_item(
> > > +	struct list_head		*item)
> > > +{
> > > +	struct xfs_iunlink_remove_intent	*iunre;
> > > +
> > > +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
> > > +	kmem_free(iunre);
> > > +}
> > > +
> > > +const struct xfs_defer_op_type xfs_iunlink_remove_defer_type = {
> > > +	.diff_items	= xfs_iunlink_remove_diff_items,
> > > +	.create_intent	= xfs_iunlink_remove_create_intent,
> > > +	.abort_intent	= xfs_iunlink_remove_abort_intent,
> > > +	.log_item	= xfs_iunlink_remove_log_item,
> > > +	.create_done	= xfs_iunlink_remove_create_done,
> > > +	.finish_item	= xfs_iunlink_remove_finish_item,
> > > +	.cancel_item	= xfs_iunlink_remove_cancel_item,
> > > +};
> > > +
> > > +/*
> > > + * Process a iunlink remove intent item that was recovered from the log.
> > > + */
> > > +int
> > > +xfs_iri_recover(
> > > +	struct xfs_trans		*parent_tp,
> > > +	struct xfs_iri_log_item		*irip)
> > > +{
> > > +	int				error = 0;
> > > +	struct xfs_trans		*tp;
> > > +	xfs_ino_t			ino;
> > > +	struct xfs_inode		*ip;
> > > +	struct xfs_mount		*mp = parent_tp->t_mountp;
> > > +	struct xfs_ird_log_item		*irdp;
> > > +
> > > +	ASSERT(!test_bit(XFS_IRI_RECOVERED, &irip->iri_flags));
> > > +
> > > +	ino = irip->iri_format.wip_ino;
> > > +	if (ino == NULLFSINO || !xfs_verify_dir_ino(mp, ino)) {
> > > +		xfs_alert(mp, "IRI recover used bad inode ino 0x%llx!", ino);
> > > +		set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> > > +		xfs_iri_release(irip);
> > > +		return -EIO;
> > > +	}
> > > +	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> > > +	if (error)
> > > +		return error;
> > > +	irdp = xfs_trans_get_ird(tp, irip);
> > > +
> > > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > > +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> > > +
> > > +	ASSERT(VFS_I(ip)->i_nlink == 0);
> > > +	VFS_I(ip)->i_state |= I_LINKABLE;
> > > +	xfs_bumplink(tp, ip);
> > > +	error = xfs_iunlink_remove(tp, ip);
> > > +	if (error)
> > > +		goto abort_error;
> > > +	VFS_I(ip)->i_state &= ~I_LINKABLE;
> > > +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> > > +
> > > +	tp->t_flags |= XFS_TRANS_DIRTY;
> > > +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
> > > +
> > > +	set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
> > > +	error = xfs_trans_commit(tp);
> > > +	return error;
> > > +
> > > +abort_error:
> > > +	xfs_trans_cancel(tp);
> > > +	return error;
> > > +}
> > > diff --git a/fs/xfs/xfs_iunlinkrm_item.h b/fs/xfs/xfs_iunlinkrm_item.h
> > > new file mode 100644
> > > index 0000000..54c4ca3
> > > --- /dev/null
> > > +++ b/fs/xfs/xfs_iunlinkrm_item.h
> > > @@ -0,0 +1,67 @@
> > > +// SPDX-License-Identifier: GPL-2.0+
> > > +/*
> > > + * Copyright (C) 2019 Tencent.  All Rights Reserved.
> > > + * Author: Kaixuxia <kaixuxia@tencent.com>
> > > + */
> > > +#ifndef	__XFS_IUNLINKRM_ITEM_H__
> > > +#define	__XFS_IUNLINKRM_ITEM_H__
> > > +
> > > +/*
> > > + * When performing rename operation with RENAME_WHITEOUT flag, we will hold AGF lock to
> > > + * allocate or free extents in manipulating the dirents firstly, and then doing the
> > > + * xfs_iunlink_remove() call last to hold AGI lock to modify the tmpfile info, so we the
> > > + * lock order AGI->AGF.
> > > + *
> > > + * The big problem here is that we have an ordering constraint on AGF and AGI locking -
> > > + * inode allocation locks the AGI, then can allocate a new extent for new inodes, locking
> > > + * the AGF after the AGI. Hence the ordering that is imposed by other parts of the code
> > > + * is AGI before AGF. So we get the ABBA agi&agf deadlock here.
> > > + *
> > > + * So make the unlinked list removal a deferred operation, i.e. log an iunlink remove intent
> > > + * and then do it after the RENAME_WHITEOUT transaction has committed, and the iunlink remove
> > > + * intention(IRI) and done log items(IRD) are provided.
> > > + */
> > > +
> > > +/* kernel only IRI/IRD definitions */
> > > +
> > > +struct xfs_mount;
> > > +struct kmem_zone;
> > > +struct xfs_inode;
> > > +
> > > +/*
> > > + * Define IRI flag bits. Manipulated by set/clear/test_bit operators.
> > > + */
> > > +#define	XFS_IRI_RECOVERED		1
> > > +
> > > +/* This is the "iunlink remove intention" log item. It is used in conjunction
> > > + * with the "iunlink remove done" log item described below.
> > > + */
> > > +typedef struct xfs_iri_log_item {
> > > +	struct xfs_log_item	iri_item;
> > > +	atomic_t		iri_refcount;
> > > +	unsigned long		iri_flags;
> > > +	xfs_iri_log_format_t	iri_format;
> > > +} xfs_iri_log_item_t;
> > > +
> > > +/* This is the "iunlink remove done" log item. */
> > > +typedef struct xfs_ird_log_item {
> > > +	struct xfs_log_item	ird_item;
> > > +	xfs_iri_log_item_t	*ird_irip;
> > > +	xfs_ird_log_format_t	ird_format;
> > > +} xfs_ird_log_item_t;
> > > +
> > > +struct xfs_iunlink_remove_intent {
> > > +	struct list_head		ri_list;
> > > +	struct xfs_inode		*wip;
> > > +};
> > > +
> > > +extern struct kmem_zone	*xfs_iri_zone;
> > > +extern struct kmem_zone	*xfs_ird_zone;
> > > +
> > > +struct xfs_iri_log_item	*xfs_iri_init(struct xfs_mount *, uint);
> > > +void xfs_iri_item_free(struct xfs_iri_log_item *);
> > > +void xfs_iri_release(struct xfs_iri_log_item *);
> > > +int xfs_iri_recover(struct xfs_trans *, struct xfs_iri_log_item *);
> > > +int xfs_iunlink_remove_add(struct xfs_trans *, struct xfs_inode *);
> > > +
> > > +#endif	/* __XFS_IUNLINKRM_ITEM_H__ */
> > > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > > index 00e9f5c..f87f510 100644
> > > --- a/fs/xfs/xfs_log.c
> > > +++ b/fs/xfs/xfs_log.c
> > > @@ -2005,6 +2005,8 @@ STATIC void xlog_state_done_syncing(
> > >  	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
> > >  	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
> > >  	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
> > > +	    REG_TYPE_STR(IRI_FORMAT, "iri_format"),
> > > +	    REG_TYPE_STR(IRD_FORMAT, "ird_format"),
> > >  	};
> > >  	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
> > >  #undef REG_TYPE_STR
> > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > > index 13d1d3e..a916f40 100644
> > > --- a/fs/xfs/xfs_log_recover.c
> > > +++ b/fs/xfs/xfs_log_recover.c
> > > @@ -33,6 +33,7 @@
> > >  #include "xfs_buf_item.h"
> > >  #include "xfs_refcount_item.h"
> > >  #include "xfs_bmap_item.h"
> > > +#include "xfs_iunlinkrm_item.h"
> > > 
> > >  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
> > > 
> > > @@ -1885,6 +1886,8 @@ struct xfs_buf_cancel {
> > >  		case XFS_LI_CUD:
> > >  		case XFS_LI_BUI:
> > >  		case XFS_LI_BUD:
> > > +		case XFS_LI_IRI:
> > > +		case XFS_LI_IRD:
> > >  			trace_xfs_log_recover_item_reorder_tail(log,
> > >  							trans, item, pass);
> > >  			list_move_tail(&item->ri_list, &inode_list);
> > > @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
> > >  }
> > > 
> > >  /*
> > > + * This routine is called to create an in-core iunlink remove intent
> > > + * item from the iri format structure which was logged on disk.
> > > + * It allocates an in-core iri, copies the inode from the format
> > > + * structure into it, and adds the iri to the AIL with the given
> > > + * LSN.
> > > + */
> > > +STATIC int
> > > +xlog_recover_iri_pass2(
> > > +	struct xlog			*log,
> > > +	struct xlog_recover_item	*item,
> > > +	xfs_lsn_t			lsn)
> > > +{
> > > +	xfs_mount_t		*mp = log->l_mp;
> > > +	xfs_iri_log_item_t	*irip;
> > > +	xfs_iri_log_format_t	*iri_formatp;
> > > +
> > > +	iri_formatp = item->ri_buf[0].i_addr;
> > > +
> > > +	irip = xfs_iri_init(mp, 1);
> > > +	irip->iri_format = *iri_formatp;
> > > +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
> > > +		xfs_iri_item_free(irip);
> > > +		return EFSCORRUPTED;
> > > +	}
> > > +
> > > +	spin_lock(&log->l_ailp->ail_lock);
> > > +	/*
> > > +	 * The IRI has two references. One for the IRD and one for IRI to ensure
> > > +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
> > > +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
> > > +	 * AIL lock.
> > > +	 */
> > > +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
> > > +	xfs_iri_release(irip);
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * This routine is called when an IRD format structure is found in a committed
> > > + * transaction in the log. Its purpose is to cancel the corresponding IRI if it
> > > + * was still in the log. To do this it searches the AIL for the IRI with an id
> > > + * equal to that in the IRD format structure. If we find it we drop the IRD
> > > + * reference, which removes the IRI from the AIL and frees it.
> > > + */
> > > +STATIC int
> > > +xlog_recover_ird_pass2(
> > > +	struct xlog			*log,
> > > +	struct xlog_recover_item	*item)
> > > +{
> > > +	xfs_ird_log_format_t	*ird_formatp;
> > > +	xfs_iri_log_item_t	*irip = NULL;
> > > +	struct xfs_log_item	*lip;
> > > +	uint64_t		iri_id;
> > > +	struct xfs_ail_cursor	cur;
> > > +	struct xfs_ail		*ailp = log->l_ailp;
> > > +
> > > +	ird_formatp = item->ri_buf[0].i_addr;
> > > +	if (item->ri_buf[0].i_len == sizeof(xfs_ird_log_format_t))
> > > +		return -EFSCORRUPTED;
> > > +	iri_id = ird_formatp->ird_iri_id;
> > > +
> > > +	/*
> > > +	 * Search for the iri with the id in the ird format structure
> > > +	 * in the AIL.
> > > +	 */
> > > +	spin_lock(&ailp->ail_lock);
> > > +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
> > > +	while (lip != NULL) {
> > > +		if (lip->li_type == XFS_LI_IRI) {
> > > +			irip = (xfs_iri_log_item_t *)lip;
> > > +			if (irip->iri_format.iri_id == iri_id) {
> > > +				/*
> > > +				 * Drop the IRD reference to the IRI. This
> > > +				 * removes the IRI from the AIL and frees it.
> > > +				 */
> > > +				spin_unlock(&ailp->ail_lock);
> > > +				xfs_iri_release(irip);
> > > +				spin_lock(&ailp->ail_lock);
> > > +				break;
> > > +			}
> > > +		}
> > > +		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> > > +	}
> > > +	xfs_trans_ail_cursor_done(&cur);
> > > +	spin_unlock(&ailp->ail_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > >   * This routine is called when an inode create format structure is found in a
> > >   * committed transaction in the log.  It's purpose is to initialise the inodes
> > >   * being allocated on disk. This requires us to get inode cluster buffers that
> > > @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
> > >  	case XFS_LI_CUD:
> > >  	case XFS_LI_BUI:
> > >  	case XFS_LI_BUD:
> > > +	case XFS_LI_IRI:
> > > +	case XFS_LI_IRD:
> > >  	default:
> > >  		break;
> > >  	}
> > > @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
> > >  	case XFS_LI_CUD:
> > >  	case XFS_LI_BUI:
> > >  	case XFS_LI_BUD:
> > > +	case XFS_LI_IRI:
> > > +	case XFS_LI_IRD:
> > >  		/* nothing to do in pass 1 */
> > >  		return 0;
> > >  	default:
> > > @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
> > >  		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
> > >  	case XFS_LI_BUD:
> > >  		return xlog_recover_bud_pass2(log, item);
> > > +	case XFS_LI_IRI:
> > > +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
> > > +	case XFS_LI_IRD:
> > > +		return xlog_recover_ird_pass2(log, item);
> > >  	case XFS_LI_DQUOT:
> > >  		return xlog_recover_dquot_pass2(log, buffer_list, item,
> > >  						trans->r_lsn);
> > > @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
> > >  	spin_lock(&ailp->ail_lock);
> > >  }
> > > 
> > > +/* Recover the IRI if necessary. */
> > > +STATIC int
> > > +xlog_recover_process_iri(
> > > +	struct xfs_trans		*parent_tp,
> > > +	struct xfs_ail			*ailp,
> > > +	struct xfs_log_item		*lip)
> > > +{
> > > +	struct xfs_iri_log_item		*irip;
> > > +	int				error;
> > > +
> > > +	/*
> > > +	 * Skip IRIs that we've already processed.
> > > +	 */
> > > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > > +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
> > > +		return 0;
> > > +
> > > +	spin_unlock(&ailp->ail_lock);
> > > +	error = xfs_iri_recover(parent_tp, irip);
> > > +	spin_lock(&ailp->ail_lock);
> > > +
> > > +	return error;
> > > +}
> > > +
> > > +/* Release the IRI since we're cancelling everything. */
> > > +STATIC void
> > > +xlog_recover_cancel_iri(
> > > +	struct xfs_mount		*mp,
> > > +	struct xfs_ail			*ailp,
> > > +	struct xfs_log_item		*lip)
> > > +{
> > > +	struct xfs_iri_log_item         *irip;
> > > +
> > > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > > +
> > > +	spin_unlock(&ailp->ail_lock);
> > > +	xfs_iri_release(irip);
> > > +	spin_lock(&ailp->ail_lock);
> > > +}
> > > +
> > >  /* Is this log item a deferred action intent? */
> > >  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >  {
> > > @@ -4729,6 +4870,7 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >  	case XFS_LI_RUI:
> > >  	case XFS_LI_CUI:
> > >  	case XFS_LI_BUI:
> > > +	case XFS_LI_IRI:
> > >  		return true;
> > >  	default:
> > >  		return false;
> > > @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >  		case XFS_LI_BUI:
> > >  			error = xlog_recover_process_bui(parent_tp, ailp, lip);
> > >  			break;
> > > +		case XFS_LI_IRI:
> > > +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
> > > +			break;
> > >  		}
> > >  		if (error)
> > >  			goto out;
> > > @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >  		case XFS_LI_BUI:
> > >  			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
> > >  			break;
> > > +		case XFS_LI_IRI:
> > > +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
> > > +			break;
> > >  		}
> > > 
> > >  		lip = xfs_trans_ail_cursor_next(ailp, &cur);
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index f945023..66742b7 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -34,6 +34,7 @@
> > >  #include "xfs_rmap_item.h"
> > >  #include "xfs_refcount_item.h"
> > >  #include "xfs_bmap_item.h"
> > > +#include "xfs_iunlinkrm_item.h"
> > >  #include "xfs_reflink.h"
> > > 
> > >  #include <linux/magic.h>
> > > @@ -1957,8 +1958,22 @@ struct proc_xfs_info {
> > >  	if (!xfs_bui_zone)
> > >  		goto out_destroy_bud_zone;
> > > 
> > > +	xfs_ird_zone = kmem_zone_init(sizeof(xfs_ird_log_item_t),
> > > +			"xfs_ird_item");
> > > +	if (!xfs_ird_zone)
> > > +		goto out_destroy_bui_zone;
> > > +
> > > +	xfs_iri_zone = kmem_zone_init(sizeof(xfs_iri_log_item_t),
> > > +			"xfs_iri_item");
> > > +	if (!xfs_iri_zone)
> > > +		goto out_destroy_ird_zone;
> > > +
> > >  	return 0;
> > > 
> > > + out_destroy_ird_zone:
> > > +	kmem_zone_destroy(xfs_ird_zone);
> > > + out_destroy_bui_zone:
> > > +	kmem_zone_destroy(xfs_bui_zone);
> > >   out_destroy_bud_zone:
> > >  	kmem_zone_destroy(xfs_bud_zone);
> > >   out_destroy_cui_zone:
> > > @@ -2007,6 +2022,8 @@ struct proc_xfs_info {
> > >  	 * destroy caches.
> > >  	 */
> > >  	rcu_barrier();
> > > +	kmem_zone_destroy(xfs_iri_zone);
> > > +	kmem_zone_destroy(xfs_ird_zone);
> > >  	kmem_zone_destroy(xfs_bui_zone);
> > >  	kmem_zone_destroy(xfs_bud_zone);
> > >  	kmem_zone_destroy(xfs_cui_zone);
> > > diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> > > index 64d7f17..dd63eaa 100644
> > > --- a/fs/xfs/xfs_trans.h
> > > +++ b/fs/xfs/xfs_trans.h
> > > @@ -26,6 +26,8 @@
> > >  struct xfs_cud_log_item;
> > >  struct xfs_bui_log_item;
> > >  struct xfs_bud_log_item;
> > > +struct xfs_iri_log_item;
> > > +struct xfs_ird_log_item;
> > > 
> > >  struct xfs_log_item {
> > >  	struct list_head		li_ail;		/* AIL pointers */
> > > -- 
> > > 1.8.3.1
> > > 
> > > -- 
> > > kaixuxia
Kaixu Xia Aug. 14, 2019, 2:32 a.m. UTC | #4
On 2019/8/13 22:20, Darrick J. Wong wrote:
> On Tue, Aug 13, 2019 at 09:36:14AM -0400, Brian Foster wrote:
>> On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
>>> When performing rename operation with RENAME_WHITEOUT flag, we will
>>> hold AGF lock to allocate or free extents in manipulating the dirents
>>> firstly, and then doing the xfs_iunlink_remove() call last to hold
>>> AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
>>>
>>
>> IIUC, the whiteout use case is that we're renaming a file, but the
>> source dentry must be replaced with a magic whiteout inode rather than
>> be removed. Therefore, xfs_rename() allocates the whiteout inode as a
>> tmpfile first in a separate transaction, updates the target dentry with
>> the source inode, replaces the source dentry to point to the whiteout
>> inode and finally removes the whiteout inode from the unlinked list
>> (since it is a tmpfile). This leads to the problem described below
>> because the rename transaction ends up doing directory block allocs
>> (locking the AGF) followed by the unlinked list remove (locking the
>> AGI).
>>
>> My understanding from reading the code is that this is primarly to
>> cleanly handle error scenarios. If anything fails after we've allocated
>> the whiteout tmpfile, it's simply left on the unlinked list and so the
>> filesystem remains in a consistent/recoverable state. Given that, the
>> solution here seems like overkill to me. For one, I thought background
>> unlinked list removal was already on our roadmap (Darrick might have
>> been looking at that and may already have a prototype as well). Also,
>> unlinked list removal occurs at log recovery time already. That's
>> somewhat of an existing purpose of the list, which makes a deferred
>> unlinked list removal operation superfluous in more traditional cases
>> where unlinked list removal doesn't require consistency with a directory
>> operation.
> 
> Not to mention this doesn't fix the problem for existing filesystems,
> because adding new log item types changes the on-disk log format and
> therefore requires an log incompat feature bit to prevent old kernels
> from trying to recover the log.

Yeah, right, we need an log incompat feature bit, otherwise the old kernels
  would hit unrecognized type of log operation error and log recovery fails.
You know, this bug has been xfs lifeline for many years, we really need to
find a solution, this deferred  unlinked list removal operation or other solutions,
like expand the directory in one transaction mentioned below.

> 
>> Functional discussion aside.. from a complexity standpoint I'm wondering
>> if we could do something much more simple like acquire the AGI lock for
>> a whiteout inode earlier in xfs_rename(). For example, suppose we did
>> something like:
>>
>> 	/*
>> 	 * Acquire the whiteout agi to preserve locking order in anticipation of
>> 	 * unlinked list removal.
>> 	 */
>> 	if (wip)
>> 		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);
>>
>> ... after we allocate the transaction but before we do any directory ops
>> that can result in block allocations. Would that prevent the problem
>> you've observed?
> 
> I had the same thought, but fun question: if @wip is allocated in AG 1
> but the dirent blocks come from AG 0, is that a problem?
> 
> Would it make more sense to expand the directory in one transaction,
> roll it, and add the actual directory entry after that?
> 
> --D
> 
>> Brian
>>
>>> The big problem here is that we have an ordering constraint on AGF
>>> and AGI locking - inode allocation locks the AGI, then can allocate
>>> a new extent for new inodes, locking the AGF after the AGI. Hence
>>> the ordering that is imposed by other parts of the code is AGI before
>>> AGF. So we get the ABBA agi&agf deadlock here.
>>>
>>> Process A:
>>> Call trace:
>>>   ? __schedule+0x2bd/0x620
>>>   schedule+0x33/0x90
>>>   schedule_timeout+0x17d/0x290
>>>   __down_common+0xef/0x125
>>>   ? xfs_buf_find+0x215/0x6c0 [xfs]
>>>   down+0x3b/0x50
>>>   xfs_buf_lock+0x34/0xf0 [xfs]
>>>   xfs_buf_find+0x215/0x6c0 [xfs]
>>>   xfs_buf_get_map+0x37/0x230 [xfs]
>>>   xfs_buf_read_map+0x29/0x190 [xfs]
>>>   xfs_trans_read_buf_map+0x13d/0x520 [xfs]
>>>   xfs_read_agf+0xa6/0x180 [xfs]
>>>   ? schedule_timeout+0x17d/0x290
>>>   xfs_alloc_read_agf+0x52/0x1f0 [xfs]
>>>   xfs_alloc_fix_freelist+0x432/0x590 [xfs]
>>>   ? down+0x3b/0x50
>>>   ? xfs_buf_lock+0x34/0xf0 [xfs]
>>>   ? xfs_buf_find+0x215/0x6c0 [xfs]
>>>   xfs_alloc_vextent+0x301/0x6c0 [xfs]
>>>   xfs_ialloc_ag_alloc+0x182/0x700 [xfs]
>>>   ? _xfs_trans_bjoin+0x72/0xf0 [xfs]
>>>   xfs_dialloc+0x116/0x290 [xfs]
>>>   xfs_ialloc+0x6d/0x5e0 [xfs]
>>>   ? xfs_log_reserve+0x165/0x280 [xfs]
>>>   xfs_dir_ialloc+0x8c/0x240 [xfs]
>>>   xfs_create+0x35a/0x610 [xfs]
>>>   xfs_generic_create+0x1f1/0x2f0 [xfs]
>>>   ...
>>>
>>> Process B:
>>> Call trace:
>>>   ? __schedule+0x2bd/0x620
>>>   ? xfs_bmapi_allocate+0x245/0x380 [xfs]
>>>   schedule+0x33/0x90
>>>   schedule_timeout+0x17d/0x290
>>>   ? xfs_buf_find+0x1fd/0x6c0 [xfs]
>>>   __down_common+0xef/0x125
>>>   ? xfs_buf_get_map+0x37/0x230 [xfs]
>>>   ? xfs_buf_find+0x215/0x6c0 [xfs]
>>>   down+0x3b/0x50
>>>   xfs_buf_lock+0x34/0xf0 [xfs]
>>>   xfs_buf_find+0x215/0x6c0 [xfs]
>>>   xfs_buf_get_map+0x37/0x230 [xfs]
>>>   xfs_buf_read_map+0x29/0x190 [xfs]
>>>   xfs_trans_read_buf_map+0x13d/0x520 [xfs]
>>>   xfs_read_agi+0xa8/0x160 [xfs]
>>>   xfs_iunlink_remove+0x6f/0x2a0 [xfs]
>>>   ? current_time+0x46/0x80
>>>   ? xfs_trans_ichgtime+0x39/0xb0 [xfs]
>>>   xfs_rename+0x57a/0xae0 [xfs]
>>>   xfs_vn_rename+0xe4/0x150 [xfs]
>>>   ...
>>>
>>> In this patch we make the unlinked list removal a deferred operation,
>>> i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
>>> transaction has committed, and the iunlink remove intention and done
>>> log items are provided.
>>>
>>> Change the ordering of the operations in the xfs_rename() function
>>> to hold the AGF lock in the RENAME_WHITEOUT transaction and hold the
>>> AGI lock in it's own transaction to match that of the rest of the code.
>>>
>>> Signed-off-by: kaixuxia <kaixuxia@tencent.com>
>>> ---
>>>   fs/xfs/Makefile                |   1 +
>>>   fs/xfs/libxfs/xfs_defer.c      |   1 +
>>>   fs/xfs/libxfs/xfs_defer.h      |   2 +
>>>   fs/xfs/libxfs/xfs_log_format.h |  27 ++-
>>>   fs/xfs/xfs_inode.c             |  36 +---
>>>   fs/xfs/xfs_inode.h             |   3 +
>>>   fs/xfs/xfs_iunlinkrm_item.c    | 458 +++++++++++++++++++++++++++++++++++++++++
>>>   fs/xfs/xfs_iunlinkrm_item.h    |  67 ++++++
>>>   fs/xfs/xfs_log.c               |   2 +
>>>   fs/xfs/xfs_log_recover.c       | 148 +++++++++++++
>>>   fs/xfs/xfs_super.c             |  17 ++
>>>   fs/xfs/xfs_trans.h             |   2 +
>>>   12 files changed, 733 insertions(+), 31 deletions(-)
>>>   create mode 100644 fs/xfs/xfs_iunlinkrm_item.c
>>>   create mode 100644 fs/xfs/xfs_iunlinkrm_item.h
>>>
>>> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
>>> index 06b68b6..9d5012e 100644
>>> --- a/fs/xfs/Makefile
>>> +++ b/fs/xfs/Makefile
>>> @@ -106,6 +106,7 @@ xfs-y				+= xfs_log.o \
>>>   				   xfs_inode_item.o \
>>>   				   xfs_refcount_item.o \
>>>   				   xfs_rmap_item.o \
>>> +				   xfs_iunlinkrm_item.o \
>>>   				   xfs_log_recover.o \
>>>   				   xfs_trans_ail.o \
>>>   				   xfs_trans_buf.o
>>> diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
>>> index eb2be2a..a0f0a3d 100644
>>> --- a/fs/xfs/libxfs/xfs_defer.c
>>> +++ b/fs/xfs/libxfs/xfs_defer.c
>>> @@ -176,6 +176,7 @@
>>>   	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
>>>   	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
>>>   	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
>>> +	[XFS_DEFER_OPS_TYPE_IUNRE]	= &xfs_iunlink_remove_defer_type,
>>>   };
>>>
>>>   /*
>>> diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
>>> index 7c28d76..9e91a36 100644
>>> --- a/fs/xfs/libxfs/xfs_defer.h
>>> +++ b/fs/xfs/libxfs/xfs_defer.h
>>> @@ -17,6 +17,7 @@ enum xfs_defer_ops_type {
>>>   	XFS_DEFER_OPS_TYPE_RMAP,
>>>   	XFS_DEFER_OPS_TYPE_FREE,
>>>   	XFS_DEFER_OPS_TYPE_AGFL_FREE,
>>> +	XFS_DEFER_OPS_TYPE_IUNRE,
>>>   	XFS_DEFER_OPS_TYPE_MAX,
>>>   };
>>>
>>> @@ -60,5 +61,6 @@ struct xfs_defer_op_type {
>>>   extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
>>>   extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
>>>   extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
>>> +extern const struct xfs_defer_op_type xfs_iunlink_remove_defer_type;
>>>
>>>   #endif /* __XFS_DEFER_H__ */
>>> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
>>> index e5f97c6..dc85b28 100644
>>> --- a/fs/xfs/libxfs/xfs_log_format.h
>>> +++ b/fs/xfs/libxfs/xfs_log_format.h
>>> @@ -117,7 +117,9 @@ struct xfs_unmount_log_format {
>>>   #define XLOG_REG_TYPE_CUD_FORMAT	24
>>>   #define XLOG_REG_TYPE_BUI_FORMAT	25
>>>   #define XLOG_REG_TYPE_BUD_FORMAT	26
>>> -#define XLOG_REG_TYPE_MAX		26
>>> +#define XLOG_REG_TYPE_IRI_FORMAT	27
>>> +#define XLOG_REG_TYPE_IRD_FORMAT	28
>>> +#define XLOG_REG_TYPE_MAX		28
>>>
>>>   /*
>>>    * Flags to log operation header
>>> @@ -240,6 +242,8 @@ struct xfs_unmount_log_format {
>>>   #define	XFS_LI_CUD		0x1243
>>>   #define	XFS_LI_BUI		0x1244	/* bmbt update intent */
>>>   #define	XFS_LI_BUD		0x1245
>>> +#define	XFS_LI_IRI		0x1246	/* iunlink remove intent */
>>> +#define	XFS_LI_IRD		0x1247
>>>
>>>   #define XFS_LI_TYPE_DESC \
>>>   	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
>>> @@ -255,7 +259,9 @@ struct xfs_unmount_log_format {
>>>   	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
>>>   	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
>>>   	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
>>> -	{ XFS_LI_BUD,		"XFS_LI_BUD" }
>>> +	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
>>> +	{ XFS_LI_IRI,		"XFS_LI_IRI" }, \
>>> +	{ XFS_LI_IRD,		"XFS_LI_IRD" }
>>>
>>>   /*
>>>    * Inode Log Item Format definitions.
>>> @@ -773,6 +779,23 @@ struct xfs_bud_log_format {
>>>   };
>>>
>>>   /*
>>> + * This is the structure used to lay out iri&ird log item in the log.
>>> + */
>>> +typedef struct xfs_iri_log_format {
>>> +	uint16_t		iri_type;	/* iri log item type */
>>> +	uint16_t		iri_size;	/* size of this item */
>>> +	uint64_t		iri_id;		/* id of corresponding iri */
>>> +	uint64_t		wip_ino;	/* inode number */
>>> +} xfs_iri_log_format_t;
>>> +
>>> +typedef struct xfs_ird_log_format {
>>> +	uint16_t		ird_type;	/* ird log item type */
>>> +	uint16_t		ird_size;	/* size of this item */
>>> +	uint64_t		ird_iri_id;	/* id of corresponding iri */
>>> +	uint64_t		wip_ino;	/* inode number */
>>> +} xfs_ird_log_format_t;
>>> +
>>> +/*
>>>    * Dquot Log format definitions.
>>>    *
>>>    * The first two fields must be the type and size fitting into
>>> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
>>> index 6467d5e..7bb3102 100644
>>> --- a/fs/xfs/xfs_inode.c
>>> +++ b/fs/xfs/xfs_inode.c
>>> @@ -35,6 +35,7 @@
>>>   #include "xfs_log.h"
>>>   #include "xfs_bmap_btree.h"
>>>   #include "xfs_reflink.h"
>>> +#include "xfs_iunlinkrm_item.h"
>>>
>>>   kmem_zone_t *xfs_inode_zone;
>>>
>>> @@ -46,7 +47,6 @@
>>>
>>>   STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
>>>   STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
>>> -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
>>>
>>>   /*
>>>    * helper function to extract extent size hint from inode
>>> @@ -1110,7 +1110,7 @@
>>>   /*
>>>    * Increment the link count on an inode & log the change.
>>>    */
>>> -static void
>>> +void
>>>   xfs_bumplink(
>>>   	xfs_trans_t *tp,
>>>   	xfs_inode_t *ip)
>>> @@ -2406,7 +2406,7 @@ struct xfs_iunlink {
>>>   /*
>>>    * Pull the on-disk inode from the AGI unlinked list.
>>>    */
>>> -STATIC int
>>> +int
>>>   xfs_iunlink_remove(
>>>   	struct xfs_trans	*tp,
>>>   	struct xfs_inode	*ip)
>>> @@ -3261,8 +3261,6 @@ struct xfs_iunlink {
>>>   	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
>>>   	if (target_ip)
>>>   		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
>>> -	if (wip)
>>> -		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
>>>
>>>   	/*
>>>   	 * If we are using project inheritance, we only allow renames
>>> @@ -3417,35 +3415,15 @@ struct xfs_iunlink {
>>>   	if (error)
>>>   		goto out_trans_cancel;
>>>
>>> -	/*
>>> -	 * For whiteouts, we need to bump the link count on the whiteout inode.
>>> -	 * This means that failures all the way up to this point leave the inode
>>> -	 * on the unlinked list and so cleanup is a simple matter of dropping
>>> -	 * the remaining reference to it. If we fail here after bumping the link
>>> -	 * count, we're shutting down the filesystem so we'll never see the
>>> -	 * intermediate state on disk.
>>> -	 */
>>> -	if (wip) {
>>> -		ASSERT(VFS_I(wip)->i_nlink == 0);
>>> -		xfs_bumplink(tp, wip);
>>> -		error = xfs_iunlink_remove(tp, wip);
>>> -		if (error)
>>> -			goto out_trans_cancel;
>>> -		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
>>> -
>>> -		/*
>>> -		 * Now we have a real link, clear the "I'm a tmpfile" state
>>> -		 * flag from the inode so it doesn't accidentally get misused in
>>> -		 * future.
>>> -		 */
>>> -		VFS_I(wip)->i_state &= ~I_LINKABLE;
>>> -	}
>>> -
>>>   	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
>>>   	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
>>>   	if (new_parent)
>>>   		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
>>>
>>> +	/* add the iunlink remove intent to the tp */
>>> +	if (wip)
>>> +		xfs_iunlink_remove_add(tp, wip);
>>> +
>>>   	error = xfs_finish_rename(tp);
>>>   	if (wip)
>>>   		xfs_irele(wip);
>>> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
>>> index 558173f..f8c30ca 100644
>>> --- a/fs/xfs/xfs_inode.h
>>> +++ b/fs/xfs/xfs_inode.h
>>> @@ -20,6 +20,7 @@
>>>   struct xfs_mount;
>>>   struct xfs_trans;
>>>   struct xfs_dquot;
>>> +struct xfs_trans;
>>>
>>>   typedef struct xfs_inode {
>>>   	/* Inode linking and identification information. */
>>> @@ -414,6 +415,7 @@ enum layout_break_reason {
>>>   void		xfs_inactive(struct xfs_inode *ip);
>>>   int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
>>>   			   struct xfs_inode **ipp, struct xfs_name *ci_name);
>>> +void		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
>>>   int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
>>>   			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
>>>   int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
>>> @@ -436,6 +438,7 @@ int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
>>>   uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
>>>
>>>   uint		xfs_ip2xflags(struct xfs_inode *);
>>> +int		xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
>>>   int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
>>>   int		xfs_itruncate_extents_flags(struct xfs_trans **,
>>>   				struct xfs_inode *, int, xfs_fsize_t, int);
>>> diff --git a/fs/xfs/xfs_iunlinkrm_item.c b/fs/xfs/xfs_iunlinkrm_item.c
>>> new file mode 100644
>>> index 0000000..4e38329
>>> --- /dev/null
>>> +++ b/fs/xfs/xfs_iunlinkrm_item.c
>>> @@ -0,0 +1,458 @@
>>> +// SPDX-License-Identifier: GPL-2.0+
>>> +/*
>>> + * Copyright (C) 2019 Tencent.  All Rights Reserved.
>>> + * Author: Kaixuxia <kaixuxia@tencent.com>
>>> + */
>>> +#include "xfs.h"
>>> +#include "xfs_fs.h"
>>> +#include "xfs_format.h"
>>> +#include "xfs_log_format.h"
>>> +#include "xfs_trans_resv.h"
>>> +#include "xfs_bit.h"
>>> +#include "xfs_shared.h"
>>> +#include "xfs_mount.h"
>>> +#include "xfs_defer.h"
>>> +#include "xfs_trans.h"
>>> +#include "xfs_trans_priv.h"
>>> +#include "xfs_log.h"
>>> +#include "xfs_alloc.h"
>>> +#include "xfs_inode.h"
>>> +#include "xfs_icache.h"
>>> +#include "xfs_iunlinkrm_item.h"
>>> +
>>> +kmem_zone_t	*xfs_iri_zone;
>>> +kmem_zone_t	*xfs_ird_zone;
>>> +
>>> +static inline struct xfs_iri_log_item *IRI_ITEM(struct xfs_log_item *lip)
>>> +{
>>> +	return container_of(lip, struct xfs_iri_log_item, iri_item);
>>> +}
>>> +
>>> +void
>>> +xfs_iri_item_free(
>>> +	struct xfs_iri_log_item *irip)
>>> +{
>>> +	kmem_zone_free(xfs_iri_zone, irip);
>>> +}
>>> +
>>> +/*
>>> + * Freeing the iri requires that we remove it from the AIL if it has already
>>> + * been placed there. However, the IRI may not yet have been placed in the AIL
>>> + * when called by xfs_iri_release() from IRD processing due to the ordering of
>>> + * committed vs unpin operations in bulk insert operations. Hence the reference
>>> + * count to ensure only the last caller frees the IRI.
>>> + */
>>> +void
>>> +xfs_iri_release(
>>> +	struct xfs_iri_log_item *irip)
>>> +{
>>> +	ASSERT(atomic_read(&irip->iri_refcount) > 0);
>>> +	if (atomic_dec_and_test(&irip->iri_refcount)) {
>>> +		xfs_trans_ail_remove(&irip->iri_item, SHUTDOWN_LOG_IO_ERROR);
>>> +		xfs_iri_item_free(irip);
>>> +	}
>>> +}
>>> +
>>> +static inline int
>>> +xfs_iri_item_sizeof(
>>> +	struct xfs_iri_log_item *irip)
>>> +{
>>> +	return sizeof(struct xfs_iri_log_format);
>>> +}
>>> +
>>> +STATIC void
>>> +xfs_iri_item_size(
>>> +	struct xfs_log_item	*lip,
>>> +	int			*nvecs,
>>> +	int			*nbytes)
>>> +{
>>> +	*nvecs += 1;
>>> +	*nbytes += xfs_iri_item_sizeof(IRI_ITEM(lip));
>>> +}
>>> +
>>> +STATIC void
>>> +xfs_iri_item_format(
>>> +	struct xfs_log_item	*lip,
>>> +	struct xfs_log_vec	*lv)
>>> +{
>>> +	struct xfs_iri_log_item	*irip = IRI_ITEM(lip);
>>> +	struct xfs_log_iovec	*vecp = NULL;
>>> +
>>> +	irip->iri_format.iri_type = XFS_LI_IRI;
>>> +	irip->iri_format.iri_size = 1;
>>> +
>>> +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRI_FORMAT,
>>> +			&irip->iri_format,
>>> +			xfs_iri_item_sizeof(irip));
>>> +}
>>> +
>>> +/*
>>> + * The unpin operation is the last place an IRI is manipulated in the log. It is
>>> + * either inserted in the AIL or aborted in the event of a log I/O error. In
>>> + * either case, the IRI transaction has been successfully committed to make it
>>> + * this far. Therefore, we expect whoever committed the IRI to either construct
>>> + * and commit the IRD or drop the IRD's reference in the event of error. Simply
>>> + * drop the log's IRI reference now that the log is done with it.
>>> + */
>>> +STATIC void
>>> +xfs_iri_item_unpin(
>>> +	struct xfs_log_item	*lip,
>>> +	int			remove)
>>> +{
>>> +	struct xfs_iri_log_item *irip = IRI_ITEM(lip);
>>> +	xfs_iri_release(irip);
>>> +}
>>> +
>>> +/*
>>> + * The IRI has been either committed or aborted if the transaction has been
>>> + * cancelled. If the transaction was cancelled, an IRD isn't going to be
>>> + * constructed and thus we free the IRI here directly.
>>> + */
>>> +STATIC void
>>> +xfs_iri_item_release(
>>> +	struct xfs_log_item     *lip)
>>> +{
>>> +	xfs_iri_release(IRI_ITEM(lip));
>>> +}
>>> +
>>> +/*
>>> + * This is the ops vector shared by all iri log items.
>>> + */
>>> +static const struct xfs_item_ops xfs_iri_item_ops = {
>>> +	.iop_size	= xfs_iri_item_size,
>>> +	.iop_format	= xfs_iri_item_format,
>>> +	.iop_unpin	= xfs_iri_item_unpin,
>>> +	.iop_release	= xfs_iri_item_release,
>>> +};
>>> +
>>> +/*
>>> + * Allocate and initialize an iri item with the given wip ino.
>>> + */
>>> +struct xfs_iri_log_item *
>>> +xfs_iri_init(struct xfs_mount  *mp,
>>> +	     uint		count)
>>> +{
>>> +	struct xfs_iri_log_item *irip;
>>> +
>>> +	irip = kmem_zone_zalloc(xfs_iri_zone, KM_SLEEP);
>>> +
>>> +	xfs_log_item_init(mp, &irip->iri_item, XFS_LI_IRI, &xfs_iri_item_ops);
>>> +	irip->iri_format.iri_id = (uintptr_t)(void *)irip;
>>> +	atomic_set(&irip->iri_refcount, 2);
>>> +
>>> +	return irip;
>>> +}
>>> +
>>> +static inline struct xfs_ird_log_item *IRD_ITEM(struct xfs_log_item *lip)
>>> +{
>>> +	return container_of(lip, struct xfs_ird_log_item, ird_item);
>>> +}
>>> +
>>> +STATIC void
>>> +xfs_ird_item_free(struct xfs_ird_log_item *irdp)
>>> +{
>>> +	kmem_zone_free(xfs_ird_zone, irdp);
>>> +}
>>> +
>>> +/*
>>> + * This returns the number of iovecs needed to log the given ird item.
>>> + * We only need 1 iovec for an ird item.  It just logs the ird_log_format
>>> + * structure.
>>> + */
>>> +STATIC void
>>> +xfs_ird_item_size(
>>> +	struct xfs_log_item	*lip,
>>> +	int			*nvecs,
>>> +	int			*nbytes)
>>> +{
>>> +	*nvecs += 1;
>>> +	*nbytes += sizeof(struct xfs_ird_log_format);
>>> +}
>>> +
>>> +STATIC void
>>> +xfs_ird_item_format(
>>> +	struct xfs_log_item	*lip,
>>> +	struct xfs_log_vec	*lv)
>>> +{
>>> +	struct xfs_ird_log_item *irdp = IRD_ITEM(lip);
>>> +	struct xfs_log_iovec	*vecp = NULL;
>>> +
>>> +	irdp->ird_format.ird_type = XFS_LI_IRD;
>>> +	irdp->ird_format.ird_size = 1;
>>> +
>>> +	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRD_FORMAT, &irdp->ird_format,
>>> +			sizeof(struct xfs_ird_log_format));
>>> +}
>>> +
>>> +/*
>>> + * The IRD is either committed or aborted if the transaction is cancelled. If
>>> + * the transaction is cancelled, drop our reference to the IRI and free the
>>> + * IRD.
>>> + */
>>> +STATIC void
>>> +xfs_ird_item_release(
>>> +	struct xfs_log_item	*lip)
>>> +{
>>> +	struct xfs_ird_log_item	*irdp = IRD_ITEM(lip);
>>> +
>>> +	xfs_iri_release(irdp->ird_irip);
>>> +	xfs_ird_item_free(irdp);
>>> +}
>>> +
>>> +static const struct xfs_item_ops xfs_ird_item_ops = {
>>> +	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
>>> +	.iop_size	= xfs_ird_item_size,
>>> +	.iop_format	= xfs_ird_item_format,
>>> +	.iop_release	= xfs_ird_item_release,
>>> +};
>>> +
>>> +static struct xfs_ird_log_item *
>>> +xfs_trans_get_ird(
>>> +	struct xfs_trans		*tp,
>>> +	struct xfs_iri_log_item		*irip)
>>> +{
>>> +	xfs_ird_log_item_t	*irdp;
>>> +
>>> +	ASSERT(tp != NULL);
>>> +
>>> +	irdp = kmem_zone_zalloc(xfs_ird_zone, KM_SLEEP);
>>> +	xfs_log_item_init(tp->t_mountp, &irdp->ird_item, XFS_LI_IRD,
>>> +			  &xfs_ird_item_ops);
>>> +	irdp->ird_irip = irip;
>>> +	irdp->ird_format.wip_ino = irip->iri_format.wip_ino;
>>> +	irdp->ird_format.ird_iri_id = irip->iri_format.iri_id;
>>> +
>>> +	xfs_trans_add_item(tp, &irdp->ird_item);
>>> +	return irdp;
>>> +}
>>> +
>>> +/* record a iunlink remove intent */
>>> +int
>>> +xfs_iunlink_remove_add(
>>> +	struct xfs_trans	*tp,
>>> +	struct xfs_inode	*wip)
>>> +{
>>> +	struct xfs_iunlink_remove_intent	*ii;
>>> +
>>> +	ii = kmem_alloc(sizeof(struct xfs_iunlink_remove_intent),
>>> +			KM_SLEEP | KM_NOFS);
>>> +	INIT_LIST_HEAD(&ii->ri_list);
>>> +	ii->wip = wip;
>>> +
>>> +	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_IUNRE, &ii->ri_list);
>>> +	return 0;
>>> +}
>>> +
>>> +/* Sort iunlink remove intents by AG. */
>>> +static int
>>> +xfs_iunlink_remove_diff_items(
>>> +	void				*priv,
>>> +	struct list_head		*a,
>>> +	struct list_head		*b)
>>> +{
>>> +	struct xfs_mount			*mp = priv;
>>> +	struct xfs_iunlink_remove_intent	*ra;
>>> +	struct xfs_iunlink_remove_intent	*rb;
>>> +
>>> +	ra = container_of(a, struct xfs_iunlink_remove_intent, ri_list);
>>> +	rb = container_of(b, struct xfs_iunlink_remove_intent, ri_list);
>>> +	return	XFS_INO_TO_AGNO(mp, ra->wip->i_ino) -
>>> +		XFS_INO_TO_AGNO(mp, rb->wip->i_ino);
>>> +}
>>> +
>>> +/* Get an IRI */
>>> +STATIC void *
>>> +xfs_iunlink_remove_create_intent(
>>> +	struct xfs_trans		*tp,
>>> +	unsigned int			count)
>>> +{
>>> +	struct xfs_iri_log_item		*irip;
>>> +
>>> +	ASSERT(tp != NULL);
>>> +	ASSERT(count == 1);
>>> +
>>> +	irip = xfs_iri_init(tp->t_mountp, count);
>>> +	ASSERT(irip != NULL);
>>> +
>>> +	/*
>>> +	 * Get a log_item_desc to point at the new item.
>>> +	 */
>>> +	xfs_trans_add_item(tp, &irip->iri_item);
>>> +	return irip;
>>> +}
>>> +
>>> +/* Log a iunlink remove to the intent item. */
>>> +STATIC void
>>> +xfs_iunlink_remove_log_item(
>>> +	struct xfs_trans		*tp,
>>> +	void				*intent,
>>> +	struct list_head		*item)
>>> +{
>>> +	struct xfs_iri_log_item			*irip = intent;
>>> +	struct xfs_iunlink_remove_intent	*iunre;
>>> +
>>> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
>>> +
>>> +	tp->t_flags |= XFS_TRANS_DIRTY;
>>> +	set_bit(XFS_LI_DIRTY, &irip->iri_item.li_flags);
>>> +
>>> +	irip->iri_format.wip_ino = (uint64_t)(iunre->wip->i_ino);
>>> +}
>>> +
>>> +/* Get an IRD so we can process all the deferred iunlink remove. */
>>> +STATIC void *
>>> +xfs_iunlink_remove_create_done(
>>> +	struct xfs_trans		*tp,
>>> +	void				*intent,
>>> +	unsigned int			count)
>>> +{
>>> +	return xfs_trans_get_ird(tp, intent);
>>> +}
>>> +
>>> +/*
>>> + * For whiteouts, we need to bump the link count on the whiteout inode.
>>> + * This means that failures all the way up to this point leave the inode
>>> + * on the unlinked list and so cleanup is a simple matter of dropping
>>> + * the remaining reference to it. If we fail here after bumping the link
>>> + * count, we're shutting down the filesystem so we'll never see the
>>> + * intermediate state on disk.
>>> + */
>>> +static int
>>> +xfs_trans_log_finish_iunlink_remove(
>>> +	struct xfs_trans		*tp,
>>> +	struct xfs_ird_log_item		*irdp,
>>> +	struct xfs_inode		*wip)
>>> +{
>>> +	int 	error;
>>> +
>>> +	ASSERT(xfs_isilocked(wip, XFS_ILOCK_EXCL));
>>> +
>>> +	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
>>> +
>>> +	ASSERT(VFS_I(wip)->i_nlink == 0);
>>> +	xfs_bumplink(tp, wip);
>>> +	error = xfs_iunlink_remove(tp, wip);
>>> +	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
>>> +	/*
>>> +	 * Now we have a real link, clear the "I'm a tmpfile" state
>>> +	 * flag from the inode so it doesn't accidentally get misused in
>>> +	 * future.
>>> +	 */
>>> +	VFS_I(wip)->i_state &= ~I_LINKABLE;
>>> +
>>> +	/*
>>> +	 * Mark the transaction dirty, even on error. This ensures the
>>> +	 * transaction is aborted, which:
>>> +	 *
>>> +	 * 1.) releases the IRI and frees the IRD
>>> +	 * 2.) shuts down the filesystem
>>> +	 */
>>> +	tp->t_flags |= XFS_TRANS_DIRTY;
>>> +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
>>> +
>>> +	return error;
>>> +}
>>> +
>>> +/* Process a deferred iunlink remove. */
>>> +STATIC int
>>> +xfs_iunlink_remove_finish_item(
>>> +	struct xfs_trans		*tp,
>>> +	struct list_head		*item,
>>> +	void				*done_item,
>>> +	void				**state)
>>> +{
>>> +	struct xfs_iunlink_remove_intent	*iunre;
>>> +	int					error;
>>> +
>>> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
>>> +	error = xfs_trans_log_finish_iunlink_remove(tp, done_item,
>>> +			iunre->wip);
>>> +	kmem_free(iunre);
>>> +	return error;
>>> +}
>>> +
>>> +/* Abort all pending IRIs. */
>>> +STATIC void
>>> +xfs_iunlink_remove_abort_intent(
>>> +	void		*intent)
>>> +{
>>> +	xfs_iri_release(intent);
>>> +}
>>> +
>>> +/* Cancel a deferred iunlink remove. */
>>> +STATIC void
>>> +xfs_iunlink_remove_cancel_item(
>>> +	struct list_head		*item)
>>> +{
>>> +	struct xfs_iunlink_remove_intent	*iunre;
>>> +
>>> +	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
>>> +	kmem_free(iunre);
>>> +}
>>> +
>>> +const struct xfs_defer_op_type xfs_iunlink_remove_defer_type = {
>>> +	.diff_items	= xfs_iunlink_remove_diff_items,
>>> +	.create_intent	= xfs_iunlink_remove_create_intent,
>>> +	.abort_intent	= xfs_iunlink_remove_abort_intent,
>>> +	.log_item	= xfs_iunlink_remove_log_item,
>>> +	.create_done	= xfs_iunlink_remove_create_done,
>>> +	.finish_item	= xfs_iunlink_remove_finish_item,
>>> +	.cancel_item	= xfs_iunlink_remove_cancel_item,
>>> +};
>>> +
>>> +/*
>>> + * Process a iunlink remove intent item that was recovered from the log.
>>> + */
>>> +int
>>> +xfs_iri_recover(
>>> +	struct xfs_trans		*parent_tp,
>>> +	struct xfs_iri_log_item		*irip)
>>> +{
>>> +	int				error = 0;
>>> +	struct xfs_trans		*tp;
>>> +	xfs_ino_t			ino;
>>> +	struct xfs_inode		*ip;
>>> +	struct xfs_mount		*mp = parent_tp->t_mountp;
>>> +	struct xfs_ird_log_item		*irdp;
>>> +
>>> +	ASSERT(!test_bit(XFS_IRI_RECOVERED, &irip->iri_flags));
>>> +
>>> +	ino = irip->iri_format.wip_ino;
>>> +	if (ino == NULLFSINO || !xfs_verify_dir_ino(mp, ino)) {
>>> +		xfs_alert(mp, "IRI recover used bad inode ino 0x%llx!", ino);
>>> +		set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
>>> +		xfs_iri_release(irip);
>>> +		return -EIO;
>>> +	}
>>> +	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
>>> +	if (error)
>>> +		return error;
>>> +
>>> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
>>> +	if (error)
>>> +		return error;
>>> +	irdp = xfs_trans_get_ird(tp, irip);
>>> +
>>> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
>>> +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
>>> +
>>> +	ASSERT(VFS_I(ip)->i_nlink == 0);
>>> +	VFS_I(ip)->i_state |= I_LINKABLE;
>>> +	xfs_bumplink(tp, ip);
>>> +	error = xfs_iunlink_remove(tp, ip);
>>> +	if (error)
>>> +		goto abort_error;
>>> +	VFS_I(ip)->i_state &= ~I_LINKABLE;
>>> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
>>> +
>>> +	tp->t_flags |= XFS_TRANS_DIRTY;
>>> +	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
>>> +
>>> +	set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
>>> +	error = xfs_trans_commit(tp);
>>> +	return error;
>>> +
>>> +abort_error:
>>> +	xfs_trans_cancel(tp);
>>> +	return error;
>>> +}
>>> diff --git a/fs/xfs/xfs_iunlinkrm_item.h b/fs/xfs/xfs_iunlinkrm_item.h
>>> new file mode 100644
>>> index 0000000..54c4ca3
>>> --- /dev/null
>>> +++ b/fs/xfs/xfs_iunlinkrm_item.h
>>> @@ -0,0 +1,67 @@
>>> +// SPDX-License-Identifier: GPL-2.0+
>>> +/*
>>> + * Copyright (C) 2019 Tencent.  All Rights Reserved.
>>> + * Author: Kaixuxia <kaixuxia@tencent.com>
>>> + */
>>> +#ifndef	__XFS_IUNLINKRM_ITEM_H__
>>> +#define	__XFS_IUNLINKRM_ITEM_H__
>>> +
>>> +/*
>>> + * When performing rename operation with RENAME_WHITEOUT flag, we will hold AGF lock to
>>> + * allocate or free extents in manipulating the dirents firstly, and then doing the
>>> + * xfs_iunlink_remove() call last to hold AGI lock to modify the tmpfile info, so we the
>>> + * lock order AGI->AGF.
>>> + *
>>> + * The big problem here is that we have an ordering constraint on AGF and AGI locking -
>>> + * inode allocation locks the AGI, then can allocate a new extent for new inodes, locking
>>> + * the AGF after the AGI. Hence the ordering that is imposed by other parts of the code
>>> + * is AGI before AGF. So we get the ABBA agi&agf deadlock here.
>>> + *
>>> + * So make the unlinked list removal a deferred operation, i.e. log an iunlink remove intent
>>> + * and then do it after the RENAME_WHITEOUT transaction has committed, and the iunlink remove
>>> + * intention(IRI) and done log items(IRD) are provided.
>>> + */
>>> +
>>> +/* kernel only IRI/IRD definitions */
>>> +
>>> +struct xfs_mount;
>>> +struct kmem_zone;
>>> +struct xfs_inode;
>>> +
>>> +/*
>>> + * Define IRI flag bits. Manipulated by set/clear/test_bit operators.
>>> + */
>>> +#define	XFS_IRI_RECOVERED		1
>>> +
>>> +/* This is the "iunlink remove intention" log item. It is used in conjunction
>>> + * with the "iunlink remove done" log item described below.
>>> + */
>>> +typedef struct xfs_iri_log_item {
>>> +	struct xfs_log_item	iri_item;
>>> +	atomic_t		iri_refcount;
>>> +	unsigned long		iri_flags;
>>> +	xfs_iri_log_format_t	iri_format;
>>> +} xfs_iri_log_item_t;
>>> +
>>> +/* This is the "iunlink remove done" log item. */
>>> +typedef struct xfs_ird_log_item {
>>> +	struct xfs_log_item	ird_item;
>>> +	xfs_iri_log_item_t	*ird_irip;
>>> +	xfs_ird_log_format_t	ird_format;
>>> +} xfs_ird_log_item_t;
>>> +
>>> +struct xfs_iunlink_remove_intent {
>>> +	struct list_head		ri_list;
>>> +	struct xfs_inode		*wip;
>>> +};
>>> +
>>> +extern struct kmem_zone	*xfs_iri_zone;
>>> +extern struct kmem_zone	*xfs_ird_zone;
>>> +
>>> +struct xfs_iri_log_item	*xfs_iri_init(struct xfs_mount *, uint);
>>> +void xfs_iri_item_free(struct xfs_iri_log_item *);
>>> +void xfs_iri_release(struct xfs_iri_log_item *);
>>> +int xfs_iri_recover(struct xfs_trans *, struct xfs_iri_log_item *);
>>> +int xfs_iunlink_remove_add(struct xfs_trans *, struct xfs_inode *);
>>> +
>>> +#endif	/* __XFS_IUNLINKRM_ITEM_H__ */
>>> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
>>> index 00e9f5c..f87f510 100644
>>> --- a/fs/xfs/xfs_log.c
>>> +++ b/fs/xfs/xfs_log.c
>>> @@ -2005,6 +2005,8 @@ STATIC void xlog_state_done_syncing(
>>>   	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
>>>   	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
>>>   	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
>>> +	    REG_TYPE_STR(IRI_FORMAT, "iri_format"),
>>> +	    REG_TYPE_STR(IRD_FORMAT, "ird_format"),
>>>   	};
>>>   	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
>>>   #undef REG_TYPE_STR
>>> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
>>> index 13d1d3e..a916f40 100644
>>> --- a/fs/xfs/xfs_log_recover.c
>>> +++ b/fs/xfs/xfs_log_recover.c
>>> @@ -33,6 +33,7 @@
>>>   #include "xfs_buf_item.h"
>>>   #include "xfs_refcount_item.h"
>>>   #include "xfs_bmap_item.h"
>>> +#include "xfs_iunlinkrm_item.h"
>>>
>>>   #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
>>>
>>> @@ -1885,6 +1886,8 @@ struct xfs_buf_cancel {
>>>   		case XFS_LI_CUD:
>>>   		case XFS_LI_BUI:
>>>   		case XFS_LI_BUD:
>>> +		case XFS_LI_IRI:
>>> +		case XFS_LI_IRD:
>>>   			trace_xfs_log_recover_item_reorder_tail(log,
>>>   							trans, item, pass);
>>>   			list_move_tail(&item->ri_list, &inode_list);
>>> @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
>>>   }
>>>
>>>   /*
>>> + * This routine is called to create an in-core iunlink remove intent
>>> + * item from the iri format structure which was logged on disk.
>>> + * It allocates an in-core iri, copies the inode from the format
>>> + * structure into it, and adds the iri to the AIL with the given
>>> + * LSN.
>>> + */
>>> +STATIC int
>>> +xlog_recover_iri_pass2(
>>> +	struct xlog			*log,
>>> +	struct xlog_recover_item	*item,
>>> +	xfs_lsn_t			lsn)
>>> +{
>>> +	xfs_mount_t		*mp = log->l_mp;
>>> +	xfs_iri_log_item_t	*irip;
>>> +	xfs_iri_log_format_t	*iri_formatp;
>>> +
>>> +	iri_formatp = item->ri_buf[0].i_addr;
>>> +
>>> +	irip = xfs_iri_init(mp, 1);
>>> +	irip->iri_format = *iri_formatp;
>>> +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
>>> +		xfs_iri_item_free(irip);
>>> +		return EFSCORRUPTED;
>>> +	}
>>> +
>>> +	spin_lock(&log->l_ailp->ail_lock);
>>> +	/*
>>> +	 * The IRI has two references. One for the IRD and one for IRI to ensure
>>> +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
>>> +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
>>> +	 * AIL lock.
>>> +	 */
>>> +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
>>> +	xfs_iri_release(irip);
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>> + * This routine is called when an IRD format structure is found in a committed
>>> + * transaction in the log. Its purpose is to cancel the corresponding IRI if it
>>> + * was still in the log. To do this it searches the AIL for the IRI with an id
>>> + * equal to that in the IRD format structure. If we find it we drop the IRD
>>> + * reference, which removes the IRI from the AIL and frees it.
>>> + */
>>> +STATIC int
>>> +xlog_recover_ird_pass2(
>>> +	struct xlog			*log,
>>> +	struct xlog_recover_item	*item)
>>> +{
>>> +	xfs_ird_log_format_t	*ird_formatp;
>>> +	xfs_iri_log_item_t	*irip = NULL;
>>> +	struct xfs_log_item	*lip;
>>> +	uint64_t		iri_id;
>>> +	struct xfs_ail_cursor	cur;
>>> +	struct xfs_ail		*ailp = log->l_ailp;
>>> +
>>> +	ird_formatp = item->ri_buf[0].i_addr;
>>> +	if (item->ri_buf[0].i_len == sizeof(xfs_ird_log_format_t))
>>> +		return -EFSCORRUPTED;
>>> +	iri_id = ird_formatp->ird_iri_id;
>>> +
>>> +	/*
>>> +	 * Search for the iri with the id in the ird format structure
>>> +	 * in the AIL.
>>> +	 */
>>> +	spin_lock(&ailp->ail_lock);
>>> +	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
>>> +	while (lip != NULL) {
>>> +		if (lip->li_type == XFS_LI_IRI) {
>>> +			irip = (xfs_iri_log_item_t *)lip;
>>> +			if (irip->iri_format.iri_id == iri_id) {
>>> +				/*
>>> +				 * Drop the IRD reference to the IRI. This
>>> +				 * removes the IRI from the AIL and frees it.
>>> +				 */
>>> +				spin_unlock(&ailp->ail_lock);
>>> +				xfs_iri_release(irip);
>>> +				spin_lock(&ailp->ail_lock);
>>> +				break;
>>> +			}
>>> +		}
>>> +		lip = xfs_trans_ail_cursor_next(ailp, &cur);
>>> +	}
>>> +	xfs_trans_ail_cursor_done(&cur);
>>> +	spin_unlock(&ailp->ail_lock);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>>    * This routine is called when an inode create format structure is found in a
>>>    * committed transaction in the log.  It's purpose is to initialise the inodes
>>>    * being allocated on disk. This requires us to get inode cluster buffers that
>>> @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
>>>   	case XFS_LI_CUD:
>>>   	case XFS_LI_BUI:
>>>   	case XFS_LI_BUD:
>>> +	case XFS_LI_IRI:
>>> +	case XFS_LI_IRD:
>>>   	default:
>>>   		break;
>>>   	}
>>> @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
>>>   	case XFS_LI_CUD:
>>>   	case XFS_LI_BUI:
>>>   	case XFS_LI_BUD:
>>> +	case XFS_LI_IRI:
>>> +	case XFS_LI_IRD:
>>>   		/* nothing to do in pass 1 */
>>>   		return 0;
>>>   	default:
>>> @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
>>>   		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
>>>   	case XFS_LI_BUD:
>>>   		return xlog_recover_bud_pass2(log, item);
>>> +	case XFS_LI_IRI:
>>> +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
>>> +	case XFS_LI_IRD:
>>> +		return xlog_recover_ird_pass2(log, item);
>>>   	case XFS_LI_DQUOT:
>>>   		return xlog_recover_dquot_pass2(log, buffer_list, item,
>>>   						trans->r_lsn);
>>> @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
>>>   	spin_lock(&ailp->ail_lock);
>>>   }
>>>
>>> +/* Recover the IRI if necessary. */
>>> +STATIC int
>>> +xlog_recover_process_iri(
>>> +	struct xfs_trans		*parent_tp,
>>> +	struct xfs_ail			*ailp,
>>> +	struct xfs_log_item		*lip)
>>> +{
>>> +	struct xfs_iri_log_item		*irip;
>>> +	int				error;
>>> +
>>> +	/*
>>> +	 * Skip IRIs that we've already processed.
>>> +	 */
>>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>>> +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
>>> +		return 0;
>>> +
>>> +	spin_unlock(&ailp->ail_lock);
>>> +	error = xfs_iri_recover(parent_tp, irip);
>>> +	spin_lock(&ailp->ail_lock);
>>> +
>>> +	return error;
>>> +}
>>> +
>>> +/* Release the IRI since we're cancelling everything. */
>>> +STATIC void
>>> +xlog_recover_cancel_iri(
>>> +	struct xfs_mount		*mp,
>>> +	struct xfs_ail			*ailp,
>>> +	struct xfs_log_item		*lip)
>>> +{
>>> +	struct xfs_iri_log_item         *irip;
>>> +
>>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>>> +
>>> +	spin_unlock(&ailp->ail_lock);
>>> +	xfs_iri_release(irip);
>>> +	spin_lock(&ailp->ail_lock);
>>> +}
>>> +
>>>   /* Is this log item a deferred action intent? */
>>>   static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>   {
>>> @@ -4729,6 +4870,7 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>   	case XFS_LI_RUI:
>>>   	case XFS_LI_CUI:
>>>   	case XFS_LI_BUI:
>>> +	case XFS_LI_IRI:
>>>   		return true;
>>>   	default:
>>>   		return false;
>>> @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>   		case XFS_LI_BUI:
>>>   			error = xlog_recover_process_bui(parent_tp, ailp, lip);
>>>   			break;
>>> +		case XFS_LI_IRI:
>>> +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
>>> +			break;
>>>   		}
>>>   		if (error)
>>>   			goto out;
>>> @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>   		case XFS_LI_BUI:
>>>   			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
>>>   			break;
>>> +		case XFS_LI_IRI:
>>> +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
>>> +			break;
>>>   		}
>>>
>>>   		lip = xfs_trans_ail_cursor_next(ailp, &cur);
>>> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
>>> index f945023..66742b7 100644
>>> --- a/fs/xfs/xfs_super.c
>>> +++ b/fs/xfs/xfs_super.c
>>> @@ -34,6 +34,7 @@
>>>   #include "xfs_rmap_item.h"
>>>   #include "xfs_refcount_item.h"
>>>   #include "xfs_bmap_item.h"
>>> +#include "xfs_iunlinkrm_item.h"
>>>   #include "xfs_reflink.h"
>>>
>>>   #include <linux/magic.h>
>>> @@ -1957,8 +1958,22 @@ struct proc_xfs_info {
>>>   	if (!xfs_bui_zone)
>>>   		goto out_destroy_bud_zone;
>>>
>>> +	xfs_ird_zone = kmem_zone_init(sizeof(xfs_ird_log_item_t),
>>> +			"xfs_ird_item");
>>> +	if (!xfs_ird_zone)
>>> +		goto out_destroy_bui_zone;
>>> +
>>> +	xfs_iri_zone = kmem_zone_init(sizeof(xfs_iri_log_item_t),
>>> +			"xfs_iri_item");
>>> +	if (!xfs_iri_zone)
>>> +		goto out_destroy_ird_zone;
>>> +
>>>   	return 0;
>>>
>>> + out_destroy_ird_zone:
>>> +	kmem_zone_destroy(xfs_ird_zone);
>>> + out_destroy_bui_zone:
>>> +	kmem_zone_destroy(xfs_bui_zone);
>>>    out_destroy_bud_zone:
>>>   	kmem_zone_destroy(xfs_bud_zone);
>>>    out_destroy_cui_zone:
>>> @@ -2007,6 +2022,8 @@ struct proc_xfs_info {
>>>   	 * destroy caches.
>>>   	 */
>>>   	rcu_barrier();
>>> +	kmem_zone_destroy(xfs_iri_zone);
>>> +	kmem_zone_destroy(xfs_ird_zone);
>>>   	kmem_zone_destroy(xfs_bui_zone);
>>>   	kmem_zone_destroy(xfs_bud_zone);
>>>   	kmem_zone_destroy(xfs_cui_zone);
>>> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
>>> index 64d7f17..dd63eaa 100644
>>> --- a/fs/xfs/xfs_trans.h
>>> +++ b/fs/xfs/xfs_trans.h
>>> @@ -26,6 +26,8 @@
>>>   struct xfs_cud_log_item;
>>>   struct xfs_bui_log_item;
>>>   struct xfs_bud_log_item;
>>> +struct xfs_iri_log_item;
>>> +struct xfs_ird_log_item;
>>>
>>>   struct xfs_log_item {
>>>   	struct list_head		li_ail;		/* AIL pointers */
>>> -- 
>>> 1.8.3.1
>>>
>>> -- 
>>> kaixuxia
Dave Chinner Aug. 15, 2019, 11:10 p.m. UTC | #5
On Tue, Aug 13, 2019 at 09:36:14AM -0400, Brian Foster wrote:
> On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > When performing rename operation with RENAME_WHITEOUT flag, we will
> > hold AGF lock to allocate or free extents in manipulating the dirents
> > firstly, and then doing the xfs_iunlink_remove() call last to hold
> > AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
> > 
> 
> IIUC, the whiteout use case is that we're renaming a file, but the
> source dentry must be replaced with a magic whiteout inode rather than
> be removed. Therefore, xfs_rename() allocates the whiteout inode as a
> tmpfile first in a separate transaction, updates the target dentry with
> the source inode, replaces the source dentry to point to the whiteout
> inode and finally removes the whiteout inode from the unlinked list
> (since it is a tmpfile). This leads to the problem described below
> because the rename transaction ends up doing directory block allocs
> (locking the AGF) followed by the unlinked list remove (locking the
> AGI).
> 
> My understanding from reading the code is that this is primarly to
> cleanly handle error scenarios. If anything fails after we've allocated
> the whiteout tmpfile, it's simply left on the unlinked list and so the
> filesystem remains in a consistent/recoverable state. Given that, the
> solution here seems like overkill to me. For one, I thought background
> unlinked list removal was already on our roadmap (Darrick might have
> been looking at that and may already have a prototype as well). Also,
> unlinked list removal occurs at log recovery time already. That's
> somewhat of an existing purpose of the list, which makes a deferred
> unlinked list removal operation superfluous in more traditional cases
> where unlinked list removal doesn't require consistency with a directory
> operation.
> 
> Functional discussion aside.. from a complexity standpoint I'm wondering
> if we could do something much more simple like acquire the AGI lock for
> a whiteout inode earlier in xfs_rename(). For example, suppose we did
> something like:
> 
> 	/*
> 	 * Acquire the whiteout agi to preserve locking order in anticipation of
> 	 * unlinked list removal.
> 	 */
> 	if (wip)
> 		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);
> 
> ... after we allocate the transaction but before we do any directory ops
> that can result in block allocations. Would that prevent the problem
> you've observed?

I'd prefer that we just do things in an order that doesn't invert
the locking. For a whiteout, we only allocate blocks when modifying
the target directory, and we do a check to see if that will succeed
before actually doing the directory modification. That means the
directory modification will only fail due to an IO error or
corruption, both of which have a high probability of causing the
filesystem to be shut down. Any error after the directory mod will
cause a shutdown because the transaction is dirty.

Further, the operation that will lock the AGF is the target
directory modification if blocks need to be allocated, and the whole
point of the "check before execution" is to abort if ENOSPC would
occur as a result of trying to allocate blocks and we don't have a
space reservation for for those blocks because we are very, very
close to ENOSPC already.

If we fail the xfs_iunlink_remove() operation, we're shutting down
the filesystem. If we fail the xfs_dir_createname(target) call, we
are most likely going to be shutting down the filesystem. So the
premise that locating xfs_iunlink_remove() at the end to make error
handling easy is not really true - transaction cancel will clean
both of them up and shut the filesystem down.

Hence I think the right thing to do is to move the
xfs_iunlink_remove() call to between xfs_dir_canenter() and
xfs_dir_createname(). This means ENOSPC will abort with a clean
transaction and all is good, otherwise a failure is most likely
going to shut down the filesystem and it doesn't matter if we do
xfs_iunlink_remove() or xfs_dir_createname() first.

And by doing xfs_iunlink_remove() first, we remove the AGI/AGF
lock inversion problem....

I think this holds together, but I might have missed something in
the tangle of different rename operation cases. So it's worth
checking, but it looks to me like a better solution than having
a bare AGI lock in the middle of the function to work around error
handling logic we didn't clearly enough about at the time (hindsight
and all that jazz)....

Thoughts?

-Dave.
Dave Chinner Aug. 15, 2019, 11:36 p.m. UTC | #6
On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> In this patch we make the unlinked list removal a deferred operation,
> i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> transaction has committed, and the iunlink remove intention and done
> log items are provided.

I really like the idea of doing this, not just for the inode unlink
list removal, but for all the high level complex metadata
modifications such as create, unlink, etc.

The reason I like this is that it moves use closer to being able to
do operations almost completely asynchronously once the first intent
has been logged.

Once we have committed the intent, we can treat the rest of the
operation like recovery - all the information needed to perform the
operation is in the intenti and all the objects that need to be
locked across the entire operation are locked and joined to the
defer structure. If the intent hits the log the we guarantee that it
will be completed atomically and in the correct sequence order.
Hence it doesn't matter once the intent is built and committed what
context actually completes the rest of the transaction.

If we have to do a sync transaction, because XFS_MOUNT_SYNC,
XFS_MOUNT_DIRSYNC, or there's a sync flag on the inode(s), we can
add a waitqueue_head to the struct xfs_defer and have the context
issuing the transaction attach itself and wait for the defer ops to
complete and wake it....


.....

> @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
>  }
> 
>  /*
> + * This routine is called to create an in-core iunlink remove intent
> + * item from the iri format structure which was logged on disk.
> + * It allocates an in-core iri, copies the inode from the format
> + * structure into it, and adds the iri to the AIL with the given
> + * LSN.
> + */
> +STATIC int
> +xlog_recover_iri_pass2(
> +	struct xlog			*log,
> +	struct xlog_recover_item	*item,
> +	xfs_lsn_t			lsn)
> +{
> +	xfs_mount_t		*mp = log->l_mp;
> +	xfs_iri_log_item_t	*irip;
> +	xfs_iri_log_format_t	*iri_formatp;
> +
> +	iri_formatp = item->ri_buf[0].i_addr;
> +
> +	irip = xfs_iri_init(mp, 1);
> +	irip->iri_format = *iri_formatp;
> +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
> +		xfs_iri_item_free(irip);
> +		return EFSCORRUPTED;
> +	}
> +
> +	spin_lock(&log->l_ailp->ail_lock);
> +	/*
> +	 * The IRI has two references. One for the IRD and one for IRI to ensure
> +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
> +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
> +	 * AIL lock.
> +	 */
> +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
> +	xfs_iri_release(irip);
> +	return 0;
> +}

These intent recovery functions all do very, very similar things.
We already have 4 copies of this almost identical code - I think
there needs to be some factoring/abstrcting done here rather than
continuing to copy/paste this code...

> @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
>  	case XFS_LI_CUD:
>  	case XFS_LI_BUI:
>  	case XFS_LI_BUD:
> +	case XFS_LI_IRI:
> +	case XFS_LI_IRD:
>  	default:
>  		break;
>  	}
> @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
>  	case XFS_LI_CUD:
>  	case XFS_LI_BUI:
>  	case XFS_LI_BUD:
> +	case XFS_LI_IRI:
> +	case XFS_LI_IRD:
>  		/* nothing to do in pass 1 */
>  		return 0;
>  	default:
> @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
>  		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
>  	case XFS_LI_BUD:
>  		return xlog_recover_bud_pass2(log, item);
> +	case XFS_LI_IRI:
> +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
> +	case XFS_LI_IRD:
> +		return xlog_recover_ird_pass2(log, item);
>  	case XFS_LI_DQUOT:
>  		return xlog_recover_dquot_pass2(log, buffer_list, item,
>  						trans->r_lsn);

As can be seen by the increasing size of this table....

> @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
>  	spin_lock(&ailp->ail_lock);
>  }
> 
> +/* Recover the IRI if necessary. */
> +STATIC int
> +xlog_recover_process_iri(
> +	struct xfs_trans		*parent_tp,
> +	struct xfs_ail			*ailp,
> +	struct xfs_log_item		*lip)
> +{
> +	struct xfs_iri_log_item		*irip;
> +	int				error;
> +
> +	/*
> +	 * Skip IRIs that we've already processed.
> +	 */
> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
> +		return 0;
> +
> +	spin_unlock(&ailp->ail_lock);
> +	error = xfs_iri_recover(parent_tp, irip);
> +	spin_lock(&ailp->ail_lock);
> +
> +	return error;
> +}
> +
> +/* Release the IRI since we're cancelling everything. */
> +STATIC void
> +xlog_recover_cancel_iri(
> +	struct xfs_mount		*mp,
> +	struct xfs_ail			*ailp,
> +	struct xfs_log_item		*lip)
> +{
> +	struct xfs_iri_log_item         *irip;
> +
> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> +
> +	spin_unlock(&ailp->ail_lock);
> +	xfs_iri_release(irip);
> +	spin_lock(&ailp->ail_lock);
> +}

More cookie cutter code.

> @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  		case XFS_LI_BUI:
>  			error = xlog_recover_process_bui(parent_tp, ailp, lip);
>  			break;
> +		case XFS_LI_IRI:
> +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
> +			break;
>  		}
>  		if (error)
>  			goto out;
> @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>  		case XFS_LI_BUI:
>  			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
>  			break;
> +		case XFS_LI_IRI:
> +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
> +			break;
>  		}

And the table that drives it....

I guess what I'm saying is that I'd really like to see an abstract
type specifically for intent log items and generic infrastructure to
manipulate them before we go adding more of them...

Cheers,

Dave.
Kaixu Xia Aug. 16, 2019, 8:09 a.m. UTC | #7
On 2019/8/16 7:36, Dave Chinner wrote:
> On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
>> In this patch we make the unlinked list removal a deferred operation,
>> i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
>> transaction has committed, and the iunlink remove intention and done
>> log items are provided.
> 
> I really like the idea of doing this, not just for the inode unlink
> list removal, but for all the high level complex metadata
> modifications such as create, unlink, etc.
> 
> The reason I like this is that it moves use closer to being able to
> do operations almost completely asynchronously once the first intent
> has been logged.
> 

Thanks a lot for your comments.
Yeah, sometimes the complex metadata modifications correspond to the
long and complex transactions that hold more locks or other common
resources, so the deferred options may be better choices than just
changing the order in one transaction.

> Once we have committed the intent, we can treat the rest of the
> operation like recovery - all the information needed to perform the
> operation is in the intenti and all the objects that need to be
> locked across the entire operation are locked and joined to the
> defer structure. If the intent hits the log the we guarantee that it
> will be completed atomically and in the correct sequence order.
> Hence it doesn't matter once the intent is built and committed what
> context actually completes the rest of the transaction.
> 
> If we have to do a sync transaction, because XFS_MOUNT_SYNC,
> XFS_MOUNT_DIRSYNC, or there's a sync flag on the inode(s), we can
> add a waitqueue_head to the struct xfs_defer and have the context
> issuing the transaction attach itself and wait for the defer ops to
> complete and wake it....
> 
> 
> .....
> 
>> @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
>>   }
>>
>>   /*
>> + * This routine is called to create an in-core iunlink remove intent
>> + * item from the iri format structure which was logged on disk.
>> + * It allocates an in-core iri, copies the inode from the format
>> + * structure into it, and adds the iri to the AIL with the given
>> + * LSN.
>> + */
>> +STATIC int
>> +xlog_recover_iri_pass2(
>> +	struct xlog			*log,
>> +	struct xlog_recover_item	*item,
>> +	xfs_lsn_t			lsn)
>> +{
>> +	xfs_mount_t		*mp = log->l_mp;
>> +	xfs_iri_log_item_t	*irip;
>> +	xfs_iri_log_format_t	*iri_formatp;
>> +
>> +	iri_formatp = item->ri_buf[0].i_addr;
>> +
>> +	irip = xfs_iri_init(mp, 1);
>> +	irip->iri_format = *iri_formatp;
>> +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
>> +		xfs_iri_item_free(irip);
>> +		return EFSCORRUPTED;
>> +	}
>> +
>> +	spin_lock(&log->l_ailp->ail_lock);
>> +	/*
>> +	 * The IRI has two references. One for the IRD and one for IRI to ensure
>> +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
>> +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
>> +	 * AIL lock.
>> +	 */
>> +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
>> +	xfs_iri_release(irip);
>> +	return 0;
>> +}
> 
> These intent recovery functions all do very, very similar things.
> We already have 4 copies of this almost identical code - I think
> there needs to be some factoring/abstrcting done here rather than
> continuing to copy/paste this code...

Factoring/abstrcting is better than just copy/paste...
The log incompat feature bit is also needed because adding new
log item types(IRI&IRD)...
Any way, I will send the V2 patch for all the review comments.

> 
>> @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
>>   	case XFS_LI_CUD:
>>   	case XFS_LI_BUI:
>>   	case XFS_LI_BUD:
>> +	case XFS_LI_IRI:
>> +	case XFS_LI_IRD:
>>   	default:
>>   		break;
>>   	}
>> @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
>>   	case XFS_LI_CUD:
>>   	case XFS_LI_BUI:
>>   	case XFS_LI_BUD:
>> +	case XFS_LI_IRI:
>> +	case XFS_LI_IRD:
>>   		/* nothing to do in pass 1 */
>>   		return 0;
>>   	default:
>> @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
>>   		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
>>   	case XFS_LI_BUD:
>>   		return xlog_recover_bud_pass2(log, item);
>> +	case XFS_LI_IRI:
>> +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
>> +	case XFS_LI_IRD:
>> +		return xlog_recover_ird_pass2(log, item);
>>   	case XFS_LI_DQUOT:
>>   		return xlog_recover_dquot_pass2(log, buffer_list, item,
>>   						trans->r_lsn);
> 
> As can be seen by the increasing size of this table....
> 
>> @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
>>   	spin_lock(&ailp->ail_lock);
>>   }
>>
>> +/* Recover the IRI if necessary. */
>> +STATIC int
>> +xlog_recover_process_iri(
>> +	struct xfs_trans		*parent_tp,
>> +	struct xfs_ail			*ailp,
>> +	struct xfs_log_item		*lip)
>> +{
>> +	struct xfs_iri_log_item		*irip;
>> +	int				error;
>> +
>> +	/*
>> +	 * Skip IRIs that we've already processed.
>> +	 */
>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>> +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
>> +		return 0;
>> +
>> +	spin_unlock(&ailp->ail_lock);
>> +	error = xfs_iri_recover(parent_tp, irip);
>> +	spin_lock(&ailp->ail_lock);
>> +
>> +	return error;
>> +}
>> +
>> +/* Release the IRI since we're cancelling everything. */
>> +STATIC void
>> +xlog_recover_cancel_iri(
>> +	struct xfs_mount		*mp,
>> +	struct xfs_ail			*ailp,
>> +	struct xfs_log_item		*lip)
>> +{
>> +	struct xfs_iri_log_item         *irip;
>> +
>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>> +
>> +	spin_unlock(&ailp->ail_lock);
>> +	xfs_iri_release(irip);
>> +	spin_lock(&ailp->ail_lock);
>> +}
> 
> More cookie cutter code.
> 
>> @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>   		case XFS_LI_BUI:
>>   			error = xlog_recover_process_bui(parent_tp, ailp, lip);
>>   			break;
>> +		case XFS_LI_IRI:
>> +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
>> +			break;
>>   		}
>>   		if (error)
>>   			goto out;
>> @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>   		case XFS_LI_BUI:
>>   			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
>>   			break;
>> +		case XFS_LI_IRI:
>> +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
>> +			break;
>>   		}
> 
> And the table that drives it....
> 
> I guess what I'm saying is that I'd really like to see an abstract
> type specifically for intent log items and generic infrastructure to
> manipulate them before we go adding more of them...
> 
> Cheers,
> 
> Dave.
>
Brian Foster Aug. 16, 2019, 2:30 p.m. UTC | #8
On Fri, Aug 16, 2019 at 09:10:01AM +1000, Dave Chinner wrote:
> On Tue, Aug 13, 2019 at 09:36:14AM -0400, Brian Foster wrote:
> > On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > > When performing rename operation with RENAME_WHITEOUT flag, we will
> > > hold AGF lock to allocate or free extents in manipulating the dirents
> > > firstly, and then doing the xfs_iunlink_remove() call last to hold
> > > AGI lock to modify the tmpfile info, so we the lock order AGI->AGF.
> > > 
> > 
> > IIUC, the whiteout use case is that we're renaming a file, but the
> > source dentry must be replaced with a magic whiteout inode rather than
> > be removed. Therefore, xfs_rename() allocates the whiteout inode as a
> > tmpfile first in a separate transaction, updates the target dentry with
> > the source inode, replaces the source dentry to point to the whiteout
> > inode and finally removes the whiteout inode from the unlinked list
> > (since it is a tmpfile). This leads to the problem described below
> > because the rename transaction ends up doing directory block allocs
> > (locking the AGF) followed by the unlinked list remove (locking the
> > AGI).
> > 
> > My understanding from reading the code is that this is primarly to
> > cleanly handle error scenarios. If anything fails after we've allocated
> > the whiteout tmpfile, it's simply left on the unlinked list and so the
> > filesystem remains in a consistent/recoverable state. Given that, the
> > solution here seems like overkill to me. For one, I thought background
> > unlinked list removal was already on our roadmap (Darrick might have
> > been looking at that and may already have a prototype as well). Also,
> > unlinked list removal occurs at log recovery time already. That's
> > somewhat of an existing purpose of the list, which makes a deferred
> > unlinked list removal operation superfluous in more traditional cases
> > where unlinked list removal doesn't require consistency with a directory
> > operation.
> > 
> > Functional discussion aside.. from a complexity standpoint I'm wondering
> > if we could do something much more simple like acquire the AGI lock for
> > a whiteout inode earlier in xfs_rename(). For example, suppose we did
> > something like:
> > 
> > 	/*
> > 	 * Acquire the whiteout agi to preserve locking order in anticipation of
> > 	 * unlinked list removal.
> > 	 */
> > 	if (wip)
> > 		xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, wip->i_ino), &agibp);
> > 
> > ... after we allocate the transaction but before we do any directory ops
> > that can result in block allocations. Would that prevent the problem
> > you've observed?
> 
> I'd prefer that we just do things in an order that doesn't invert
> the locking. For a whiteout, we only allocate blocks when modifying
> the target directory, and we do a check to see if that will succeed
> before actually doing the directory modification. That means the
> directory modification will only fail due to an IO error or
> corruption, both of which have a high probability of causing the
> filesystem to be shut down. Any error after the directory mod will
> cause a shutdown because the transaction is dirty.
> 
> Further, the operation that will lock the AGF is the target
> directory modification if blocks need to be allocated, and the whole
> point of the "check before execution" is to abort if ENOSPC would
> occur as a result of trying to allocate blocks and we don't have a
> space reservation for for those blocks because we are very, very
> close to ENOSPC already.
> 
> If we fail the xfs_iunlink_remove() operation, we're shutting down
> the filesystem. If we fail the xfs_dir_createname(target) call, we
> are most likely going to be shutting down the filesystem. So the
> premise that locating xfs_iunlink_remove() at the end to make error
> handling easy is not really true - transaction cancel will clean
> both of them up and shut the filesystem down.
> 

Yeah, though I guess it depends on whether it's considered correct to
leave out the error handling. In this case, it sounds like you'd prefer
to do that since we can infer the transaction is dirty and so the
filesystem is shutting down anyways. That sounds reasonable to me given
the quirky circumstances of this particular operation, provided we don't
leave anything around in too bogus of a state to cause problems even
with a shut down fs (which I don't think would be the case, but it
should be tested).

> Hence I think the right thing to do is to move the
> xfs_iunlink_remove() call to between xfs_dir_canenter() and
> xfs_dir_createname(). This means ENOSPC will abort with a clean
> transaction and all is good, otherwise a failure is most likely
> going to shut down the filesystem and it doesn't matter if we do
> xfs_iunlink_remove() or xfs_dir_createname() first.
> 

Note that the canenter() call is currently only used in the target_ip ==
NULL case (and only if we couldn't get a block res). Perhaps we don't
care about the AGF lock in the other case, but we still need to fix up
the whiteout tmpfile for both. For the target_ip != NULL case, we'd want
to make sure we handle things like the -EEXIST error check in there
right now before we dirty the transaction with a whiteout inode tweak so
an invalid request from userspace doesn't shutdown the fs.

Those nits aside, I think the iunlink_remove()/bumplink() combination is
going to always dirty the transaction and so guarantee a cancel after
that point shuts down the fs.

> And by doing xfs_iunlink_remove() first, we remove the AGI/AGF
> lock inversion problem....
> 
> I think this holds together, but I might have missed something in
> the tangle of different rename operation cases. So it's worth
> checking, but it looks to me like a better solution than having
> a bare AGI lock in the middle of the function to work around error
> handling logic we didn't clearly enough about at the time (hindsight
> and all that jazz)....
> 
> Thoughts?
> 

If we explicitly ignore error handling as such because shutdown cleans
up the mess, then I'd just like to make sure we have some combination of
asserts and/or comments to verify that remains the case for future
changes. Otherwise somebody could insert a transaction roll or something
a couple years down the line and introduce a corruption vector that none
of us remember. With that angle covered, the approach sounds reasonable
to me.

I have no terribly strong preference between the three alternative
options discussed so far. Refactoring the dir code would be a bit more
work in the way of fixing a bug, which is fine, but if we want/need a
backportable fix it might be better to consider that a follow up fix up
after taking one of the other two approaches to address the lock order
issue.

Brian

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Brian Foster Aug. 16, 2019, 2:53 p.m. UTC | #9
On Fri, Aug 16, 2019 at 04:09:39PM +0800, kaixuxia wrote:
> 
> 
> On 2019/8/16 7:36, Dave Chinner wrote:
> > On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > > In this patch we make the unlinked list removal a deferred operation,
> > > i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> > > transaction has committed, and the iunlink remove intention and done
> > > log items are provided.
> > 
> > I really like the idea of doing this, not just for the inode unlink
> > list removal, but for all the high level complex metadata
> > modifications such as create, unlink, etc.
> > 
> > The reason I like this is that it moves use closer to being able to
> > do operations almost completely asynchronously once the first intent
> > has been logged.
> > 
> 
> Thanks a lot for your comments.
> Yeah, sometimes the complex metadata modifications correspond to the
> long and complex transactions that hold more locks or other common
> resources, so the deferred options may be better choices than just
> changing the order in one transaction.
> 

I can't speak for Dave (who can of course chime in again..) or others,
but I don't think he's saying that this approach is preferred to the
various alternative approaches discussed in the other subthread. Note
that he also replied there with another potential solution that doesn't
involve deferred operations.

Rather, I think he's viewing this in a much longer term context around
changing more of the filesystem to be async in architecture. Personally,
I'd have a ton more questions around the context of what something like
that looks like before I'd support starting to switch over less complex
operations to be deferred operations based on the current dfops
mechanism. The mechanism works and solves real problems, but it also has
tradeoffs that IMO warrant the current model of selective use. Further,
it's nearly impossible to determine what other fundamental
incompatibilities might exist without context on bigger picture design.
IOW, this topic really needs a separate thread that that starts with a
high level architectural description for others to reason about, because
I think it's already caused confusion.

In short, while it might be worth keeping this patch around for future
use, I still think this is overkill (and insufficient as Darrick already
noted) for fixing the originally reported problem... 

Brian

> > Once we have committed the intent, we can treat the rest of the
> > operation like recovery - all the information needed to perform the
> > operation is in the intenti and all the objects that need to be
> > locked across the entire operation are locked and joined to the
> > defer structure. If the intent hits the log the we guarantee that it
> > will be completed atomically and in the correct sequence order.
> > Hence it doesn't matter once the intent is built and committed what
> > context actually completes the rest of the transaction.
> > 
> > If we have to do a sync transaction, because XFS_MOUNT_SYNC,
> > XFS_MOUNT_DIRSYNC, or there's a sync flag on the inode(s), we can
> > add a waitqueue_head to the struct xfs_defer and have the context
> > issuing the transaction attach itself and wait for the defer ops to
> > complete and wake it....
> > 
> > 
> > .....
> > 
> > > @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
> > >   }
> > > 
> > >   /*
> > > + * This routine is called to create an in-core iunlink remove intent
> > > + * item from the iri format structure which was logged on disk.
> > > + * It allocates an in-core iri, copies the inode from the format
> > > + * structure into it, and adds the iri to the AIL with the given
> > > + * LSN.
> > > + */
> > > +STATIC int
> > > +xlog_recover_iri_pass2(
> > > +	struct xlog			*log,
> > > +	struct xlog_recover_item	*item,
> > > +	xfs_lsn_t			lsn)
> > > +{
> > > +	xfs_mount_t		*mp = log->l_mp;
> > > +	xfs_iri_log_item_t	*irip;
> > > +	xfs_iri_log_format_t	*iri_formatp;
> > > +
> > > +	iri_formatp = item->ri_buf[0].i_addr;
> > > +
> > > +	irip = xfs_iri_init(mp, 1);
> > > +	irip->iri_format = *iri_formatp;
> > > +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
> > > +		xfs_iri_item_free(irip);
> > > +		return EFSCORRUPTED;
> > > +	}
> > > +
> > > +	spin_lock(&log->l_ailp->ail_lock);
> > > +	/*
> > > +	 * The IRI has two references. One for the IRD and one for IRI to ensure
> > > +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
> > > +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
> > > +	 * AIL lock.
> > > +	 */
> > > +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
> > > +	xfs_iri_release(irip);
> > > +	return 0;
> > > +}
> > 
> > These intent recovery functions all do very, very similar things.
> > We already have 4 copies of this almost identical code - I think
> > there needs to be some factoring/abstrcting done here rather than
> > continuing to copy/paste this code...
> 
> Factoring/abstrcting is better than just copy/paste...
> The log incompat feature bit is also needed because adding new
> log item types(IRI&IRD)...
> Any way, I will send the V2 patch for all the review comments.
> 
> > 
> > > @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
> > >   	case XFS_LI_CUD:
> > >   	case XFS_LI_BUI:
> > >   	case XFS_LI_BUD:
> > > +	case XFS_LI_IRI:
> > > +	case XFS_LI_IRD:
> > >   	default:
> > >   		break;
> > >   	}
> > > @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
> > >   	case XFS_LI_CUD:
> > >   	case XFS_LI_BUI:
> > >   	case XFS_LI_BUD:
> > > +	case XFS_LI_IRI:
> > > +	case XFS_LI_IRD:
> > >   		/* nothing to do in pass 1 */
> > >   		return 0;
> > >   	default:
> > > @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
> > >   		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
> > >   	case XFS_LI_BUD:
> > >   		return xlog_recover_bud_pass2(log, item);
> > > +	case XFS_LI_IRI:
> > > +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
> > > +	case XFS_LI_IRD:
> > > +		return xlog_recover_ird_pass2(log, item);
> > >   	case XFS_LI_DQUOT:
> > >   		return xlog_recover_dquot_pass2(log, buffer_list, item,
> > >   						trans->r_lsn);
> > 
> > As can be seen by the increasing size of this table....
> > 
> > > @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
> > >   	spin_lock(&ailp->ail_lock);
> > >   }
> > > 
> > > +/* Recover the IRI if necessary. */
> > > +STATIC int
> > > +xlog_recover_process_iri(
> > > +	struct xfs_trans		*parent_tp,
> > > +	struct xfs_ail			*ailp,
> > > +	struct xfs_log_item		*lip)
> > > +{
> > > +	struct xfs_iri_log_item		*irip;
> > > +	int				error;
> > > +
> > > +	/*
> > > +	 * Skip IRIs that we've already processed.
> > > +	 */
> > > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > > +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
> > > +		return 0;
> > > +
> > > +	spin_unlock(&ailp->ail_lock);
> > > +	error = xfs_iri_recover(parent_tp, irip);
> > > +	spin_lock(&ailp->ail_lock);
> > > +
> > > +	return error;
> > > +}
> > > +
> > > +/* Release the IRI since we're cancelling everything. */
> > > +STATIC void
> > > +xlog_recover_cancel_iri(
> > > +	struct xfs_mount		*mp,
> > > +	struct xfs_ail			*ailp,
> > > +	struct xfs_log_item		*lip)
> > > +{
> > > +	struct xfs_iri_log_item         *irip;
> > > +
> > > +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
> > > +
> > > +	spin_unlock(&ailp->ail_lock);
> > > +	xfs_iri_release(irip);
> > > +	spin_lock(&ailp->ail_lock);
> > > +}
> > 
> > More cookie cutter code.
> > 
> > > @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >   		case XFS_LI_BUI:
> > >   			error = xlog_recover_process_bui(parent_tp, ailp, lip);
> > >   			break;
> > > +		case XFS_LI_IRI:
> > > +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
> > > +			break;
> > >   		}
> > >   		if (error)
> > >   			goto out;
> > > @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
> > >   		case XFS_LI_BUI:
> > >   			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
> > >   			break;
> > > +		case XFS_LI_IRI:
> > > +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
> > > +			break;
> > >   		}
> > 
> > And the table that drives it....
> > 
> > I guess what I'm saying is that I'd really like to see an abstract
> > type specifically for intent log items and generic infrastructure to
> > manipulate them before we go adding more of them...
> > 
> > Cheers,
> > 
> > Dave.
> > 
> 
> -- 
> kaixuxia
Dave Chinner Aug. 17, 2019, 1:40 a.m. UTC | #10
On Fri, Aug 16, 2019 at 10:53:10AM -0400, Brian Foster wrote:
> On Fri, Aug 16, 2019 at 04:09:39PM +0800, kaixuxia wrote:
> > 
> > 
> > On 2019/8/16 7:36, Dave Chinner wrote:
> > > On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > > > In this patch we make the unlinked list removal a deferred operation,
> > > > i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> > > > transaction has committed, and the iunlink remove intention and done
> > > > log items are provided.
> > > 
> > > I really like the idea of doing this, not just for the inode unlink
> > > list removal, but for all the high level complex metadata
> > > modifications such as create, unlink, etc.
> > > 
> > > The reason I like this is that it moves use closer to being able to
> > > do operations almost completely asynchronously once the first intent
> > > has been logged.
> > > 
> > 
> > Thanks a lot for your comments.
> > Yeah, sometimes the complex metadata modifications correspond to the
> > long and complex transactions that hold more locks or other common
> > resources, so the deferred options may be better choices than just
> > changing the order in one transaction.
> > 
> 
> I can't speak for Dave (who can of course chime in again..) or others,
> but I don't think he's saying that this approach is preferred to the
> various alternative approaches discussed in the other subthread. Note
> that he also replied there with another potential solution that doesn't
> involve deferred operations.

Right, two separate things. One: fixing the bug doesn't require
deferred operations.

Two: async deferred operations is the direction we've been heading
in for a long, long time.

> Rather, I think he's viewing this in a much longer term context around
> changing more of the filesystem to be async in architecture.

Right, in terms of longer term context.

> Personally,
> I'd have a ton more questions around the context of what something like
> that looks like before I'd support starting to switch over less complex
> operations to be deferred operations based on the current dfops
> mechanism.
>
> The mechanism works and solves real problems, but it also has
> tradeoffs that IMO warrant the current model of selective use. Further,
> it's nearly impossible to determine what other fundamental
> incompatibilities might exist without context on bigger picture design.

The "bigger picture" takes up a lot of space in my head, and it has
for a long time. However, here you are worrying about implementation
details around the dfops mechanisms - that's not big picture
thinking.

Big picture thinking is about how all the pieces fit together, not
how a specific piece of the picture is implemented. The design and
implementation of the dfops mechanism is going to change over time,
but the architectural function it performs will not change.

The architectural problem the "intent and deferral" mechanism solves
is that XFS's original "huge complex transaction" model broke down
when we started trying to add more functionality to each individual
transaction. This first came to light in the late 90s, when HSMs
required attributes and attributes could not be added atomically in
creation operations. So all sorts of problems occurred on crashes,
which mean HSMs had to scan filesytsems after a crash to find files
with inconsistent attributes (hence bulkstat!). The problem still
exists today with security attributes, default acls, etc.

And then we started wanting to add parent pointers. Which require
atomic manipulation of attributes in directory modification
transactions. Oh dear.  And then came desires to add rmap, which
needed their own atomic additions to every transaction in the
filesysetm that allocated or freed space. And then reflink, with
it's requirements.

The transaction model basically broke down - it couldn't do what we
needed.  You can see some of the ideas I had more than 10 years ago
about how we'd need to morph XFS to support the flexibility in
transactional modifications we needed here:

http://xfs.org/index.php/Improving_Metadata_Performance_By_Reducing_Journal_Overhead#Operation_Based_Logging
http://xfs.org/index.php/Improving_Metadata_Performance_By_Reducing_Journal_Overhead#Atomic_Multi-Transaction_Operations

The "operation based logging" mechanism is essentially how we are
using intents in deferred operations. Another example is the icreate
item, which just logs the location of the inode chunk we need
to initialise, rahter than logging the physical initialisation
directly.

The problem that Darrick solved with the deferred operations was the
"atomic multi-transaction operations" problem - i.e. how to link all
these smaller, individual atomic modifications into a much larger
fail-safe atomic operation without blowing out the log reservation
to cover every single possible change that could be made.

Now, keep in mind that the potential mechanisms/implementations I
talk about in those links are way out of date. It's the concepts and
direction - the bigger picture - that I'm demonstrating here. So
don't get stuck on "but that mechanism won't work", rather see it
for what it actually is - ideas for how we go from complex, massive
transactions to flexible agreggated chains of small, individual
intent-based transactions.

IOWs, dfops is just infrastructure to provide the intent chaining
functionality required by the "aggregated chains" modification
architecture. If we have to modify the dfops infrastructure to solve
problems along the way, then thats just fine. It's just a mechanism
we used to implement a piece of the bigger picture - dfops is not a
feature in the bigger picture at all.....

In terms of the bigger picture, the work Allison is doing to
re-architect the attribute manipulations around deferred operations
for parent pointers is breaking the new ground here. It's slow going
because it's the first major conversion to the "new way", but it's
telling us about all the things the dfops mechanism doesn't provide.
Conversions of other operations will be simpler as the dfops
infrastructure will be more capable as a result of the attribute
conversion.

But kind in mind that it is the conversion of attribute modification
to chained intents that is the big picture work here - dfops is just
the mechanism it uses. i.e. It's the conversion to the "operation
based logging + atomic multi-transaction" architecture that allows
us to add attribute modifications into directory operations and
guarantee the dir and attr mods are atomic.

From that perspective, dfops is just the implementation mechanism that
makes the architectural big picture change possible. dfops will need
change and morph as necessary to support these changes, but those
changes are not architectural or big picture items - they are just
implementation details....

I like this patch because it means we are starting to reach the
end-game of this architectural change.  This patch indicates that
people are starting to understand the end goal of this work: to
break up big transactions into atomic chains of smaller, simpler
linked transactions.  And they are doing so without needing to be
explicitly told "this is how we want complex modifications to be
done". This is _really good_. :)

And that leads me to start thinking about the next step after that,
which I'd always planned it to be, and that is async processing of
the "atomic multi-transaction operations". That, at the time, was
based on the observation that we had supercomputers with thousands
of CPUs banging on the one filesystem and we always had CPUs to
spare. That's even more true these days: lots of filesytem
operations still single threaded so we have huge amounts of idle CPU
to spare. We could be using that to speed up things like rsync,
tarball extraction, rm -rf, etc.

I mapped out 10-15 years worth of work for XFS back in 2008, and
we've been regularly ticking off boxes on the implementation
checklist ever since. We're actually tracking fairly well on the
"done in 15yrs" timeline at the moment. Async operation is at the
end of that checklist...

What I'm trying to say is that the bigger picture here has been out
there for a long time, but you have to look past the trees to see
it. Hopefully now that I've pointed out the forest, it will be
easier to see :)

Cheers,

Dave.
Brian Foster Aug. 17, 2019, 1:20 p.m. UTC | #11
On Sat, Aug 17, 2019 at 11:40:23AM +1000, Dave Chinner wrote:
> On Fri, Aug 16, 2019 at 10:53:10AM -0400, Brian Foster wrote:
> > On Fri, Aug 16, 2019 at 04:09:39PM +0800, kaixuxia wrote:
> > > 
> > > 
> > > On 2019/8/16 7:36, Dave Chinner wrote:
> > > > On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
> > > > > In this patch we make the unlinked list removal a deferred operation,
> > > > > i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
> > > > > transaction has committed, and the iunlink remove intention and done
> > > > > log items are provided.
> > > > 
> > > > I really like the idea of doing this, not just for the inode unlink
> > > > list removal, but for all the high level complex metadata
> > > > modifications such as create, unlink, etc.
> > > > 
> > > > The reason I like this is that it moves use closer to being able to
> > > > do operations almost completely asynchronously once the first intent
> > > > has been logged.
> > > > 
> > > 
> > > Thanks a lot for your comments.
> > > Yeah, sometimes the complex metadata modifications correspond to the
> > > long and complex transactions that hold more locks or other common
> > > resources, so the deferred options may be better choices than just
> > > changing the order in one transaction.
> > > 
> > 
> > I can't speak for Dave (who can of course chime in again..) or others,
> > but I don't think he's saying that this approach is preferred to the
> > various alternative approaches discussed in the other subthread. Note
> > that he also replied there with another potential solution that doesn't
> > involve deferred operations.
> 
> Right, two separate things. One: fixing the bug doesn't require
> deferred operations.
> 
> Two: async deferred operations is the direction we've been heading
> in for a long, long time.
> 

Ok, we're on the same page in terms of these being separate issues.

I think we might have different thoughts on the application of the term
async, because deferred operations != async in my mind. The latter makes
me think of various potential (and semi-crazy) behavior changes that I'm
not going to go into because I'd rather ask for context than guess at
what you're thinking.

> > Rather, I think he's viewing this in a much longer term context around
> > changing more of the filesystem to be async in architecture.
> 
> Right, in terms of longer term context.
> 
> > Personally,
> > I'd have a ton more questions around the context of what something like
> > that looks like before I'd support starting to switch over less complex
> > operations to be deferred operations based on the current dfops
> > mechanism.
> >
> > The mechanism works and solves real problems, but it also has
> > tradeoffs that IMO warrant the current model of selective use. Further,
> > it's nearly impossible to determine what other fundamental
> > incompatibilities might exist without context on bigger picture design.
> 
> The "bigger picture" takes up a lot of space in my head, and it has
> for a long time. However, here you are worrying about implementation
> details around the dfops mechanisms - that's not big picture
> thinking.
> 

Right, I was intentionally trying to focus on the bug because I got the
impression that the patch author believes adding a log incompat bit and
some refactoring makes this patch acceptable for merge to fix the lock
ordering bug.

That of course doesn't mean this intent patch might not be useful
towards such unrelated and bigger changes down the road, particularly if
the author wanted to dig into that effort..

> Big picture thinking is about how all the pieces fit together, not
> how a specific piece of the picture is implemented. The design and
> implementation of the dfops mechanism is going to change over time,
> but the architectural function it performs will not change.
> 
> The architectural problem the "intent and deferral" mechanism solves
> is that XFS's original "huge complex transaction" model broke down
> when we started trying to add more functionality to each individual
> transaction. This first came to light in the late 90s, when HSMs
> required attributes and attributes could not be added atomically in
> creation operations. So all sorts of problems occurred on crashes,
> which mean HSMs had to scan filesytsems after a crash to find files
> with inconsistent attributes (hence bulkstat!). The problem still
> exists today with security attributes, default acls, etc.
> 
> And then we started wanting to add parent pointers. Which require
> atomic manipulation of attributes in directory modification
> transactions. Oh dear.  And then came desires to add rmap, which
> needed their own atomic additions to every transaction in the
> filesysetm that allocated or freed space. And then reflink, with
> it's requirements.
> 

Yep, I'm pretty familiar with most of this at this point.

> The transaction model basically broke down - it couldn't do what we
> needed.  You can see some of the ideas I had more than 10 years ago
> about how we'd need to morph XFS to support the flexibility in
> transactional modifications we needed here:
> 
> http://xfs.org/index.php/Improving_Metadata_Performance_By_Reducing_Journal_Overhead#Operation_Based_Logging
> http://xfs.org/index.php/Improving_Metadata_Performance_By_Reducing_Journal_Overhead#Atomic_Multi-Transaction_Operations
> 

I don't recall going through this, though, at least any time recent
enough that I'd probably be able to grok it. Thanks for the reference,
I'll give it a read.

> The "operation based logging" mechanism is essentially how we are
> using intents in deferred operations. Another example is the icreate
> item, which just logs the location of the inode chunk we need
> to initialise, rahter than logging the physical initialisation
> directly.
> 
> The problem that Darrick solved with the deferred operations was the
> "atomic multi-transaction operations" problem - i.e. how to link all
> these smaller, individual atomic modifications into a much larger
> fail-safe atomic operation without blowing out the log reservation
> to cover every single possible change that could be made.
> 
> Now, keep in mind that the potential mechanisms/implementations I
> talk about in those links are way out of date. It's the concepts and
> direction - the bigger picture - that I'm demonstrating here. So
> don't get stuck on "but that mechanism won't work", rather see it
> for what it actually is - ideas for how we go from complex, massive
> transactions to flexible agreggated chains of small, individual
> intent-based transactions.
> 

Sure. I think you misinterpret my response about not wanting to start
deferring arbitrary operations right now into being some reflection on
thoughts of future use of said mechanism. I'm 1.) indicating that I don't
think the current mechanism warrants arbitrary use (because that's what
this patch we're discussing right now fundamentally does) and 2.) asking
for context around that longer term vision, because right now I have
_zero_.

IOW, my feedback is on this patch and not a reflection on some future
design that I have no notion of in my head. I'm very much interested in
hearing/reading more about the longer term vision here, but I'm also
very much against this patch (right now, for its documented purpose). :)

> IOWs, dfops is just infrastructure to provide the intent chaining
> functionality required by the "aggregated chains" modification
> architecture. If we have to modify the dfops infrastructure to solve
> problems along the way, then thats just fine. It's just a mechanism
> we used to implement a piece of the bigger picture - dfops is not a
> feature in the bigger picture at all.....
> 

*nod*

> In terms of the bigger picture, the work Allison is doing to
> re-architect the attribute manipulations around deferred operations
> for parent pointers is breaking the new ground here. It's slow going
> because it's the first major conversion to the "new way", but it's
> telling us about all the things the dfops mechanism doesn't provide.
> Conversions of other operations will be simpler as the dfops
> infrastructure will be more capable as a result of the attribute
> conversion.
> 

I'm aware, I've been reviewing that code.

> But kind in mind that it is the conversion of attribute modification
> to chained intents that is the big picture work here - dfops is just
> the mechanism it uses. i.e. It's the conversion to the "operation
> based logging + atomic multi-transaction" architecture that allows
> us to add attribute modifications into directory operations and
> guarantee the dir and attr mods are atomic.
> 
> From that perspective, dfops is just the implementation mechanism that
> makes the architectural big picture change possible. dfops will need
> change and morph as necessary to support these changes, but those
> changes are not architectural or big picture items - they are just
> implementation details....
> 
> I like this patch because it means we are starting to reach the
> end-game of this architectural change.  This patch indicates that
> people are starting to understand the end goal of this work: to
> break up big transactions into atomic chains of smaller, simpler
> linked transactions.  And they are doing so without needing to be
> explicitly told "this is how we want complex modifications to be
> done". This is _really good_. :)
> 
> And that leads me to start thinking about the next step after that,
> which I'd always planned it to be, and that is async processing of
> the "atomic multi-transaction operations". That, at the time, was
> based on the observation that we had supercomputers with thousands
> of CPUs banging on the one filesystem and we always had CPUs to
> spare. That's even more true these days: lots of filesytem
> operations still single threaded so we have huge amounts of idle CPU
> to spare. We could be using that to speed up things like rsync,
> tarball extraction, rm -rf, etc.
> 

I haven't read back through the links yet, but on a skim the "async"
part of this sounds like a gap in what is described in the sections
referenced above (which sounds more like changing log formats to
something more logical than physical). I'm pretty familiar with all of
the dfops bits to this point, the async bit is what I'm asking about...

What exactly are you thinking about making async that isn't already? Are
you talking about separating in-core changes from backend
ordering/logging in general and across the board? Or opportunistically
making certain deferred operations async if the result of such
operations is not required to be complete by the time the issuing
operation returns to userspace? For example, a hole punch needs to
modify the associated file before it returns, but we might not care if
the associated block freeing operation has completed or not before the
punch returns (as long as the intent is logged) because that's not a
hard requirement of the higher level operation. Whereas the current
behavior is that the extent free operation is deferred, but it is not
necessarily async at the operational level (i.e. the async logging
nature of the CIL notwithstanding). Hm?

Brian

> I mapped out 10-15 years worth of work for XFS back in 2008, and
> we've been regularly ticking off boxes on the implementation
> checklist ever since. We're actually tracking fairly well on the
> "done in 15yrs" timeline at the moment. Async operation is at the
> end of that checklist...
> 
> What I'm trying to say is that the bigger picture here has been out
> there for a long time, but you have to look past the trees to see
> it. Hopefully now that I've pointed out the forest, it will be
> easier to see :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Kaixu Xia Aug. 19, 2019, 7:49 a.m. UTC | #12
On 2019/8/16 22:53, Brian Foster wrote:
> On Fri, Aug 16, 2019 at 04:09:39PM +0800, kaixuxia wrote:
>>
>>
>> On 2019/8/16 7:36, Dave Chinner wrote:
>>> On Tue, Aug 13, 2019 at 07:17:33PM +0800, kaixuxia wrote:
>>>> In this patch we make the unlinked list removal a deferred operation,
>>>> i.e. log an iunlink remove intent and then do it after the RENAME_WHITEOUT
>>>> transaction has committed, and the iunlink remove intention and done
>>>> log items are provided.
>>>
>>> I really like the idea of doing this, not just for the inode unlink
>>> list removal, but for all the high level complex metadata
>>> modifications such as create, unlink, etc.
>>>
>>> The reason I like this is that it moves use closer to being able to
>>> do operations almost completely asynchronously once the first intent
>>> has been logged.
>>>
>>
>> Thanks a lot for your comments.
>> Yeah, sometimes the complex metadata modifications correspond to the
>> long and complex transactions that hold more locks or other common
>> resources, so the deferred options may be better choices than just
>> changing the order in one transaction.
>>
> 
> I can't speak for Dave (who can of course chime in again..) or others,
> but I don't think he's saying that this approach is preferred to the
> various alternative approaches discussed in the other subthread. Note
> that he also replied there with another potential solution that doesn't
> involve deferred operations.
> 
> Rather, I think he's viewing this in a much longer term context around
> changing more of the filesystem to be async in architecture. Personally,
> I'd have a ton more questions around the context of what something like
> that looks like before I'd support starting to switch over less complex
> operations to be deferred operations based on the current dfops
> mechanism. The mechanism works and solves real problems, but it also has
> tradeoffs that IMO warrant the current model of selective use. Further,
> it's nearly impossible to determine what other fundamental
> incompatibilities might exist without context on bigger picture design.
> IOW, this topic really needs a separate thread that that starts with a
> high level architectural description for others to reason about, because
> I think it's already caused confusion.try the 
> 
> In short, while it might be worth keeping this patch around for future
> use, I still think this is overkill (and insufficient as Darrick already
> noted) for fixing the originally reported problem...

Yep.. Just putting the async deferred operations aside, maybe it is too
big for this patch, and focusing on the bug. Of course, we have alternative
solutions for the bug, if this patch is overkill (adding the log incompat bit).
I will try the lightweight and appropriate approaches discussed in the other
subthread, for example refactoring the dir code and move the xfs_iunlink_remove()
call to between the xfs_dir_canenter() and xfs_dir_createname().

> 
> Brian
> 
>>> Once we have committed the intent, we can treat the rest of the
>>> operation like recovery - all the information needed to perform the
>>> operation is in the intenti and all the objects that need to be
>>> locked across the entire operation are locked and joined to the
>>> defer structure. If the intent hits the log the we guarantee that it
>>> will be completed atomically and in the correct sequence order.
>>> Hence it doesn't matter once the intent is built and committed what
>>> context actually completes the rest of the transaction.
>>>
>>> If we have to do a sync transaction, because XFS_MOUNT_SYNC,
>>> XFS_MOUNT_DIRSYNC, or there's a sync flag on the inode(s), we can
>>> add a waitqueue_head to the struct xfs_defer and have the context
>>> issuing the transaction attach itself and wait for the defer ops to
>>> complete and wake it....
>>>
>>>
>>> .....
>>>
>>>> @@ -3752,6 +3755,96 @@ struct xfs_buf_cancel {
>>>>    }
>>>>
>>>>    /*
>>>> + * This routine is called to create an in-core iunlink remove intent
>>>> + * item from the iri format structure which was logged on disk.
>>>> + * It allocates an in-core iri, copies the inode from the format
>>>> + * structure into it, and adds the iri to the AIL with the given
>>>> + * LSN.
>>>> + */
>>>> +STATIC int
>>>> +xlog_recover_iri_pass2(
>>>> +	struct xlog			*log,
>>>> +	struct xlog_recover_item	*item,
>>>> +	xfs_lsn_t			lsn)
>>>> +{
>>>> +	xfs_mount_t		*mp = log->l_mp;
>>>> +	xfs_iri_log_item_t	*irip;
>>>> +	xfs_iri_log_format_t	*iri_formatp;
>>>> +
>>>> +	iri_formatp = item->ri_buf[0].i_addr;
>>>> +
>>>> +	irip = xfs_iri_init(mp, 1);
>>>> +	irip->iri_format = *iri_formatp;
>>>> +	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
>>>> +		xfs_iri_item_free(irip);
>>>> +		return EFSCORRUPTED;
>>>> +	}
>>>> +
>>>> +	spin_lock(&log->l_ailp->ail_lock);
>>>> +	/*
>>>> +	 * The IRI has two references. One for the IRD and one for IRI to ensure
>>>> +	 * it makes it into the AIL. Insert the IRI into the AIL directly and
>>>> +	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
>>>> +	 * AIL lock.
>>>> +	 */
>>>> +	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
>>>> +	xfs_iri_release(irip);
>>>> +	return 0;
>>>> +}
>>>
>>> These intent recovery functions all do very, very similar things.
>>> We already have 4 copies of this almost identical code - I think
>>> there needs to be some factoring/abstrcting done here rather than
>>> continuing to copy/paste this code...
>>
>> Factoring/abstrcting is better than just copy/paste...
>> The log incompat feature bit is also needed because adding new
>> log item types(IRI&IRD)...
>> Any way, I will send the V2 patch for all the review comments.
>>
>>>
>>>> @@ -3981,6 +4074,8 @@ struct xfs_buf_cancel {
>>>>    	case XFS_LI_CUD:
>>>>    	case XFS_LI_BUI:
>>>>    	case XFS_LI_BUD:
>>>> +	case XFS_LI_IRI:
>>>> +	case XFS_LI_IRD:
>>>>    	default:
>>>>    		break;
>>>>    	}
>>>> @@ -4010,6 +4105,8 @@ struct xfs_buf_cancel {
>>>>    	case XFS_LI_CUD:
>>>>    	case XFS_LI_BUI:
>>>>    	case XFS_LI_BUD:
>>>> +	case XFS_LI_IRI:
>>>> +	case XFS_LI_IRD:
>>>>    		/* nothing to do in pass 1 */
>>>>    		return 0;
>>>>    	default:
>>>> @@ -4052,6 +4149,10 @@ struct xfs_buf_cancel {
>>>>    		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
>>>>    	case XFS_LI_BUD:
>>>>    		return xlog_recover_bud_pass2(log, item);
>>>> +	case XFS_LI_IRI:
>>>> +		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
>>>> +	case XFS_LI_IRD:
>>>> +		return xlog_recover_ird_pass2(log, item);
>>>>    	case XFS_LI_DQUOT:
>>>>    		return xlog_recover_dquot_pass2(log, buffer_list, item,
>>>>    						trans->r_lsn);
>>>
>>> As can be seen by the increasing size of this table....
>>>
>>>> @@ -4721,6 +4822,46 @@ struct xfs_buf_cancel {
>>>>    	spin_lock(&ailp->ail_lock);
>>>>    }
>>>>
>>>> +/* Recover the IRI if necessary. */
>>>> +STATIC int
>>>> +xlog_recover_process_iri(
>>>> +	struct xfs_trans		*parent_tp,
>>>> +	struct xfs_ail			*ailp,
>>>> +	struct xfs_log_item		*lip)
>>>> +{
>>>> +	struct xfs_iri_log_item		*irip;
>>>> +	int				error;
>>>> +
>>>> +	/*
>>>> +	 * Skip IRIs that we've already processed.
>>>> +	 */
>>>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>>>> +	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
>>>> +		return 0;
>>>> +
>>>> +	spin_unlock(&ailp->ail_lock);
>>>> +	error = xfs_iri_recover(parent_tp, irip);
>>>> +	spin_lock(&ailp->ail_lock);
>>>> +
>>>> +	return error;
>>>> +}
>>>> +
>>>> +/* Release the IRI since we're cancelling everything. */
>>>> +STATIC void
>>>> +xlog_recover_cancel_iri(
>>>> +	struct xfs_mount		*mp,
>>>> +	struct xfs_ail			*ailp,
>>>> +	struct xfs_log_item		*lip)
>>>> +{
>>>> +	struct xfs_iri_log_item         *irip;
>>>> +
>>>> +	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
>>>> +
>>>> +	spin_unlock(&ailp->ail_lock);
>>>> +	xfs_iri_release(irip);
>>>> +	spin_lock(&ailp->ail_lock);
>>>> +}
>>>
>>> More cookie cutter code.
>>>
>>>> @@ -4856,6 +4998,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>>    		case XFS_LI_BUI:
>>>>    			error = xlog_recover_process_bui(parent_tp, ailp, lip);
>>>>    			break;
>>>> +		case XFS_LI_IRI:
>>>> +			error = xlog_recover_process_iri(parent_tp, ailp, lip);
>>>> +			break;
>>>>    		}
>>>>    		if (error)
>>>>    			goto out;
>>>> @@ -4912,6 +5057,9 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
>>>>    		case XFS_LI_BUI:
>>>>    			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
>>>>    			break;
>>>> +		case XFS_LI_IRI:
>>>> +			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
>>>> +			break;
>>>>    		}
>>>
>>> And the table that drives it....
>>>
>>> I guess what I'm saying is that I'd really like to see an abstract
>>> type specifically for intent log items and generic infrastructure to
>>> manipulate them before we go adding more of them...
>>>
>>> Cheers,
>>>
>>> Dave.
>>>
>>
>> -- 
>> kaixuxia
Dave Chinner Aug. 19, 2019, 10:20 a.m. UTC | #13
On Sat, Aug 17, 2019 at 09:20:06AM -0400, Brian Foster wrote:
> On Sat, Aug 17, 2019 at 11:40:23AM +1000, Dave Chinner wrote:
> > I like this patch because it means we are starting to reach the
> > end-game of this architectural change.  This patch indicates that
> > people are starting to understand the end goal of this work: to
> > break up big transactions into atomic chains of smaller, simpler
> > linked transactions.  And they are doing so without needing to be
> > explicitly told "this is how we want complex modifications to be
> > done". This is _really good_. :)
> > 
> > And that leads me to start thinking about the next step after that,
> > which I'd always planned it to be, and that is async processing of
> > the "atomic multi-transaction operations". That, at the time, was
> > based on the observation that we had supercomputers with thousands
> > of CPUs banging on the one filesystem and we always had CPUs to
> > spare. That's even more true these days: lots of filesytem
> > operations still single threaded so we have huge amounts of idle CPU
> > to spare. We could be using that to speed up things like rsync,
> > tarball extraction, rm -rf, etc.
> > 
> 
> I haven't read back through the links yet, but on a skim the "async"
> part of this sounds like a gap in what is described in the sections
> referenced above (which sounds more like changing log formats to
> something more logical than physical). I'm pretty familiar with all of
> the dfops bits to this point, the async bit is what I'm asking about...
> 
> What exactly are you thinking about making async that isn't already? Are
> you talking about separating in-core changes from backend
> ordering/logging in general and across the board?

Yup, separating the work we have to do from the process context that
needs it to be done.

Think about a buffered write. All we need to do in process context
is reserve space and copy the data into the kernel. The rest of it
is done asynchornously in the background, and can be expedited by
fsync().

Basically applying that to create, rename, etc. It's more complex
because we have to guarantee ordering of operations, but
fundamentally there is nothing stopping us from doing something liek
this on create:

here's a synchronous create, but with async transaction processing:

	DEFINE_WAIT(wait);

	trans alloc
	lock dir inode
	log intent {
		dir = dp
		op = file create
		name = <xfs_name>
		mode = mode
		wait = wait
	}
	xfs_defer_finish(intent, wait)
		-> commits intent
		-> punts rest of work to worker thread
			-> when all is done, will wakeup(wait)
		-> sleeps on wait
	unlock dir

This could eventually become an async create by restructuring it
kinda like this:

	ip = xfs_inode_alloc();

	<initialise and set up inode, leave XFS_INEW/I_NEW set>

	grab dir sequence number
	trans alloc
	log intent {
		dir = dp
		seq = dir_seq
		op = file create
		name = <xfs_name>
		mode = mode
		ip = ip
	}
	xfs_defer_finish(intent)
		-> commits intent
		-> punts rest of creation work to worker thread
			when complete, will clear XFS_INEW/I_NEW

	return instantiated inode to caller

Anyone one who looks this inode up after creation will block
on XFS_INEW/I_NEW flag bits. The caller that created the inode
will be able to operate on it straight away....

SO converting to async processing is really requires several steps.

	1. convert everything to intent logging and defer
	   operations
	2. start every modification with an intent and commit
	3. add wait events to each dfops chain
	4. run dfops in worker threads, calling wakeups when done
	5. convert high level code to do in-core modifications,
	   dfops runs on-disk transactions only
	6. get rid of high level waits for ops that don't need
	   to wait for transactional changes.

> Or opportunistically
> making certain deferred operations async if the result of such
> operations is not required to be complete by the time the issuing
> operation returns to userspace?

Well, that's obvious for things like unlink. But what such async
processing allows is things like bulk directory modifications
(e.g. rm -rf detection because the dir inode gets unlinked before
we've started processing any of the dirent removal ops) which can
greatly speed up operations.

e.g. rm -rf becomes "load all the inodes into memory as we log
dirent removal, when the dir unlink is logged, truncate the dir
inode they are all gone. Sort all the inodes into same cluster/chunk
groups, free all the inodes in a single inobt/finobt record
update...."

IOWs, moving to intent based logging allows us to dynamically change
the way we do operations - the intent defines what needs to be done,
but it doesn't define how it gets done. As such, bulk processing
optimisations become possible and those optimisations can be done
completely independently of the front end that logs the initial
intent.

> For example, a hole punch needs to
> modify the associated file before it returns, but we might not care if
> the associated block freeing operation has completed or not before the
> punch returns (as long as the intent is logged) because that's not a
> hard requirement of the higher level operation. Whereas the current
> behavior is that the extent free operation is deferred, but it is not
> necessarily async at the operational level (i.e. the async logging
> nature of the CIL notwithstanding). Hm?

Yup, exactly. Nothing says the extent has to be free by the time the
hole punch returns. The only rules we need to play by is that it
looks to userspace like there's hole, and if they run fsync then
there really is a hole.  Otherwise the scheduling of the work is
largely up to us.

Split front/back async processing like this isn't new - it's
something Daniel Phillips was trying to do with tux3. It deferred as
much as it could to the back end processing threads and did as
little as possible in the syscall contexts. See slide 14:

https://events.static.linuxfound.org/sites/events/files/slides/tux3.linuxcon.pdf

So the concept has largely been proven in other filesystems, it's
just that if you don't design something from scratch to be
asynchronous it can be difficult to retrofit...

Cheers,

Dave.
Brian Foster Aug. 19, 2019, 2:28 p.m. UTC | #14
On Mon, Aug 19, 2019 at 08:20:17PM +1000, Dave Chinner wrote:
> On Sat, Aug 17, 2019 at 09:20:06AM -0400, Brian Foster wrote:
> > On Sat, Aug 17, 2019 at 11:40:23AM +1000, Dave Chinner wrote:
> > > I like this patch because it means we are starting to reach the
> > > end-game of this architectural change.  This patch indicates that
> > > people are starting to understand the end goal of this work: to
> > > break up big transactions into atomic chains of smaller, simpler
> > > linked transactions.  And they are doing so without needing to be
> > > explicitly told "this is how we want complex modifications to be
> > > done". This is _really good_. :)
> > > 
> > > And that leads me to start thinking about the next step after that,
> > > which I'd always planned it to be, and that is async processing of
> > > the "atomic multi-transaction operations". That, at the time, was
> > > based on the observation that we had supercomputers with thousands
> > > of CPUs banging on the one filesystem and we always had CPUs to
> > > spare. That's even more true these days: lots of filesytem
> > > operations still single threaded so we have huge amounts of idle CPU
> > > to spare. We could be using that to speed up things like rsync,
> > > tarball extraction, rm -rf, etc.
> > > 
> > 
> > I haven't read back through the links yet, but on a skim the "async"
> > part of this sounds like a gap in what is described in the sections
> > referenced above (which sounds more like changing log formats to
> > something more logical than physical). I'm pretty familiar with all of
> > the dfops bits to this point, the async bit is what I'm asking about...
> > 
> > What exactly are you thinking about making async that isn't already? Are
> > you talking about separating in-core changes from backend
> > ordering/logging in general and across the board?
> 
> Yup, separating the work we have to do from the process context that
> needs it to be done.
> 
> Think about a buffered write. All we need to do in process context
> is reserve space and copy the data into the kernel. The rest of it
> is done asynchornously in the background, and can be expedited by
> fsync().
> 

Yep, makes sense.

> Basically applying that to create, rename, etc. It's more complex
> because we have to guarantee ordering of operations, but
> fundamentally there is nothing stopping us from doing something liek
> this on create:
> 

Right, my next big question was going to be around maintaining/enforcing
ordering (if necessary) around whatever operations end up async
deferred. I didn't want to go there because I wasn't really clear on the
goal.

> here's a synchronous create, but with async transaction processing:
> 
> 	DEFINE_WAIT(wait);
> 
> 	trans alloc
> 	lock dir inode
> 	log intent {
> 		dir = dp
> 		op = file create
> 		name = <xfs_name>
> 		mode = mode
> 		wait = wait
> 	}
> 	xfs_defer_finish(intent, wait)
> 		-> commits intent
> 		-> punts rest of work to worker thread
> 			-> when all is done, will wakeup(wait)
> 		-> sleeps on wait
> 	unlock dir
> 
> This could eventually become an async create by restructuring it
> kinda like this:
> 
> 	ip = xfs_inode_alloc();
> 
> 	<initialise and set up inode, leave XFS_INEW/I_NEW set>
> 
> 	grab dir sequence number
> 	trans alloc
> 	log intent {
> 		dir = dp
> 		seq = dir_seq
> 		op = file create
> 		name = <xfs_name>
> 		mode = mode
> 		ip = ip
> 	}
> 	xfs_defer_finish(intent)
> 		-> commits intent
> 		-> punts rest of creation work to worker thread
> 			when complete, will clear XFS_INEW/I_NEW
> 
> 	return instantiated inode to caller
> 

So for this example, is this async to userspace or to some intermediate
level in XFS to facilitate creation batching or some such? It sounds
kind of like a delayed inode allocation mechanism, though that might be
a challenge to expose to userspace if we don't do enough to at least
acquire a physical inode number.

> Anyone one who looks this inode up after creation will block
> on XFS_INEW/I_NEW flag bits. The caller that created the inode
> will be able to operate on it straight away....
> 
> SO converting to async processing is really requires several steps.
> 
> 	1. convert everything to intent logging and defer
> 	   operations

The approach makes sense, this all sounds like mostly mechanism. What's
still lacking is a high level design description to justify large scale
developmental decisions like "converting everything to intent logging."
Particularly since whatever new intents we create will need to be
supported indefinitely once they're released in the wild. Note that I'm
sure you're working/thinking in design -> implement mode and can see
some of these decisions more clearly, but the rest of us shouldn't have
to piece together the design story from implementation details. ;)

For example, even with the examples discussed here it's not clear to me
we need to define an intent for unlinked list removal. We've already
discussed methods to make removal further asynchronous than it already
is at a higher level that doesn't depend on asynchronous transaction
processing at all. Even if unlinked removal did end up deferred in
certain use cases, that should be dictated by redesign of particular
higher level operations (which may or may not require that granular of
an intent).

> 	2. start every modification with an intent and commit
> 	3. add wait events to each dfops chain
> 	4. run dfops in worker threads, calling wakeups when done
> 	5. convert high level code to do in-core modifications,
> 	   dfops runs on-disk transactions only
> 	6. get rid of high level waits for ops that don't need
> 	   to wait for transactional changes.
> 
> > Or opportunistically
> > making certain deferred operations async if the result of such
> > operations is not required to be complete by the time the issuing
> > operation returns to userspace?
> 
> Well, that's obvious for things like unlink. But what such async
> processing allows is things like bulk directory modifications
> (e.g. rm -rf detection because the dir inode gets unlinked before
> we've started processing any of the dirent removal ops) which can
> greatly speed up operations.
> 
> e.g. rm -rf becomes "load all the inodes into memory as we log
> dirent removal, when the dir unlink is logged, truncate the dir
> inode they are all gone. Sort all the inodes into same cluster/chunk
> groups, free all the inodes in a single inobt/finobt record
> update...."
> 

This sounds like a great example to run through a design description for
or (even better) an RFC. It clearly depends on some mechanical things
like conversion of some operations to use intents and enhancements to
deferred processing to provide an async execution context. It also may
depend on some less mechanical things like ordering rules and a batching
detection heuristic, but things like that can be easily fudged for an
RFC.

It also presents an opportunity to demonstrate value. While this all
sounds like a nice approach in theory, there are still interesting
questions like how much this improves performance over the aggregation
provided by delayed logging, what object locking looks like across the
new separation between process transaction processing and deferred/async
transaction processing, whether intent granularity needs to
fundamentally change from the approach we've taken so far (that hasn't
been with consideration for async processing, etc).

Brian

> IOWs, moving to intent based logging allows us to dynamically change
> the way we do operations - the intent defines what needs to be done,
> but it doesn't define how it gets done. As such, bulk processing
> optimisations become possible and those optimisations can be done
> completely independently of the front end that logs the initial
> intent.
> 
> > For example, a hole punch needs to
> > modify the associated file before it returns, but we might not care if
> > the associated block freeing operation has completed or not before the
> > punch returns (as long as the intent is logged) because that's not a
> > hard requirement of the higher level operation. Whereas the current
> > behavior is that the extent free operation is deferred, but it is not
> > necessarily async at the operational level (i.e. the async logging
> > nature of the CIL notwithstanding). Hm?
> 
> Yup, exactly. Nothing says the extent has to be free by the time the
> hole punch returns. The only rules we need to play by is that it
> looks to userspace like there's hole, and if they run fsync then
> there really is a hole.  Otherwise the scheduling of the work is
> largely up to us.
> 
> Split front/back async processing like this isn't new - it's
> something Daniel Phillips was trying to do with tux3. It deferred as
> much as it could to the back end processing threads and did as
> little as possible in the syscall contexts. See slide 14:
> 
> https://events.static.linuxfound.org/sites/events/files/slides/tux3.linuxcon.pdf
> 
> So the concept has largely been proven in other filesystems, it's
> just that if you don't design something from scratch to be
> asynchronous it can be difficult to retrofit...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Aug. 20, 2019, 1:04 a.m. UTC | #15
On Mon, Aug 19, 2019 at 10:28:09AM -0400, Brian Foster wrote:
> On Mon, Aug 19, 2019 at 08:20:17PM +1000, Dave Chinner wrote:
> > On Sat, Aug 17, 2019 at 09:20:06AM -0400, Brian Foster wrote:
> > > On Sat, Aug 17, 2019 at 11:40:23AM +1000, Dave Chinner wrote:
> > > > I like this patch because it means we are starting to reach the
> > > > end-game of this architectural change.  This patch indicates that
> > > > people are starting to understand the end goal of this work: to
> > > > break up big transactions into atomic chains of smaller, simpler
> > > > linked transactions.  And they are doing so without needing to be
> > > > explicitly told "this is how we want complex modifications to be
> > > > done". This is _really good_. :)
> > > > 
> > > > And that leads me to start thinking about the next step after that,
> > > > which I'd always planned it to be, and that is async processing of
> > > > the "atomic multi-transaction operations". That, at the time, was
> > > > based on the observation that we had supercomputers with thousands
> > > > of CPUs banging on the one filesystem and we always had CPUs to
> > > > spare. That's even more true these days: lots of filesytem
> > > > operations still single threaded so we have huge amounts of idle CPU
> > > > to spare. We could be using that to speed up things like rsync,
> > > > tarball extraction, rm -rf, etc.
> > > > 
> > > 
> > > I haven't read back through the links yet, but on a skim the "async"
> > > part of this sounds like a gap in what is described in the sections
> > > referenced above (which sounds more like changing log formats to
> > > something more logical than physical). I'm pretty familiar with all of
> > > the dfops bits to this point, the async bit is what I'm asking about...
> > > 
> > > What exactly are you thinking about making async that isn't already? Are
> > > you talking about separating in-core changes from backend
> > > ordering/logging in general and across the board?
> > 
> > Yup, separating the work we have to do from the process context that
> > needs it to be done.
> > 
> > Think about a buffered write. All we need to do in process context
> > is reserve space and copy the data into the kernel. The rest of it
> > is done asynchornously in the background, and can be expedited by
> > fsync().
> > 
> 
> Yep, makes sense.
> 
> > Basically applying that to create, rename, etc. It's more complex
> > because we have to guarantee ordering of operations, but
> > fundamentally there is nothing stopping us from doing something liek
> > this on create:
> > 
> 
> Right, my next big question was going to be around maintaining/enforcing
> ordering (if necessary) around whatever operations end up async
> deferred. I didn't want to go there because I wasn't really clear on the
> goal.

Stuff like ordering of operations are implementation issues, not
really anything to do with the big picture. Big picture says "user
visible ordering should be maintained", how to achieve that with
async processing is something we'll work out how deal with when we
actually start prototyping async functionality. That's a long way
off yet, so you're not going to get specific answers to "how do we
implement this specific thing" - the best you'll get is "sequence
numbering of some sort"....

> > here's a synchronous create, but with async transaction processing:
> > 
> > 	DEFINE_WAIT(wait);
> > 
> > 	trans alloc
> > 	lock dir inode
> > 	log intent {
> > 		dir = dp
> > 		op = file create
> > 		name = <xfs_name>
> > 		mode = mode
> > 		wait = wait
> > 	}
> > 	xfs_defer_finish(intent, wait)
> > 		-> commits intent
> > 		-> punts rest of work to worker thread
> > 			-> when all is done, will wakeup(wait)
> > 		-> sleeps on wait
> > 	unlock dir
> > 
> > This could eventually become an async create by restructuring it
> > kinda like this:
> > 
> > 	ip = xfs_inode_alloc();
> > 
> > 	<initialise and set up inode, leave XFS_INEW/I_NEW set>
> > 
> > 	grab dir sequence number
> > 	trans alloc
> > 	log intent {
> > 		dir = dp
> > 		seq = dir_seq
> > 		op = file create
> > 		name = <xfs_name>
> > 		mode = mode
> > 		ip = ip
> > 	}
> > 	xfs_defer_finish(intent)
> > 		-> commits intent
> > 		-> punts rest of creation work to worker thread
> > 			when complete, will clear XFS_INEW/I_NEW
> > 
> > 	return instantiated inode to caller
> > 
> 
> So for this example, is this async to userspace or to some intermediate
> level in XFS to facilitate creation batching or some such?

What needs to be done async is determined by the initial intent
setup and then xfs_defer_finish() will run what needs to be done
sync to keep userspace happy and defer everything else.  e.g. we
might need to run dirent mods sync so lookup/readdir behave
appropriately, but then everything else can be deferred to an async
context. 

Don't ask for specifics, because there aren't any. General answers
is all you are going to get at this point because that's all the
high level architecture provides. The details of any specific answer
will change as we slowly change the code to make use of new in
memory tracking structures and fine-grained intent based operations.

> It sounds
> kind of like a delayed inode allocation mechanism, though that might be
> a challenge to expose to userspace if we don't do enough to at least
> acquire a physical inode number.

That's not a new problem - the VFS already solves that problem
on lookup with the I_NEW flag. i.e. new references will block until
the inode is fully instantiated and the I_NEW flag is cleared. We
also have XFS_INEW to do the same thing internally in XFS. IOWs, we
already have infrastructure to block async lookups if we need
information that is only instantiately during allocation....

> > Anyone one who looks this inode up after creation will block
> > on XFS_INEW/I_NEW flag bits. The caller that created the inode
> > will be able to operate on it straight away....
> > 
> > SO converting to async processing is really requires several steps.
> > 
> > 	1. convert everything to intent logging and defer
> > 	   operations
> 
> The approach makes sense, this all sounds like mostly mechanism.

I wasn't detailing the high level design - I was describing the
process we'd need to undertake to get to async processing to
indicate how far off it actually is. :)

> What's
> still lacking is a high level design description to justify large scale
> developmental decisions like "converting everything to intent logging."
> Particularly since whatever new intents we create will need to be
> supported indefinitely once they're released in the wild. Note that I'm
> sure you're working/thinking in design -> implement mode and can see
> some of these decisions more clearly, but the rest of us shouldn't have
> to piece together the design story from implementation details. ;)

What justification is needed? We went through all that years ago -
I've got historical records from XFS meetings in 2004 that document
all these problems and many of the potential solutions. e.g. delayed
logging is one of the things brought up in those documents. IOWs,
we've proven time and time again that the limitation of
metadata performance in XFS is physical logging, and using
fine-grained intent logging has long been seen as a solution to the
problem for almost as long.

That is reflected in the title of the document I pointed you at:
"Improving metadata performance by reducing journal overhead". That
whole section of the document at is based around one thing -
improving journalling efficiency by reducing the journalling
overhead.

The biggest issue we have is physical journalling is expensive in
terms of CPU, memory and IO bandwidth. Delayed logging knocked IO
bandwidth down by a factor of 10x, but it came at the cost of
substantial additional CPU and more than doubled the memory
footprint of the transaction subsystem. That's because it added
another layer of buffering (2 layers if you count shadow buffers)
to aggregate all the physical changes that have been made to
objects.

You can see how this affects performance by comparing 4kB block size
performance with 64kB block size performance. 4kB is far faster
because physical logging results in a 10x increase log IO bandwidth
for the same operations, a substantatial increase in memory
footprint and a huge increase in CPU usage as a result of larger
contiguous allocations and memcpy()s when logging changes. So
despite the fact we do less IOPS, the added overhead of everything
means 64kB block size only acheives 30-50% of the performance of 4kB
block size filesystems under heavy metadata modification workloads.

IOWs, the performance of the filesystem is still largely limited by
the physical logging of metadata in the journalling subsystem, and
the only way around that is to avoid physical logging of objects.

That's the architectural problem that intent logging solves. We move
away from the CPU, memory and bandwidth requirements of physical
logging to create atomic transactions and replace it with small,
logical changes that are chained together to create larger atomic
operations.

I mentioned icreate earlier as an example - perhaps adding numbers
to this will make it clear the gains we can realise via intent
logging. Creating an inode chunk on a v5 filesystem with physical
logging requires 64 inode core regions to be physically logged.
In terms of log item size for a single inode chunk:

	size = 64 * (2 * 128 + sizeof(ophdr))
	     = ~20kB per inode chunk

This requires a 20kB allocation, a heap of CPU processing to walk
the format bitmap multiple times, and then 20kB of log space to
write it.

Now do 100,000 inode creates/s - that's 15,000 inode clusters, so we
are talking 15,000 * 20kB = 300MB/s log bandwidth for the physical
inode clusters alone. And delayed logging can't mitigate this
because we don't relog inode buffers. So the icreate item replaced
this 20kB of physical log bandwidth with about 50 bytes of log
bandwidth and almost no CPU or log item memory overhead on during
transaction commit.

On my 32p test machine, I'm seeing inode creates rates of over
500,000/s, and that's using maybe 200MB/s of log bandwidthi with
icreate. Without the icreate transaction, we'd need 1.5-2GB/s of log
bandwidth to get this sort of performance.

It should be clear from this example just how much overhead we can
avoid by logging intents rather than physical changes. This is not
speculation, intent logging benefits are well proven theory....

> For example, even with the examples discussed here it's not clear to me
> we need to define an intent for unlinked list removal. We've already
> discussed methods to make removal further asynchronous than it already
> is at a higher level that doesn't depend on asynchronous transaction
> processing at all. Even if unlinked removal did end up deferred in
> certain use cases, that should be dictated by redesign of particular
> higher level operations (which may or may not require that granular of
> an intent).

The part of inode unlink Darrick is working on making async is the
part when userspace has dropped it's final reference to the unlinked
inode. i.e. after userspace is done with it. The unlinked inode is
in reclaim at that point, and we can do what we want with it. i.e.
it's an inode reclaim optimisation, not something that userspace
will ever see or notice.

That is, the actual part of unlinking the inode from the directory
and adding it to the AGI unlinked list is still synchonous, and
still blocks userspace. We want that part async, too, so we can get
all the entries removed from directories and the directory unlinked
before we start processing any of the inode unlinks...

And, FWIW, AGI unlinked list addition/removal should really be made
an intent rather than physically logging the iunlink field in the
inode buffer.

> > e.g. rm -rf becomes "load all the inodes into memory as we log
> > dirent removal, when the dir unlink is logged, truncate the dir
> > inode they are all gone. Sort all the inodes into same cluster/chunk
> > groups, free all the inodes in a single inobt/finobt record
> > update...."
> > 
> 
> This sounds like a great example to run through a design description for
> or (even better) an RFC. It clearly depends on some mechanical things
> like conversion of some operations to use intents and enhancements to
> deferred processing to provide an async execution context. It also may
> depend on some less mechanical things like ordering rules and a batching
> detection heuristic, but things like that can be easily fudged for an
> RFC.

Sure, that will all come in time. You're asking about things that
are still a few years away from being realised. There's still a lot
of work to do before we get anywhere near the "lets start making
things async and batching" point in time.

> It also presents an opportunity to demonstrate value. While this all
> sounds like a nice approach in theory, there are still interesting
> questions like how much this improves performance over the aggregation
> provided by delayed logging, what object locking looks like across the
> new separation between process transaction processing and deferred/async
> transaction processing, whether intent granularity needs to
> fundamentally change from the approach we've taken so far (that hasn't
> been with consideration for async processing, etc).

Intent based logging is easily an order of magnitude, maybe two
orders of magnitude more efficient than physical logging on for 64kB
block size filesystems. You can see from above the gains that are
realised, and there is plenty of other filesystems and research out
there that demonstrate the efficiency gains associated with logical
intent logging.  IOWs, the path we are already on (moving to intent
based logging) is well grounded in both theory and reality, and so
it really does not need any further justification.

The async processing side of things is the next step beyond - saying
that "we've got a lot of things to consider here" is stating the
bleeding obvious. It's the big picture goal I've had in mind for a
long time, but there's no point in getting into any sort of detailed
design while we are still several steps away from having a platform
we can start actually prototyping functionality on. 

Hence I haven't spend any time on trying to document it because we
haven't set all the foundations it needs in concrete yet. That we
are starting to talk about it is indicative that we're getting close
to needing a new set of "Ideas for XFS" that lay out this stuff in a
more concrete fashion, but we aren't quite there yet....

And, realistically, we have to be aware that async processing by
itself may not realise any gains at all. The gains I see being
significant come from batching multiple async modifications into
single operations, not from a direct conversion to async processing
But we can't do batching if we don't have infrastructure that allows
large numbers of operations to be performed in an asynchronous
manner.

That's where comparisons with delayed logging falls down. Delayed
logging is possible because it sits behind an interface that allows
async processing of operations. i.e. The CIL is the batch processing
mechanism built underneath an async interface, it's not an async
mechanism itself. We've got to add the equivalent of the async
transaction commit interface before we can even think about how
batching operations could be done. Async first, batching later.

Cheers,

Dave.
Brian Foster Aug. 20, 2019, 2:04 p.m. UTC | #16
On Tue, Aug 20, 2019 at 11:04:55AM +1000, Dave Chinner wrote:
> On Mon, Aug 19, 2019 at 10:28:09AM -0400, Brian Foster wrote:
> > On Mon, Aug 19, 2019 at 08:20:17PM +1000, Dave Chinner wrote:
> > > On Sat, Aug 17, 2019 at 09:20:06AM -0400, Brian Foster wrote:
> > > > On Sat, Aug 17, 2019 at 11:40:23AM +1000, Dave Chinner wrote:
> > > > > I like this patch because it means we are starting to reach the
> > > > > end-game of this architectural change.  This patch indicates that
> > > > > people are starting to understand the end goal of this work: to
> > > > > break up big transactions into atomic chains of smaller, simpler
> > > > > linked transactions.  And they are doing so without needing to be
> > > > > explicitly told "this is how we want complex modifications to be
> > > > > done". This is _really good_. :)
> > > > > 
> > > > > And that leads me to start thinking about the next step after that,
> > > > > which I'd always planned it to be, and that is async processing of
> > > > > the "atomic multi-transaction operations". That, at the time, was
> > > > > based on the observation that we had supercomputers with thousands
> > > > > of CPUs banging on the one filesystem and we always had CPUs to
> > > > > spare. That's even more true these days: lots of filesytem
> > > > > operations still single threaded so we have huge amounts of idle CPU
> > > > > to spare. We could be using that to speed up things like rsync,
> > > > > tarball extraction, rm -rf, etc.
> > > > > 
> > > > 
...
> > > Basically applying that to create, rename, etc. It's more complex
> > > because we have to guarantee ordering of operations, but
> > > fundamentally there is nothing stopping us from doing something liek
> > > this on create:
> > > 
> > 
> > Right, my next big question was going to be around maintaining/enforcing
> > ordering (if necessary) around whatever operations end up async
> > deferred. I didn't want to go there because I wasn't really clear on the
> > goal.
> 
> Stuff like ordering of operations are implementation issues, not
> really anything to do with the big picture. Big picture says "user
> visible ordering should be maintained", how to achieve that with
> async processing is something we'll work out how deal with when we
> actually start prototyping async functionality. That's a long way
> off yet, so you're not going to get specific answers to "how do we
> implement this specific thing" - the best you'll get is "sequence
> numbering of some sort"....
> 

Either way, that sounds more like "requirements" to me. All I'm saying
is there needs to be some kind of broader design description before
implementation.

> > > here's a synchronous create, but with async transaction processing:
> > > 
> > > 	DEFINE_WAIT(wait);
> > > 
> > > 	trans alloc
> > > 	lock dir inode
> > > 	log intent {
> > > 		dir = dp
> > > 		op = file create
> > > 		name = <xfs_name>
> > > 		mode = mode
> > > 		wait = wait
> > > 	}
> > > 	xfs_defer_finish(intent, wait)
> > > 		-> commits intent
> > > 		-> punts rest of work to worker thread
> > > 			-> when all is done, will wakeup(wait)
> > > 		-> sleeps on wait
> > > 	unlock dir
> > > 
> > > This could eventually become an async create by restructuring it
> > > kinda like this:
> > > 
> > > 	ip = xfs_inode_alloc();
> > > 
> > > 	<initialise and set up inode, leave XFS_INEW/I_NEW set>
> > > 
> > > 	grab dir sequence number
> > > 	trans alloc
> > > 	log intent {
> > > 		dir = dp
> > > 		seq = dir_seq
> > > 		op = file create
> > > 		name = <xfs_name>
> > > 		mode = mode
> > > 		ip = ip
> > > 	}
> > > 	xfs_defer_finish(intent)
> > > 		-> commits intent
> > > 		-> punts rest of creation work to worker thread
> > > 			when complete, will clear XFS_INEW/I_NEW
> > > 
> > > 	return instantiated inode to caller
> > > 
> > 
> > So for this example, is this async to userspace or to some intermediate
> > level in XFS to facilitate creation batching or some such?
> 
> What needs to be done async is determined by the initial intent
> setup and then xfs_defer_finish() will run what needs to be done
> sync to keep userspace happy and defer everything else.  e.g. we
> might need to run dirent mods sync so lookup/readdir behave
> appropriately, but then everything else can be deferred to an async
> context. 
> 
> Don't ask for specifics, because there aren't any. General answers
> is all you are going to get at this point because that's all the
> high level architecture provides. The details of any specific answer
> will change as we slowly change the code to make use of new in
> memory tracking structures and fine-grained intent based operations.
> 

I'm not asking for specifics. I'm asking for clarification on observable
behavior of the pseudocode you posted above. The bit you've described
below around how I_NEW works indirectly answers the question, for the
most part.

> > It sounds
> > kind of like a delayed inode allocation mechanism, though that might be
> > a challenge to expose to userspace if we don't do enough to at least
> > acquire a physical inode number.
> 
> That's not a new problem - the VFS already solves that problem
> on lookup with the I_NEW flag. i.e. new references will block until
> the inode is fully instantiated and the I_NEW flag is cleared. We
> also have XFS_INEW to do the same thing internally in XFS. IOWs, we
> already have infrastructure to block async lookups if we need
> information that is only instantiately during allocation....
> 
> > > Anyone one who looks this inode up after creation will block
> > > on XFS_INEW/I_NEW flag bits. The caller that created the inode
> > > will be able to operate on it straight away....
> > > 
> > > SO converting to async processing is really requires several steps.
> > > 
> > > 	1. convert everything to intent logging and defer
> > > 	   operations
> > 
> > The approach makes sense, this all sounds like mostly mechanism.
> 
> I wasn't detailing the high level design - I was describing the
> process we'd need to undertake to get to async processing to
> indicate how far off it actually is. :)
> 

Ok.

> > What's
> > still lacking is a high level design description to justify large scale
> > developmental decisions like "converting everything to intent logging."
> > Particularly since whatever new intents we create will need to be
> > supported indefinitely once they're released in the wild. Note that I'm
> > sure you're working/thinking in design -> implement mode and can see
> > some of these decisions more clearly, but the rest of us shouldn't have
> > to piece together the design story from implementation details. ;)
> 
> What justification is needed? We went through all that years ago -
> I've got historical records from XFS meetings in 2004 that document
> all these problems and many of the potential solutions. e.g. delayed
> logging is one of the things brought up in those documents. IOWs,
> we've proven time and time again that the limitation of
> metadata performance in XFS is physical logging, and using
> fine-grained intent logging has long been seen as a solution to the
> problem for almost as long.
> 

The same justification that's required for every patch we merge..? I
think you're misreading me here. I'm saying if you wanted to pose the
idea of starting to turn arbitrary ops into deferred ops for the purpose
of the grand async future, I haven't to this point seen any kind of
design description or some such that justifies making that kind of
change.

> That is reflected in the title of the document I pointed you at:
> "Improving metadata performance by reducing journal overhead". That
> whole section of the document at is based around one thing -
> improving journalling efficiency by reducing the journalling
> overhead.
> 

Logical logging is unrelated to my questions around the idea of async
deferred operations. Note that I'm not saying these two things aren't
part of a progression or aren't technically interdependent or anything
of that nature. I'm saying the logical logging approach is documented,
has quantifiable value and we have examples of it today, as you've gone
into detail on below. There's no need to argue any of that here.

I do note that you're referring to the explicit intent/done instances
(like what this patch creates for unlinked list removal) and instances
of logical logging (such as icreate) interchangeably. I'm not going to
quibble over terminology (i.e. no need to go into that here), but note
that I'm only digging at the notion of creating more instances of the
former for the purpose of async deferred operations. The icreate
transaction isn't a deferred operation and intent logging as it relates
to logical logging is orthogonal to this patch.

In fact, I'd argue this patch is more likely to increase transaction
overhead than reduce it because it's (ab)using a mechanism to break down
complex transactions into multiple for something that isn't complex
enough to require it. IOW, we've added a transaction roll and and
intent/done item pair that must be logged without any corresponding
changes to reduce log overhead of the core operation via logical
logging.

...
> 
> > For example, even with the examples discussed here it's not clear to me
> > we need to define an intent for unlinked list removal. We've already
> > discussed methods to make removal further asynchronous than it already
> > is at a higher level that doesn't depend on asynchronous transaction
> > processing at all. Even if unlinked removal did end up deferred in
> > certain use cases, that should be dictated by redesign of particular
> > higher level operations (which may or may not require that granular of
> > an intent).
> 
> The part of inode unlink Darrick is working on making async is the
> part when userspace has dropped it's final reference to the unlinked
> inode. i.e. after userspace is done with it. The unlinked inode is
> in reclaim at that point, and we can do what we want with it. i.e.
> it's an inode reclaim optimisation, not something that userspace
> will ever see or notice.
> 

That's precisely my point. That's the operation this patch turns into a
deferred operation (and intent/done sequence), and that's why I'm trying
to understand why/how this relates to some future async dfops processing
scheme.

> That is, the actual part of unlinking the inode from the directory
> and adding it to the AGI unlinked list is still synchonous, and
> still blocks userspace. We want that part async, too, so we can get
> all the entries removed from directories and the directory unlinked
> before we start processing any of the inode unlinks...
> 
> And, FWIW, AGI unlinked list addition/removal should really be made
> an intent rather than physically logging the iunlink field in the
> inode buffer.
> 
> > > e.g. rm -rf becomes "load all the inodes into memory as we log
> > > dirent removal, when the dir unlink is logged, truncate the dir
> > > inode they are all gone. Sort all the inodes into same cluster/chunk
> > > groups, free all the inodes in a single inobt/finobt record
> > > update...."
> > > 
> > 
> > This sounds like a great example to run through a design description for
> > or (even better) an RFC. It clearly depends on some mechanical things
> > like conversion of some operations to use intents and enhancements to
> > deferred processing to provide an async execution context. It also may
> > depend on some less mechanical things like ordering rules and a batching
> > detection heuristic, but things like that can be easily fudged for an
> > RFC.
> 
> Sure, that will all come in time. You're asking about things that
> are still a few years away from being realised. There's still a lot
> of work to do before we get anywhere near the "lets start making
> things async and batching" point in time.
> 
> > It also presents an opportunity to demonstrate value. While this all
> > sounds like a nice approach in theory, there are still interesting
> > questions like how much this improves performance over the aggregation
> > provided by delayed logging, what object locking looks like across the
> > new separation between process transaction processing and deferred/async
> > transaction processing, whether intent granularity needs to
> > fundamentally change from the approach we've taken so far (that hasn't
> > been with consideration for async processing, etc).
> 
> Intent based logging is easily an order of magnitude, maybe two
> orders of magnitude more efficient than physical logging on for 64kB
> block size filesystems. You can see from above the gains that are
> realised, and there is plenty of other filesystems and research out
> there that demonstrate the efficiency gains associated with logical
> intent logging.  IOWs, the path we are already on (moving to intent
> based logging) is well grounded in both theory and reality, and so
> it really does not need any further justification.
> 
> The async processing side of things is the next step beyond - saying
> that "we've got a lot of things to consider here" is stating the
> bleeding obvious. It's the big picture goal I've had in mind for a
> long time, but there's no point in getting into any sort of detailed
> design while we are still several steps away from having a platform
> we can start actually prototyping functionality on. 
> 

It's perfectly reasonable to me to say that it's far too early to reason
about how the grand async future is going to work. To be fair, you're
the one who brought it up. :) I'm just following up with questions to
try and understand what you're talking about when you use that to say
that you like this patch. I'm not asking for detailed design. I'm simply
asking you provide enough to relate this patch to your "big picture
goal." Most of what I'm getting back is either very nebulous or
complaints about the questions, so my opinion on the patch hasn't really
changed.

Again, that's all fine if it's too early to answer these questions..
just don't expect to convince me in the same thread that something as
detailed as an unlinked list removal deferred operation is a fait
accompli to support a design vision that is too far out to describe an
association with. You might have been thinking about this for a long
time and so see it that way, and I suppose that's somewhat informative
with you being a steward of XFS architecture and all (which is part of
why I'm asking and not ignoring :), but that otherwise doesn't make for
a useful discussion or a compelling argument for people who might want
to understand the goal and can't read your mind.

FWIW, the rm -rf example makes a lot more sense and helps explain much
more of a general idea of what you're talking about than what I had
before. The only conclusion I can draw from this as it relates to this
patch, however, is that it might be useful down the road, or it might
not. That's good enough for me for the time being.

> Hence I haven't spend any time on trying to document it because we
> haven't set all the foundations it needs in concrete yet. That we
> are starting to talk about it is indicative that we're getting close
> to needing a new set of "Ideas for XFS" that lay out this stuff in a
> more concrete fashion, but we aren't quite there yet....
> 
> And, realistically, we have to be aware that async processing by
> itself may not realise any gains at all. The gains I see being
> significant come from batching multiple async modifications into
> single operations, not from a direct conversion to async processing
> But we can't do batching if we don't have infrastructure that allows
> large numbers of operations to be performed in an asynchronous
> manner.
> 

Yep, that's another open question to me: whether there's a tradeoff
between too much asynchrony vs. the current appoach, whether it be with
too much context switching and transaction churn on boxes with fewer
CPUs, or driving too deep queues of async ops, etc. Big picture or not,
implementation detail or not, categorize questions like that however you
like. To me, those are fundamental factors into decisions over things
like the granularity/design of async tasks and associated intents. If we
can't practically reason about that yet, then it's kind of hard to
reason about deferring some random low level operation in service of
that undefined model.

Brian

> That's where comparisons with delayed logging falls down. Delayed
> logging is possible because it sits behind an interface that allows
> async processing of operations. i.e. The CIL is the batch processing
> mechanism built underneath an async interface, it's not an async
> mechanism itself. We've got to add the equivalent of the async
> transaction commit interface before we can even think about how
> batching operations could be done. Async first, batching later.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 06b68b6..9d5012e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,6 +106,7 @@  xfs-y				+= xfs_log.o \
  				   xfs_inode_item.o \
  				   xfs_refcount_item.o \
  				   xfs_rmap_item.o \
+				   xfs_iunlinkrm_item.o \
  				   xfs_log_recover.o \
  				   xfs_trans_ail.o \
  				   xfs_trans_buf.o
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a..a0f0a3d 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -176,6 +176,7 @@ 
  	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
  	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
  	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
+	[XFS_DEFER_OPS_TYPE_IUNRE]	= &xfs_iunlink_remove_defer_type,
  };

  /*
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7c28d76..9e91a36 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -17,6 +17,7 @@  enum xfs_defer_ops_type {
  	XFS_DEFER_OPS_TYPE_RMAP,
  	XFS_DEFER_OPS_TYPE_FREE,
  	XFS_DEFER_OPS_TYPE_AGFL_FREE,
+	XFS_DEFER_OPS_TYPE_IUNRE,
  	XFS_DEFER_OPS_TYPE_MAX,
  };

@@ -60,5 +61,6 @@  struct xfs_defer_op_type {
  extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
  extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
  extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_iunlink_remove_defer_type;

  #endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e5f97c6..dc85b28 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -117,7 +117,9 @@  struct xfs_unmount_log_format {
  #define XLOG_REG_TYPE_CUD_FORMAT	24
  #define XLOG_REG_TYPE_BUI_FORMAT	25
  #define XLOG_REG_TYPE_BUD_FORMAT	26
-#define XLOG_REG_TYPE_MAX		26
+#define XLOG_REG_TYPE_IRI_FORMAT	27
+#define XLOG_REG_TYPE_IRD_FORMAT	28
+#define XLOG_REG_TYPE_MAX		28

  /*
   * Flags to log operation header
@@ -240,6 +242,8 @@  struct xfs_unmount_log_format {
  #define	XFS_LI_CUD		0x1243
  #define	XFS_LI_BUI		0x1244	/* bmbt update intent */
  #define	XFS_LI_BUD		0x1245
+#define	XFS_LI_IRI		0x1246	/* iunlink remove intent */
+#define	XFS_LI_IRD		0x1247

  #define XFS_LI_TYPE_DESC \
  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -255,7 +259,9 @@  struct xfs_unmount_log_format {
  	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
  	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
-	{ XFS_LI_BUD,		"XFS_LI_BUD" }
+	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
+	{ XFS_LI_IRI,		"XFS_LI_IRI" }, \
+	{ XFS_LI_IRD,		"XFS_LI_IRD" }

  /*
   * Inode Log Item Format definitions.
@@ -773,6 +779,23 @@  struct xfs_bud_log_format {
  };

  /*
+ * This is the structure used to lay out iri&ird log item in the log.
+ */
+typedef struct xfs_iri_log_format {
+	uint16_t		iri_type;	/* iri log item type */
+	uint16_t		iri_size;	/* size of this item */
+	uint64_t		iri_id;		/* id of corresponding iri */
+	uint64_t		wip_ino;	/* inode number */
+} xfs_iri_log_format_t;
+
+typedef struct xfs_ird_log_format {
+	uint16_t		ird_type;	/* ird log item type */
+	uint16_t		ird_size;	/* size of this item */
+	uint64_t		ird_iri_id;	/* id of corresponding iri */
+	uint64_t		wip_ino;	/* inode number */
+} xfs_ird_log_format_t;
+
+/*
   * Dquot Log format definitions.
   *
   * The first two fields must be the type and size fitting into
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e..7bb3102 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,6 +35,7 @@ 
  #include "xfs_log.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_reflink.h"
+#include "xfs_iunlinkrm_item.h"

  kmem_zone_t *xfs_inode_zone;

@@ -46,7 +47,6 @@ 

  STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
  STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);

  /*
   * helper function to extract extent size hint from inode
@@ -1110,7 +1110,7 @@ 
  /*
   * Increment the link count on an inode & log the change.
   */
-static void
+void
  xfs_bumplink(
  	xfs_trans_t *tp,
  	xfs_inode_t *ip)
@@ -2406,7 +2406,7 @@  struct xfs_iunlink {
  /*
   * Pull the on-disk inode from the AGI unlinked list.
   */
-STATIC int
+int
  xfs_iunlink_remove(
  	struct xfs_trans	*tp,
  	struct xfs_inode	*ip)
@@ -3261,8 +3261,6 @@  struct xfs_iunlink {
  	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
  	if (target_ip)
  		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-	if (wip)
-		xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);

  	/*
  	 * If we are using project inheritance, we only allow renames
@@ -3417,35 +3415,15 @@  struct xfs_iunlink {
  	if (error)
  		goto out_trans_cancel;

-	/*
-	 * For whiteouts, we need to bump the link count on the whiteout inode.
-	 * This means that failures all the way up to this point leave the inode
-	 * on the unlinked list and so cleanup is a simple matter of dropping
-	 * the remaining reference to it. If we fail here after bumping the link
-	 * count, we're shutting down the filesystem so we'll never see the
-	 * intermediate state on disk.
-	 */
-	if (wip) {
-		ASSERT(VFS_I(wip)->i_nlink == 0);
-		xfs_bumplink(tp, wip);
-		error = xfs_iunlink_remove(tp, wip);
-		if (error)
-			goto out_trans_cancel;
-		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-
-		/*
-		 * Now we have a real link, clear the "I'm a tmpfile" state
-		 * flag from the inode so it doesn't accidentally get misused in
-		 * future.
-		 */
-		VFS_I(wip)->i_state &= ~I_LINKABLE;
-	}
-
  	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
  	if (new_parent)
  		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);

+	/* add the iunlink remove intent to the tp */
+	if (wip)
+		xfs_iunlink_remove_add(tp, wip);
+
  	error = xfs_finish_rename(tp);
  	if (wip)
  		xfs_irele(wip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f..f8c30ca 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,6 +20,7 @@ 
  struct xfs_mount;
  struct xfs_trans;
  struct xfs_dquot;
+struct xfs_trans;

  typedef struct xfs_inode {
  	/* Inode linking and identification information. */
@@ -414,6 +415,7 @@  enum layout_break_reason {
  void		xfs_inactive(struct xfs_inode *ip);
  int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
  			   struct xfs_inode **ipp, struct xfs_name *ci_name);
+void		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
  int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
  			   umode_t mode, dev_t rdev, struct xfs_inode **ipp);
  int		xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
@@ -436,6 +438,7 @@  int		xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
  uint		xfs_ilock_attr_map_shared(struct xfs_inode *);

  uint		xfs_ip2xflags(struct xfs_inode *);
+int		xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
  int		xfs_ifree(struct xfs_trans *, struct xfs_inode *);
  int		xfs_itruncate_extents_flags(struct xfs_trans **,
  				struct xfs_inode *, int, xfs_fsize_t, int);
diff --git a/fs/xfs/xfs_iunlinkrm_item.c b/fs/xfs/xfs_iunlinkrm_item.c
new file mode 100644
index 0000000..4e38329
--- /dev/null
+++ b/fs/xfs/xfs_iunlinkrm_item.c
@@ -0,0 +1,458 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Tencent.  All Rights Reserved.
+ * Author: Kaixuxia <kaixuxia@tencent.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_alloc.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iunlinkrm_item.h"
+
+kmem_zone_t	*xfs_iri_zone;
+kmem_zone_t	*xfs_ird_zone;
+
+static inline struct xfs_iri_log_item *IRI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_iri_log_item, iri_item);
+}
+
+void
+xfs_iri_item_free(
+	struct xfs_iri_log_item *irip)
+{
+	kmem_zone_free(xfs_iri_zone, irip);
+}
+
+/*
+ * Freeing the iri requires that we remove it from the AIL if it has already
+ * been placed there. However, the IRI may not yet have been placed in the AIL
+ * when called by xfs_iri_release() from IRD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the IRI.
+ */
+void
+xfs_iri_release(
+	struct xfs_iri_log_item *irip)
+{
+	ASSERT(atomic_read(&irip->iri_refcount) > 0);
+	if (atomic_dec_and_test(&irip->iri_refcount)) {
+		xfs_trans_ail_remove(&irip->iri_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_iri_item_free(irip);
+	}
+}
+
+static inline int
+xfs_iri_item_sizeof(
+	struct xfs_iri_log_item *irip)
+{
+	return sizeof(struct xfs_iri_log_format);
+}
+
+STATIC void
+xfs_iri_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += xfs_iri_item_sizeof(IRI_ITEM(lip));
+}
+
+STATIC void
+xfs_iri_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_iri_log_item	*irip = IRI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	irip->iri_format.iri_type = XFS_LI_IRI;
+	irip->iri_format.iri_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRI_FORMAT,
+			&irip->iri_format,
+			xfs_iri_item_sizeof(irip));
+}
+
+/*
+ * The unpin operation is the last place an IRI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the IRI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the IRI to either construct
+ * and commit the IRD or drop the IRD's reference in the event of error. Simply
+ * drop the log's IRI reference now that the log is done with it.
+ */
+STATIC void
+xfs_iri_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_iri_log_item *irip = IRI_ITEM(lip);
+	xfs_iri_release(irip);
+}
+
+/*
+ * The IRI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an IRD isn't going to be
+ * constructed and thus we free the IRI here directly.
+ */
+STATIC void
+xfs_iri_item_release(
+	struct xfs_log_item     *lip)
+{
+	xfs_iri_release(IRI_ITEM(lip));
+}
+
+/*
+ * This is the ops vector shared by all iri log items.
+ */
+static const struct xfs_item_ops xfs_iri_item_ops = {
+	.iop_size	= xfs_iri_item_size,
+	.iop_format	= xfs_iri_item_format,
+	.iop_unpin	= xfs_iri_item_unpin,
+	.iop_release	= xfs_iri_item_release,
+};
+
+/*
+ * Allocate and initialize an iri item with the given wip ino.
+ */
+struct xfs_iri_log_item *
+xfs_iri_init(struct xfs_mount  *mp,
+	     uint		count)
+{
+	struct xfs_iri_log_item *irip;
+
+	irip = kmem_zone_zalloc(xfs_iri_zone, KM_SLEEP);
+
+	xfs_log_item_init(mp, &irip->iri_item, XFS_LI_IRI, &xfs_iri_item_ops);
+	irip->iri_format.iri_id = (uintptr_t)(void *)irip;
+	atomic_set(&irip->iri_refcount, 2);
+
+	return irip;
+}
+
+static inline struct xfs_ird_log_item *IRD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_ird_log_item, ird_item);
+}
+
+STATIC void
+xfs_ird_item_free(struct xfs_ird_log_item *irdp)
+{
+	kmem_zone_free(xfs_ird_zone, irdp);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given ird item.
+ * We only need 1 iovec for an ird item.  It just logs the ird_log_format
+ * structure.
+ */
+STATIC void
+xfs_ird_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_ird_log_format);
+}
+
+STATIC void
+xfs_ird_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_ird_log_item *irdp = IRD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	irdp->ird_format.ird_type = XFS_LI_IRD;
+	irdp->ird_format.ird_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_IRD_FORMAT, &irdp->ird_format,
+			sizeof(struct xfs_ird_log_format));
+}
+
+/*
+ * The IRD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the IRI and free the
+ * IRD.
+ */
+STATIC void
+xfs_ird_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_ird_log_item	*irdp = IRD_ITEM(lip);
+
+	xfs_iri_release(irdp->ird_irip);
+	xfs_ird_item_free(irdp);
+}
+
+static const struct xfs_item_ops xfs_ird_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
+	.iop_size	= xfs_ird_item_size,
+	.iop_format	= xfs_ird_item_format,
+	.iop_release	= xfs_ird_item_release,
+};
+
+static struct xfs_ird_log_item *
+xfs_trans_get_ird(
+	struct xfs_trans		*tp,
+	struct xfs_iri_log_item		*irip)
+{
+	xfs_ird_log_item_t	*irdp;
+
+	ASSERT(tp != NULL);
+
+	irdp = kmem_zone_zalloc(xfs_ird_zone, KM_SLEEP);
+	xfs_log_item_init(tp->t_mountp, &irdp->ird_item, XFS_LI_IRD,
+			  &xfs_ird_item_ops);
+	irdp->ird_irip = irip;
+	irdp->ird_format.wip_ino = irip->iri_format.wip_ino;
+	irdp->ird_format.ird_iri_id = irip->iri_format.iri_id;
+
+	xfs_trans_add_item(tp, &irdp->ird_item);
+	return irdp;
+}
+
+/* record a iunlink remove intent */
+int
+xfs_iunlink_remove_add(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*wip)
+{
+	struct xfs_iunlink_remove_intent	*ii;
+
+	ii = kmem_alloc(sizeof(struct xfs_iunlink_remove_intent),
+			KM_SLEEP | KM_NOFS);
+	INIT_LIST_HEAD(&ii->ri_list);
+	ii->wip = wip;
+
+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_IUNRE, &ii->ri_list);
+	return 0;
+}
+
+/* Sort iunlink remove intents by AG. */
+static int
+xfs_iunlink_remove_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount			*mp = priv;
+	struct xfs_iunlink_remove_intent	*ra;
+	struct xfs_iunlink_remove_intent	*rb;
+
+	ra = container_of(a, struct xfs_iunlink_remove_intent, ri_list);
+	rb = container_of(b, struct xfs_iunlink_remove_intent, ri_list);
+	return	XFS_INO_TO_AGNO(mp, ra->wip->i_ino) -
+		XFS_INO_TO_AGNO(mp, rb->wip->i_ino);
+}
+
+/* Get an IRI */
+STATIC void *
+xfs_iunlink_remove_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_iri_log_item		*irip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count == 1);
+
+	irip = xfs_iri_init(tp->t_mountp, count);
+	ASSERT(irip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &irip->iri_item);
+	return irip;
+}
+
+/* Log a iunlink remove to the intent item. */
+STATIC void
+xfs_iunlink_remove_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_iri_log_item			*irip = intent;
+	struct xfs_iunlink_remove_intent	*iunre;
+
+	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &irip->iri_item.li_flags);
+
+	irip->iri_format.wip_ino = (uint64_t)(iunre->wip->i_ino);
+}
+
+/* Get an IRD so we can process all the deferred iunlink remove. */
+STATIC void *
+xfs_iunlink_remove_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_ird(tp, intent);
+}
+
+/*
+ * For whiteouts, we need to bump the link count on the whiteout inode.
+ * This means that failures all the way up to this point leave the inode
+ * on the unlinked list and so cleanup is a simple matter of dropping
+ * the remaining reference to it. If we fail here after bumping the link
+ * count, we're shutting down the filesystem so we'll never see the
+ * intermediate state on disk.
+ */
+static int
+xfs_trans_log_finish_iunlink_remove(
+	struct xfs_trans		*tp,
+	struct xfs_ird_log_item		*irdp,
+	struct xfs_inode		*wip)
+{
+	int 	error;
+
+	ASSERT(xfs_isilocked(wip, XFS_ILOCK_EXCL));
+
+	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
+
+	ASSERT(VFS_I(wip)->i_nlink == 0);
+	xfs_bumplink(tp, wip);
+	error = xfs_iunlink_remove(tp, wip);
+	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+	/*
+	 * Now we have a real link, clear the "I'm a tmpfile" state
+	 * flag from the inode so it doesn't accidentally get misused in
+	 * future.
+	 */
+	VFS_I(wip)->i_state &= ~I_LINKABLE;
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the IRI and frees the IRD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
+
+	return error;
+}
+
+/* Process a deferred iunlink remove. */
+STATIC int
+xfs_iunlink_remove_finish_item(
+	struct xfs_trans		*tp,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_iunlink_remove_intent	*iunre;
+	int					error;
+
+	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
+	error = xfs_trans_log_finish_iunlink_remove(tp, done_item,
+			iunre->wip);
+	kmem_free(iunre);
+	return error;
+}
+
+/* Abort all pending IRIs. */
+STATIC void
+xfs_iunlink_remove_abort_intent(
+	void		*intent)
+{
+	xfs_iri_release(intent);
+}
+
+/* Cancel a deferred iunlink remove. */
+STATIC void
+xfs_iunlink_remove_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_iunlink_remove_intent	*iunre;
+
+	iunre = container_of(item, struct xfs_iunlink_remove_intent, ri_list);
+	kmem_free(iunre);
+}
+
+const struct xfs_defer_op_type xfs_iunlink_remove_defer_type = {
+	.diff_items	= xfs_iunlink_remove_diff_items,
+	.create_intent	= xfs_iunlink_remove_create_intent,
+	.abort_intent	= xfs_iunlink_remove_abort_intent,
+	.log_item	= xfs_iunlink_remove_log_item,
+	.create_done	= xfs_iunlink_remove_create_done,
+	.finish_item	= xfs_iunlink_remove_finish_item,
+	.cancel_item	= xfs_iunlink_remove_cancel_item,
+};
+
+/*
+ * Process a iunlink remove intent item that was recovered from the log.
+ */
+int
+xfs_iri_recover(
+	struct xfs_trans		*parent_tp,
+	struct xfs_iri_log_item		*irip)
+{
+	int				error = 0;
+	struct xfs_trans		*tp;
+	xfs_ino_t			ino;
+	struct xfs_inode		*ip;
+	struct xfs_mount		*mp = parent_tp->t_mountp;
+	struct xfs_ird_log_item		*irdp;
+
+	ASSERT(!test_bit(XFS_IRI_RECOVERED, &irip->iri_flags));
+
+	ino = irip->iri_format.wip_ino;
+	if (ino == NULLFSINO || !xfs_verify_dir_ino(mp, ino)) {
+		xfs_alert(mp, "IRI recover used bad inode ino 0x%llx!", ino);
+		set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
+		xfs_iri_release(irip);
+		return -EIO;
+	}
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		return error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	irdp = xfs_trans_get_ird(tp, irip);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	VFS_I(ip)->i_state |= I_LINKABLE;
+	xfs_bumplink(tp, ip);
+	error = xfs_iunlink_remove(tp, ip);
+	if (error)
+		goto abort_error;
+	VFS_I(ip)->i_state &= ~I_LINKABLE;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &irdp->ird_item.li_flags);
+
+	set_bit(XFS_IRI_RECOVERED, &irip->iri_flags);
+	error = xfs_trans_commit(tp);
+	return error;
+
+abort_error:
+	xfs_trans_cancel(tp);
+	return error;
+}
diff --git a/fs/xfs/xfs_iunlinkrm_item.h b/fs/xfs/xfs_iunlinkrm_item.h
new file mode 100644
index 0000000..54c4ca3
--- /dev/null
+++ b/fs/xfs/xfs_iunlinkrm_item.h
@@ -0,0 +1,67 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Tencent.  All Rights Reserved.
+ * Author: Kaixuxia <kaixuxia@tencent.com>
+ */
+#ifndef	__XFS_IUNLINKRM_ITEM_H__
+#define	__XFS_IUNLINKRM_ITEM_H__
+
+/*
+ * When performing rename operation with RENAME_WHITEOUT flag, we will hold AGF lock to
+ * allocate or free extents in manipulating the dirents firstly, and then doing the
+ * xfs_iunlink_remove() call last to hold AGI lock to modify the tmpfile info, so we the
+ * lock order AGI->AGF.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI locking -
+ * inode allocation locks the AGI, then can allocate a new extent for new inodes, locking
+ * the AGF after the AGI. Hence the ordering that is imposed by other parts of the code
+ * is AGI before AGF. So we get the ABBA agi&agf deadlock here.
+ *
+ * So make the unlinked list removal a deferred operation, i.e. log an iunlink remove intent
+ * and then do it after the RENAME_WHITEOUT transaction has committed, and the iunlink remove
+ * intention(IRI) and done log items(IRD) are provided.
+ */
+
+/* kernel only IRI/IRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+struct xfs_inode;
+
+/*
+ * Define IRI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_IRI_RECOVERED		1
+
+/* This is the "iunlink remove intention" log item. It is used in conjunction
+ * with the "iunlink remove done" log item described below.
+ */
+typedef struct xfs_iri_log_item {
+	struct xfs_log_item	iri_item;
+	atomic_t		iri_refcount;
+	unsigned long		iri_flags;
+	xfs_iri_log_format_t	iri_format;
+} xfs_iri_log_item_t;
+
+/* This is the "iunlink remove done" log item. */
+typedef struct xfs_ird_log_item {
+	struct xfs_log_item	ird_item;
+	xfs_iri_log_item_t	*ird_irip;
+	xfs_ird_log_format_t	ird_format;
+} xfs_ird_log_item_t;
+
+struct xfs_iunlink_remove_intent {
+	struct list_head		ri_list;
+	struct xfs_inode		*wip;
+};
+
+extern struct kmem_zone	*xfs_iri_zone;
+extern struct kmem_zone	*xfs_ird_zone;
+
+struct xfs_iri_log_item	*xfs_iri_init(struct xfs_mount *, uint);
+void xfs_iri_item_free(struct xfs_iri_log_item *);
+void xfs_iri_release(struct xfs_iri_log_item *);
+int xfs_iri_recover(struct xfs_trans *, struct xfs_iri_log_item *);
+int xfs_iunlink_remove_add(struct xfs_trans *, struct xfs_inode *);
+
+#endif	/* __XFS_IUNLINKRM_ITEM_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 00e9f5c..f87f510 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2005,6 +2005,8 @@  STATIC void xlog_state_done_syncing(
  	    REG_TYPE_STR(CUD_FORMAT, "cud_format"),
  	    REG_TYPE_STR(BUI_FORMAT, "bui_format"),
  	    REG_TYPE_STR(BUD_FORMAT, "bud_format"),
+	    REG_TYPE_STR(IRI_FORMAT, "iri_format"),
+	    REG_TYPE_STR(IRD_FORMAT, "ird_format"),
  	};
  	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
  #undef REG_TYPE_STR
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e..a916f40 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -33,6 +33,7 @@ 
  #include "xfs_buf_item.h"
  #include "xfs_refcount_item.h"
  #include "xfs_bmap_item.h"
+#include "xfs_iunlinkrm_item.h"

  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)

@@ -1885,6 +1886,8 @@  struct xfs_buf_cancel {
  		case XFS_LI_CUD:
  		case XFS_LI_BUI:
  		case XFS_LI_BUD:
+		case XFS_LI_IRI:
+		case XFS_LI_IRD:
  			trace_xfs_log_recover_item_reorder_tail(log,
  							trans, item, pass);
  			list_move_tail(&item->ri_list, &inode_list);
@@ -3752,6 +3755,96 @@  struct xfs_buf_cancel {
  }

  /*
+ * This routine is called to create an in-core iunlink remove intent
+ * item from the iri format structure which was logged on disk.
+ * It allocates an in-core iri, copies the inode from the format
+ * structure into it, and adds the iri to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_iri_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	xfs_mount_t		*mp = log->l_mp;
+	xfs_iri_log_item_t	*irip;
+	xfs_iri_log_format_t	*iri_formatp;
+
+	iri_formatp = item->ri_buf[0].i_addr;
+
+	irip = xfs_iri_init(mp, 1);
+	irip->iri_format = *iri_formatp;
+	if (item->ri_buf[0].i_len != sizeof(xfs_iri_log_format_t)) {
+		xfs_iri_item_free(irip);
+		return EFSCORRUPTED;
+	}
+
+	spin_lock(&log->l_ailp->ail_lock);
+	/*
+	 * The IRI has two references. One for the IRD and one for IRI to ensure
+	 * it makes it into the AIL. Insert the IRI into the AIL directly and
+	 * drop the IRI reference. Note that xfs_trans_ail_update() drops the
+	 * AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &irip->iri_item, lsn);
+	xfs_iri_release(irip);
+	return 0;
+}
+
+/*
+ * This routine is called when an IRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding IRI if it
+ * was still in the log. To do this it searches the AIL for the IRI with an id
+ * equal to that in the IRD format structure. If we find it we drop the IRD
+ * reference, which removes the IRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_ird_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	xfs_ird_log_format_t	*ird_formatp;
+	xfs_iri_log_item_t	*irip = NULL;
+	struct xfs_log_item	*lip;
+	uint64_t		iri_id;
+	struct xfs_ail_cursor	cur;
+	struct xfs_ail		*ailp = log->l_ailp;
+
+	ird_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len == sizeof(xfs_ird_log_format_t))
+		return -EFSCORRUPTED;
+	iri_id = ird_formatp->ird_iri_id;
+
+	/*
+	 * Search for the iri with the id in the ird format structure
+	 * in the AIL.
+	 */
+	spin_lock(&ailp->ail_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_IRI) {
+			irip = (xfs_iri_log_item_t *)lip;
+			if (irip->iri_format.iri_id == iri_id) {
+				/*
+				 * Drop the IRD reference to the IRI. This
+				 * removes the IRI from the AIL and frees it.
+				 */
+				spin_unlock(&ailp->ail_lock);
+				xfs_iri_release(irip);
+				spin_lock(&ailp->ail_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->ail_lock);
+
+	return 0;
+}
+
+/*
   * This routine is called when an inode create format structure is found in a
   * committed transaction in the log.  It's purpose is to initialise the inodes
   * being allocated on disk. This requires us to get inode cluster buffers that
@@ -3981,6 +4074,8 @@  struct xfs_buf_cancel {
  	case XFS_LI_CUD:
  	case XFS_LI_BUI:
  	case XFS_LI_BUD:
+	case XFS_LI_IRI:
+	case XFS_LI_IRD:
  	default:
  		break;
  	}
@@ -4010,6 +4105,8 @@  struct xfs_buf_cancel {
  	case XFS_LI_CUD:
  	case XFS_LI_BUI:
  	case XFS_LI_BUD:
+	case XFS_LI_IRI:
+	case XFS_LI_IRD:
  		/* nothing to do in pass 1 */
  		return 0;
  	default:
@@ -4052,6 +4149,10 @@  struct xfs_buf_cancel {
  		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
  	case XFS_LI_BUD:
  		return xlog_recover_bud_pass2(log, item);
+	case XFS_LI_IRI:
+		return xlog_recover_iri_pass2(log, item, trans->r_lsn);
+	case XFS_LI_IRD:
+		return xlog_recover_ird_pass2(log, item);
  	case XFS_LI_DQUOT:
  		return xlog_recover_dquot_pass2(log, buffer_list, item,
  						trans->r_lsn);
@@ -4721,6 +4822,46 @@  struct xfs_buf_cancel {
  	spin_lock(&ailp->ail_lock);
  }

+/* Recover the IRI if necessary. */
+STATIC int
+xlog_recover_process_iri(
+	struct xfs_trans		*parent_tp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_iri_log_item		*irip;
+	int				error;
+
+	/*
+	 * Skip IRIs that we've already processed.
+	 */
+	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
+	if (test_bit(XFS_IRI_RECOVERED, &irip->iri_flags))
+		return 0;
+
+	spin_unlock(&ailp->ail_lock);
+	error = xfs_iri_recover(parent_tp, irip);
+	spin_lock(&ailp->ail_lock);
+
+	return error;
+}
+
+/* Release the IRI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_iri(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_iri_log_item         *irip;
+
+	irip = container_of(lip, struct xfs_iri_log_item, iri_item);
+
+	spin_unlock(&ailp->ail_lock);
+	xfs_iri_release(irip);
+	spin_lock(&ailp->ail_lock);
+}
+
  /* Is this log item a deferred action intent? */
  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
  {
@@ -4729,6 +4870,7 @@  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
  	case XFS_LI_RUI:
  	case XFS_LI_CUI:
  	case XFS_LI_BUI:
+	case XFS_LI_IRI:
  		return true;
  	default:
  		return false;
@@ -4856,6 +4998,9 @@  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
  		case XFS_LI_BUI:
  			error = xlog_recover_process_bui(parent_tp, ailp, lip);
  			break;
+		case XFS_LI_IRI:
+			error = xlog_recover_process_iri(parent_tp, ailp, lip);
+			break;
  		}
  		if (error)
  			goto out;
@@ -4912,6 +5057,9 @@  static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
  		case XFS_LI_BUI:
  			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
  			break;
+		case XFS_LI_IRI:
+			xlog_recover_cancel_iri(log->l_mp, ailp, lip);
+			break;
  		}

  		lip = xfs_trans_ail_cursor_next(ailp, &cur);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f945023..66742b7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -34,6 +34,7 @@ 
  #include "xfs_rmap_item.h"
  #include "xfs_refcount_item.h"
  #include "xfs_bmap_item.h"
+#include "xfs_iunlinkrm_item.h"
  #include "xfs_reflink.h"

  #include <linux/magic.h>
@@ -1957,8 +1958,22 @@  struct proc_xfs_info {
  	if (!xfs_bui_zone)
  		goto out_destroy_bud_zone;

+	xfs_ird_zone = kmem_zone_init(sizeof(xfs_ird_log_item_t),
+			"xfs_ird_item");
+	if (!xfs_ird_zone)
+		goto out_destroy_bui_zone;
+
+	xfs_iri_zone = kmem_zone_init(sizeof(xfs_iri_log_item_t),
+			"xfs_iri_item");
+	if (!xfs_iri_zone)
+		goto out_destroy_ird_zone;
+
  	return 0;

+ out_destroy_ird_zone:
+	kmem_zone_destroy(xfs_ird_zone);
+ out_destroy_bui_zone:
+	kmem_zone_destroy(xfs_bui_zone);
   out_destroy_bud_zone:
  	kmem_zone_destroy(xfs_bud_zone);
   out_destroy_cui_zone:
@@ -2007,6 +2022,8 @@  struct proc_xfs_info {
  	 * destroy caches.
  	 */
  	rcu_barrier();
+	kmem_zone_destroy(xfs_iri_zone);
+	kmem_zone_destroy(xfs_ird_zone);
  	kmem_zone_destroy(xfs_bui_zone);
  	kmem_zone_destroy(xfs_bud_zone);
  	kmem_zone_destroy(xfs_cui_zone);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 64d7f17..dd63eaa 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -26,6 +26,8 @@ 
  struct xfs_cud_log_item;
  struct xfs_bui_log_item;
  struct xfs_bud_log_item;
+struct xfs_iri_log_item;
+struct xfs_ird_log_item;

  struct xfs_log_item {
  	struct list_head		li_ail;		/* AIL pointers */