From patchwork Tue Jun 23 09:50:12 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dave Chinner <david@fromorbit.com>
X-Patchwork-Id: 11620209
Return-Path: <SRS0=nlsN=AE=vger.kernel.org=linux-xfs-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id AEADC1392
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:27 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 9BD8E206C0
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:27 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1732168AbgFWJu1 (ORCPT
        <rfc822;patchwork-linux-xfs@patchwork.kernel.org>);
        Tue, 23 Jun 2020 05:50:27 -0400
Received: from mail107.syd.optusnet.com.au ([211.29.132.53]:58800 "EHLO
        mail107.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK)
        by vger.kernel.org with ESMTP id S1732172AbgFWJu0 (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Tue, 23 Jun 2020 05:50:26 -0400
Received: from dread.disaster.area (pa49-180-124-177.pa.nsw.optusnet.com.au
 [49.180.124.177])
        by mail107.syd.optusnet.com.au (Postfix) with ESMTPS id 4CFD1D5AB3F
        for <linux-xfs@vger.kernel.org>;
 Tue, 23 Jun 2020 19:50:22 +1000 (AEST)
Received: from discord.disaster.area ([192.168.253.110])
        by dread.disaster.area with esmtp (Exim 4.92.3)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0004gj-Tq
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
Received: from dave by discord.disaster.area with local (Exim 4.93)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0087B5-KW
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
From: Dave Chinner <david@fromorbit.com>
To: linux-xfs@vger.kernel.org
Subject: [PATCH 1/4] xfs: xfs_iflock is no longer a completion
Date: Tue, 23 Jun 2020 19:50:12 +1000
Message-Id: <20200623095015.1934171-2-david@fromorbit.com>
X-Mailer: git-send-email 2.26.2.761.g0e0b3e54be
In-Reply-To: <20200623095015.1934171-1-david@fromorbit.com>
References: <20200623095015.1934171-1-david@fromorbit.com>
MIME-Version: 1.0
X-Optus-CM-Score: 0
X-Optus-CM-Analysis: v=2.3 cv=W5xGqiek c=1 sm=1 tr=0
        a=k3aV/LVJup6ZGWgigO6cSA==:117 a=k3aV/LVJup6ZGWgigO6cSA==:17
        a=nTHF0DUjJn0A:10 a=20KFwNOVAAAA:8 a=gBkaUiCbtYbGVvkf08wA:9
Sender: linux-xfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Dave Chinner <dchinner@redhat.com>

With the recent rework of the inode cluster flushing, we no longer
ever wait on the the inode flush "lock". It was never a lock in the
first place, just a completion to allow callers to wait for inode IO
to complete. We now never wait for flush completion as all inode
flushing is non-blocking. Hence we can get rid of all the iflock
infrastructure and instead just set and check a state flag.

Rename the XFS_IFLOCK flag to XFS_IFLUSHING, convert all the
xfs_iflock_nowait() test-and-set operations on that flag, and
replace all the xfs_ifunlock() calls to clear operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c     | 19 ++++++------
 fs/xfs/xfs_inode.c      | 67 +++++++++++++++--------------------------
 fs/xfs/xfs_inode.h      | 33 +-------------------
 fs/xfs/xfs_inode_item.c |  6 ++--
 fs/xfs/xfs_inode_item.h |  4 +--
 5 files changed, 39 insertions(+), 90 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a973f180c6cd..0d73559f2d58 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -54,7 +54,6 @@ xfs_inode_alloc(
 
 	XFS_STATS_INC(mp, vn_active);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
 	/* initialise the xfs inode */
@@ -125,7 +124,7 @@ void
 xfs_inode_free(
 	struct xfs_inode	*ip)
 {
-	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
@@ -999,7 +998,7 @@ xfs_reclaim_inode_grab(
 	 * Do unlocked checks to see if the inode already is being flushed or in
 	 * reclaim to avoid lock traffic.
 	 */
-	if (__xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+	if (__xfs_iflags_test(ip, XFS_IFLUSHING | XFS_IRECLAIM))
 		return 1;
 
 	/*
@@ -1045,23 +1044,23 @@ xfs_reclaim_inode(
 
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 		goto out;
-	if (!xfs_iflock_nowait(ip))
+	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
 		goto out_iunlock;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		xfs_iunpin_wait(ip);
 		/* xfs_iflush_abort() drops the flush lock */
 		xfs_iflush_abort(ip);
+		ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
 		goto reclaim;
 	}
 	if (xfs_ipincount(ip))
-		goto out_ifunlock;
+		goto out_clear_flush;
 	if (!xfs_inode_clean(ip))
-		goto out_ifunlock;
+		goto out_clear_flush;
 
-	xfs_ifunlock(ip);
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
 reclaim:
-	ASSERT(!xfs_isiflocked(ip));
 
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always appears
@@ -1111,8 +1110,8 @@ xfs_reclaim_inode(
 	__xfs_inode_free(ip);
 	return true;
 
-out_ifunlock:
-	xfs_ifunlock(ip);
+out_clear_flush:
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
 out_iunlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bae84b3eeb9a..ba86d27b5226 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -598,22 +598,6 @@ xfs_lock_two_inodes(
 	}
 }
 
-void
-__xfs_iflock(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-	do {
-		prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (xfs_isiflocked(ip))
-			io_schedule();
-	} while (!xfs_iflock_nowait(ip));
-
-	finish_wait(wq, &wait.wq_entry);
-}
-
 STATIC uint
 _xfs_dic2xflags(
 	uint16_t		di_flags,
@@ -2547,11 +2531,8 @@ xfs_ifree_mark_inode_stale(
 	 * valid, the wrong inode or stale.
 	 */
 	spin_lock(&ip->i_flags_lock);
-	if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-		return;
-	}
+	if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+		goto out_iflags_unlock;
 
 	/*
 	 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2568,16 +2549,14 @@ xfs_ifree_mark_inode_stale(
 		}
 	}
 	ip->i_flags |= XFS_ISTALE;
-	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
 
 	/*
-	 * If we can't get the flush lock, the inode is already attached.  All
+	 * If the inode is flushing, it is already attached to the buffer.  All
 	 * we needed to do here is mark the inode stale so buffer IO completion
 	 * will remove it from the AIL.
 	 */
 	iip = ip->i_itemp;
-	if (!xfs_iflock_nowait(ip)) {
+	if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
 		ASSERT(!list_empty(&iip->ili_item.li_bio_list));
 		ASSERT(iip->ili_last_fields);
 		goto out_iunlock;
@@ -2589,10 +2568,12 @@ xfs_ifree_mark_inode_stale(
 	 * commit as the flock synchronises removal of the inode from the
 	 * cluster buffer against inode reclaim.
 	 */
-	if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
-		xfs_ifunlock(ip);
+	if (!iip || list_empty(&iip->ili_item.li_bio_list))
 		goto out_iunlock;
-	}
+
+	__xfs_iflags_set(ip, XFS_IFLUSHING);
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
 
 	/* we have a dirty inode in memory that has not yet been flushed. */
 	spin_lock(&iip->ili_lock);
@@ -2602,9 +2583,16 @@ xfs_ifree_mark_inode_stale(
 	spin_unlock(&iip->ili_lock);
 	ASSERT(iip->ili_last_fields);
 
+	if (ip != free_ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return;
+
 out_iunlock:
 	if (ip != free_ip)
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
 }
 
 /*
@@ -3459,7 +3447,7 @@ xfs_iflush(
 	int			error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(xfs_isiflocked(ip));
+	ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
 	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 	ASSERT(iip->ili_item.li_buf == bp);
@@ -3629,7 +3617,7 @@ xfs_iflush_cluster(
 		/*
 		 * Quick and dirty check to avoid locks if possible.
 		 */
-		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
 			continue;
 		if (xfs_ipincount(ip))
 			continue;
@@ -3643,7 +3631,7 @@ xfs_iflush_cluster(
 		 */
 		spin_lock(&ip->i_flags_lock);
 		ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
-		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+		if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
 			spin_unlock(&ip->i_flags_lock);
 			continue;
 		}
@@ -3651,23 +3639,16 @@ xfs_iflush_cluster(
 		/*
 		 * ILOCK will pin the inode against reclaim and prevent
 		 * concurrent transactions modifying the inode while we are
-		 * flushing the inode.
+		 * flushing the inode. If we get the lock, set the flushing
+		 * state before we drop the i_flags_lock.
 		 */
 		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
 			spin_unlock(&ip->i_flags_lock);
 			continue;
 		}
+		__xfs_iflags_set(ip, XFS_IFLUSHING);
 		spin_unlock(&ip->i_flags_lock);
 
-		/*
-		 * Skip inodes that are already flush locked as they have
-		 * already been written to the buffer.
-		 */
-		if (!xfs_iflock_nowait(ip)) {
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			continue;
-		}
-
 		/*
 		 * Abort flushing this inode if we are shut down because the
 		 * inode may not currently be in the AIL. This can occur when
@@ -3686,7 +3667,7 @@ xfs_iflush_cluster(
 
 		/* don't block waiting on a log force to unpin dirty inodes */
 		if (xfs_ipincount(ip)) {
-			xfs_ifunlock(ip);
+			xfs_iflags_clear(ip, XFS_IFLUSHING);
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			continue;
 		}
@@ -3694,7 +3675,7 @@ xfs_iflush_cluster(
 		if (!xfs_inode_clean(ip))
 			error = xfs_iflush(ip, bp);
 		else
-			xfs_ifunlock(ip);
+			xfs_iflags_clear(ip, XFS_IFLUSHING);
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7a8adb76c17f..991ef00370d5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -211,8 +211,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
 #define XFS_INEW		(1 << __XFS_INEW_BIT)
 #define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
 #define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
-#define XFS_IFLOCK		(1 << __XFS_IFLOCK_BIT)
+#define XFS_IFLUSHING		(1 << 7) /* inode is being flushed */
 #define __XFS_IPINNED_BIT	8	 /* wakeup key for zero pin count */
 #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 #define XFS_IEOFBLOCKS		(1 << 9) /* has the preallocblocks tag set */
@@ -233,36 +232,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
 	(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
 	 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
 
-/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
-	return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
-	return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
-	if (!xfs_iflock_nowait(ip))
-		__xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
-	ASSERT(xfs_isiflocked(ip));
-	xfs_iflags_clear(ip, XFS_IFLOCK);
-	smp_mb();
-	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
-
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 3840117f8a5e..0494b907c63d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -492,7 +492,7 @@ xfs_inode_item_push(
 		return XFS_ITEM_PINNED;
 
 	/* If the inode is already flush locked, we're already flushing. */
-	if (xfs_isiflocked(ip))
+	if (xfs_iflags_test(ip, XFS_IFLUSHING))
 		return XFS_ITEM_FLUSHING;
 
 	if (!xfs_buf_trylock(bp))
@@ -702,7 +702,7 @@ xfs_iflush_finish(
 		iip->ili_last_fields = 0;
 		iip->ili_flush_lsn = 0;
 		spin_unlock(&iip->ili_lock);
-		xfs_ifunlock(iip->ili_inode);
+		xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
 		if (drop_buffer)
 			xfs_buf_rele(bp);
 	}
@@ -789,7 +789,7 @@ xfs_iflush_abort(
 		list_del_init(&iip->ili_item.li_bio_list);
 		spin_unlock(&iip->ili_lock);
 	}
-	xfs_ifunlock(ip);
+	xfs_iflags_clear(ip, XFS_IFLUSHING);
 	if (bp)
 		xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 048b5e7dee90..23a7b4928727 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -25,8 +25,8 @@ struct xfs_inode_log_item {
 	 *
 	 * We need atomic changes between inode dirtying, inode flushing and
 	 * inode completion, but these all hold different combinations of
-	 * ILOCK and iflock and hence we need some other method of serialising
-	 * updates to the flush state.
+	 * ILOCK and IFLUSHING and hence we need some other method of
+	 * serialising updates to the flush state.
 	 */
 	spinlock_t		ili_lock;	   /* flush state lock */
 	unsigned int		ili_last_fields;   /* fields when flushed */

From patchwork Tue Jun 23 09:50:13 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dave Chinner <david@fromorbit.com>
X-Patchwork-Id: 11620207
Return-Path: <SRS0=nlsN=AE=vger.kernel.org=linux-xfs-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 841E014E3
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:25 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 6F7F120738
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:25 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1732163AbgFWJuZ (ORCPT
        <rfc822;patchwork-linux-xfs@patchwork.kernel.org>);
        Tue, 23 Jun 2020 05:50:25 -0400
Received: from mail105.syd.optusnet.com.au ([211.29.132.249]:47833 "EHLO
        mail105.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK)
        by vger.kernel.org with ESMTP id S1732168AbgFWJuY (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Tue, 23 Jun 2020 05:50:24 -0400
Received: from dread.disaster.area (pa49-180-124-177.pa.nsw.optusnet.com.au
 [49.180.124.177])
        by mail105.syd.optusnet.com.au (Postfix) with ESMTPS id C05913A4EB6
        for <linux-xfs@vger.kernel.org>;
 Tue, 23 Jun 2020 19:50:20 +1000 (AEST)
Received: from discord.disaster.area ([192.168.253.110])
        by dread.disaster.area with esmtp (Exim 4.92.3)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0004gl-Uu
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
Received: from dave by discord.disaster.area with local (Exim 4.93)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0087BA-MN
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
From: Dave Chinner <david@fromorbit.com>
To: linux-xfs@vger.kernel.org
Subject: [PATCH 2/4] xfs: add log item precommit operation
Date: Tue, 23 Jun 2020 19:50:13 +1000
Message-Id: <20200623095015.1934171-3-david@fromorbit.com>
X-Mailer: git-send-email 2.26.2.761.g0e0b3e54be
In-Reply-To: <20200623095015.1934171-1-david@fromorbit.com>
References: <20200623095015.1934171-1-david@fromorbit.com>
MIME-Version: 1.0
X-Optus-CM-Score: 0
X-Optus-CM-Analysis: v=2.3 cv=X6os11be c=1 sm=1 tr=0
        a=k3aV/LVJup6ZGWgigO6cSA==:117 a=k3aV/LVJup6ZGWgigO6cSA==:17
        a=nTHF0DUjJn0A:10 a=20KFwNOVAAAA:8 a=E9wXMvJij-PDQt6hmJUA:9
Sender: linux-xfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Dave Chinner <dchinner@redhat.com>

For inodes that are dirty, we have an attached cluster buffer that
we want to use to track the dirty inode through the AIL.
Unfortunately, locking the cluster buffer and adding it to the
transaction when the inode is first logged in a transaction leads to
buffer lock ordering inversions.

The specific problem is ordering against the AGI buffer. When
modifying unlinked lists, the buffer lock order is AGI -> inode
cluster buffer as the AGI buffer lock serialises all access to the
unlinked lists. Unfortunately, functionality like xfs_droplink()
logs the inode before calling xfs_iunlink(), as do various directory
manipulation functions. The inode can be logged way down in the
stack as far as the bmapi routines and hence, without a major
rewrite of lots of APIs there's no way we can avoid the inode being
logged by something until after the AGI has been logged.

As we are going to be using ordered buffers for inode AIL tracking,
there isn't a need to actually lock that buffer against modification
as all the modifications are captured by logging the inode item
itself. Hence we don't actually need to join the cluster buffer into
the transaction until just before it is committed. This means we do
not perturb any of the existing buffer lock orders in transactions,
and the inode cluster buffer is always locked last in a transaction
that doesn't otherwise touch inode cluster buffers.

We do this by introducing a precommit log item method. A log item
method is used because it is likely dquots will be moved to this
same ordered buffer tracking scheme and hence will need a similar
callout. This commit just introduces the mechanism; the inode item
implementation is in followup commits.

The precommit items need to be sorted into consistent order as we
may be locking multiple items here. Hence if we have two dirty
inodes in cluster buffers A and B, and some other transaction has
two separate dirty inodes in the same cluster buffers, locking them
in different orders opens us up to ABBA deadlocks. Hence we sort the
items on the transaction based on the presence of a sort log item
method.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c |  1 +
 fs/xfs/xfs_trans.c  | 90 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trans.h  |  6 ++-
 3 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0d73559f2d58..1c744dbb313f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1077,6 +1077,7 @@ xfs_reclaim_inode(
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
+	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3c94e5ff4316..6f350490f84b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -799,6 +799,89 @@ xfs_trans_committed_bulk(
 	spin_unlock(&ailp->ail_lock);
 }
 
+/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last (XXX: need to check the logic matches the comment).
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+	void			*unused_arg,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xfs_log_item	*lia = container_of(a,
+					struct xfs_log_item, li_trans);
+	struct xfs_log_item	*lib = container_of(b,
+					struct xfs_log_item, li_trans);
+	int64_t			diff;
+
+	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+		return 0;
+	if (!lia->li_ops->iop_sort)
+		return 1;
+	if (!lib->li_ops->iop_sort)
+		return -1;
+
+	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip, *n;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Sort the item list to avoid ABBA deadlocks with other transactions
+	 * running precommit operations that lock multiple shared items such as
+	 * inode cluster buffers.
+	 */
+	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+	/*
+	 * Precommit operations can remove the log item from the transaction
+	 * if the log item exists purely to delay modifications until they
+	 * can be ordered against other operations. Hence we have to use
+	 * list_for_each_entry_safe() here.
+	 */
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+		if (lip->li_ops->iop_precommit) {
+			error = lip->li_ops->iop_precommit(tp, lip);
+			if (error)
+				break;
+		}
+	}
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return error;
+}
+
 /*
  * Commit the given transaction to the log.
  *
@@ -823,6 +906,13 @@ __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
+	error = xfs_trans_run_precommits(tp);
+	if (error) {
+		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+			xfs_defer_cancel(tp);
+		goto out_unreserve;
+	}
+
 	/*
 	 * Finish deferred items on final commit. Only permanent transactions
 	 * should ever have deferred ops.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b752501818d2..26ea19bd0621 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -70,10 +70,12 @@ struct xfs_item_ops {
 	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
 	void (*iop_pin)(struct xfs_log_item *);
 	void (*iop_unpin)(struct xfs_log_item *, int remove);
-	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	uint64_t (*iop_sort)(struct xfs_log_item *);
+	int (*iop_precommit)(struct xfs_trans *, struct xfs_log_item *);
 	void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
-	void (*iop_release)(struct xfs_log_item *);
 	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	void (*iop_release)(struct xfs_log_item *);
 	int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
 	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
 };

From patchwork Tue Jun 23 09:50:14 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dave Chinner <david@fromorbit.com>
X-Patchwork-Id: 11620211
Return-Path: <SRS0=nlsN=AE=vger.kernel.org=linux-xfs-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id D8FEF14F6
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:27 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id C174E206C0
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:27 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1732172AbgFWJu1 (ORCPT
        <rfc822;patchwork-linux-xfs@patchwork.kernel.org>);
        Tue, 23 Jun 2020 05:50:27 -0400
Received: from mail104.syd.optusnet.com.au ([211.29.132.246]:60118 "EHLO
        mail104.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK)
        by vger.kernel.org with ESMTP id S1732158AbgFWJu0 (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Tue, 23 Jun 2020 05:50:26 -0400
Received: from dread.disaster.area (pa49-180-124-177.pa.nsw.optusnet.com.au
 [49.180.124.177])
        by mail104.syd.optusnet.com.au (Postfix) with ESMTPS id 1D6B18218C6
        for <linux-xfs@vger.kernel.org>;
 Tue, 23 Jun 2020 19:50:17 +1000 (AEST)
Received: from discord.disaster.area ([192.168.253.110])
        by dread.disaster.area with esmtp (Exim 4.92.3)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZI-0004gp-0n
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:16 +1000
Received: from dave by discord.disaster.area with local (Exim 4.93)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0087BF-Ni
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
From: Dave Chinner <david@fromorbit.com>
To: linux-xfs@vger.kernel.org
Subject: [PATCH 3/4] xfs: track unlinked inodes in core inode
Date: Tue, 23 Jun 2020 19:50:14 +1000
Message-Id: <20200623095015.1934171-4-david@fromorbit.com>
X-Mailer: git-send-email 2.26.2.761.g0e0b3e54be
In-Reply-To: <20200623095015.1934171-1-david@fromorbit.com>
References: <20200623095015.1934171-1-david@fromorbit.com>
MIME-Version: 1.0
X-Optus-CM-Score: 0
X-Optus-CM-Analysis: v=2.3 cv=W5xGqiek c=1 sm=1 tr=0
        a=k3aV/LVJup6ZGWgigO6cSA==:117 a=k3aV/LVJup6ZGWgigO6cSA==:17
        a=nTHF0DUjJn0A:10 a=20KFwNOVAAAA:8 a=jndlIj3Fb2IgZOO0V_YA:9
        a=rik3nddzSUnEAw9k:21 a=MSXD4Do7lcwibSuw:21 a=Mr0vz9zGLq8A:10
Sender: linux-xfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Dave Chinner <dchinner@redhat.com>

Currently we cache unlinked inode list backrefs through a separate
cache which has to be maintained via memory allocation and a hash
table. When the inode is on the unlinked list, we have an existence
guarantee for the inode in memory.

That is, if the inode is on the unlinked list, there must be a
reference to the inode from the core VFS because dropping the last
reference to the inode causes it to be removed from the unlinked
list. Hence if we hold the AGI locked, we guarantee that any inode
on the unlinked list is pinned in memory. That means we can actually
track the entire unlinked list on the inode itself and use
unreferenced inode cache lookups to update the list pointers as
needed.

However, we don't use this relationship because log recovery has
no in memory state and so has to work directly from buffers.
However, because unlink recovery only removes from the head of the
list, we can easily fake this in memory state as the inode we read
in from the AGI bucket has a pointer to the next inode. Hence we can
play reference leapfrog in the recovery loop always reading the
second inode on the list and updating pointers before dropping the
reference to the first inode. Hence the in-memory state is always
valid for recovery, too.

This means we can tear out the old inode unlinked list cache and
update mechanisms and replace it with a much simpler "insert" and
"remove" functions that use in-memory inode state rather than on
buffer state to track the list. The diffstat speaks for itself.

Food for thought: This obliviates the need for the on-disk AGI
unlinked hash - we because we track as a double linked list in
memory, we don't need to keep hash chains on disk short to minimise
previous inode lookup overhead on list removal. Hence we probably
should just convert the unlinked list code to use a single list
on disk...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   1 +
 fs/xfs/xfs_icache.c           |   2 +
 fs/xfs/xfs_inode.c            | 672 ++++++++++------------------------
 fs/xfs/xfs_inode.h            |   5 +-
 fs/xfs/xfs_log_recover.c      | 166 +++++----
 fs/xfs/xfs_mount.c            |   5 -
 6 files changed, 297 insertions(+), 554 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 6b6f67595bf4..74f713e4a6b5 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -225,6 +225,7 @@ xfs_inode_from_disk(
 	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
 	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
 	to->di_flags	= be16_to_cpu(from->di_flags);
+	ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
 
 	if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
 		inode_set_iversion_queried(inode,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 1c744dbb313f..dadf417fb9b4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -71,6 +71,8 @@ xfs_inode_alloc(
 	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
 	INIT_LIST_HEAD(&ip->i_ioend_list);
 	spin_lock_init(&ip->i_ioend_lock);
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ba86d27b5226..1f1c8819330b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1876,211 +1876,123 @@ xfs_inactive(
 }
 
 /*
- * In-Core Unlinked List Lookups
- * =============================
- *
- * Every inode is supposed to be reachable from some other piece of metadata
- * with the exception of the root directory.  Inodes with a connection to a
- * file descriptor but not linked from anywhere in the on-disk directory tree
- * are collectively known as unlinked inodes, though the filesystem itself
- * maintains links to these inodes so that on-disk metadata are consistent.
- *
- * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
- * header contains a number of buckets that point to an inode, and each inode
- * record has a pointer to the next inode in the hash chain.  This
- * singly-linked list causes scaling problems in the iunlink remove function
- * because we must walk that list to find the inode that points to the inode
- * being removed from the unlinked hash bucket list.
- *
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk.  That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
- *
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk.  The only errors that
- * should bubble out are for obviously incorrect situations.
- *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
- */
-
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
-	struct rhash_head	iu_rhash_head;
-	xfs_agino_t		iu_agino;		/* X */
-	xfs_agino_t		iu_next_unlinked;	/* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
-	struct rhashtable_compare_arg	*arg,
-	const void			*obj)
-{
-	const xfs_agino_t		*key = arg->key;
-	const struct xfs_iunlink	*iu = obj;
-
-	if (iu->iu_next_unlinked != *key)
-		return 1;
-	return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
-	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
-	.key_len		= sizeof(xfs_agino_t),
-	.key_offset		= offsetof(struct xfs_iunlink,
-					   iu_next_unlinked),
-	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
-	.automatic_shrinking	= true,
-	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
-};
-
-/*
- * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode, we have existence guarantees by holding the AGI buffer lock and
+ * that only unlinked, referenced inodes can be on the unlinked inode list.
+ * If we don't find the inode in cache, then let the caller handle the
+ * situation.
  */
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_ilookup(
 	struct xfs_perag	*pag,
 	xfs_agino_t		agino)
 {
-	struct xfs_iunlink	*iu;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_inode	*ip;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	return iu ? iu->iu_agino : NULLAGINO;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	/* Inode not in memory, nothing to do */
+	if (!ip) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* Uh-oh! */
+		ip = NULL;
+	}
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return ip;
 }
 
 /*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
+ * Return the inode before @ip on the unlinked list without taking a reference
+ * to it. Caller must hold the AGI buffer locked to guarantee existence of the
+ * inode. Returns the inode or NULL if at the head of the list.
+ * If a lookup fails, return corruption error.
  */
-static int
-xfs_iunlink_insert_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup_prev(
 	struct xfs_perag	*pag,
-	struct xfs_iunlink	*iu)
+	struct xfs_inode	*ip)
 {
-	int			error;
-
-	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	/*
-	 * Fail loudly if there already was an entry because that's a sign of
-	 * corruption of in-memory data.  Also fail loudly if we see an error
-	 * code we didn't anticipate from the rhashtable code.  Currently we
-	 * only anticipate ENOMEM.
-	 */
-	if (error) {
-		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
-		kmem_free(iu);
-	}
-	/*
-	 * Absorb any runtime errors that aren't a result of corruption because
-	 * this is a cache and we can always fall back to bucket list scanning.
-	 */
-	if (error != 0 && error != -EEXIST)
-		error = 0;
-	return error;
+	struct xfs_inode	*prev_ip;
+
+	if (ip->i_prev_unlinked == NULLAGINO)
+		return NULL;
+	prev_ip = xfs_iunlink_ilookup(pag, ip->i_prev_unlinked);
+	if (!prev_ip)
+		return ERR_PTR(-EFSCORRUPTED);
+	return prev_ip;
 }
 
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
-static int
-xfs_iunlink_add_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup_next(
 	struct xfs_perag	*pag,
-	xfs_agino_t		prev_agino,
-	xfs_agino_t		this_agino)
+	struct xfs_inode	*ip)
 {
-	struct xfs_iunlink	*iu;
-
-	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
-		return 0;
-
-	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
-	iu->iu_agino = prev_agino;
-	iu->iu_next_unlinked = this_agino;
-
-	return xfs_iunlink_insert_backref(pag, iu);
+	struct xfs_inode	*next_ip;
+
+	if (ip->i_next_unlinked == NULLAGINO)
+		return NULL;
+	next_ip = xfs_iunlink_ilookup(pag, ip->i_next_unlinked);
+	if (!next_ip)
+		return ERR_PTR(-EFSCORRUPTED);
+	return next_ip;
 }
 
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
-	struct xfs_perag	*pag,
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
 	xfs_agino_t		agino,
-	xfs_agino_t		next_unlinked)
+	struct xfs_buf		*ibp,
+	struct xfs_dinode	*dip,
+	struct xfs_imap		*imap,
+	xfs_agino_t		next_agino)
 {
-	struct xfs_iunlink	*iu;
-	int			error;
-
-	/* Look up the old entry; if there wasn't one then exit. */
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	if (!iu)
-		return 0;
-
-	/*
-	 * Remove the entry.  This shouldn't ever return an error, but if we
-	 * couldn't remove the old entry we don't want to add it again to the
-	 * hash table, and if the entry disappeared on us then someone's
-	 * violated the locking rules and we need to fail loudly.  Either way
-	 * we cannot remove the inode because internal state is or would have
-	 * been corrupt.
-	 */
-	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	if (error)
-		return error;
-
-	/* If there is no new next entry just free our item and return. */
-	if (next_unlinked == NULLAGINO) {
-		kmem_free(iu);
-		return 0;
-	}
+	struct xfs_mount	*mp = tp->t_mountp;
+	int			offset;
 
-	/* Update the entry and re-add it to the hash table. */
-	iu->iu_next_unlinked = next_unlinked;
-	return xfs_iunlink_insert_backref(pag, iu);
-}
+	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
 
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
-	struct xfs_perag	*pag)
-{
-	return rhashtable_init(&pag->pagi_unlinked_hash,
-			&xfs_iunlink_hash_params);
-}
+	trace_xfs_iunlink_update_dinode(mp, agno, agino,
+			be32_to_cpu(dip->di_next_unlinked), next_agino);
 
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
-	void			*ptr,
-	void			*arg)
-{
-	struct xfs_iunlink	*iu = ptr;
-	bool			*freed_anything = arg;
+	dip->di_next_unlinked = cpu_to_be32(next_agino);
+	offset = imap->im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
 
-	*freed_anything = true;
-	kmem_free(iu);
+	/* need to recalc the inode CRC if appropriate */
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
 }
 
-void
-xfs_iunlink_destroy(
-	struct xfs_perag	*pag)
+/* Set an in-core inode's unlinked pointer and return the old value. */
+static int
+xfs_iunlink_update_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
-	bool			freed_anything = false;
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	int			error;
 
-	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
-			xfs_iunlink_free_item, &freed_anything);
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
+	if (error)
+		return error;
 
-	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+	xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			ibp, dip, &ip->i_imap, ip->i_next_unlinked);
+	return 0;
 }
 
 /*
@@ -2122,120 +2034,25 @@ xfs_iunlink_update_bucket(
 	return 0;
 }
 
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
-	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino,
-	struct xfs_buf		*ibp,
-	struct xfs_dinode	*dip,
-	struct xfs_imap		*imap,
-	xfs_agino_t		next_agino)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	int			offset;
-
-	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
-
-	trace_xfs_iunlink_update_dinode(mp, agno, agino,
-			be32_to_cpu(dip->di_next_unlinked), next_agino);
-
-	dip->di_next_unlinked = cpu_to_be32(next_agino);
-	offset = imap->im_boffset +
-			offsetof(struct xfs_dinode, di_next_unlinked);
-
-	/* need to recalc the inode CRC if appropriate */
-	xfs_dinode_calc_crc(mp, dip);
-	xfs_trans_inode_buf(tp, ibp);
-	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		next_agino,
-	xfs_agino_t		*old_next_agino)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_dinode	*dip;
-	struct xfs_buf		*ibp;
-	xfs_agino_t		old_value;
-	int			error;
-
-	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
-
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
-	if (error)
-		return error;
-
-	/* Make sure the old pointer isn't garbage. */
-	old_value = be32_to_cpu(dip->di_next_unlinked);
-	if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
-		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
-				sizeof(*dip), __this_address);
-		error = -EFSCORRUPTED;
-		goto out;
-	}
-
-	/*
-	 * Since we're updating a linked list, we should never find that the
-	 * current pointer is the same as the new value, unless we're
-	 * terminating the list.
-	 */
-	*old_next_agino = old_value;
-	if (old_value == next_agino) {
-		if (next_agino != NULLAGINO) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-					dip, sizeof(*dip), __this_address);
-			error = -EFSCORRUPTED;
-		}
-		goto out;
-	}
-
-	/* Ok, update the new pointer. */
-	xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			ibp, dip, &ip->i_imap, next_agino);
-	return 0;
-out:
-	xfs_trans_brelse(tp, ibp);
-	return error;
-}
-
 /*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI.  It will be pulled from this
- * list when the inode is freed.
+ * Always insert at the head, so we only have to do a next inode lookup to
+ * update it's prev pointer. The AGI bucket will point at the one we are
+ * inserting.
  */
-STATIC int
-xfs_iunlink(
+static int
+xfs_iunlink_insert_inode(
 	struct xfs_trans	*tp,
+	struct xfs_buf		*agibp,
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi;
-	struct xfs_buf		*agibp;
-	xfs_agino_t		next_agino;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		next_agino;
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
-	trace_xfs_iunlink(ip);
-
-	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, agno, &agibp);
-	if (error)
-		return error;
-	agi = agibp->b_addr;
-
 	/*
 	 * Get the index into the agi hash table for the list this inode will
 	 * go on.  Make sure the pointer isn't garbage and that this inode
@@ -2248,27 +2065,24 @@ xfs_iunlink(
 		return -EFSCORRUPTED;
 	}
 
-	if (next_agino != NULLAGINO) {
-		struct xfs_perag	*pag;
-		xfs_agino_t		old_agino;
+	ip->i_prev_unlinked = NULLAGINO;
+	ip->i_next_unlinked = next_agino;
+	if (ip->i_next_unlinked != NULLAGINO) {
+		struct xfs_inode	*nip;
 
-		/*
-		 * There is already another inode in the bucket, so point this
-		 * inode to the current head of the list.
-		 */
-		error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
-				&old_agino);
-		if (error)
-			return error;
-		ASSERT(old_agino == NULLAGINO);
+		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
+		if (IS_ERR_OR_NULL(nip))
+			return -EFSCORRUPTED;
 
-		/*
-		 * agino has been unlinked, add a backref from the next inode
-		 * back to agino.
-		 */
-		pag = xfs_perag_get(mp, agno);
-		error = xfs_iunlink_add_backref(pag, agino, next_agino);
-		xfs_perag_put(pag);
+		if (nip->i_prev_unlinked != NULLAGINO) {
+			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+						NULL, 0, __this_address);
+			return -EFSCORRUPTED;
+		}
+		nip->i_prev_unlinked = agino;
+
+		/* update the on disk inode now */
+		error = xfs_iunlink_update_inode(tp, ip);
 		if (error)
 			return error;
 	}
@@ -2277,118 +2091,110 @@ xfs_iunlink(
 	return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
 }
 
-/* Return the imap, dinode pointer, and buffer for an inode. */
-STATIC int
-xfs_iunlink_map_ino(
+/*
+ * Remove can be from anywhere in the list, so we have to do two adjacent inode
+ * lookups here so we can update list pointers. We may be at the head or the
+ * tail of the list, so we have to handle those cases as well.
+ */
+static int
+xfs_iunlink_remove_inode(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino,
-	struct xfs_imap		*imap,
-	struct xfs_dinode	**dipp,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		next_agino = ip->i_next_unlinked;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
-	imap->im_blkno = 0;
-	error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_imap returned error %d.",
-				__func__, error);
-		return error;
+	if (ip->i_prev_unlinked == NULLAGINO) {
+		/* remove from head of list */
+		if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino) {
+			xfs_buf_mark_corrupt(agibp);
+			return -EFSCORRUPTED;
+		}
+		if (next_agino == agino ||
+		    !xfs_verify_agino_or_null(mp, agno, next_agino))
+			return -EFSCORRUPTED;
+
+		error = xfs_iunlink_update_bucket(tp, agno, agibp,
+					bucket_index, next_agino);
+		if (error)
+			return -EFSCORRUPTED;
+	} else {
+		/* lookup previous inode and update to point at next */
+		struct xfs_inode	*pip;
+
+		pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip);
+		if (IS_ERR_OR_NULL(pip))
+			return -EFSCORRUPTED;
+
+		if (pip->i_next_unlinked != agino) {
+			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+						NULL, 0, __this_address);
+			return -EFSCORRUPTED;
+		}
+
+		/* update the on disk inode now */
+		pip->i_next_unlinked = next_agino;
+		error = xfs_iunlink_update_inode(tp, pip);
+		if (error)
+			return error;
 	}
 
-	error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-				__func__, error);
-		return error;
+	/* lookup the next inode and update to point at prev */
+	if (ip->i_next_unlinked != NULLAGINO) {
+		struct xfs_inode	*nip;
+
+		nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
+		if (IS_ERR_OR_NULL(nip))
+			return -EFSCORRUPTED;
+
+		if (nip->i_prev_unlinked != agino) {
+			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+						NULL, 0, __this_address);
+			return -EFSCORRUPTED;
+		}
+		/* in memory update only */
+		nip->i_prev_unlinked = ip->i_prev_unlinked;
 	}
 
-	return 0;
+	/* now clear prev/next from this inode and update on disk */
+	ip->i_prev_unlinked = NULLAGINO;
+	ip->i_next_unlinked = NULLAGINO;
+	return xfs_iunlink_update_inode(tp, ip);
 }
 
 /*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino.  Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
  *
- * Do not call this function if @target_agino is the head of the list.
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
  */
 STATIC int
-xfs_iunlink_map_prev(
+xfs_iunlink(
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		head_agino,
-	xfs_agino_t		target_agino,
-	xfs_agino_t		*agino,
-	struct xfs_imap		*imap,
-	struct xfs_dinode	**dipp,
-	struct xfs_buf		**bpp,
-	struct xfs_perag	*pag)
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agino_t		next_agino;
+	struct xfs_buf		*agibp;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	int			error;
 
-	ASSERT(head_agino != target_agino);
-	*bpp = NULL;
-
-	/* See if our backref cache can find it faster. */
-	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
-	if (*agino != NULLAGINO) {
-		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
-		if (error)
-			return error;
-
-		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
-			return 0;
-
-		/*
-		 * If we get here the cache contents were corrupt, so drop the
-		 * buffer and fall back to walking the bucket list.
-		 */
-		xfs_trans_brelse(tp, *bpp);
-		*bpp = NULL;
-		WARN_ON_ONCE(1);
-	}
-
-	trace_xfs_iunlink_map_prev_fallback(mp, agno);
-
-	/* Otherwise, walk the entire bucket until we find it. */
-	next_agino = head_agino;
-	while (next_agino != target_agino) {
-		xfs_agino_t	unlinked_agino;
-
-		if (*bpp)
-			xfs_trans_brelse(tp, *bpp);
-
-		*agino = next_agino;
-		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
-				bpp);
-		if (error)
-			return error;
+	ASSERT(VFS_I(ip)->i_nlink == 0);
+	ASSERT(VFS_I(ip)->i_mode != 0);
+	trace_xfs_iunlink(ip);
 
-		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
-		/*
-		 * Make sure this pointer is valid and isn't an obvious
-		 * infinite loop.
-		 */
-		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
-		    next_agino == unlinked_agino) {
-			XFS_CORRUPTION_ERROR(__func__,
-					XFS_ERRLEVEL_LOW, mp,
-					*dipp, sizeof(**dipp));
-			error = -EFSCORRUPTED;
-			return error;
-		}
-		next_agino = unlinked_agino;
-	}
+	/* Get the agi buffer first.  It ensures lock ordering on the list. */
+	error = xfs_read_agi(mp, tp, agno, &agibp);
+	if (error)
+		return error;
 
-	return 0;
+	return xfs_iunlink_insert_inode(tp, agibp, ip);
 }
 
 /*
@@ -2400,16 +2206,8 @@ xfs_iunlink_remove(
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*last_ibp;
-	struct xfs_dinode	*last_dip = NULL;
-	struct xfs_perag	*pag = NULL;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino;
-	xfs_agino_t		head_agino;
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
 	trace_xfs_iunlink_remove(ip);
@@ -2418,84 +2216,8 @@ xfs_iunlink_remove(
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		return error;
-	agi = agibp->b_addr;
-
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the head pointer isn't garbage.
-	 */
-	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (!xfs_verify_agino(mp, agno, head_agino)) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-				agi, sizeof(*agi));
-		return -EFSCORRUPTED;
-	}
-
-	/*
-	 * Set our inode's next_unlinked pointer to NULL and then return
-	 * the old pointer value so that we can update whatever was previous
-	 * to us in the list to point to whatever was next in the list.
-	 */
-	error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
-	if (error)
-		return error;
-
-	/*
-	 * If there was a backref pointing from the next inode back to this
-	 * one, remove it because we've removed this inode from the list.
-	 *
-	 * Later, if this inode was in the middle of the list we'll update
-	 * this inode's backref to point from the next inode.
-	 */
-	if (next_agino != NULLAGINO) {
-		pag = xfs_perag_get(mp, agno);
-		error = xfs_iunlink_change_backref(pag, next_agino,
-				NULLAGINO);
-		if (error)
-			goto out;
-	}
-
-	if (head_agino == agino) {
-		/* Point the head of the list to the next unlinked inode. */
-		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
-				next_agino);
-		if (error)
-			goto out;
-	} else {
-		struct xfs_imap	imap;
-		xfs_agino_t	prev_agino;
-
-		if (!pag)
-			pag = xfs_perag_get(mp, agno);
-
-		/* We need to search the list for the inode being freed. */
-		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
-				&prev_agino, &imap, &last_dip, &last_ibp,
-				pag);
-		if (error)
-			goto out;
 
-		/* Point the previous inode on the list to the next inode. */
-		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
-				last_dip, &imap, next_agino);
-
-		/*
-		 * Now we deal with the backref for this inode.  If this inode
-		 * pointed at a real inode, change the backref that pointed to
-		 * us to point to our old next.  If this inode was the end of
-		 * the list, delete the backref that pointed to us.  Note that
-		 * change_backref takes care of deleting the backref if
-		 * next_agino is NULLAGINO.
-		 */
-		error = xfs_iunlink_change_backref(pag, agino, next_agino);
-		if (error)
-			goto out;
-	}
-
-out:
-	if (pag)
-		xfs_perag_put(pag);
-	return error;
+	return xfs_iunlink_remove_inode(tp, agibp, ip);
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 991ef00370d5..c3fb62fbdeb1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,8 @@ typedef struct xfs_inode {
 	uint64_t		i_delayed_blks;	/* count of delay alloc blks */
 
 	struct xfs_icdinode	i_d;		/* most of ondisk inode */
+	xfs_agino_t		i_prev_unlinked;
+	xfs_agino_t		i_next_unlinked;
 
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
@@ -463,9 +465,6 @@ extern struct kmem_zone	*xfs_inode_zone;
 /* The default CoW extent size hint. */
 #define XFS_DEFAULT_COWEXTSZ_HINT 32
 
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
-
 void xfs_end_io(struct work_struct *work);
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 52a65a74208f..d47eea31c165 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2682,15 +2682,13 @@ xlog_recover_clear_agi_bucket(
 	return;
 }
 
-STATIC xfs_agino_t
-xlog_recover_process_one_iunlink(
+static struct xfs_inode *
+xlog_recover_get_one_iunlink(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
 	xfs_agino_t			agino,
 	int				bucket)
 {
-	struct xfs_buf			*ibp;
-	struct xfs_dinode		*dip;
 	struct xfs_inode		*ip;
 	xfs_ino_t			ino;
 	int				error;
@@ -2698,45 +2696,20 @@ xlog_recover_process_one_iunlink(
 	ino = XFS_AGINO_TO_INO(mp, agno, agino);
 	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
 	if (error)
-		goto fail;
-
-	/*
-	 * Get the on disk inode to find the next inode in the bucket.
-	 */
-	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
-	if (error)
-		goto fail_iput;
+		return NULL;
 
 	xfs_iflags_clear(ip, XFS_IRECOVERY);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
 
-	/* setup for the next pass */
-	agino = be32_to_cpu(dip->di_next_unlinked);
-	xfs_buf_relse(ibp);
-
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
 	 * the inode is dropped.
 	 */
 	ip->i_d.di_dmevmask = 0;
 
-	xfs_irele(ip);
-	return agino;
+	return ip;
 
- fail_iput:
-	xfs_irele(ip);
- fail:
-	/*
-	 * We can't read in the inode this bucket points to, or this inode
-	 * is messed up.  Just ditch this bucket of inodes.  We will lose
-	 * some inodes and space, but at least we won't hang.
-	 *
-	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
-	 * clear the inode pointer in the bucket.
-	 */
-	xlog_recover_clear_agi_bucket(mp, agno, bucket);
-	return NULLAGINO;
 }
 
 /*
@@ -2762,56 +2735,107 @@ xlog_recover_process_one_iunlink(
  * scheduled on this CPU to ensure other scheduled work can run without undue
  * latency.
  */
-STATIC void
-xlog_recover_process_iunlinks(
-	struct xlog	*log)
+static int
+xlog_recover_iunlink_ag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
 {
-	xfs_mount_t	*mp;
-	xfs_agnumber_t	agno;
-	xfs_agi_t	*agi;
-	xfs_buf_t	*agibp;
-	xfs_agino_t	agino;
-	int		bucket;
-	int		error;
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	int			bucket;
+	int			error;
 
-	mp = log->l_mp;
+	/*
+	 * Find the agi for this ag.
+	 */
+	error = xfs_read_agi(mp, NULL, agno, &agibp);
+	if (error) {
 
-	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		/*
-		 * Find the agi for this ag.
+		 * AGI is b0rked. Don't process it.
+		 *
+		 * We should probably mark the filesystem as corrupt after we've
+		 * recovered all the ag's we can....
 		 */
-		error = xfs_read_agi(mp, NULL, agno, &agibp);
+		return 0;
+	}
+
+	/*
+	 * Unlock the buffer so that it can be acquired in the normal course of
+	 * the transaction to truncate and free each inode.  Because we are not
+	 * racing with anyone else here for the AGI buffer, we don't even need
+	 * to hold it locked to read the initial unlinked bucket entries out of
+	 * the buffer. We keep buffer reference though, so that it stays pinned
+	 * in memory while we need the buffer.
+	 */
+	agi = agibp->b_addr;
+	xfs_buf_unlock(agibp);
+
+	/*
+	 * The unlinked inode list is maintained on incore inodes as a double
+	 * linked list. We don't have any of that state in memory, so we have to
+	 * create it as we go. This is simple as we are only removing from the
+	 * head of the list and that means we only need to pull the current
+	 * inode in and the next inode.  Inodes are unlinked when their
+	 * reference count goes to zero, so we can overlap the xfs_iget() and
+	 * xfs_irele() calls so we always have the first two inodes on the list
+	 * in memory. Hence we can fake up the necessary in memory state for the
+	 * unlink to "just work".
+	 */
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+		struct xfs_inode	*ip, *prev_ip = NULL;
+		xfs_agino_t		agino, prev_agino = NULLAGINO;
+
+		agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+		while (agino != NULLAGINO) {
+			ip = xlog_recover_get_one_iunlink(mp, agno, agino,
+							  bucket);
+			if (!ip) {
+				/*
+				 * something busted, but still got to release
+				 * prev_ip, so make it look like it's at the end
+				 * of the list before it gets released.
+				 */
+				error = -EFSCORRUPTED;
+				if (prev_ip)
+					prev_ip->i_next_unlinked = NULLAGINO;
+				break;
+			}
+			if (prev_ip) {
+				ip->i_prev_unlinked = prev_agino;
+				xfs_irele(prev_ip);
+			}
+			prev_agino = agino;
+			prev_ip = ip;
+			agino = ip->i_next_unlinked;
+			cond_resched();
+		}
+		if (prev_ip)
+			xfs_irele(prev_ip);
 		if (error) {
 			/*
-			 * AGI is b0rked. Don't process it.
-			 *
-			 * We should probably mark the filesystem as corrupt
-			 * after we've recovered all the ag's we can....
+			 * We can't read an inode this bucket points to, or an
+			 * inode is messed up.  Just ditch this bucket of
+			 * inodes.  We will lose some inodes and space, but at
+			 * least we won't hang.
 			 */
-			continue;
-		}
-		/*
-		 * Unlock the buffer so that it can be acquired in the normal
-		 * course of the transaction to truncate and free each inode.
-		 * Because we are not racing with anyone else here for the AGI
-		 * buffer, we don't even need to hold it locked to read the
-		 * initial unlinked bucket entries out of the buffer. We keep
-		 * buffer reference though, so that it stays pinned in memory
-		 * while we need the buffer.
-		 */
-		agi = agibp->b_addr;
-		xfs_buf_unlock(agibp);
-
-		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
-			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
-			while (agino != NULLAGINO) {
-				agino = xlog_recover_process_one_iunlink(mp,
-							agno, agino, bucket);
-				cond_resched();
-			}
+			xlog_recover_clear_agi_bucket(mp, agno, bucket);
+			break;
 		}
-		xfs_buf_rele(agibp);
 	}
+	xfs_buf_rele(agibp);
+	return error;
+}
+
+STATIC void
+xlog_recover_process_iunlinks(
+	struct xlog		*log)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	xfs_agnumber_t		agno;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
+		xlog_recover_iunlink_ag(mp, agno);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c8ae49a1e99c..031e96ff022d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -146,7 +146,6 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
-		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
@@ -223,9 +222,6 @@ xfs_initialize_perag(
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
-		error = xfs_iunlink_init(pag);
-		if (error)
-			goto out_hash_destroy;
 		spin_lock_init(&pag->pag_state_lock);
 	}
 
@@ -248,7 +244,6 @@ xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
-		xfs_iunlink_destroy(pag);
 		kmem_free(pag);
 	}
 	return error;

From patchwork Tue Jun 23 09:50:15 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dave Chinner <david@fromorbit.com>
X-Patchwork-Id: 11620203
Return-Path: <SRS0=nlsN=AE=vger.kernel.org=linux-xfs-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id F31FB1392
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:20 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id D0E98206C0
	for <patchwork-linux-xfs@patchwork.kernel.org>;
 Tue, 23 Jun 2020 09:50:20 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1732167AbgFWJuU (ORCPT
        <rfc822;patchwork-linux-xfs@patchwork.kernel.org>);
        Tue, 23 Jun 2020 05:50:20 -0400
Received: from mail104.syd.optusnet.com.au ([211.29.132.246]:60082 "EHLO
        mail104.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK)
        by vger.kernel.org with ESMTP id S1731996AbgFWJuU (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Tue, 23 Jun 2020 05:50:20 -0400
Received: from dread.disaster.area (pa49-180-124-177.pa.nsw.optusnet.com.au
 [49.180.124.177])
        by mail104.syd.optusnet.com.au (Postfix) with ESMTPS id D6550821849
        for <linux-xfs@vger.kernel.org>;
 Tue, 23 Jun 2020 19:50:16 +1000 (AEST)
Received: from discord.disaster.area ([192.168.253.110])
        by dread.disaster.area with esmtp (Exim 4.92.3)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZI-0004gs-2F
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:16 +1000
Received: from dave by discord.disaster.area with local (Exim 4.93)
        (envelope-from <david@fromorbit.com>)
        id 1jnfZH-0087BK-Py
        for linux-xfs@vger.kernel.org; Tue, 23 Jun 2020 19:50:15 +1000
From: Dave Chinner <david@fromorbit.com>
To: linux-xfs@vger.kernel.org
Subject: [PATCH 4/4] xfs: introduce inode unlink log item
Date: Tue, 23 Jun 2020 19:50:15 +1000
Message-Id: <20200623095015.1934171-5-david@fromorbit.com>
X-Mailer: git-send-email 2.26.2.761.g0e0b3e54be
In-Reply-To: <20200623095015.1934171-1-david@fromorbit.com>
References: <20200623095015.1934171-1-david@fromorbit.com>
MIME-Version: 1.0
X-Optus-CM-Score: 0
X-Optus-CM-Analysis: v=2.3 cv=X6os11be c=1 sm=1 tr=0
        a=k3aV/LVJup6ZGWgigO6cSA==:117 a=k3aV/LVJup6ZGWgigO6cSA==:17
        a=nTHF0DUjJn0A:10 a=20KFwNOVAAAA:8 a=cCzSk0G2xW8NUW1EVT8A:9
        a=a8GXlMhSRjAcR0Ga:21 a=JgH9tDYpa43xFfec:21
Sender: linux-xfs-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

From: Dave Chinner <dchinner@redhat.com>

Tracking dirty inodes via cluster buffers creates lock ordering
issues with logging unlinked inode updates direct to the cluster
buffer. The unlinked inode list is unordered, so we can lock cluster
buffers in random orders and that causes deadlocks.

To solve this problem, we really want to dealy locking the cluster
buffers until the pre-commit phase where we can order the buffers
correctly along with all the other inode cluster buffers that are
locked by the transaction. However, to do this we need to be able to
tell the transaction which inodes need to have there unlinked list
updated and what it should be updated to.

We can delay the buffer update to the pre-commit phase based on the
fact taht all unlinked inode list updates are serialised by the AGI
buffer. It will be locked into the transaction before the list
update starts, and will remain locked until the transaction commits.
Hence we can lock and update the cluster buffers safely any time
during the transaction and we are still safe from other racing
unlinked list updates.

The iunlink log item currently only exists in memory. we need a log
item to attach information to the transaction, but it's context
is completely owned by the transaction. Hence it is never formatted
or inserted into the CIL, nor is it seen by the journal, the AIL or
log recovery.

This makes it a very simple log item, and the changes makes results
in adding addition buffer log items to the transaction. Hence once
the iunlink log item has run it's pre-commit operation, it can be
dropped by the transaction and released.

The creation of this in-memory intent does not prevent us from
extending it in future to the journal to replace buffer based
logging of the unlinked list. Changing the format of the items we
write to the on disk journal is beyond the scope of this patchset,
hence we limit it to being in-memory only.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/Makefile           |   1 +
 fs/xfs/xfs_inode.c        |  70 +++----------------
 fs/xfs/xfs_inode_item.c   |   3 +-
 fs/xfs/xfs_iunlink_item.c | 141 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iunlink_item.h |  24 +++++++
 fs/xfs/xfs_super.c        |  10 +++
 6 files changed, 189 insertions(+), 60 deletions(-)
 create mode 100644 fs/xfs/xfs_iunlink_item.c
 create mode 100644 fs/xfs/xfs_iunlink_item.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..febdf034ca94 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -105,6 +105,7 @@ xfs-y				+= xfs_log.o \
 				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
 				   xfs_inode_item_recover.o \
+				   xfs_iunlink_item.o \
 				   xfs_refcount_item.o \
 				   xfs_rmap_item.o \
 				   xfs_log_recover.o \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f1c8819330b..ab288424764c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,6 +35,7 @@
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_iunlink_item.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -1945,56 +1946,6 @@ xfs_iunlink_lookup_next(
 	return next_ip;
 }
 
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
-	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino,
-	struct xfs_buf		*ibp,
-	struct xfs_dinode	*dip,
-	struct xfs_imap		*imap,
-	xfs_agino_t		next_agino)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	int			offset;
-
-	ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
-
-	trace_xfs_iunlink_update_dinode(mp, agno, agino,
-			be32_to_cpu(dip->di_next_unlinked), next_agino);
-
-	dip->di_next_unlinked = cpu_to_be32(next_agino);
-	offset = imap->im_boffset +
-			offsetof(struct xfs_dinode, di_next_unlinked);
-
-	/* need to recalc the inode CRC if appropriate */
-	xfs_dinode_calc_crc(mp, dip);
-	xfs_trans_inode_buf(tp, ibp);
-	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-static int
-xfs_iunlink_update_inode(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-	struct xfs_dinode	*dip;
-	struct xfs_buf		*ibp;
-	int			error;
-
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
-	if (error)
-		return error;
-
-	xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			ibp, dip, &ip->i_imap, ip->i_next_unlinked);
-	return 0;
-}
-
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
  * is responsible for validating the old value.
@@ -2051,7 +2002,6 @@ xfs_iunlink_insert_inode(
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agino_t		next_agino;
 	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-	int			error;
 
 	/*
 	 * Get the index into the agi hash table for the list this inode will
@@ -2082,9 +2032,7 @@ xfs_iunlink_insert_inode(
 		nip->i_prev_unlinked = agino;
 
 		/* update the on disk inode now */
-		error = xfs_iunlink_update_inode(tp, ip);
-		if (error)
-			return error;
+		xfs_iunlink_log(tp, ip);
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2140,9 +2088,7 @@ xfs_iunlink_remove_inode(
 
 		/* update the on disk inode now */
 		pip->i_next_unlinked = next_agino;
-		error = xfs_iunlink_update_inode(tp, pip);
-		if (error)
-			return error;
+		xfs_iunlink_log(tp, pip);
 	}
 
 	/* lookup the next inode and update to point at prev */
@@ -2162,10 +2108,15 @@ xfs_iunlink_remove_inode(
 		nip->i_prev_unlinked = ip->i_prev_unlinked;
 	}
 
-	/* now clear prev/next from this inode and update on disk */
+	/*
+	 * Now clear prev/next from this inode and update on disk if we
+	 * need to clear the on-disk link.
+	 */
 	ip->i_prev_unlinked = NULLAGINO;
 	ip->i_next_unlinked = NULLAGINO;
-	return xfs_iunlink_update_inode(tp, ip);
+	if (next_agino != NULLAGINO)
+		xfs_iunlink_log(tp, ip);
+	return 0;
 }
 
 /*
@@ -2185,6 +2136,7 @@ xfs_iunlink(
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	int			error;
 
+	ASSERT(ip->i_next_unlinked == NULLAGINO);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
 	trace_xfs_iunlink(ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0494b907c63d..bc1970c37edc 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -488,8 +488,9 @@ xfs_inode_item_push(
 	ASSERT(iip->ili_item.li_buf);
 
 	if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
-	    (ip->i_flags & XFS_ISTALE))
+	    (ip->i_flags & XFS_ISTALE)) {
 		return XFS_ITEM_PINNED;
+	}
 
 	/* If the inode is already flush locked, we're already flushing. */
 	if (xfs_iflags_test(ip, XFS_IFLUSHING))
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000000..83f1dc81133b
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+
+kmem_zone_t	*xfs_iunlink_zone;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_iunlink_item, iu_item);
+}
+
+static void
+xfs_iunlink_item_release(
+	struct xfs_log_item	*lip)
+{
+	kmem_cache_free(xfs_iunlink_zone, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+	struct xfs_log_item	*lip)
+{
+	return IUL_ITEM(lip)->iu_ino;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number
+ * we were passed, then update the next unlinked field for that inode in
+ * the buffer and log the buffer. This ensures that the inode cluster buffer
+ * was logged in the correct order w.r.t. other inode cluster buffers.
+ *
+ * Note: if the inode cluster buffer is marked stale, this transaction is
+ * actually freeing the inode cluster. In that case, do not relog the buffer
+ * as this removes the stale state from it. That then causes the post-commit
+ * processing that is dependent on the cluster buffer being stale to go wrong
+ * and we'll leave stale inodes in the AIL that cannot be removed, hanging the
+ * log.
+ */
+static int
+xfs_iunlink_item_precommit(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_iunlink_item	*iup = IUL_ITEM(lip);
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, iup->iu_ino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, iup->iu_ino);
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*bp;
+	int			offset;
+	int			error;
+
+	error = xfs_imap_to_bp(mp, tp, &iup->iu_imap, &dip, &bp, 0);
+	if (error)
+		goto out_remove;
+
+	trace_xfs_iunlink_update_dinode(mp, agno, agino,
+			be32_to_cpu(dip->di_next_unlinked),
+			iup->iu_next_unlinked);
+
+	/*
+	 * Don't bother updating the unlinked field on stale buffers as
+	 * it will never get to disk anyway.
+	 */
+	if (bp->b_flags & XBF_STALE)
+		goto out_remove;
+
+	dip->di_next_unlinked = cpu_to_be32(iup->iu_next_unlinked);
+	offset = iup->iu_imap.im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	/* need to recalc the inode CRC if appropriate */
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, bp);
+	xfs_trans_log_buf(tp, bp, offset, offset + sizeof(xfs_agino_t) - 1);
+
+out_remove:
+	/*
+	 * This log item only exists to perform this action. We now remove
+	 * it from the transaction and free it as it should never reach the
+	 * CIL.
+	 */
+	list_del(&lip->li_trans);
+	xfs_iunlink_item_release(lip);
+	return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+	.flags		= XFS_ITEM_RELEASE_WHEN_COMMITTED,
+	.iop_release	= xfs_iunlink_item_release,
+	.iop_sort	= xfs_iunlink_item_sort,
+	.iop_precommit	= xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+void
+xfs_iunlink_log(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_iunlink_item	*iup;
+
+	iup = kmem_zone_zalloc(xfs_iunlink_zone, 0);
+
+	xfs_log_item_init(tp->t_mountp, &iup->iu_item, XFS_LI_IUNLINK,
+			  &xfs_iunlink_item_ops);
+
+	iup->iu_ino = ip->i_ino;
+	iup->iu_next_unlinked = ip->i_next_unlinked;
+	iup->iu_imap = ip->i_imap;
+
+	xfs_trans_add_item(tp, &iup->iu_item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &iup->iu_item.li_flags);
+}
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000000..c9e58acf4ccf
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H	1
+
+struct xfs_trans;
+struct xfs_inode;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+	struct xfs_log_item	iu_item;
+	struct xfs_imap		iu_imap;
+	xfs_ino_t		iu_ino;
+	xfs_agino_t		iu_next_unlinked;
+};
+
+extern kmem_zone_t *xfs_iunlink_zone;
+
+void xfs_iunlink_log(struct xfs_trans *tp, struct xfs_inode *ip);
+
+#endif	/* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5a5d9453cf51..a36dfb0e7e5b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,7 @@
 #include "xfs_refcount_item.h"
 #include "xfs_bmap_item.h"
 #include "xfs_reflink.h"
+#include "xfs_iunlink_item.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -1955,8 +1956,16 @@ xfs_init_zones(void)
 	if (!xfs_bui_zone)
 		goto out_destroy_bud_zone;
 
+	xfs_iunlink_zone = kmem_cache_create("xfs_iul_item",
+					     sizeof(struct xfs_iunlink_item),
+					     0, 0, NULL);
+	if (!xfs_iunlink_zone)
+		goto out_destroy_bui_zone;
+
 	return 0;
 
+ out_destroy_bui_zone:
+	kmem_cache_destroy(xfs_bui_zone);
  out_destroy_bud_zone:
 	kmem_cache_destroy(xfs_bud_zone);
  out_destroy_cui_zone:
@@ -2003,6 +2012,7 @@ xfs_destroy_zones(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_cache_destroy(xfs_iunlink_zone);
 	kmem_cache_destroy(xfs_bui_zone);
 	kmem_cache_destroy(xfs_bud_zone);
 	kmem_cache_destroy(xfs_cui_zone);