diff mbox series

[8/8] xfs: cache unlinked pointers in an rhashtable

Message ID 154897672012.26065.1375987197453969157.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: incore unlinked list | expand

Commit Message

Darrick J. Wong Jan. 31, 2019, 11:18 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Use a rhashtable to cache the unlinked list incore.  This should speed
up unlinked processing considerably when there are a lot of inodes on
the unlinked list because iunlink_remove no longer has to traverse an
entire bucket list to find which inode points to the one being removed.

The incore list structure records "X.next_unlinked = Y" relations, with
the rhashtable using Y to index the records.  This makes finding the
inode X that points to a inode Y very quick.  If our cache fails to find
anything we can always fall back on the old method.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h       |   10 ++
 fs/xfs/xfs_log_recover.c |   12 ++-
 fs/xfs/xfs_mount.c       |    7 +
 fs/xfs/xfs_mount.h       |    1 
 5 files changed, 245 insertions(+), 1 deletion(-)

Comments

Christoph Hellwig Feb. 1, 2019, 8:03 a.m. UTC | #1
On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.

This sounds pretty reasonable and a real review will follow.  But can
you quantify the considerably speedups for real life workloads?
Brian Foster Feb. 1, 2019, 7:29 p.m. UTC | #2
On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.
> 
> The incore list structure records "X.next_unlinked = Y" relations, with
> the rhashtable using Y to index the records.  This makes finding the
> inode X that points to a inode Y very quick.  If our cache fails to find
> anything we can always fall back on the old method.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

I still need to take a closer look at this one, but I want to make sure
I grok what's happening from the unlinked list perspective...

>  fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_inode.h       |   10 ++
>  fs/xfs/xfs_log_recover.c |   12 ++-
>  fs/xfs/xfs_mount.c       |    7 +
>  fs/xfs/xfs_mount.h       |    1 
>  5 files changed, 245 insertions(+), 1 deletion(-)
> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 56349497d75b..eec3c6109fc6 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
...
> @@ -2072,6 +2255,9 @@ xfs_iunlink(
>  		if (error)
>  			goto out_unlock;
>  		ASSERT(old_agino == NULLAGINO);
> +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> +		if (error)
> +			goto out_unlock;

agino has been unlinked, add a backref from the next inode (i.e., we
point to) back to agino.

>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2144,6 +2330,17 @@ xfs_iunlink_map_prev(
>  
>  	ASSERT(head_agino != target_agino);
>  
> +	/* See if our backref cache can find it faster. */
> +	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
> +	if (error == 0) {
> +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> +				&last_dip, &last_ibp);
> +		if (error)
> +			return error;
> +		goto out;
> +	}
> +
>  	next_agino = head_agino;
>  	while (next_agino != target_agino) {
>  		xfs_agino_t	unlinked_agino;
> @@ -2169,6 +2366,7 @@ xfs_iunlink_map_prev(
>  		next_agino = unlinked_agino;
>  	}
>  
> +out:
>  	/* Should never happen, but don't return garbage. */
>  	if (next_ino == NULLFSINO)
>  		return -EFSCORRUPTED;
> @@ -2243,6 +2441,12 @@ xfs_iunlink_remove(
>  		if (error)
>  			goto out_unlock;
>  
> +		if (next_agino != NULLAGINO) {
> +			error = xfs_iunlink_forget_backref(pag, next_agino);
> +			if (error)
> +				goto out_unlock;
> +		}

Removed from the head. We thus had no inode pointing to us, but the next
record had a ref that we pointed to it. Drop that since we're gone and
that inode is now the head.

> +
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
>  				next_agino);
> @@ -2265,10 +2469,22 @@ xfs_iunlink_remove(
>  				&next_agino);
>  		if (error)
>  			goto out_unlock;
> +		if (next_agino != NULLAGINO) {
> +			error = xfs_iunlink_forget_backref(pag, next_agino);
> +			if (error)
> +				goto out_unlock;
> +		}

Removed from the middle. next_agino had a backref to us, so we drop that
since we're going away.

>  
>  		/* Point the previous inode on the list to the next inode. */
>  		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
>  				next_ino, next_agino);
> +		if (next_agino == NULLAGINO)
> +			error = xfs_iunlink_forget_backref(pag, agino);
> +		else
> +			error = xfs_iunlink_change_backref(pag, agino,
> +					next_agino);

Now we deal with the backref for us (i.e., to whatever pointed at us and
now points at something else). If it now points at a real inode, change
the ref that pointed to us to point to our old next. If it doesn't,
delete the ref that pointed to us. Am I following that correctly?

Hmm, I realize this may be what's happening behind these wrappers, but
I'm wondering if the logic at this level would be easier to grok if we
just dropped the backref that points to our ip (IIUC we should always
have one here), if we point to something, drop the backref for that and
then add a new one for the prev->next link we've just created.

But I still need to take a closer look at all of this... it might be
more clear with some comments.

Brian

> +		if (error)
> +			goto out_unlock;
>  	}
>  	pag->pagi_unlinked_count--;
>  
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index be2014520155..9690f0d32e33 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -500,4 +500,14 @@ extern struct kmem_zone	*xfs_inode_zone;
>  
>  bool xfs_inode_verify_forks(struct xfs_inode *ip);
>  
> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> +		xfs_agino_t *prev_agino);
> +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +
>  #endif	/* __XFS_INODE_H__ */
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index c920b8aeba01..909b70550aa8 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -5078,12 +5078,22 @@ xlog_recover_process_one_iunlink(
>  	agino = be32_to_cpu(dip->di_next_unlinked);
>  	xfs_buf_relse(ibp);
>  
> -	/* Make sure the in-core data knows about this unlinked inode. */
> +	/*
> +	 * Make sure the in-core data knows about this unlinked inode.  Since
> +	 * our iunlinks recovery basically just deletes the head of a bucket
> +	 * list until the bucket is empty, we need only to add the backref from
> +	 * the current list item to the next one, if this isn't the list tail.
> +	 */
>  	pag = xfs_perag_get(mp, agno);
>  	mutex_lock(&pag->pagi_unlinked_lock);
>  	pag->pagi_unlinked_count++;
> +	if (agino != NULLAGINO)
> +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> +				agino);
>  	mutex_unlock(&pag->pagi_unlinked_lock);
>  	xfs_perag_put(pag);
> +	if (error)
> +		goto fail_iput;
>  
>  	/*
>  	 * Prevent any DMAPI event from being sent when the reference on
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 6bfc985669e0..4eb97ddc915e 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -151,6 +151,7 @@ xfs_free_perag(
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
>  		ASSERT(pag->pagi_unlinked_count == 0 ||
>  		       XFS_FORCED_SHUTDOWN(mp));
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> @@ -231,6 +232,9 @@ xfs_initialize_perag(
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
>  		mutex_init(&pag->pagi_unlinked_lock);
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_iunlink_mutex;
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> @@ -241,6 +245,8 @@ xfs_initialize_perag(
>  	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
>  	return 0;
>  
> +out_iunlink_mutex:
> +	mutex_destroy(&pag->pagi_unlinked_lock);
>  out_hash_destroy:
>  	xfs_buf_hash_destroy(pag);
>  out_free_pag:
> @@ -253,6 +259,7 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 0fcc6b6a4f67..27a433e93d32 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -392,6 +392,7 @@ typedef struct xfs_perag {
>  	/* unlinked inodes */
>  	struct mutex		pagi_unlinked_lock;
>  	uint32_t		pagi_unlinked_count;
> +	struct rhashtable	pagi_unlinked_hash;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
>
Darrick J. Wong Feb. 1, 2019, 7:40 p.m. UTC | #3
On Fri, Feb 01, 2019 at 02:29:02PM -0500, Brian Foster wrote:
> On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> > 
> > The incore list structure records "X.next_unlinked = Y" relations, with
> > the rhashtable using Y to index the records.  This makes finding the
> > inode X that points to a inode Y very quick.  If our cache fails to find
> > anything we can always fall back on the old method.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> I still need to take a closer look at this one, but I want to make sure
> I grok what's happening from the unlinked list perspective...
> 
> >  fs/xfs/xfs_inode.c       |  216 ++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_inode.h       |   10 ++
> >  fs/xfs/xfs_log_recover.c |   12 ++-
> >  fs/xfs/xfs_mount.c       |    7 +
> >  fs/xfs/xfs_mount.h       |    1 
> >  5 files changed, 245 insertions(+), 1 deletion(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 56349497d75b..eec3c6109fc6 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> ...
> > @@ -2072,6 +2255,9 @@ xfs_iunlink(
> >  		if (error)
> >  			goto out_unlock;
> >  		ASSERT(old_agino == NULLAGINO);
> > +		error = xfs_iunlink_add_backref(pag, agino, next_agino);
> > +		if (error)
> > +			goto out_unlock;
> 
> agino has been unlinked, add a backref from the next inode (i.e., we
> point to) back to agino.
> 
> >  	}
> >  
> >  	/* Point the head of the list to point to this inode. */
> > @@ -2144,6 +2330,17 @@ xfs_iunlink_map_prev(
> >  
> >  	ASSERT(head_agino != target_agino);
> >  
> > +	/* See if our backref cache can find it faster. */
> > +	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
> > +	if (error == 0) {
> > +		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > +		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
> > +				&last_dip, &last_ibp);
> > +		if (error)
> > +			return error;
> > +		goto out;
> > +	}
> > +
> >  	next_agino = head_agino;
> >  	while (next_agino != target_agino) {
> >  		xfs_agino_t	unlinked_agino;
> > @@ -2169,6 +2366,7 @@ xfs_iunlink_map_prev(
> >  		next_agino = unlinked_agino;
> >  	}
> >  
> > +out:
> >  	/* Should never happen, but don't return garbage. */
> >  	if (next_ino == NULLFSINO)
> >  		return -EFSCORRUPTED;
> > @@ -2243,6 +2441,12 @@ xfs_iunlink_remove(
> >  		if (error)
> >  			goto out_unlock;
> >  
> > +		if (next_agino != NULLAGINO) {
> > +			error = xfs_iunlink_forget_backref(pag, next_agino);
> > +			if (error)
> > +				goto out_unlock;
> > +		}
> 
> Removed from the head. We thus had no inode pointing to us, but the next
> record had a ref that we pointed to it. Drop that since we're gone and
> that inode is now the head.
> 
> > +
> >  		/* Point the head of the list to the next unlinked inode. */
> >  		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
> >  				next_agino);
> > @@ -2265,10 +2469,22 @@ xfs_iunlink_remove(
> >  				&next_agino);
> >  		if (error)
> >  			goto out_unlock;
> > +		if (next_agino != NULLAGINO) {
> > +			error = xfs_iunlink_forget_backref(pag, next_agino);
> > +			if (error)
> > +				goto out_unlock;
> > +		}
> 
> Removed from the middle. next_agino had a backref to us, so we drop that
> since we're going away.
> 
> >  
> >  		/* Point the previous inode on the list to the next inode. */
> >  		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
> >  				next_ino, next_agino);
> > +		if (next_agino == NULLAGINO)
> > +			error = xfs_iunlink_forget_backref(pag, agino);
> > +		else
> > +			error = xfs_iunlink_change_backref(pag, agino,
> > +					next_agino);
> 
> Now we deal with the backref for us (i.e., to whatever pointed at us and
> now points at something else). If it now points at a real inode, change
> the ref that pointed to us to point to our old next. If it doesn't,
> delete the ref that pointed to us. Am I following that correctly?
> 
> Hmm, I realize this may be what's happening behind these wrappers, but
> I'm wondering if the logic at this level would be easier to grok if we
> just dropped the backref that points to our ip (IIUC we should always
> have one here), if we point to something, drop the backref for that and
> then add a new one for the prev->next link we've just created.
> 
> But I still need to take a closer look at all of this... it might be
> more clear with some comments.

Yep, your understanding of what we're doing with the backrefs is correct.
I'll add some comments to make what we're doing clearer.

FWIW the reason for having a separate change_backref is to avoid freeing
the old backref only to allocate memory for a new backref immediately.

--D

> Brian
> 
> > +		if (error)
> > +			goto out_unlock;
> >  	}
> >  	pag->pagi_unlinked_count--;
> >  
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index be2014520155..9690f0d32e33 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -500,4 +500,14 @@ extern struct kmem_zone	*xfs_inode_zone;
> >  
> >  bool xfs_inode_verify_forks(struct xfs_inode *ip);
> >  
> > +int xfs_iunlink_init(struct xfs_perag *pag);
> > +void xfs_iunlink_destroy(struct xfs_perag *pag);
> > +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> > +		xfs_agino_t *prev_agino);
> > +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> > +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> > +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> > +		xfs_agino_t this_agino);
> > +
> >  #endif	/* __XFS_INODE_H__ */
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index c920b8aeba01..909b70550aa8 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -5078,12 +5078,22 @@ xlog_recover_process_one_iunlink(
> >  	agino = be32_to_cpu(dip->di_next_unlinked);
> >  	xfs_buf_relse(ibp);
> >  
> > -	/* Make sure the in-core data knows about this unlinked inode. */
> > +	/*
> > +	 * Make sure the in-core data knows about this unlinked inode.  Since
> > +	 * our iunlinks recovery basically just deletes the head of a bucket
> > +	 * list until the bucket is empty, we need only to add the backref from
> > +	 * the current list item to the next one, if this isn't the list tail.
> > +	 */
> >  	pag = xfs_perag_get(mp, agno);
> >  	mutex_lock(&pag->pagi_unlinked_lock);
> >  	pag->pagi_unlinked_count++;
> > +	if (agino != NULLAGINO)
> > +		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
> > +				agino);
> >  	mutex_unlock(&pag->pagi_unlinked_lock);
> >  	xfs_perag_put(pag);
> > +	if (error)
> > +		goto fail_iput;
> >  
> >  	/*
> >  	 * Prevent any DMAPI event from being sent when the reference on
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 6bfc985669e0..4eb97ddc915e 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -151,6 +151,7 @@ xfs_free_perag(
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> >  		ASSERT(pag->pagi_unlinked_count == 0 ||
> >  		       XFS_FORCED_SHUTDOWN(mp));
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> > @@ -231,6 +232,9 @@ xfs_initialize_perag(
> >  		if (first_initialised == NULLAGNUMBER)
> >  			first_initialised = index;
> >  		mutex_init(&pag->pagi_unlinked_lock);
> > +		error = xfs_iunlink_init(pag);
> > +		if (error)
> > +			goto out_iunlink_mutex;
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > @@ -241,6 +245,8 @@ xfs_initialize_perag(
> >  	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
> >  	return 0;
> >  
> > +out_iunlink_mutex:
> > +	mutex_destroy(&pag->pagi_unlinked_lock);
> >  out_hash_destroy:
> >  	xfs_buf_hash_destroy(pag);
> >  out_free_pag:
> > @@ -253,6 +259,7 @@ xfs_initialize_perag(
> >  		if (!pag)
> >  			break;
> >  		xfs_buf_hash_destroy(pag);
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pagi_unlinked_lock);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		kmem_free(pag);
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 0fcc6b6a4f67..27a433e93d32 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -392,6 +392,7 @@ typedef struct xfs_perag {
> >  	/* unlinked inodes */
> >  	struct mutex		pagi_unlinked_lock;
> >  	uint32_t		pagi_unlinked_count;
> > +	struct rhashtable	pagi_unlinked_hash;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> >
Dave Chinner Feb. 1, 2019, 11:59 p.m. UTC | #4
On Fri, Feb 01, 2019 at 12:03:43AM -0800, Christoph Hellwig wrote:
> On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> 
> This sounds pretty reasonable and a real review will follow.  But can
> you quantify the considerably speedups for real life workloads?

When you have more than a few hundred open-but-unlinked inodes in an
AG, the unlinked list walking starts to take a significant amount of
time when the final reference to an inode goes away. It's
essentially an O(N) buffer walk, so takes a lot of CPU time when a
list gets more than a few inode long.

With darrick's background inactivation, the lists grow to tens of
thousands of inodes very quickly. I was seeing >95% of CPU time
being spend in unlinked list walking on large scale 'rm -rf'
workloads and performance absolutely dropped off a cliff. My initial
code (from years and years ago) used a
double linked list, and when I forward ported that and ran it up,
the CPU overhead of unlink list removal was cut to almost nothing and
all the performance regressions with background inactivation went
away. i.e. Unlink list removal went from an O(N) overhead to always
being O(1), so the length of the list didnt' affect performance any
more.

Darrick's code replaces the list_head in each inode I used with a
rhashtable - it removes the 16 bytes per-inode memory footprint my
implementation required, but otherwise is identical.  Hence I expect
that it'll show exactly the same performance characteristics as the
existing code.

IOWs, it's really required for backgrounding and parallelising inode
inactivation, but otherwise it won't make any noticable/measurable
difference to most workloads because the unlinked lists almost never
grows more than one inode in length and so O(N) ~= O(1) almost all
the time...

Cheers,

Dave.
Darrick J. Wong Feb. 2, 2019, 4:31 a.m. UTC | #5
On Sat, Feb 02, 2019 at 10:59:00AM +1100, Dave Chinner wrote:
> On Fri, Feb 01, 2019 at 12:03:43AM -0800, Christoph Hellwig wrote:
> > On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Use a rhashtable to cache the unlinked list incore.  This should speed
> > > up unlinked processing considerably when there are a lot of inodes on
> > > the unlinked list because iunlink_remove no longer has to traverse an
> > > entire bucket list to find which inode points to the one being removed.
> > 
> > This sounds pretty reasonable and a real review will follow.  But can
> > you quantify the considerably speedups for real life workloads?
> 
> When you have more than a few hundred open-but-unlinked inodes in an
> AG, the unlinked list walking starts to take a significant amount of
> time when the final reference to an inode goes away. It's
> essentially an O(N) buffer walk, so takes a lot of CPU time when a
> list gets more than a few inode long.
> 
> With darrick's background inactivation, the lists grow to tens of
> thousands of inodes very quickly. I was seeing >95% of CPU time
> being spend in unlinked list walking on large scale 'rm -rf'
> workloads and performance absolutely dropped off a cliff. My initial
> code (from years and years ago) used a
> double linked list, and when I forward ported that and ran it up,
> the CPU overhead of unlink list removal was cut to almost nothing and
> all the performance regressions with background inactivation went
> away. i.e. Unlink list removal went from an O(N) overhead to always
> being O(1), so the length of the list didnt' affect performance any
> more.
> 
> Darrick's code replaces the list_head in each inode I used with a
> rhashtable - it removes the 16 bytes per-inode memory footprint my
> implementation required, but otherwise is identical.  Hence I expect
> that it'll show exactly the same performance characteristics as the
> existing code.
> 
> IOWs, it's really required for backgrounding and parallelising inode
> inactivation, but otherwise it won't make any noticable/measurable
> difference to most workloads because the unlinked lists almost never
> grows more than one inode in length and so O(N) ~= O(1) almost all
> the time...

I wrote a silly program to open as many O_TMPFILE files as it can and
then close them all.  I wrapped that in a script to crank up ulimit -n
to fs.file_max (which on this VM was about 194000 files) and came up
with this on a 5.0-rc4 kernel with deferred inactivation and iunlink
backref caching:

+ /d/t/tmpfile/tmpfile
Opened 193519 files in 6.44s.
Closed 193519 files in 1.37s

real    0m7.818s
user    0m0.083s
sys     0m7.373s
+ cd /
+ umount /mnt

real    0m5.681s
user    0m0.008s
sys     0m0.000s

I then repeated the experiment with a vanilla 5.0-rc2 kernel I had lying
around and saw much slower results:

+ /d/t/tmpfile/tmpfile
Opened 193588 files in 6.35s.
Closed 193588 files in 751.61s

real    12m38.853s
user    0m0.084s
sys     12m34.470s
+ cd /
+ umount /mnt

real    0m0.086s
user    0m0.000s
sys     0m0.060s

The VM had 2G of RAM and a 3GB SCSI disk backed by a tmpfs file.

--D

Wrapper script:

#!/bin/bash -x

dir="$(dirname "$0")"

umount /mnt
rmmod xfs
mkfs.xfs -f /dev/sda
mount /dev/sda /mnt

ulimit -n $(cat /proc/sys/fs/file-max)

cd /mnt
time $dir/tmpfile
cd /
time umount /mnt

and tmpfile.c:

/* Open files and leak them. */
#define _GNU_SOURCE
#include <time.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

static int min_fd = -1;
static int max_fd = -1;
static unsigned int nr_opened = 0;
static float start_time;

void clock_time(float *time)
{
	static clockid_t clkid = CLOCK_MONOTONIC;
	struct timespec ts;
	int ret;

retry:
	ret = clock_gettime(clkid, &ts);
	if (ret) {
		if (clkid == CLOCK_MONOTONIC) {
			clkid = CLOCK_REALTIME;
			goto retry;
		}
		perror("clock_gettime");
		exit(2);
	}
	*time = ts.tv_sec + ((float)ts.tv_nsec / 1000000000);
}

/*
 * Exit the program due to an error.
 *
 * If we've exhausted all the file descriptors, make sure we close all the
 * open fds in the order we received them in order to exploit a quirk of ext4
 * and xfs where the oldest unlinked inodes are at the /end/ of the unlinked
 * lists, which will make removing the unlinked files maximally painful.
 *
 * If it's some other error, just die and let the kernel sort it out.
 */
void die(void)
{
	float end_time;
	int fd;

	switch (errno) {
	case EMFILE:
	case ENFILE:
	case ENOSPC:
		clock_time(&end_time);
		printf("Opened %u files in %.2fs.\n", nr_opened,
				end_time - start_time);
		clock_time(&start_time);

		for (fd = min_fd; fd <= max_fd; fd++)
			close(fd);
		clock_time(&end_time);
		printf("Closed %u files in %.2fs\n", nr_opened,
				end_time - start_time);
		exit(0);
		break;
	default:
		perror("open?");
		exit(2);
		break;
	}
}

/* Remember how many file we open and all that. */
void remember_fd(int fd)
{
	if (min_fd == -1 || min_fd > fd)
		min_fd = fd;
	if (max_fd == -1 || max_fd < fd)
		max_fd = fd;
	nr_opened++;
}

/* Put an opened file on the unlinked list and leak the fd. */
void leak_tmpfile(void)
{
	int fd = -1;
	int ret;

	/* Try to create an O_TMPFILE and leak the fd. */
#ifdef O_TMPFILE
	fd = open(".", O_TMPFILE | O_RDWR, 0644);
	if (fd >= 0) {
		remember_fd(fd);
		return;
	}
	if (fd < 0)
		die();
#endif

	/* Oh well, create a new file, unlink it, and leak the fd. */
	fd = open("./moo", O_CREAT | O_RDWR, 0644);
	if (fd < 0)
		die();
	ret = unlink("./moo");
	if (ret)
		die();
	remember_fd(fd);
}

/* Try to put as many files on the unlinked list and then kill them. */
int main(int argc, char *argv[])
{
	clock_time(&start_time);
	while (1)
		leak_tmpfile();
	return 0;
}
Christoph Hellwig Feb. 2, 2019, 4:07 p.m. UTC | #6
On Fri, Feb 01, 2019 at 08:31:59PM -0800, Darrick J. Wong wrote:
> 
> I wrote a silly program to open as many O_TMPFILE files as it can and
> then close them all.  I wrapped that in a script to crank up ulimit -n
> to fs.file_max (which on this VM was about 194000 files) and came up
> with this on a 5.0-rc4 kernel with deferred inactivation and iunlink
> backref caching:

Ah, cool.  Maybe throw a one-line summary of the results into the commit
log for the next version.
Christoph Hellwig Feb. 2, 2019, 5:01 p.m. UTC | #7
On Thu, Jan 31, 2019 at 03:18:40PM -0800, Darrick J. Wong wrote:
> +/*
> + * Faster In-Core Unlinked List Lookups
> + * ====================================

I'd drop the "Faster " here - the whole point of a separate in-core
lookup is generally that it is faster.

> +struct xfs_iunlink_key {
> +	xfs_agino_t		iu_next_unlinked;
> +};

I don't think we need this structure at all.  We can just directly
pass a pointer to the xfs_agino_t instead.

> +	.obj_cmpfn		= _xfs_iunlink_obj_cmp,

No real need for the _ prefix.  Also it would generally be nice
if the method implementation name is prefix + method name.

> +int
> +xfs_iunlink_lookup_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		agino,
> +	xfs_agino_t		*prev_agino)
> +{
> +	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
> +	struct xfs_iunlink	*iu;
> +
> +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
> +			xfs_iunlink_hash_params);
> +	if (!iu)
> +		return -ENOENT;
> +	*prev_agino = iu->iu_agino;
> +	return 0;

I think we can just retun the xfs_agino_t as the return value, with
NULLAGINO as the not found return value.

> +
> +/* Forget that X.next_unlinked = @agino. */
> +int
> +xfs_iunlink_forget_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		agino)
> +{
> +	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
> +	struct xfs_iunlink	*iu;
> +	int			error;
> +
> +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
> +			xfs_iunlink_hash_params);
> +	if (!iu)
> +		return -ENOENT;
> +
> +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> +	if (error)
> +		return error;
> +
> +	kmem_free(iu);
> +	return 0;

This mostly duplicates the change_backref help.  If we just free
the iu for a NULLAGINO next_unlinked there we can merge the two
functions easily.

> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
> +		xfs_agino_t *prev_agino);
> +int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);
> +int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
> +int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
> +		xfs_agino_t this_agino);

Half of these aren't used outside xfs_inode.c.

> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pagi_unlinked_lock);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> @@ -231,6 +232,9 @@ xfs_initialize_perag(
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
>  		mutex_init(&pag->pagi_unlinked_lock);
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_iunlink_mutex;

Any good reason pagi_unlinked_lock isn't handled in
xfs_iunlink_init / xfs_iunlink_destroy?

Untested patch for most of my comments above:

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e797b11ad842..f8da8ce48348 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1907,7 +1907,7 @@ xfs_inactive(
 }
 
 /*
- * Faster In-Core Unlinked List Lookups
+ * In-Core Unlinked List Lookups
  * ====================================
  *
  * Every inode is supposed to be reachable from some other piece of metadata
@@ -1937,20 +1937,16 @@ struct xfs_iunlink {
 	xfs_agino_t		iu_next_unlinked;
 };
 
-struct xfs_iunlink_key {
-	xfs_agino_t		iu_next_unlinked;
-};
-
 /* Unlinked list predecessor lookup hashtable construction */
 static int
-xfs_iunlink_obj_cmp(
+xfs_iunlink_obj_cmpfn(
 	struct rhashtable_compare_arg	*arg,
 	const void			*obj)
 {
-	const struct xfs_iunlink_key	*key = arg->key;
+	const xfs_agino_t		*key = arg->key;
 	const struct xfs_iunlink	*iu = obj;
 
-	if (iu->iu_next_unlinked != key->iu_next_unlinked)
+	if (iu->iu_next_unlinked != *key)
 		return 1;
 	return 0;
 }
@@ -1962,28 +1958,25 @@ static const struct rhashtable_params xfs_iunlink_hash_params = {
 	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
 	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
 	.automatic_shrinking	= true,
-	.obj_cmpfn		= xfs_iunlink_obj_cmp,
+	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
 };
 
 /*
- * Return X (in @prev_agino), where X.next_unlinked == @agino.  Returns -ENOENT
- * if no such relation is found.
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
  */
-static int
+static xfs_agino_t
 xfs_iunlink_lookup_backref(
 	struct xfs_perag	*pag,
-	xfs_agino_t		agino,
-	xfs_agino_t		*prev_agino)
+	xfs_agino_t		agino)
 {
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
 	struct xfs_iunlink	*iu;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
 			xfs_iunlink_hash_params);
 	if (!iu)
-		return -ENOENT;
-	*prev_agino = iu->iu_agino;
-	return 0;
+		return NULLAGINO;
+	return iu->iu_agino;
 }
 
 /* Remember that @prev_agino.next_unlinked = @this_agino. */
@@ -2003,31 +1996,6 @@ xfs_iunlink_add_backref(
 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
 }
 
-/* Forget that X.next_unlinked = @agino. */
-static int
-xfs_iunlink_forget_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		agino)
-{
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
-	struct xfs_iunlink	*iu;
-	int			error;
-
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
-			xfs_iunlink_hash_params);
-	if (!iu)
-		return -ENOENT;
-
-	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	if (error)
-		return error;
-
-	kmem_free(iu);
-	return 0;
-}
-
-
 /* Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. */
 static int
 xfs_iunlink_change_backref(
@@ -2035,11 +2003,10 @@ xfs_iunlink_change_backref(
 	xfs_agino_t		agino,
 	xfs_agino_t		next_unlinked)
 {
-	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
 	struct xfs_iunlink	*iu;
 	int			error;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
 			xfs_iunlink_hash_params);
 	if (!iu)
 		return -ENOENT;
@@ -2049,8 +2016,18 @@ xfs_iunlink_change_backref(
 	if (error)
 		return error;
 
-	iu->iu_next_unlinked = next_unlinked;
+	/*
+	 * If there is no new next entry just free our item and return.
+	 */
+	if (next_unlinked == NULLAGINO) {
+		kmem_free(iu);
+		return 0;
+	}
 
+	/*
+	 * Else update the hash table to the new entry.
+	 */
+	iu->iu_next_unlinked = next_unlinked;
 	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
 }
@@ -2357,8 +2334,8 @@ xfs_iunlink_map_prev(
 	ASSERT(head_agino != target_agino);
 
 	/* See if our backref cache can find it faster. */
-	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
-	if (error == 0) {
+	next_agino = xfs_iunlink_lookup_backref(pag, target_agino);
+	if (next_agino != NULLAGINO) {
 		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
 		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
 				&last_dip, &last_ibp);
@@ -2468,7 +2445,8 @@ xfs_iunlink_remove(
 			goto out_unlock;
 
 		if (next_agino != NULLAGINO) {
-			error = xfs_iunlink_forget_backref(pag, next_agino);
+			error = xfs_iunlink_change_backref(pag, next_agino,
+					NULLAGINO);
 			if (error)
 				goto out_unlock;
 		}
@@ -2496,7 +2474,8 @@ xfs_iunlink_remove(
 		if (error)
 			goto out_unlock;
 		if (next_agino != NULLAGINO) {
-			error = xfs_iunlink_forget_backref(pag, next_agino);
+			error = xfs_iunlink_change_backref(pag, next_agino,
+					NULLAGINO);
 			if (error)
 				goto out_unlock;
 		}
@@ -2504,11 +2483,7 @@ xfs_iunlink_remove(
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
 				next_ino, next_agino);
-		if (next_agino == NULLAGINO)
-			error = xfs_iunlink_forget_backref(pag, agino);
-		else
-			error = xfs_iunlink_change_backref(pag, agino,
-					next_agino);
+		error = xfs_iunlink_change_backref(pag, agino, next_agino);
 		if (error)
 			goto out_unlock;
 	}
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 56349497d75b..eec3c6109fc6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1880,6 +1880,189 @@  xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * Faster In-Core Unlinked List Lookups
+ * ====================================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+	struct rhash_head	iu_rhash_head;
+	xfs_agino_t		iu_agino;
+	xfs_agino_t		iu_next_unlinked;
+};
+
+struct xfs_iunlink_key {
+	xfs_agino_t		iu_next_unlinked;
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+_xfs_iunlink_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_iunlink_key	*key = arg->key;
+	const struct xfs_iunlink	*iu = obj;
+
+	if (iu->iu_next_unlinked != key->iu_next_unlinked)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
+	.nelem_hint		= 512,
+	.key_len		= sizeof(xfs_agino_t),
+	.key_offset		= offsetof(struct xfs_iunlink, iu_next_unlinked),
+	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_iunlink_obj_cmp,
+};
+
+/*
+ * Return X (in @prev_agino), where X.next_unlinked == @agino.  Returns -ENOENT
+ * if no such relation is found.
+ */
+int
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		*prev_agino)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+	*prev_agino = iu->iu_agino;
+	return 0;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+	iu->iu_agino = prev_agino;
+	iu->iu_next_unlinked = this_agino;
+
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/* Forget that X.next_unlinked = @agino. */
+int
+xfs_iunlink_forget_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	kmem_free(iu);
+	return 0;
+}
+
+
+/* Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. */
+int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	struct xfs_iunlink_key	key = { .iu_next_unlinked = agino };
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &key,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return -ENOENT;
+
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	iu->iu_next_unlinked = next_unlinked;
+
+	return rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+	struct xfs_perag	*pag)
+{
+	return rhashtable_init(&pag->pagi_unlinked_hash,
+			&xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+	void			*ptr,
+	void			*arg)
+{
+	struct xfs_iunlink	*iu = ptr;
+	bool			*freed_anything = arg;
+
+	*freed_anything = true;
+	kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+	struct xfs_perag	*pag)
+{
+	bool			freed_anything = false;
+
+	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+			xfs_iunlink_free_item, &freed_anything);
+
+	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
  * is responsible for validating the old value.
@@ -2072,6 +2255,9 @@  xfs_iunlink(
 		if (error)
 			goto out_unlock;
 		ASSERT(old_agino == NULLAGINO);
+		error = xfs_iunlink_add_backref(pag, agino, next_agino);
+		if (error)
+			goto out_unlock;
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2144,6 +2330,17 @@  xfs_iunlink_map_prev(
 
 	ASSERT(head_agino != target_agino);
 
+	/* See if our backref cache can find it faster. */
+	error = xfs_iunlink_lookup_backref(pag, target_agino, &next_agino);
+	if (error == 0) {
+		next_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+		error = xfs_iunlink_map_ino(tp, next_ino, &last_imap,
+				&last_dip, &last_ibp);
+		if (error)
+			return error;
+		goto out;
+	}
+
 	next_agino = head_agino;
 	while (next_agino != target_agino) {
 		xfs_agino_t	unlinked_agino;
@@ -2169,6 +2366,7 @@  xfs_iunlink_map_prev(
 		next_agino = unlinked_agino;
 	}
 
+out:
 	/* Should never happen, but don't return garbage. */
 	if (next_ino == NULLFSINO)
 		return -EFSCORRUPTED;
@@ -2243,6 +2441,12 @@  xfs_iunlink_remove(
 		if (error)
 			goto out_unlock;
 
+		if (next_agino != NULLAGINO) {
+			error = xfs_iunlink_forget_backref(pag, next_agino);
+			if (error)
+				goto out_unlock;
+		}
+
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agibp, bucket_index,
 				next_agino);
@@ -2265,10 +2469,22 @@  xfs_iunlink_remove(
 				&next_agino);
 		if (error)
 			goto out_unlock;
+		if (next_agino != NULLAGINO) {
+			error = xfs_iunlink_forget_backref(pag, next_agino);
+			if (error)
+				goto out_unlock;
+		}
 
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, last_ibp, last_dip, &imap,
 				next_ino, next_agino);
+		if (next_agino == NULLAGINO)
+			error = xfs_iunlink_forget_backref(pag, agino);
+		else
+			error = xfs_iunlink_change_backref(pag, agino,
+					next_agino);
+		if (error)
+			goto out_unlock;
 	}
 	pag->pagi_unlinked_count--;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..9690f0d32e33 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,14 @@  extern struct kmem_zone	*xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+int xfs_iunlink_lookup_backref(struct xfs_perag *pag, xfs_agino_t agino,
+		xfs_agino_t *prev_agino);
+int xfs_iunlink_add_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+int xfs_iunlink_forget_backref(struct xfs_perag *pag, xfs_agino_t agino);
+int xfs_iunlink_change_backref(struct xfs_perag *pag, xfs_agino_t prev_agino,
+		xfs_agino_t this_agino);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c920b8aeba01..909b70550aa8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5078,12 +5078,22 @@  xlog_recover_process_one_iunlink(
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 
-	/* Make sure the in-core data knows about this unlinked inode. */
+	/*
+	 * Make sure the in-core data knows about this unlinked inode.  Since
+	 * our iunlinks recovery basically just deletes the head of a bucket
+	 * list until the bucket is empty, we need only to add the backref from
+	 * the current list item to the next one, if this isn't the list tail.
+	 */
 	pag = xfs_perag_get(mp, agno);
 	mutex_lock(&pag->pagi_unlinked_lock);
 	pag->pagi_unlinked_count++;
+	if (agino != NULLAGINO)
+		error = xfs_iunlink_add_backref(pag, XFS_INO_TO_AGINO(mp, ino),
+				agino);
 	mutex_unlock(&pag->pagi_unlinked_lock);
 	xfs_perag_put(pag);
+	if (error)
+		goto fail_iput;
 
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6bfc985669e0..4eb97ddc915e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -151,6 +151,7 @@  xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		ASSERT(pag->pagi_unlinked_count == 0 ||
 		       XFS_FORCED_SHUTDOWN(mp));
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pagi_unlinked_lock);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
@@ -231,6 +232,9 @@  xfs_initialize_perag(
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
 		mutex_init(&pag->pagi_unlinked_lock);
+		error = xfs_iunlink_init(pag);
+		if (error)
+			goto out_iunlink_mutex;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -241,6 +245,8 @@  xfs_initialize_perag(
 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
 	return 0;
 
+out_iunlink_mutex:
+	mutex_destroy(&pag->pagi_unlinked_lock);
 out_hash_destroy:
 	xfs_buf_hash_destroy(pag);
 out_free_pag:
@@ -253,6 +259,7 @@  xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pagi_unlinked_lock);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0fcc6b6a4f67..27a433e93d32 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -392,6 +392,7 @@  typedef struct xfs_perag {
 	/* unlinked inodes */
 	struct mutex		pagi_unlinked_lock;
 	uint32_t		pagi_unlinked_count;
+	struct rhashtable	pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *