[9/9] xfs: cache unlinked pointers in an rhashtable
diff mbox series

Message ID 154947213625.22369.7330296004413590907.stgit@magnolia
State New
Headers show
Series
  • xfs: incore unlinked list
Related show

Commit Message

Darrick J. Wong Feb. 6, 2019, 4:55 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Use a rhashtable to cache the unlinked list incore.  This should speed
up unlinked processing considerably when there are a lot of inodes on
the unlinked list because iunlink_remove no longer has to traverse an
entire bucket list to find which inode points to the one being removed.

The incore list structure records "X.next_unlinked = Y" relations, with
the rhashtable using Y to index the records.  This makes finding the
inode X that points to a inode Y very quick.  If our cache fails to find
anything we can always fall back on the old method.

FWIW this drastically reduces the amount of time it takes to remove
inodes from the unlinked list.  I wrote a program to open a lot of
O_TMPFILE files and then close them in the same order, which takes
a very long time if we have to traverse the unlinked lists.  With the
ptach, I see:

+ /d/t/tmpfile/tmpfile
Opened 193531 files in 6.33s.
Closed 193531 files in 5.86s

real    0m12.192s
user    0m0.064s
sys     0m11.619s
+ cd /
+ umount /mnt

real    0m0.050s
user    0m0.004s
sys     0m0.030s

And without the patch:

+ /d/t/tmpfile/tmpfile
Opened 193588 files in 6.35s.
Closed 193588 files in 751.61s

real    12m38.853s
user    0m0.084s
sys     12m34.470s
+ cd /
+ umount /mnt

real    0m0.086s
user    0m0.000s
sys     0m0.060s

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_errortag.h |    4 -
 fs/xfs/xfs_error.c           |    3 
 fs/xfs/xfs_inode.c           |  269 +++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_inode.h           |    3 
 fs/xfs/xfs_mount.c           |    5 +
 fs/xfs/xfs_mount.h           |    7 +
 fs/xfs/xfs_trace.h           |    1 
 7 files changed, 285 insertions(+), 7 deletions(-)

Comments

Brian Foster Feb. 7, 2019, 2:31 p.m. UTC | #1
On Wed, Feb 06, 2019 at 08:55:36AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Use a rhashtable to cache the unlinked list incore.  This should speed
> up unlinked processing considerably when there are a lot of inodes on
> the unlinked list because iunlink_remove no longer has to traverse an
> entire bucket list to find which inode points to the one being removed.
> 
> The incore list structure records "X.next_unlinked = Y" relations, with
> the rhashtable using Y to index the records.  This makes finding the
> inode X that points to a inode Y very quick.  If our cache fails to find
> anything we can always fall back on the old method.
> 
> FWIW this drastically reduces the amount of time it takes to remove
> inodes from the unlinked list.  I wrote a program to open a lot of
> O_TMPFILE files and then close them in the same order, which takes
> a very long time if we have to traverse the unlinked lists.  With the
> ptach, I see:
> 
...
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

A few minor comments below. With those addressed, this otherwise looks
pretty good to me:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_errortag.h |    4 -
>  fs/xfs/xfs_error.c           |    3 
>  fs/xfs/xfs_inode.c           |  269 +++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_inode.h           |    3 
>  fs/xfs/xfs_mount.c           |    5 +
>  fs/xfs/xfs_mount.h           |    7 +
>  fs/xfs/xfs_trace.h           |    1 
>  7 files changed, 285 insertions(+), 7 deletions(-)
> 
> 
...
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 53a9c8c26d18..fdfcb3a9bac7 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1906,6 +1906,194 @@ xfs_inactive(
>  	xfs_qm_dqdetach(ip);
>  }
>  
...
> +/*
> + * Remember that @prev_agino.next_unlinked = @this_agino.  Fail loudly if there
> + * already was an entry, but absorb any other runtime errors.
> + */
> +int
> +xfs_iunlink_add_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		prev_agino,
> +	xfs_agino_t		this_agino)
> +{
> +	struct xfs_iunlink	*iu;
> +	int			error;
> +
> +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> +		return 0;
> +
> +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> +	iu->iu_agino = prev_agino;
> +	iu->iu_next_unlinked = this_agino;
> +
> +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> +	if (error == 0 || error == -EEXIST)
> +		return error;
> +	return 0;

The return val looks funny at a glance: return -EEXIST or otherwise
return 0 for success and all other errors (also the error == 0 looks
superfluous). I see the comment above describes what this does, but it
doesn't explain why. I'd move the comment to above the if check and
explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
because fallback blah blah. We care about -EEXIST because ...").

Also I assume we need to free the iu object on insert failure,
regardless of whether we ultimately return the error.

> +}
> +
> +/*
> + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
> + * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
> + * wasn't any such entry then we don't bother.
> + */
> +int
> +xfs_iunlink_change_backref(
> +	struct xfs_perag	*pag,
> +	xfs_agino_t		agino,
> +	xfs_agino_t		next_unlinked)
> +{
> +	struct xfs_iunlink	*iu;
> +	int			error;
> +
> +	/* Look up the old entry; if there wasn't one then exit. */
> +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> +			xfs_iunlink_hash_params);
> +	if (!iu)
> +		return 0;
> +
> +	/*
> +	 * Remove the entry.  This shouldn't ever return an error, but if we
> +	 * couldn't remove the old entry we don't want to add it again to the
> +	 * hash table, and if the entry disappeared on us then someone's
> +	 * violated the locking rules and we need to fail loudly.
> +	 */
> +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> +	if (error)
> +		return error;

What's interesting about remove failure is that if we otherwise found an
iu for this inode, removing the inode from the unlinked list leaves a
stale/incorrect iu around in the in-core table. So it's one thing for
the in-core table to be a valid subset of the on-disk structures, but
another thing entirely to have incoherence between the two. It might be
worth pointing out that it's critical to return an error here so we
don't actually remove the inode (whereas the re-add below is less
strict).

> +
> +	/* If there is no new next entry just free our item and return. */
> +	if (next_unlinked == NULLAGINO) {
> +		kmem_free(iu);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Update the hash table to the new entry, ignoring operational
> +	 * errors.
> +	 */
> +	iu->iu_next_unlinked = next_unlinked;
> +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> +	if (error == 0 || error == -EEXIST)
> +		return error;
> +	return 0;

Similar error == 0 thing and potential memory leak here.

> +}
> +
...
> @@ -2141,6 +2341,27 @@ xfs_iunlink_map_prev(
>  
>  	ASSERT(head_agino != target_agino);
>  
> +	/* See if our backref cache can find it faster. */
> +	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
> +	if (*agino != NULLAGINO) {
> +		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
> +		if (error)
> +			return error;
> +
> +		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
> +			return 0;
> +
> +		/*
> +		 * If we get here the cache contents were corrupt, so drop the
> +		 * buffer and fall back to walking the bucket list.
> +		 */
> +		xfs_trans_brelse(tp, *bpp);
> +		*bpp = NULL;

I like the logic to ensure we don't screw around with an inode that
doesn't actually point to our target, but I do wonder whether we should
do a little more than silently ignore the fact we found an incoherent
backref. Even just a WARN_ON[_ONCE]() or something here to help us
detect this case during testing might be sufficient..?

Brian

> +	}
> +
> +	trace_xfs_iunlink_map_prev_fallback(mp, agno);
> +
> +	/* Otherwise, walk the entire bucket until we find it. */
>  	next_agino = head_agino;
>  	while (next_agino != target_agino) {
>  		xfs_agino_t	unlinked_agino;
> @@ -2186,6 +2407,7 @@ xfs_iunlink_remove(
>  	struct xfs_buf		*agibp;
>  	struct xfs_buf		*last_ibp = NULL;
>  	struct xfs_dinode	*last_dip = NULL;
> +	struct xfs_perag	*pag = NULL;
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	xfs_agino_t		next_agino;
> @@ -2221,27 +2443,62 @@ xfs_iunlink_remove(
>  	if (error)
>  		return error;
>  
> +	/*
> +	 * If there was a backref pointing from the next inode back to this
> +	 * one, remove it because we've removed this inode from the list.
> +	 *
> +	 * Later, if this inode was in the middle of the list we'll update
> +	 * this inode's backref to point from the next inode.
> +	 */
> +	if (next_agino != NULLAGINO) {
> +		pag = xfs_perag_get(mp, agno);
> +		error = xfs_iunlink_change_backref(pag, next_agino,
> +				NULLAGINO);
> +		if (error)
> +			goto out;
> +	}
> +
>  	if (head_agino == agino) {
>  		/* Point the head of the list to the next unlinked inode. */
>  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
>  				next_agino);
>  		if (error)
> -			return error;
> +			goto out;
>  	} else {
>  		struct xfs_imap	imap;
>  		xfs_agino_t	prev_agino;
>  
> +		if (!pag)
> +			pag = xfs_perag_get(mp, agno);
> +
>  		/* We need to search the list for the inode being freed. */
>  		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
> -				&prev_agino, &imap, &last_dip, &last_ibp);
> +				&prev_agino, &imap, &last_dip, &last_ibp,
> +				pag);
>  		if (error)
> -			return error;
> +			goto out;
>  
>  		/* Point the previous inode on the list to the next inode. */
>  		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
>  				last_dip, &imap, next_agino);
> +
> +		/*
> +		 * Now we deal with the backref for this inode.  If this inode
> +		 * pointed at a real inode, change the backref that pointed to
> +		 * us to point to our old next.  If this inode was the end of
> +		 * the list, delete the backref that pointed to us.  Note that
> +		 * change_backref takes care of deleting the backref if
> +		 * next_agino is NULLAGINO.
> +		 */
> +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> +		if (error)
> +			goto out;
>  	}
> -	return 0;
> +
> +out:
> +	if (pag)
> +		xfs_perag_put(pag);
> +	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index be2014520155..e62074a5257c 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -500,4 +500,7 @@ extern struct kmem_zone	*xfs_inode_zone;
>  
>  bool xfs_inode_verify_forks(struct xfs_inode *ip);
>  
> +int xfs_iunlink_init(struct xfs_perag *pag);
> +void xfs_iunlink_destroy(struct xfs_perag *pag);
> +
>  #endif	/* __XFS_INODE_H__ */
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index b4d8c318be3c..fd63b0b1307c 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -149,6 +149,7 @@ xfs_free_perag(
>  		spin_unlock(&mp->m_perag_lock);
>  		ASSERT(pag);
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> +		xfs_iunlink_destroy(pag);
>  		xfs_buf_hash_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> @@ -227,6 +228,9 @@ xfs_initialize_perag(
>  		/* first new pag is fully initialized */
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
> +		error = xfs_iunlink_init(pag);
> +		if (error)
> +			goto out_hash_destroy;
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> @@ -249,6 +253,7 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> +		xfs_iunlink_destroy(pag);
>  		mutex_destroy(&pag->pag_ici_reclaim_lock);
>  		kmem_free(pag);
>  	}
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 7daafe064af8..a33f45077867 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -396,6 +396,13 @@ typedef struct xfs_perag {
>  
>  	/* reference count */
>  	uint8_t			pagf_refcount_level;
> +
> +	/*
> +	 * Unlinked inode information.  This incore information reflects
> +	 * data stored in the AGI, so callers must hold the AGI buffer lock
> +	 * or have some other means to control concurrency.
> +	 */
> +	struct rhashtable	pagi_unlinked_hash;
>  } xfs_perag_t;
>  
>  static inline struct xfs_ag_resv *
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index a6e384a642b1..c83ce022a355 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3447,6 +3447,7 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
>  	TP_ARGS(ip))
>  DEFINE_AGINODE_EVENT(xfs_iunlink);
>  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
>  
>  #endif /* _TRACE_XFS_H */
>  
>
Darrick J. Wong Feb. 7, 2019, 4:33 p.m. UTC | #2
On Thu, Feb 07, 2019 at 09:31:15AM -0500, Brian Foster wrote:
> On Wed, Feb 06, 2019 at 08:55:36AM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Use a rhashtable to cache the unlinked list incore.  This should speed
> > up unlinked processing considerably when there are a lot of inodes on
> > the unlinked list because iunlink_remove no longer has to traverse an
> > entire bucket list to find which inode points to the one being removed.
> > 
> > The incore list structure records "X.next_unlinked = Y" relations, with
> > the rhashtable using Y to index the records.  This makes finding the
> > inode X that points to a inode Y very quick.  If our cache fails to find
> > anything we can always fall back on the old method.
> > 
> > FWIW this drastically reduces the amount of time it takes to remove
> > inodes from the unlinked list.  I wrote a program to open a lot of
> > O_TMPFILE files and then close them in the same order, which takes
> > a very long time if we have to traverse the unlinked lists.  With the
> > ptach, I see:
> > 
> ...
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> A few minor comments below. With those addressed, this otherwise looks
> pretty good to me:
> 
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> 
> >  fs/xfs/libxfs/xfs_errortag.h |    4 -
> >  fs/xfs/xfs_error.c           |    3 
> >  fs/xfs/xfs_inode.c           |  269 +++++++++++++++++++++++++++++++++++++++++-
> >  fs/xfs/xfs_inode.h           |    3 
> >  fs/xfs/xfs_mount.c           |    5 +
> >  fs/xfs/xfs_mount.h           |    7 +
> >  fs/xfs/xfs_trace.h           |    1 
> >  7 files changed, 285 insertions(+), 7 deletions(-)
> > 
> > 
> ...
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 53a9c8c26d18..fdfcb3a9bac7 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1906,6 +1906,194 @@ xfs_inactive(
> >  	xfs_qm_dqdetach(ip);
> >  }
> >  
> ...
> > +/*
> > + * Remember that @prev_agino.next_unlinked = @this_agino.  Fail loudly if there
> > + * already was an entry, but absorb any other runtime errors.
> > + */
> > +int
> > +xfs_iunlink_add_backref(
> > +	struct xfs_perag	*pag,
> > +	xfs_agino_t		prev_agino,
> > +	xfs_agino_t		this_agino)
> > +{
> > +	struct xfs_iunlink	*iu;
> > +	int			error;
> > +
> > +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> > +		return 0;
> > +
> > +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> > +	iu->iu_agino = prev_agino;
> > +	iu->iu_next_unlinked = this_agino;
> > +
> > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > +	if (error == 0 || error == -EEXIST)
> > +		return error;
> > +	return 0;
> 
> The return val looks funny at a glance: return -EEXIST or otherwise
> return 0 for success and all other errors (also the error == 0 looks
> superfluous). I see the comment above describes what this does, but it
> doesn't explain why. I'd move the comment to above the if check and
> explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
> because fallback blah blah. We care about -EEXIST because ...").

Will do.

> Also I assume we need to free the iu object on insert failure,
> regardless of whether we ultimately return the error.

Oops, yes, good catch!

> > +}
> > +
> > +/*
> > + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
> > + * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
> > + * wasn't any such entry then we don't bother.
> > + */
> > +int
> > +xfs_iunlink_change_backref(
> > +	struct xfs_perag	*pag,
> > +	xfs_agino_t		agino,
> > +	xfs_agino_t		next_unlinked)
> > +{
> > +	struct xfs_iunlink	*iu;
> > +	int			error;
> > +
> > +	/* Look up the old entry; if there wasn't one then exit. */
> > +	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> > +			xfs_iunlink_hash_params);
> > +	if (!iu)
> > +		return 0;
> > +
> > +	/*
> > +	 * Remove the entry.  This shouldn't ever return an error, but if we
> > +	 * couldn't remove the old entry we don't want to add it again to the
> > +	 * hash table, and if the entry disappeared on us then someone's
> > +	 * violated the locking rules and we need to fail loudly.
> > +	 */
> > +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > +	if (error)
> > +		return error;
> 
> What's interesting about remove failure is that if we otherwise found an
> iu for this inode, removing the inode from the unlinked list leaves a
> stale/incorrect iu around in the in-core table. So it's one thing for
> the in-core table to be a valid subset of the on-disk structures, but
> another thing entirely to have incoherence between the two. It might be
> worth pointing out that it's critical to return an error here so we
> don't actually remove the inode (whereas the re-add below is less
> strict).

Ok.

> > +
> > +	/* If there is no new next entry just free our item and return. */
> > +	if (next_unlinked == NULLAGINO) {
> > +		kmem_free(iu);
> > +		return 0;
> > +	}
> > +
> > +	/*
> > +	 * Update the hash table to the new entry, ignoring operational
> > +	 * errors.
> > +	 */
> > +	iu->iu_next_unlinked = next_unlinked;
> > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > +	if (error == 0 || error == -EEXIST)
> > +		return error;
> > +	return 0;
> 
> Similar error == 0 thing and potential memory leak here.

Refactored into a common helper to capture all the logic and
documentation.

> > +}
> > +
> ...
> > @@ -2141,6 +2341,27 @@ xfs_iunlink_map_prev(
> >  
> >  	ASSERT(head_agino != target_agino);
> >  
> > +	/* See if our backref cache can find it faster. */
> > +	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
> > +	if (*agino != NULLAGINO) {
> > +		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
> > +		if (error)
> > +			return error;
> > +
> > +		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
> > +			return 0;
> > +
> > +		/*
> > +		 * If we get here the cache contents were corrupt, so drop the
> > +		 * buffer and fall back to walking the bucket list.
> > +		 */
> > +		xfs_trans_brelse(tp, *bpp);
> > +		*bpp = NULL;
> 
> I like the logic to ensure we don't screw around with an inode that
> doesn't actually point to our target, but I do wonder whether we should
> do a little more than silently ignore the fact we found an incoherent
> backref. Even just a WARN_ON[_ONCE]() or something here to help us
> detect this case during testing might be sufficient..?

Yeah, that's a good idea.  The cache can miss, but it shouldn't ever be
corrupt.

--D

> 
> Brian
> 
> > +	}
> > +
> > +	trace_xfs_iunlink_map_prev_fallback(mp, agno);
> > +
> > +	/* Otherwise, walk the entire bucket until we find it. */
> >  	next_agino = head_agino;
> >  	while (next_agino != target_agino) {
> >  		xfs_agino_t	unlinked_agino;
> > @@ -2186,6 +2407,7 @@ xfs_iunlink_remove(
> >  	struct xfs_buf		*agibp;
> >  	struct xfs_buf		*last_ibp = NULL;
> >  	struct xfs_dinode	*last_dip = NULL;
> > +	struct xfs_perag	*pag = NULL;
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> >  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> >  	xfs_agino_t		next_agino;
> > @@ -2221,27 +2443,62 @@ xfs_iunlink_remove(
> >  	if (error)
> >  		return error;
> >  
> > +	/*
> > +	 * If there was a backref pointing from the next inode back to this
> > +	 * one, remove it because we've removed this inode from the list.
> > +	 *
> > +	 * Later, if this inode was in the middle of the list we'll update
> > +	 * this inode's backref to point from the next inode.
> > +	 */
> > +	if (next_agino != NULLAGINO) {
> > +		pag = xfs_perag_get(mp, agno);
> > +		error = xfs_iunlink_change_backref(pag, next_agino,
> > +				NULLAGINO);
> > +		if (error)
> > +			goto out;
> > +	}
> > +
> >  	if (head_agino == agino) {
> >  		/* Point the head of the list to the next unlinked inode. */
> >  		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
> >  				next_agino);
> >  		if (error)
> > -			return error;
> > +			goto out;
> >  	} else {
> >  		struct xfs_imap	imap;
> >  		xfs_agino_t	prev_agino;
> >  
> > +		if (!pag)
> > +			pag = xfs_perag_get(mp, agno);
> > +
> >  		/* We need to search the list for the inode being freed. */
> >  		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
> > -				&prev_agino, &imap, &last_dip, &last_ibp);
> > +				&prev_agino, &imap, &last_dip, &last_ibp,
> > +				pag);
> >  		if (error)
> > -			return error;
> > +			goto out;
> >  
> >  		/* Point the previous inode on the list to the next inode. */
> >  		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
> >  				last_dip, &imap, next_agino);
> > +
> > +		/*
> > +		 * Now we deal with the backref for this inode.  If this inode
> > +		 * pointed at a real inode, change the backref that pointed to
> > +		 * us to point to our old next.  If this inode was the end of
> > +		 * the list, delete the backref that pointed to us.  Note that
> > +		 * change_backref takes care of deleting the backref if
> > +		 * next_agino is NULLAGINO.
> > +		 */
> > +		error = xfs_iunlink_change_backref(pag, agino, next_agino);
> > +		if (error)
> > +			goto out;
> >  	}
> > -	return 0;
> > +
> > +out:
> > +	if (pag)
> > +		xfs_perag_put(pag);
> > +	return error;
> >  }
> >  
> >  /*
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index be2014520155..e62074a5257c 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -500,4 +500,7 @@ extern struct kmem_zone	*xfs_inode_zone;
> >  
> >  bool xfs_inode_verify_forks(struct xfs_inode *ip);
> >  
> > +int xfs_iunlink_init(struct xfs_perag *pag);
> > +void xfs_iunlink_destroy(struct xfs_perag *pag);
> > +
> >  #endif	/* __XFS_INODE_H__ */
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index b4d8c318be3c..fd63b0b1307c 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -149,6 +149,7 @@ xfs_free_perag(
> >  		spin_unlock(&mp->m_perag_lock);
> >  		ASSERT(pag);
> >  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> > +		xfs_iunlink_destroy(pag);
> >  		xfs_buf_hash_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		call_rcu(&pag->rcu_head, __xfs_free_perag);
> > @@ -227,6 +228,9 @@ xfs_initialize_perag(
> >  		/* first new pag is fully initialized */
> >  		if (first_initialised == NULLAGNUMBER)
> >  			first_initialised = index;
> > +		error = xfs_iunlink_init(pag);
> > +		if (error)
> > +			goto out_hash_destroy;
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > @@ -249,6 +253,7 @@ xfs_initialize_perag(
> >  		if (!pag)
> >  			break;
> >  		xfs_buf_hash_destroy(pag);
> > +		xfs_iunlink_destroy(pag);
> >  		mutex_destroy(&pag->pag_ici_reclaim_lock);
> >  		kmem_free(pag);
> >  	}
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 7daafe064af8..a33f45077867 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -396,6 +396,13 @@ typedef struct xfs_perag {
> >  
> >  	/* reference count */
> >  	uint8_t			pagf_refcount_level;
> > +
> > +	/*
> > +	 * Unlinked inode information.  This incore information reflects
> > +	 * data stored in the AGI, so callers must hold the AGI buffer lock
> > +	 * or have some other means to control concurrency.
> > +	 */
> > +	struct rhashtable	pagi_unlinked_hash;
> >  } xfs_perag_t;
> >  
> >  static inline struct xfs_ag_resv *
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index a6e384a642b1..c83ce022a355 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3447,6 +3447,7 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
> >  	TP_ARGS(ip))
> >  DEFINE_AGINODE_EVENT(xfs_iunlink);
> >  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> > +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
> >  
> >  #endif /* _TRACE_XFS_H */
> >  
> >
Christoph Hellwig Feb. 8, 2019, 8 a.m. UTC | #3
> > +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> > +		return 0;
> > +
> > +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> > +	iu->iu_agino = prev_agino;
> > +	iu->iu_next_unlinked = this_agino;
> > +
> > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > +	if (error == 0 || error == -EEXIST)
> > +		return error;
> > +	return 0;
> 
> The return val looks funny at a glance: return -EEXIST or otherwise
> return 0 for success and all other errors (also the error == 0 looks
> superfluous). I see the comment above describes what this does, but it
> doesn't explain why. I'd move the comment to above the if check and
> explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
> because fallback blah blah. We care about -EEXIST because ...").
> 
> Also I assume we need to free the iu object on insert failure,
> regardless of whether we ultimately return the error.

But is this really the right thing to do?  Everything but ENOMEM
would be a bug in the rhashtable implementation, or the XFS use,
wouldn't it?

So why not drop the return value entirely and do a:

	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
			&iu->iu_rhash_head, xfs_iunlink_hash_params);
	ASSERT(error == 0 || error == -ENOMEM);

> > +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > +	if (error)
> > +		return error;
> 
> What's interesting about remove failure is that if we otherwise found an
> iu for this inode, removing the inode from the unlinked list leaves a
> stale/incorrect iu around in the in-core table. So it's one thing for
> the in-core table to be a valid subset of the on-disk structures, but
> another thing entirely to have incoherence between the two. It might be
> worth pointing out that it's critical to return an error here so we
> don't actually remove the inode (whereas the re-add below is less
> strict).

Again, remove failure here sounds like a programming bug - so ASSERT
and or forced shutdown here.
Brian Foster Feb. 8, 2019, 12:06 p.m. UTC | #4
On Fri, Feb 08, 2019 at 12:00:16AM -0800, Christoph Hellwig wrote:
> > > +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> > > +		return 0;
> > > +
> > > +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> > > +	iu->iu_agino = prev_agino;
> > > +	iu->iu_next_unlinked = this_agino;
> > > +
> > > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > +	if (error == 0 || error == -EEXIST)
> > > +		return error;
> > > +	return 0;
> > 
> > The return val looks funny at a glance: return -EEXIST or otherwise
> > return 0 for success and all other errors (also the error == 0 looks
> > superfluous). I see the comment above describes what this does, but it
> > doesn't explain why. I'd move the comment to above the if check and
> > explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
> > because fallback blah blah. We care about -EEXIST because ...").
> > 
> > Also I assume we need to free the iu object on insert failure,
> > regardless of whether we ultimately return the error.
> 
> But is this really the right thing to do?  Everything but ENOMEM
> would be a bug in the rhashtable implementation, or the XFS use,
> wouldn't it?
> 

I think anything beyond -ENOMEM would imply a bug somewhere, yes.

> So why not drop the return value entirely and do a:
> 
> 	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> 	ASSERT(error == 0 || error == -ENOMEM);
> 

It's not clear to me whether you're suggesting we return 0, error or
nothing at all here. The assert otherwise seems fine to me as I don't
think we'd ever expect anything outside of success or -ENOMEM.

That said, I don't see any reason to ever leak an iu if we know it
didn't make it into the table. I could probably go either way on whether
we wanted to allow the filesystem to continue or not on unexpected
insert errors. The original comment was just that we probably shouldn't
explode on "expected" errors like -ENOSPC.

> > > +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > +	if (error)
> > > +		return error;
> > 
> > What's interesting about remove failure is that if we otherwise found an
> > iu for this inode, removing the inode from the unlinked list leaves a
> > stale/incorrect iu around in the in-core table. So it's one thing for
> > the in-core table to be a valid subset of the on-disk structures, but
> > another thing entirely to have incoherence between the two. It might be
> > worth pointing out that it's critical to return an error here so we
> > don't actually remove the inode (whereas the re-add below is less
> > strict).
> 
> Again, remove failure here sounds like a programming bug - so ASSERT
> and or forced shutdown here.

Remove failure already unconditionally returns the error (which in the
inactive path translates to a shutdown). My feedback above was just a
suggestion to explain why in a comment because the error handling is
intentionally different between table insert/remove/lookup.

Brian
Darrick J. Wong Feb. 8, 2019, 4:54 p.m. UTC | #5
On Fri, Feb 08, 2019 at 07:06:24AM -0500, Brian Foster wrote:
> On Fri, Feb 08, 2019 at 12:00:16AM -0800, Christoph Hellwig wrote:
> > > > +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> > > > +		return 0;
> > > > +
> > > > +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> > > > +	iu->iu_agino = prev_agino;
> > > > +	iu->iu_next_unlinked = this_agino;
> > > > +
> > > > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > > +	if (error == 0 || error == -EEXIST)
> > > > +		return error;
> > > > +	return 0;
> > > 
> > > The return val looks funny at a glance: return -EEXIST or otherwise
> > > return 0 for success and all other errors (also the error == 0 looks
> > > superfluous). I see the comment above describes what this does, but it
> > > doesn't explain why. I'd move the comment to above the if check and
> > > explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
> > > because fallback blah blah. We care about -EEXIST because ...").
> > > 
> > > Also I assume we need to free the iu object on insert failure,
> > > regardless of whether we ultimately return the error.
> > 
> > But is this really the right thing to do?  Everything but ENOMEM
> > would be a bug in the rhashtable implementation, or the XFS use,
> > wouldn't it?
> > 
> 
> I think anything beyond -ENOMEM would imply a bug somewhere, yes.

AFAICT the only other error we can receive from rhashtable is -E2BIG.
That only happens if we somehow end up with more than 2^31 inodes (or
leave a logic bomb for ourselves by setting max_size, I guess...)

> > So why not drop the return value entirely and do a:
> > 
> > 	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > 	ASSERT(error == 0 || error == -ENOMEM);

But if someone hits -EEXIST on a non-debug filesystem we'll never know
that our code was actually buggy, whereas returning it shuts down the
filesystem and we'll start getting letters.

> It's not clear to me whether you're suggesting we return 0, error or
> nothing at all here. The assert otherwise seems fine to me as I don't
> think we'd ever expect anything outside of success or -ENOMEM.

As I mentioned above, we could theoretically receive -E2BIG due to a
programming bug, or at some point the rhashtable code could start
returning new errors it hasn't before, or we could be misreading the
code too. :)

> That said, I don't see any reason to ever leak an iu if we know it
> didn't make it into the table.

Agreed.

> I could probably go either way on whether we wanted to allow the
> filesystem to continue or not on unexpected insert errors. The
> original comment was just that we probably shouldn't explode on
> "expected" errors like -ENOSPC.

Since we /do/ have a fallback to handle cache misses, I think it's fine
to ignore hashtable insertion failures, though I'll put in a ASSERT if
we see any error other than ENOMEM.

> 
> > > > +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> > > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > > +	if (error)
> > > > +		return error;
> > > 
> > > What's interesting about remove failure is that if we otherwise found an
> > > iu for this inode, removing the inode from the unlinked list leaves a
> > > stale/incorrect iu around in the in-core table. So it's one thing for
> > > the in-core table to be a valid subset of the on-disk structures, but
> > > another thing entirely to have incoherence between the two. It might be
> > > worth pointing out that it's critical to return an error here so we
> > > don't actually remove the inode (whereas the re-add below is less
> > > strict).
> > 
> > Again, remove failure here sounds like a programming bug - so ASSERT
> > and or forced shutdown here.
> 
> Remove failure already unconditionally returns the error (which in the
> inactive path translates to a shutdown). My feedback above was just a
> suggestion to explain why in a comment because the error handling is
> intentionally different between table insert/remove/lookup.

I assume you're ok with the comment that got added in the followup patch
I sent to this thread?

--D

> Brian
Christoph Hellwig Feb. 11, 2019, 8:08 a.m. UTC | #6
On Fri, Feb 08, 2019 at 07:06:24AM -0500, Brian Foster wrote:
> It's not clear to me whether you're suggesting we return 0, error or
> nothing at all here. The assert otherwise seems fine to me as I don't
> think we'd ever expect anything outside of success or -ENOMEM.

I'm arguing that we should return nothing.

> That said, I don't see any reason to ever leak an iu if we know it
> didn't make it into the table. I could probably go either way on whether
> we wanted to allow the filesystem to continue or not on unexpected
> insert errors. The original comment was just that we probably shouldn't
> explode on "expected" errors like -ENOSPC.

Well, IFF the only error case that should happen is either ENOMEM or
E2BIG we don't have an allocation in that case.  Everything else is
a programming bug where we should assert / shut the file system down,
in which case the tiny leak is the least of our problems.
Brian Foster Feb. 11, 2019, 12:21 p.m. UTC | #7
On Fri, Feb 08, 2019 at 08:54:14AM -0800, Darrick J. Wong wrote:
> On Fri, Feb 08, 2019 at 07:06:24AM -0500, Brian Foster wrote:
> > On Fri, Feb 08, 2019 at 12:00:16AM -0800, Christoph Hellwig wrote:
> > > > > +	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> > > > > +		return 0;
> > > > > +
> > > > > +	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
> > > > > +	iu->iu_agino = prev_agino;
> > > > > +	iu->iu_next_unlinked = this_agino;
> > > > > +
> > > > > +	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > > > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > > > +	if (error == 0 || error == -EEXIST)
> > > > > +		return error;
> > > > > +	return 0;
> > > > 
> > > > The return val looks funny at a glance: return -EEXIST or otherwise
> > > > return 0 for success and all other errors (also the error == 0 looks
> > > > superfluous). I see the comment above describes what this does, but it
> > > > doesn't explain why. I'd move the comment to above the if check and
> > > > explain why it's there (i.e., "Absorb all runtime errors besides -EEXIST
> > > > because fallback blah blah. We care about -EEXIST because ...").
> > > > 
> > > > Also I assume we need to free the iu object on insert failure,
> > > > regardless of whether we ultimately return the error.
> > > 
> > > But is this really the right thing to do?  Everything but ENOMEM
> > > would be a bug in the rhashtable implementation, or the XFS use,
> > > wouldn't it?
> > > 
> > 
> > I think anything beyond -ENOMEM would imply a bug somewhere, yes.
> 
> AFAICT the only other error we can receive from rhashtable is -E2BIG.
> That only happens if we somehow end up with more than 2^31 inodes (or
> leave a logic bomb for ourselves by setting max_size, I guess...)
> 
> > > So why not drop the return value entirely and do a:
> > > 
> > > 	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> > > 			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > 	ASSERT(error == 0 || error == -ENOMEM);
> 
> But if someone hits -EEXIST on a non-debug filesystem we'll never know
> that our code was actually buggy, whereas returning it shuts down the
> filesystem and we'll start getting letters.
> 
> > It's not clear to me whether you're suggesting we return 0, error or
> > nothing at all here. The assert otherwise seems fine to me as I don't
> > think we'd ever expect anything outside of success or -ENOMEM.
> 
> As I mentioned above, we could theoretically receive -E2BIG due to a
> programming bug, or at some point the rhashtable code could start
> returning new errors it hasn't before, or we could be misreading the
> code too. :)
> 

Good point. Despite all our reasoning about what errors we might or
might not expect, I think this comes down to the fact that the API
returns an error. It's probably not appropriate to ignore it entirely,
even with a fallback mechanism in place that allows us to not fail the
higher level operation.

> > That said, I don't see any reason to ever leak an iu if we know it
> > didn't make it into the table.
> 
> Agreed.
> 
> > I could probably go either way on whether we wanted to allow the
> > filesystem to continue or not on unexpected insert errors. The
> > original comment was just that we probably shouldn't explode on
> > "expected" errors like -ENOSPC.
> 
> Since we /do/ have a fallback to handle cache misses, I think it's fine
> to ignore hashtable insertion failures, though I'll put in a ASSERT if
> we see any error other than ENOMEM.
> 

Sounds good.

> > 
> > > > > +	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> > > > > +			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> > > > > +	if (error)
> > > > > +		return error;
> > > > 
> > > > What's interesting about remove failure is that if we otherwise found an
> > > > iu for this inode, removing the inode from the unlinked list leaves a
> > > > stale/incorrect iu around in the in-core table. So it's one thing for
> > > > the in-core table to be a valid subset of the on-disk structures, but
> > > > another thing entirely to have incoherence between the two. It might be
> > > > worth pointing out that it's critical to return an error here so we
> > > > don't actually remove the inode (whereas the re-add below is less
> > > > strict).
> > > 
> > > Again, remove failure here sounds like a programming bug - so ASSERT
> > > and or forced shutdown here.
> > 
> > Remove failure already unconditionally returns the error (which in the
> > inactive path translates to a shutdown). My feedback above was just a
> > suggestion to explain why in a comment because the error handling is
> > intentionally different between table insert/remove/lookup.
> 
> I assume you're ok with the comment that got added in the followup patch
> I sent to this thread?
> 

Yep, thanks.

Brian

> --D
> 
> > Brian
Brian Foster Feb. 11, 2019, 12:21 p.m. UTC | #8
On Mon, Feb 11, 2019 at 12:08:31AM -0800, Christoph Hellwig wrote:
> On Fri, Feb 08, 2019 at 07:06:24AM -0500, Brian Foster wrote:
> > It's not clear to me whether you're suggesting we return 0, error or
> > nothing at all here. The assert otherwise seems fine to me as I don't
> > think we'd ever expect anything outside of success or -ENOMEM.
> 
> I'm arguing that we should return nothing.
> 
> > That said, I don't see any reason to ever leak an iu if we know it
> > didn't make it into the table. I could probably go either way on whether
> > we wanted to allow the filesystem to continue or not on unexpected
> > insert errors. The original comment was just that we probably shouldn't
> > explode on "expected" errors like -ENOSPC.
> 
> Well, IFF the only error case that should happen is either ENOMEM or
> E2BIG we don't have an allocation in that case.  Everything else is
> a programming bug where we should assert / shut the file system down,
> in which case the tiny leak is the least of our problems.

The caller allocated the object we're trying to insert. From the earlier
patch in this thread:

+       iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+       iu->iu_agino = prev_agino;
+       iu->iu_next_unlinked = this_agino;
+
+       error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+                       &iu->iu_rhash_head, xfs_iunlink_hash_params);

Brian

Patch
diff mbox series

diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@ 
 #define XFS_ERRTAG_BUF_LRU_REF				31
 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR			32
 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC			33
-#define XFS_ERRTAG_MAX					34
+#define XFS_ERRTAG_IUNLINK_FALLBACK			34
+#define XFS_ERRTAG_MAX					35
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@ 
 #define XFS_RANDOM_BUF_LRU_REF				2
 #define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
 #define XFS_RANDOM_FORCE_SUMMARY_RECALC			1
+#define XFS_RANDOM_IUNLINK_FALLBACK			(XFS_RANDOM_DEFAULT/10)
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..cc77c11bd0d4 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@  static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_BUF_LRU_REF,
 	XFS_RANDOM_FORCE_SCRUB_REPAIR,
 	XFS_RANDOM_FORCE_SUMMARY_RECALC,
+	XFS_RANDOM_IUNLINK_FALLBACK,
 };
 
 struct xfs_errortag_attr {
@@ -159,6 +160,7 @@  XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
 XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@  static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
 	XFS_ERRORTAG_ATTR_LIST(force_repair),
 	XFS_ERRORTAG_ATTR_LIST(bad_summary),
+	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
 	NULL,
 };
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 53a9c8c26d18..fdfcb3a9bac7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1906,6 +1906,194 @@  xfs_inactive(
 	xfs_qm_dqdetach(ip);
 }
 
+/*
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk.  The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+	struct rhash_head	iu_rhash_head;
+	xfs_agino_t		iu_agino;		/* X */
+	xfs_agino_t		iu_next_unlinked;	/* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const xfs_agino_t		*key = arg->key;
+	const struct xfs_iunlink	*iu = obj;
+
+	if (iu->iu_next_unlinked != *key)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
+	.key_len		= sizeof(xfs_agino_t),
+	.key_offset		= offsetof(struct xfs_iunlink,
+					   iu_next_unlinked),
+	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
+ */
+xfs_agino_t
+xfs_iunlink_lookup_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino)
+{
+	struct xfs_iunlink	*iu;
+
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Remember that @prev_agino.next_unlinked = @this_agino.  Fail loudly if there
+ * already was an entry, but absorb any other runtime errors.
+ */
+int
+xfs_iunlink_add_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		prev_agino,
+	xfs_agino_t		this_agino)
+{
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+		return 0;
+
+	iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+	iu->iu_agino = prev_agino;
+	iu->iu_next_unlinked = this_agino;
+
+	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error == 0 || error == -EEXIST)
+		return error;
+	return 0;
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
+ * wasn't any such entry then we don't bother.
+ */
+int
+xfs_iunlink_change_backref(
+	struct xfs_perag	*pag,
+	xfs_agino_t		agino,
+	xfs_agino_t		next_unlinked)
+{
+	struct xfs_iunlink	*iu;
+	int			error;
+
+	/* Look up the old entry; if there wasn't one then exit. */
+	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+			xfs_iunlink_hash_params);
+	if (!iu)
+		return 0;
+
+	/*
+	 * Remove the entry.  This shouldn't ever return an error, but if we
+	 * couldn't remove the old entry we don't want to add it again to the
+	 * hash table, and if the entry disappeared on us then someone's
+	 * violated the locking rules and we need to fail loudly.
+	 */
+	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error)
+		return error;
+
+	/* If there is no new next entry just free our item and return. */
+	if (next_unlinked == NULLAGINO) {
+		kmem_free(iu);
+		return 0;
+	}
+
+	/*
+	 * Update the hash table to the new entry, ignoring operational
+	 * errors.
+	 */
+	iu->iu_next_unlinked = next_unlinked;
+	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+			&iu->iu_rhash_head, xfs_iunlink_hash_params);
+	if (error == 0 || error == -EEXIST)
+		return error;
+	return 0;
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+	struct xfs_perag	*pag)
+{
+	return rhashtable_init(&pag->pagi_unlinked_hash,
+			&xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+	void			*ptr,
+	void			*arg)
+{
+	struct xfs_iunlink	*iu = ptr;
+	bool			*freed_anything = arg;
+
+	*freed_anything = true;
+	kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+	struct xfs_perag	*pag)
+{
+	bool			freed_anything = false;
+
+	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+			xfs_iunlink_free_item, &freed_anything);
+
+	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
  * is responsible for validating the old value.
@@ -2066,7 +2254,8 @@  xfs_iunlink(
 		return -EFSCORRUPTED;
 
 	if (next_agino != NULLAGINO) {
-		xfs_agino_t	old_agino;
+		struct xfs_perag	*pag;
+		xfs_agino_t		old_agino;
 
 		/*
 		 * There is already another inode in the bucket, so point this
@@ -2077,6 +2266,16 @@  xfs_iunlink(
 		if (error)
 			return error;
 		ASSERT(old_agino == NULLAGINO);
+
+		/*
+		 * agino has been unlinked, add a backref from the next inode
+		 * back to agino.
+		 */
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_iunlink_add_backref(pag, agino, next_agino);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2133,7 +2332,8 @@  xfs_iunlink_map_prev(
 	xfs_agino_t		*agino,
 	struct xfs_imap		*imap,
 	struct xfs_dinode	**dipp,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		**bpp,
+	struct xfs_perag	*pag)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_agino_t		next_agino;
@@ -2141,6 +2341,27 @@  xfs_iunlink_map_prev(
 
 	ASSERT(head_agino != target_agino);
 
+	/* See if our backref cache can find it faster. */
+	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
+	if (*agino != NULLAGINO) {
+		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+		if (error)
+			return error;
+
+		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+			return 0;
+
+		/*
+		 * If we get here the cache contents were corrupt, so drop the
+		 * buffer and fall back to walking the bucket list.
+		 */
+		xfs_trans_brelse(tp, *bpp);
+		*bpp = NULL;
+	}
+
+	trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+	/* Otherwise, walk the entire bucket until we find it. */
 	next_agino = head_agino;
 	while (next_agino != target_agino) {
 		xfs_agino_t	unlinked_agino;
@@ -2186,6 +2407,7 @@  xfs_iunlink_remove(
 	struct xfs_buf		*agibp;
 	struct xfs_buf		*last_ibp = NULL;
 	struct xfs_dinode	*last_dip = NULL;
+	struct xfs_perag	*pag = NULL;
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agino_t		next_agino;
@@ -2221,27 +2443,62 @@  xfs_iunlink_remove(
 	if (error)
 		return error;
 
+	/*
+	 * If there was a backref pointing from the next inode back to this
+	 * one, remove it because we've removed this inode from the list.
+	 *
+	 * Later, if this inode was in the middle of the list we'll update
+	 * this inode's backref to point from the next inode.
+	 */
+	if (next_agino != NULLAGINO) {
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_iunlink_change_backref(pag, next_agino,
+				NULLAGINO);
+		if (error)
+			goto out;
+	}
+
 	if (head_agino == agino) {
 		/* Point the head of the list to the next unlinked inode. */
 		error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
 				next_agino);
 		if (error)
-			return error;
+			goto out;
 	} else {
 		struct xfs_imap	imap;
 		xfs_agino_t	prev_agino;
 
+		if (!pag)
+			pag = xfs_perag_get(mp, agno);
+
 		/* We need to search the list for the inode being freed. */
 		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
-				&prev_agino, &imap, &last_dip, &last_ibp);
+				&prev_agino, &imap, &last_dip, &last_ibp,
+				pag);
 		if (error)
-			return error;
+			goto out;
 
 		/* Point the previous inode on the list to the next inode. */
 		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
 				last_dip, &imap, next_agino);
+
+		/*
+		 * Now we deal with the backref for this inode.  If this inode
+		 * pointed at a real inode, change the backref that pointed to
+		 * us to point to our old next.  If this inode was the end of
+		 * the list, delete the backref that pointed to us.  Note that
+		 * change_backref takes care of deleting the backref if
+		 * next_agino is NULLAGINO.
+		 */
+		error = xfs_iunlink_change_backref(pag, agino, next_agino);
+		if (error)
+			goto out;
 	}
-	return 0;
+
+out:
+	if (pag)
+		xfs_perag_put(pag);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@  extern struct kmem_zone	*xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@  xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@  xfs_initialize_perag(
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
+		error = xfs_iunlink_init(pag);
+		if (error)
+			goto out_hash_destroy;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@  xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
+		xfs_iunlink_destroy(pag);
 		mutex_destroy(&pag->pag_ici_reclaim_lock);
 		kmem_free(pag);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..a33f45077867 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -396,6 +396,13 @@  typedef struct xfs_perag {
 
 	/* reference count */
 	uint8_t			pagf_refcount_level;
+
+	/*
+	 * Unlinked inode information.  This incore information reflects
+	 * data stored in the AGI, so callers must hold the AGI buffer lock
+	 * or have some other means to control concurrency.
+	 */
+	struct rhashtable	pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a6e384a642b1..c83ce022a355 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3447,6 +3447,7 @@  DEFINE_EVENT(xfs_ag_inode_class, name, \
 	TP_ARGS(ip))
 DEFINE_AGINODE_EVENT(xfs_iunlink);
 DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
 
 #endif /* _TRACE_XFS_H */