diff mbox series

[06/13] xfs: replace iunlink backref lookups with list lookups

Message ID 20200812092556.2567285-7-david@fromorbit.com (mailing list archive)
State Accepted
Headers show
Series xfs: in memory inode unlink log items | expand

Commit Message

Dave Chinner Aug. 12, 2020, 9:25 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now we have an in memory linked list of all the inodes on the
unlinked list, use that to look up inodes in the list that we need
to modify when adding or removing from the list.

This means we are no longer using the backref cache to maintain the
previous inode lookups, so we can remove all that infrastructure
now.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_error.c |   2 -
 fs/xfs/xfs_inode.c | 327 ++++++++-------------------------------------
 fs/xfs/xfs_inode.h |   3 -
 fs/xfs/xfs_mount.c |   5 -
 fs/xfs/xfs_trace.h |   1 -
 5 files changed, 54 insertions(+), 284 deletions(-)

Comments

Darrick J. Wong Aug. 19, 2020, 12:13 a.m. UTC | #1
On Wed, Aug 12, 2020 at 07:25:49PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now we have an in memory linked list of all the inodes on the
> unlinked list, use that to look up inodes in the list that we need
> to modify when adding or removing from the list.
> 
> This means we are no longer using the backref cache to maintain the
> previous inode lookups, so we can remove all that infrastructure
> now.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_error.c |   2 -
>  fs/xfs/xfs_inode.c | 327 ++++++++-------------------------------------
>  fs/xfs/xfs_inode.h |   3 -
>  fs/xfs/xfs_mount.c |   5 -
>  fs/xfs/xfs_trace.h |   1 -
>  5 files changed, 54 insertions(+), 284 deletions(-)
> 
> diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
> index 7f6e20899473..829a89418830 100644
> --- a/fs/xfs/xfs_error.c
> +++ b/fs/xfs/xfs_error.c
> @@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
>  XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
>  XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
>  XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
> -XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
>  XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
>  
>  static struct attribute *xfs_errortag_attrs[] = {
> @@ -200,7 +199,6 @@ static struct attribute *xfs_errortag_attrs[] = {
>  	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
>  	XFS_ERRORTAG_ATTR_LIST(force_repair),
>  	XFS_ERRORTAG_ATTR_LIST(bad_summary),
> -	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
>  	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
>  	NULL,
>  };
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index dcf80ac51267..2c930de99561 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1893,196 +1893,29 @@ xfs_inactive(
>   * because we must walk that list to find the inode that points to the inode
>   * being removed from the unlinked hash bucket list.
>   *
> - * What if we modelled the unlinked list as a collection of records capturing
> - * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
> - * have a fast way to look up unlinked list predecessors, which avoids the
> - * slow list walk.  That's exactly what we do here (in-core) with a per-AG
> - * rhashtable.
> + * However, inodes that are on the unlinked list are also guaranteed to be in
> + * memory as they are loaded and then pinned in memory by whatever holds
> + * references to the inode to perform the unlink. Same goes for the O_TMPFILE
> + * usage of the unlinked list - those files are pinned in memory by an open file
> + * descriptor. Hence the inodes on the list are pinned in memory until they are
> + * removed from the list.
>   *
> - * Because this is a backref cache, we ignore operational failures since the
> - * iunlink code can fall back to the slow bucket walk.  The only errors that
> - * should bubble out are for obviously incorrect situations.
> + * That means we can simply use an in-memory double linked list to track inodes
> + * on the unlinked list. As we've removed the scalability problem resulting from
> + * removal on a single linked list requiring traversal, we also no longer use
> + * the on-disk hash to keep traversals short. We just use a single list on disk
> + * now, and track the previous inode in the list in memory.
>   *
> - * All users of the backref cache MUST hold the AGI buffer lock to serialize
> - * access or have otherwise provided for concurrency control.
> - */
> -
> -/* Capture a "X.next_unlinked = Y" relationship. */
> -struct xfs_iunlink {
> -	struct rhash_head	iu_rhash_head;
> -	xfs_agino_t		iu_agino;		/* X */
> -	xfs_agino_t		iu_next_unlinked;	/* Y */
> -};
> -
> -/* Unlinked list predecessor lookup hashtable construction */
> -static int
> -xfs_iunlink_obj_cmpfn(
> -	struct rhashtable_compare_arg	*arg,
> -	const void			*obj)
> -{
> -	const xfs_agino_t		*key = arg->key;
> -	const struct xfs_iunlink	*iu = obj;
> -
> -	if (iu->iu_next_unlinked != *key)
> -		return 1;
> -	return 0;
> -}
> -
> -static const struct rhashtable_params xfs_iunlink_hash_params = {
> -	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
> -	.key_len		= sizeof(xfs_agino_t),
> -	.key_offset		= offsetof(struct xfs_iunlink,
> -					   iu_next_unlinked),
> -	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
> -	.automatic_shrinking	= true,
> -	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
> -};
> -
> -/*
> - * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
> - * relation is found.
> + * To provide the guarantee that inodes are always on this in memory list, log
> + * recovery does what is necessary to populate the list sufficient to perform
> + * removal from the head of the list correctly. As such, we can now always rely
> + * on the in-memory list and if it differs from what we find on disk then we
> + * have a memory corruption problem or a software bug and so mismatches are now
> + * considered EFSCORRUPTION errors and are not recoverable.
> + *
> + * All users of the unlinked list MUST hold the AGI buffer lock to serialize
> + * access to the list.
>   */
> -static xfs_agino_t
> -xfs_iunlink_lookup_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		agino)
> -{
> -	struct xfs_iunlink	*iu;
> -
> -	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> -			xfs_iunlink_hash_params);
> -	return iu ? iu->iu_agino : NULLAGINO;
> -}
> -
> -/*
> - * Take ownership of an iunlink cache entry and insert it into the hash table.
> - * If successful, the entry will be owned by the cache; if not, it is freed.
> - * Either way, the caller does not own @iu after this call.
> - */
> -static int
> -xfs_iunlink_insert_backref(
> -	struct xfs_perag	*pag,
> -	struct xfs_iunlink	*iu)
> -{
> -	int			error;
> -
> -	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
> -			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> -	/*
> -	 * Fail loudly if there already was an entry because that's a sign of
> -	 * corruption of in-memory data.  Also fail loudly if we see an error
> -	 * code we didn't anticipate from the rhashtable code.  Currently we
> -	 * only anticipate ENOMEM.
> -	 */
> -	if (error) {
> -		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
> -		kmem_free(iu);
> -	}
> -	/*
> -	 * Absorb any runtime errors that aren't a result of corruption because
> -	 * this is a cache and we can always fall back to bucket list scanning.
> -	 */
> -	if (error != 0 && error != -EEXIST)
> -		error = 0;
> -	return error;
> -}
> -
> -/* Remember that @prev_agino.next_unlinked = @this_agino. */
> -static int
> -xfs_iunlink_add_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		prev_agino,
> -	xfs_agino_t		this_agino)
> -{
> -	struct xfs_iunlink	*iu;
> -
> -	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
> -		return 0;
> -
> -	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
> -	iu->iu_agino = prev_agino;
> -	iu->iu_next_unlinked = this_agino;
> -
> -	return xfs_iunlink_insert_backref(pag, iu);
> -}
> -
> -/*
> - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
> - * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
> - * wasn't any such entry then we don't bother.
> - */
> -static int
> -xfs_iunlink_change_backref(
> -	struct xfs_perag	*pag,
> -	xfs_agino_t		agino,
> -	xfs_agino_t		next_unlinked)
> -{
> -	struct xfs_iunlink	*iu;
> -	int			error;
> -
> -	/* Look up the old entry; if there wasn't one then exit. */
> -	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
> -			xfs_iunlink_hash_params);
> -	if (!iu)
> -		return 0;
> -
> -	/*
> -	 * Remove the entry.  This shouldn't ever return an error, but if we
> -	 * couldn't remove the old entry we don't want to add it again to the
> -	 * hash table, and if the entry disappeared on us then someone's
> -	 * violated the locking rules and we need to fail loudly.  Either way
> -	 * we cannot remove the inode because internal state is or would have
> -	 * been corrupt.
> -	 */
> -	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
> -			&iu->iu_rhash_head, xfs_iunlink_hash_params);
> -	if (error)
> -		return error;
> -
> -	/* If there is no new next entry just free our item and return. */
> -	if (next_unlinked == NULLAGINO) {
> -		kmem_free(iu);
> -		return 0;
> -	}
> -
> -	/* Update the entry and re-add it to the hash table. */
> -	iu->iu_next_unlinked = next_unlinked;
> -	return xfs_iunlink_insert_backref(pag, iu);
> -}
> -
> -/* Set up the in-core predecessor structures. */
> -int
> -xfs_iunlink_init(
> -	struct xfs_perag	*pag)
> -{
> -	return rhashtable_init(&pag->pagi_unlinked_hash,
> -			&xfs_iunlink_hash_params);
> -}
> -
> -/* Free the in-core predecessor structures. */
> -static void
> -xfs_iunlink_free_item(
> -	void			*ptr,
> -	void			*arg)
> -{
> -	struct xfs_iunlink	*iu = ptr;
> -	bool			*freed_anything = arg;
> -
> -	*freed_anything = true;
> -	kmem_free(iu);
> -}
> -
> -void
> -xfs_iunlink_destroy(
> -	struct xfs_perag	*pag)
> -{
> -	bool			freed_anything = false;
> -
> -	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
> -			xfs_iunlink_free_item, &freed_anything);
> -
> -	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
> -}
>  
>  /*
>   * Point the AGI unlinked bucket at an inode and log the results.  The caller
> @@ -2221,6 +2054,7 @@ xfs_iunlink_insert_inode(
>  {
>  	struct xfs_mount	*mp = tp->t_mountp;
>  	struct xfs_agi		*agi;
> +	struct xfs_inode	*nip;
>  	xfs_agino_t		next_agino;
>  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
> @@ -2242,9 +2076,13 @@ xfs_iunlink_insert_inode(
>  		return -EFSCORRUPTED;
>  	}
>  
> -	if (next_agino != NULLAGINO) {
> +	nip = list_first_entry_or_null(&agibp->b_pag->pag_ici_unlink_list,
> +					struct xfs_inode, i_unlink);
> +	if (nip) {
>  		xfs_agino_t		old_agino;
>  
> +		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
> +
>  		/*
>  		 * There is already another inode in the bucket, so point this
>  		 * inode to the current head of the list.
> @@ -2254,14 +2092,8 @@ xfs_iunlink_insert_inode(
>  		if (error)
>  			return error;
>  		ASSERT(old_agino == NULLAGINO);
> -
> -		/*
> -		 * agino has been unlinked, add a backref from the next inode
> -		 * back to agino.
> -		 */
> -		error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
> -		if (error)
> -			return error;
> +	} else {
> +		ASSERT(next_agino == NULLAGINO);
>  	}
>  
>  	/* Point the head of the list to point to this inode. */
> @@ -2354,70 +2186,24 @@ xfs_iunlink_map_prev(
>  	xfs_agnumber_t		agno,
>  	xfs_agino_t		head_agino,
>  	xfs_agino_t		target_agino,
> -	xfs_agino_t		*agino,
> +	xfs_agino_t		agino,
>  	struct xfs_imap		*imap,
>  	struct xfs_dinode	**dipp,
>  	struct xfs_buf		**bpp,
>  	struct xfs_perag	*pag)
>  {
> -	struct xfs_mount	*mp = tp->t_mountp;
> -	xfs_agino_t		next_agino;
>  	int			error;
>  
>  	ASSERT(head_agino != target_agino);
>  	*bpp = NULL;
>  
> -	/* See if our backref cache can find it faster. */
> -	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
> -	if (*agino != NULLAGINO) {
> -		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
> -		if (error)
> -			return error;
> -
> -		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
> -			return 0;
> -
> -		/*
> -		 * If we get here the cache contents were corrupt, so drop the
> -		 * buffer and fall back to walking the bucket list.
> -		 */
> -		xfs_trans_brelse(tp, *bpp);
> -		*bpp = NULL;
> -		WARN_ON_ONCE(1);
> -	}
> -
> -	trace_xfs_iunlink_map_prev_fallback(mp, agno);
> -
> -	/* Otherwise, walk the entire bucket until we find it. */
> -	next_agino = head_agino;
> -	while (next_agino != target_agino) {
> -		xfs_agino_t	unlinked_agino;
> -
> -		if (*bpp)
> -			xfs_trans_brelse(tp, *bpp);
> -
> -		*agino = next_agino;
> -		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
> -				bpp);
> -		if (error)
> -			return error;
> -
> -		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
> -		/*
> -		 * Make sure this pointer is valid and isn't an obvious
> -		 * infinite loop.
> -		 */
> -		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
> -		    next_agino == unlinked_agino) {
> -			XFS_CORRUPTION_ERROR(__func__,
> -					XFS_ERRLEVEL_LOW, mp,
> -					*dipp, sizeof(**dipp));
> -			error = -EFSCORRUPTED;
> -			return error;
> -		}
> -		next_agino = unlinked_agino;
> -	}
> +	ASSERT(agino != NULLAGINO);
> +	error = xfs_iunlink_map_ino(tp, agno, agino, imap, dipp, bpp);
> +	if (error)
> +		return error;
>  
> +	if (be32_to_cpu((*dipp)->di_next_unlinked) != target_agino)
> +		return -EFSCORRUPTED;

Why drop the corruption report here?

--D

>  	return 0;
>  }
>  
> @@ -2461,27 +2247,31 @@ xfs_iunlink_remove_inode(
>  	if (error)
>  		return error;
>  
> -	/*
> -	 * If there was a backref pointing from the next inode back to this
> -	 * one, remove it because we've removed this inode from the list.
> -	 *
> -	 * Later, if this inode was in the middle of the list we'll update
> -	 * this inode's backref to point from the next inode.
> -	 */
> -	if (next_agino != NULLAGINO) {
> -		error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
> -				NULLAGINO);
> -		if (error)
> -			return error;
> +#ifdef DEBUG
> +	{
> +	struct xfs_inode *nip = list_next_entry(ip, i_unlink);
> +	if (nip)
> +		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
> +	else
> +		ASSERT(next_agino == NULLAGINO);
>  	}
> +#endif
> +
> +	if (ip != list_first_entry(&agibp->b_pag->pag_ici_unlink_list,
> +					struct xfs_inode, i_unlink)) {
>  
> -	if (head_agino != agino) {
> +		struct xfs_inode *pip;
>  		struct xfs_imap	imap;
>  		xfs_agino_t	prev_agino;
>  
> +		ASSERT(head_agino != agino);
> +
> +		pip = list_prev_entry(ip, i_unlink);
> +		prev_agino = XFS_INO_TO_AGINO(mp, pip->i_ino);
> +
>  		/* We need to search the list for the inode being freed. */
>  		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
> -				&prev_agino, &imap, &last_dip, &last_ibp,
> +				prev_agino, &imap, &last_dip, &last_ibp,
>  				agibp->b_pag);
>  		if (error)
>  			return error;
> @@ -2490,16 +2280,7 @@ xfs_iunlink_remove_inode(
>  		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
>  				last_dip, &imap, next_agino);
>  
> -		/*
> -		 * Now we deal with the backref for this inode.  If this inode
> -		 * pointed at a real inode, change the backref that pointed to
> -		 * us to point to our old next.  If this inode was the end of
> -		 * the list, delete the backref that pointed to us.  Note that
> -		 * change_backref takes care of deleting the backref if
> -		 * next_agino is NULLAGINO.
> -		 */
> -		return xfs_iunlink_change_backref(agibp->b_pag, agino,
> -				next_agino);
> +		return 0;
>  	}
>  
>  	/* Point the head of the list to the next unlinked inode. */
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 73f36908a1ce..7f8fbb7c8594 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -464,9 +464,6 @@ extern struct kmem_zone	*xfs_inode_zone;
>  /* The default CoW extent size hint. */
>  #define XFS_DEFAULT_COWEXTSZ_HINT 32
>  
> -int xfs_iunlink_init(struct xfs_perag *pag);
> -void xfs_iunlink_destroy(struct xfs_perag *pag);
> -
>  void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 2def15297a5f..f28c969af272 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -146,7 +146,6 @@ xfs_free_perag(
>  		spin_unlock(&mp->m_perag_lock);
>  		ASSERT(pag);
>  		ASSERT(atomic_read(&pag->pag_ref) == 0);
> -		xfs_iunlink_destroy(pag);
>  		xfs_buf_hash_destroy(pag);
>  		call_rcu(&pag->rcu_head, __xfs_free_perag);
>  	}
> @@ -224,9 +223,6 @@ xfs_initialize_perag(
>  		/* first new pag is fully initialized */
>  		if (first_initialised == NULLAGNUMBER)
>  			first_initialised = index;
> -		error = xfs_iunlink_init(pag);
> -		if (error)
> -			goto out_hash_destroy;
>  		spin_lock_init(&pag->pag_state_lock);
>  	}
>  
> @@ -249,7 +245,6 @@ xfs_initialize_perag(
>  		if (!pag)
>  			break;
>  		xfs_buf_hash_destroy(pag);
> -		xfs_iunlink_destroy(pag);
>  		kmem_free(pag);
>  	}
>  	return error;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index abb1d859f226..acddc60f6d88 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3514,7 +3514,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
>  	TP_ARGS(ip))
>  DEFINE_AGINODE_EVENT(xfs_iunlink);
>  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> -DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
>  
>  DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
>  	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> -- 
> 2.26.2.761.g0e0b3e54be
>
Dave Chinner Aug. 19, 2020, 12:52 a.m. UTC | #2
On Tue, Aug 18, 2020 at 05:13:22PM -0700, Darrick J. Wong wrote:
> On Wed, Aug 12, 2020 at 07:25:49PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now we have an in memory linked list of all the inodes on the
> > unlinked list, use that to look up inodes in the list that we need
> > to modify when adding or removing from the list.
> > 
> > This means we are no longer using the backref cache to maintain the
> > previous inode lookups, so we can remove all that infrastructure
> > now.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>

....

> > @@ -2354,70 +2186,24 @@ xfs_iunlink_map_prev(
> >  	xfs_agnumber_t		agno,
> >  	xfs_agino_t		head_agino,
> >  	xfs_agino_t		target_agino,
> > -	xfs_agino_t		*agino,
> > +	xfs_agino_t		agino,
> >  	struct xfs_imap		*imap,
> >  	struct xfs_dinode	**dipp,
> >  	struct xfs_buf		**bpp,
> >  	struct xfs_perag	*pag)
> >  {
> > -	struct xfs_mount	*mp = tp->t_mountp;
> > -	xfs_agino_t		next_agino;
> >  	int			error;
> >  
> >  	ASSERT(head_agino != target_agino);
> >  	*bpp = NULL;
> >  
> > -	/* See if our backref cache can find it faster. */
> > -	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
> > -	if (*agino != NULLAGINO) {
> > -		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
> > -		if (error)
> > -			return error;
> > -
> > -		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
> > -			return 0;
> > -
> > -		/*
> > -		 * If we get here the cache contents were corrupt, so drop the
> > -		 * buffer and fall back to walking the bucket list.
> > -		 */
> > -		xfs_trans_brelse(tp, *bpp);
> > -		*bpp = NULL;
> > -		WARN_ON_ONCE(1);
> > -	}
> > -
> > -	trace_xfs_iunlink_map_prev_fallback(mp, agno);
> > -
> > -	/* Otherwise, walk the entire bucket until we find it. */
> > -	next_agino = head_agino;
> > -	while (next_agino != target_agino) {
> > -		xfs_agino_t	unlinked_agino;
> > -
> > -		if (*bpp)
> > -			xfs_trans_brelse(tp, *bpp);
> > -
> > -		*agino = next_agino;
> > -		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
> > -				bpp);
> > -		if (error)
> > -			return error;
> > -
> > -		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
> > -		/*
> > -		 * Make sure this pointer is valid and isn't an obvious
> > -		 * infinite loop.
> > -		 */
> > -		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
> > -		    next_agino == unlinked_agino) {
> > -			XFS_CORRUPTION_ERROR(__func__,
> > -					XFS_ERRLEVEL_LOW, mp,
> > -					*dipp, sizeof(**dipp));
> > -			error = -EFSCORRUPTED;
> > -			return error;
> > -		}
> > -		next_agino = unlinked_agino;
> > -	}
> > +	ASSERT(agino != NULLAGINO);
> > +	error = xfs_iunlink_map_ino(tp, agno, agino, imap, dipp, bpp);
> > +	if (error)
> > +		return error;
> >  
> > +	if (be32_to_cpu((*dipp)->di_next_unlinked) != target_agino)
> > +		return -EFSCORRUPTED;
> 
> Why drop the corruption report here?

For simplicity of refactoring. It comes back later on in the series
as this check gets recombined with another similar function. i.e.
See xfs_iunlink_log_inode() at the end of the series....

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7f6e20899473..829a89418830 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -162,7 +162,6 @@  XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
 XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
-XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
 XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
 
 static struct attribute *xfs_errortag_attrs[] = {
@@ -200,7 +199,6 @@  static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
 	XFS_ERRORTAG_ATTR_LIST(force_repair),
 	XFS_ERRORTAG_ATTR_LIST(bad_summary),
-	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
 	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
 	NULL,
 };
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dcf80ac51267..2c930de99561 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1893,196 +1893,29 @@  xfs_inactive(
  * because we must walk that list to find the inode that points to the inode
  * being removed from the unlinked hash bucket list.
  *
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk.  That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
+ * However, inodes that are on the unlinked list are also guaranteed to be in
+ * memory as they are loaded and then pinned in memory by whatever holds
+ * references to the inode to perform the unlink. Same goes for the O_TMPFILE
+ * usage of the unlinked list - those files are pinned in memory by an open file
+ * descriptor. Hence the inodes on the list are pinned in memory until they are
+ * removed from the list.
  *
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk.  The only errors that
- * should bubble out are for obviously incorrect situations.
+ * That means we can simply use an in-memory double linked list to track inodes
+ * on the unlinked list. As we've removed the scalability problem resulting from
+ * removal on a single linked list requiring traversal, we also no longer use
+ * the on-disk hash to keep traversals short. We just use a single list on disk
+ * now, and track the previous inode in the list in memory.
  *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
- */
-
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
-	struct rhash_head	iu_rhash_head;
-	xfs_agino_t		iu_agino;		/* X */
-	xfs_agino_t		iu_next_unlinked;	/* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
-	struct rhashtable_compare_arg	*arg,
-	const void			*obj)
-{
-	const xfs_agino_t		*key = arg->key;
-	const struct xfs_iunlink	*iu = obj;
-
-	if (iu->iu_next_unlinked != *key)
-		return 1;
-	return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
-	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
-	.key_len		= sizeof(xfs_agino_t),
-	.key_offset		= offsetof(struct xfs_iunlink,
-					   iu_next_unlinked),
-	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
-	.automatic_shrinking	= true,
-	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
-};
-
-/*
- * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
- * relation is found.
+ * To provide the guarantee that inodes are always on this in memory list, log
+ * recovery does what is necessary to populate the list sufficient to perform
+ * removal from the head of the list correctly. As such, we can now always rely
+ * on the in-memory list and if it differs from what we find on disk then we
+ * have a memory corruption problem or a software bug and so mismatches are now
+ * considered EFSCORRUPTION errors and are not recoverable.
+ *
+ * All users of the unlinked list MUST hold the AGI buffer lock to serialize
+ * access to the list.
  */
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		agino)
-{
-	struct xfs_iunlink	*iu;
-
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	return iu ? iu->iu_agino : NULLAGINO;
-}
-
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
-	struct xfs_perag	*pag,
-	struct xfs_iunlink	*iu)
-{
-	int			error;
-
-	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	/*
-	 * Fail loudly if there already was an entry because that's a sign of
-	 * corruption of in-memory data.  Also fail loudly if we see an error
-	 * code we didn't anticipate from the rhashtable code.  Currently we
-	 * only anticipate ENOMEM.
-	 */
-	if (error) {
-		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
-		kmem_free(iu);
-	}
-	/*
-	 * Absorb any runtime errors that aren't a result of corruption because
-	 * this is a cache and we can always fall back to bucket list scanning.
-	 */
-	if (error != 0 && error != -EEXIST)
-		error = 0;
-	return error;
-}
-
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
-static int
-xfs_iunlink_add_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		prev_agino,
-	xfs_agino_t		this_agino)
-{
-	struct xfs_iunlink	*iu;
-
-	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
-		return 0;
-
-	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
-	iu->iu_agino = prev_agino;
-	iu->iu_next_unlinked = this_agino;
-
-	return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		agino,
-	xfs_agino_t		next_unlinked)
-{
-	struct xfs_iunlink	*iu;
-	int			error;
-
-	/* Look up the old entry; if there wasn't one then exit. */
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	if (!iu)
-		return 0;
-
-	/*
-	 * Remove the entry.  This shouldn't ever return an error, but if we
-	 * couldn't remove the old entry we don't want to add it again to the
-	 * hash table, and if the entry disappeared on us then someone's
-	 * violated the locking rules and we need to fail loudly.  Either way
-	 * we cannot remove the inode because internal state is or would have
-	 * been corrupt.
-	 */
-	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	if (error)
-		return error;
-
-	/* If there is no new next entry just free our item and return. */
-	if (next_unlinked == NULLAGINO) {
-		kmem_free(iu);
-		return 0;
-	}
-
-	/* Update the entry and re-add it to the hash table. */
-	iu->iu_next_unlinked = next_unlinked;
-	return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
-	struct xfs_perag	*pag)
-{
-	return rhashtable_init(&pag->pagi_unlinked_hash,
-			&xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
-	void			*ptr,
-	void			*arg)
-{
-	struct xfs_iunlink	*iu = ptr;
-	bool			*freed_anything = arg;
-
-	*freed_anything = true;
-	kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
-	struct xfs_perag	*pag)
-{
-	bool			freed_anything = false;
-
-	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
-			xfs_iunlink_free_item, &freed_anything);
-
-	ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
-}
 
 /*
  * Point the AGI unlinked bucket at an inode and log the results.  The caller
@@ -2221,6 +2054,7 @@  xfs_iunlink_insert_inode(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agi		*agi;
+	struct xfs_inode	*nip;
 	xfs_agino_t		next_agino;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
@@ -2242,9 +2076,13 @@  xfs_iunlink_insert_inode(
 		return -EFSCORRUPTED;
 	}
 
-	if (next_agino != NULLAGINO) {
+	nip = list_first_entry_or_null(&agibp->b_pag->pag_ici_unlink_list,
+					struct xfs_inode, i_unlink);
+	if (nip) {
 		xfs_agino_t		old_agino;
 
+		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
+
 		/*
 		 * There is already another inode in the bucket, so point this
 		 * inode to the current head of the list.
@@ -2254,14 +2092,8 @@  xfs_iunlink_insert_inode(
 		if (error)
 			return error;
 		ASSERT(old_agino == NULLAGINO);
-
-		/*
-		 * agino has been unlinked, add a backref from the next inode
-		 * back to agino.
-		 */
-		error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
-		if (error)
-			return error;
+	} else {
+		ASSERT(next_agino == NULLAGINO);
 	}
 
 	/* Point the head of the list to point to this inode. */
@@ -2354,70 +2186,24 @@  xfs_iunlink_map_prev(
 	xfs_agnumber_t		agno,
 	xfs_agino_t		head_agino,
 	xfs_agino_t		target_agino,
-	xfs_agino_t		*agino,
+	xfs_agino_t		agino,
 	struct xfs_imap		*imap,
 	struct xfs_dinode	**dipp,
 	struct xfs_buf		**bpp,
 	struct xfs_perag	*pag)
 {
-	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agino_t		next_agino;
 	int			error;
 
 	ASSERT(head_agino != target_agino);
 	*bpp = NULL;
 
-	/* See if our backref cache can find it faster. */
-	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
-	if (*agino != NULLAGINO) {
-		error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
-		if (error)
-			return error;
-
-		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
-			return 0;
-
-		/*
-		 * If we get here the cache contents were corrupt, so drop the
-		 * buffer and fall back to walking the bucket list.
-		 */
-		xfs_trans_brelse(tp, *bpp);
-		*bpp = NULL;
-		WARN_ON_ONCE(1);
-	}
-
-	trace_xfs_iunlink_map_prev_fallback(mp, agno);
-
-	/* Otherwise, walk the entire bucket until we find it. */
-	next_agino = head_agino;
-	while (next_agino != target_agino) {
-		xfs_agino_t	unlinked_agino;
-
-		if (*bpp)
-			xfs_trans_brelse(tp, *bpp);
-
-		*agino = next_agino;
-		error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
-				bpp);
-		if (error)
-			return error;
-
-		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
-		/*
-		 * Make sure this pointer is valid and isn't an obvious
-		 * infinite loop.
-		 */
-		if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
-		    next_agino == unlinked_agino) {
-			XFS_CORRUPTION_ERROR(__func__,
-					XFS_ERRLEVEL_LOW, mp,
-					*dipp, sizeof(**dipp));
-			error = -EFSCORRUPTED;
-			return error;
-		}
-		next_agino = unlinked_agino;
-	}
+	ASSERT(agino != NULLAGINO);
+	error = xfs_iunlink_map_ino(tp, agno, agino, imap, dipp, bpp);
+	if (error)
+		return error;
 
+	if (be32_to_cpu((*dipp)->di_next_unlinked) != target_agino)
+		return -EFSCORRUPTED;
 	return 0;
 }
 
@@ -2461,27 +2247,31 @@  xfs_iunlink_remove_inode(
 	if (error)
 		return error;
 
-	/*
-	 * If there was a backref pointing from the next inode back to this
-	 * one, remove it because we've removed this inode from the list.
-	 *
-	 * Later, if this inode was in the middle of the list we'll update
-	 * this inode's backref to point from the next inode.
-	 */
-	if (next_agino != NULLAGINO) {
-		error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
-				NULLAGINO);
-		if (error)
-			return error;
+#ifdef DEBUG
+	{
+	struct xfs_inode *nip = list_next_entry(ip, i_unlink);
+	if (nip)
+		ASSERT(next_agino == XFS_INO_TO_AGINO(mp, nip->i_ino));
+	else
+		ASSERT(next_agino == NULLAGINO);
 	}
+#endif
+
+	if (ip != list_first_entry(&agibp->b_pag->pag_ici_unlink_list,
+					struct xfs_inode, i_unlink)) {
 
-	if (head_agino != agino) {
+		struct xfs_inode *pip;
 		struct xfs_imap	imap;
 		xfs_agino_t	prev_agino;
 
+		ASSERT(head_agino != agino);
+
+		pip = list_prev_entry(ip, i_unlink);
+		prev_agino = XFS_INO_TO_AGINO(mp, pip->i_ino);
+
 		/* We need to search the list for the inode being freed. */
 		error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
-				&prev_agino, &imap, &last_dip, &last_ibp,
+				prev_agino, &imap, &last_dip, &last_ibp,
 				agibp->b_pag);
 		if (error)
 			return error;
@@ -2490,16 +2280,7 @@  xfs_iunlink_remove_inode(
 		xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
 				last_dip, &imap, next_agino);
 
-		/*
-		 * Now we deal with the backref for this inode.  If this inode
-		 * pointed at a real inode, change the backref that pointed to
-		 * us to point to our old next.  If this inode was the end of
-		 * the list, delete the backref that pointed to us.  Note that
-		 * change_backref takes care of deleting the backref if
-		 * next_agino is NULLAGINO.
-		 */
-		return xfs_iunlink_change_backref(agibp->b_pag, agino,
-				next_agino);
+		return 0;
 	}
 
 	/* Point the head of the list to the next unlinked inode. */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 73f36908a1ce..7f8fbb7c8594 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -464,9 +464,6 @@  extern struct kmem_zone	*xfs_inode_zone;
 /* The default CoW extent size hint. */
 #define XFS_DEFAULT_COWEXTSZ_HINT 32
 
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
-
 void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2def15297a5f..f28c969af272 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -146,7 +146,6 @@  xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
-		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
@@ -224,9 +223,6 @@  xfs_initialize_perag(
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
-		error = xfs_iunlink_init(pag);
-		if (error)
-			goto out_hash_destroy;
 		spin_lock_init(&pag->pag_state_lock);
 	}
 
@@ -249,7 +245,6 @@  xfs_initialize_perag(
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
-		xfs_iunlink_destroy(pag);
 		kmem_free(pag);
 	}
 	return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index abb1d859f226..acddc60f6d88 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3514,7 +3514,6 @@  DEFINE_EVENT(xfs_ag_inode_class, name, \
 	TP_ARGS(ip))
 DEFINE_AGINODE_EVENT(xfs_iunlink);
 DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
-DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
 
 DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
 	TP_PROTO(struct xfs_mount *mp, unsigned int flags),