diff mbox

[16/25] xfs: scrub inodes

Message ID 150706335008.19351.13005092237576865356.stgit@magnolia (mailing list archive)
State Superseded
Headers show

Commit Message

Darrick J. Wong Oct. 3, 2017, 8:42 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Scrub the fields within an inode.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile        |    1 
 fs/xfs/libxfs/xfs_fs.h |    3 
 fs/xfs/scrub/common.c  |   65 +++++++++
 fs/xfs/scrub/common.h  |    5 +
 fs/xfs/scrub/inode.c   |  353 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/scrub.c   |   17 ++
 fs/xfs/scrub/scrub.h   |    2 
 7 files changed, 443 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/scrub/inode.c



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Dave Chinner Oct. 5, 2017, 4:04 a.m. UTC | #1
On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote:
> @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree(
>  
>  	return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
>  }
> +
> +/*
> + * Given an inode and the scrub control structure, grab either the
> + * inode referenced in the control structure or the inode passed in.
> + * The inode is not locked.
> + */
> +int
> +xfs_scrub_get_inode(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_inode		*ip_in)
> +{
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_inode		*ips = NULL;

*ip?

> +	int				error;
> +
> +	if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino))
> +		return -EINVAL;

What's this checking?

> +
> +	/* We want to scan the inode we already had opened. */
> +	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
> +		sc->ip = ip_in;
> +		return 0;
> +	}
> +
> +	/* Look up the inode, see if the generation number matches. */
> +	if (xfs_internal_inum(mp, sc->sm->sm_ino))
> +		return -ENOENT;

maybe xfs_internal_inum should be moved to the same place as all the
inode/agbno/bno verification functions....

> +	error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED,
> +			0, &ips);

I think we also want XFS_IGET_DONTCACHE here, so we don't trash the
inode cache with inodes that we use once for scrub and never touch
again.

> +	if (error == -ENOENT || error == -EINVAL) {
> +		/* inode doesn't exist... */
> +		return -ENOENT;
> +	} else if (error) {
> +		trace_xfs_scrub_op_error(sc,
> +				XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
> +				XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
> +				error, __return_address);
> +		return error;
> +	}
> +	if (VFS_I(ips)->i_generation != sc->sm->sm_gen) {
> +		iput(VFS_I(ips));
> +		return -ENOENT;
> +	}
> +
> +	sc->ip = ips;
> +	return 0;
> +}
> +
> +/* Push everything out of the log onto disk. */
> +int
> +xfs_scrub_checkpoint_log(
> +	struct xfs_mount	*mp)
> +{
> +	int			error;
> +
> +	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
> +	if (error)
> +		return error;
> +	xfs_ail_push_all_sync(mp->m_ail);
> +	return 0;
> +}

Oooo, that's a nasty thing to do on busy systems with large dirty
logs. I hope this is a "last resort" kind of thing....

> +/* Set us up with an inode. */

What state are we trying to get the inode into here? We grab all the
various locks, but we can still have data changing via mmap pages
that are already faulted in and delalloc extents in the incore
extent list that aren't reflected on disk...

A comment explaining what we expect here would be nice.

> +int
> +xfs_scrub_setup_inode(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_inode		*ip)
> +{
> +	struct xfs_mount		*mp = sc->mp;
> +	int				error;
> +
> +	/*
> +	 * Try to get the inode.  If the verifiers fail, we try again
> +	 * in raw mode.
> +	 */
> +	error = xfs_scrub_get_inode(sc, ip);
> +	switch (error) {
> +	case 0:
> +		break;
> +	case -EFSCORRUPTED:
> +	case -EFSBADCRC:
> +		return xfs_scrub_checkpoint_log(mp);
> +	default:
> +		return error;
> +	}
> +
> +	/* Got the inode, lock it and we're ready to go. */
> +	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> +	xfs_ilock(sc->ip, sc->ilock_flags);
> +	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
> +	if (error)
> +		goto out_unlock;
> +	sc->ilock_flags |= XFS_ILOCK_EXCL;
> +	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);

Should the inode be joined to the transaction so that cancelling the
transaction unlocks the inode? Then the need for the ilock_flags
variable goes away....

> +
> +	return error;
> +out_unlock:
> +	xfs_iunlock(sc->ip, sc->ilock_flags);
> +	if (sc->ip != ip)
> +		iput(VFS_I(sc->ip));
> +	sc->ip = NULL;
> +	return error;
> +}
> +
> +/* Inode core */
> +
> +/* Scrub an inode. */
> +int
> +xfs_scrub_inode(
> +	struct xfs_scrub_context	*sc)
> +{
> +	struct xfs_imap			imap;
> +	struct xfs_dinode		di;
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_buf			*bp = NULL;
> +	struct xfs_dinode		*dip;
> +	xfs_ino_t			ino;
> +	size_t				fork_recs;
> +	unsigned long long		isize;
> +	uint64_t			flags2;
> +	uint32_t			nextents;
> +	uint32_t			extsize;
> +	uint32_t			cowextsize;
> +	uint16_t			flags;
> +	uint16_t			mode;
> +	bool				has_shared;
> +	int				error = 0;
> +
> +	/* Did we get the in-core inode, or are we doing this manually? */
> +	if (sc->ip) {
> +		ino = sc->ip->i_ino;
> +		xfs_inode_to_disk(sc->ip, &di, 0);
> +		dip = &di;
> +	} else {
> +		/* Map & read inode. */
> +		ino = sc->sm->sm_ino;
> +		error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
> +		if (error == -EINVAL) {
> +			/*
> +			 * Inode could have gotten deleted out from under us;
> +			 * just forget about it.
> +			 */
> +			error = -ENOENT;
> +			goto out;
> +		}
> +		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
> +				XFS_INO_TO_AGBNO(mp, ino), &error))
> +			goto out;
> +
> +		error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> +				imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
> +				NULL);
> +		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
> +				XFS_INO_TO_AGBNO(mp, ino), &error))
> +			goto out;
> +
> +		/* Is this really an inode? */
> +		bp->b_ops = &xfs_inode_buf_ops;
> +		dip = xfs_buf_offset(bp, imap.im_boffset);
> +		if (!xfs_dinode_verify(mp, ino, dip) ||
> +		    !xfs_dinode_good_version(mp, dip->di_version)) {
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +			goto out;
> +		}
> +
> +		/* ...and is it the one we asked for? */
> +		if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
> +			error = -ENOENT;
> +			goto out;
> +		}
> +	}

Can we factor the manual mapping into a separate function? Just
makes it a bit cleaner and gets rid of a bunch of local variables
from this function that are just used to map and read the inode.

ANd reading on and getting ahead of the code, could we split it
further into

	<setup dip>

	xfs_scrub_dinode(sc, ino, dip, bp);

	<do live incore inode stuff>

> +
> +	flags = be16_to_cpu(dip->di_flags);
> +	if (dip->di_version >= 3)
> +		flags2 = be64_to_cpu(dip->di_flags2);
> +	else
> +		flags2 = 0;
> +
> +	/* di_mode */
> +	mode = be16_to_cpu(dip->di_mode);
> +	if (mode & ~(S_IALLUGO | S_IFMT))
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +	/* v1/v2 fields */
> +	switch (dip->di_version) {
> +	case 1:
> +		if (dip->di_nlink != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +		if (dip->di_mode == 0 && sc->ip)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +		if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;

We don't really support v1 inode format anymore - we convert it to
version 2 automatically in xfs_inode_from_disk() so the in-memory
inode is always v2 or v3, never v1. And when we write it back out,
we write it as a v2 inode, never as a v1 inode.

Hence I'm not sure whether we should be worrying about scrubbing
such inodes - they are going to be in an ever shrinking minority
of filesystems. At minimum, they should always return "preen".

> +	case 2:
> +	case 3:
> +		if (dip->di_onlink != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +		if (dip->di_mode == 0 && sc->ip)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +		if (dip->di_projid_hi != 0 &&
> +		    !xfs_sb_version_hasprojid32bit(&mp->m_sb))
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	default:
> +		ASSERT(0);

If we don't understand the version, it's corrupt.

> +		break;
> +	}
> +
> +	/*
> +	 * di_uid/di_gid -- -1 isn't invalid, but there's no way that
> +	 * userspace could have created that.
> +	 */
> +	if (dip->di_uid == cpu_to_be32(-1U) ||
> +	    dip->di_gid == cpu_to_be32(-1U))
> +		xfs_scrub_ino_set_warning(sc, bp);
> +
> +	/* di_format */
> +	switch (dip->di_format) {
> +	case XFS_DINODE_FMT_DEV:
> +		if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
> +		    !S_ISFIFO(mode) && !S_ISSOCK(mode))
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +		if (!S_ISDIR(mode) && !S_ISLNK(mode))
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_EXTENTS:
> +		if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_BTREE:
> +		if (!S_ISREG(mode) && !S_ISDIR(mode))
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_UUID:
> +	default:
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	}
> +
> +	/* di_size */
> +	isize = be64_to_cpu(dip->di_size);
> +	if (isize & (1ULL << 63))
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);

Should we be checking against the on disk format size, or the
mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)?
32 or 64 bit systems are going to have different maximum valid file
sizes..

Directories have a maximum bound size, too - the data space, leaf
space and freespace space, each of which are 32GB in size, IIRC.

And symlinks have a different maximum size, too.

> +	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);

> +
> +	/* di_nblocks */
> +	if (flags2 & XFS_DIFLAG2_REFLINK) {
> +		; /* nblocks can exceed dblocks */
> +	} else if (flags & XFS_DIFLAG_REALTIME) {
> +		if (be64_to_cpu(dip->di_nblocks) >=
> +		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);

That doesn't seem right. the file can be on either the data or the
rt device, so the maximum file blocks is the size of one device or
the other, not both combined.

> +	} else {
> +		if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +	}
> +
> +	/* di_extsize */
> +	extsize = be32_to_cpu(dip->di_extsize);
> +	if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) {
> +		if (extsize <= 0 || extsize > MAXEXTLEN)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +		if (!(flags & XFS_DIFLAG_REALTIME) &&
> +		    extsize > mp->m_sb.sb_agblocks / 2)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +	} else {
> +		if (extsize != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +	}

There more to validating the extentsize hints than this. From
xfs_ioctl_setattr_check_extsize():

/*
 * extent size hint validation is somewhat cumbersome. Rules are:
 *
 * 1. extent size hint is only valid for directories and regular files
 * 2. FS_XFLAG_EXTSIZE is only valid for regular files
 * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
 * 4. can only be changed on regular files if no extents are allocated
 * 5. can be changed on directories at any time
 * 6. extsize hint of 0 turns off hints, clears inode flags.
 * 7. Extent size must be a multiple of the appropriate block size.
 * 8. for non-realtime files, the extent size hint must be limited
 *    to half the AG size to avoid alignment extending the extent beyond the
 *    limits of the AG.
 */

Maybe there's some commonality between these two functions...

> +
> +	/* di_flags */
> +	if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND))
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +	if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);

What about project id flags?

Also, there are flags that are allowed only on regular files, and
there are flags that are only allowed on directories. Those should
probably also be checked for preening.

> +
> +	/* di_nextents */
> +	nextents = be32_to_cpu(dip->di_nextents);
> +	fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> +	switch (dip->di_format) {
> +	case XFS_DINODE_FMT_EXTENTS:
> +		if (nextents > fork_recs)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_BTREE:
> +		if (nextents <= fork_recs)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +	case XFS_DINODE_FMT_DEV:
> +	case XFS_DINODE_FMT_UUID:
> +	default:
> +		if (nextents != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	}
> +
> +	/* di_anextents */
> +	nextents = be16_to_cpu(dip->di_anextents);
> +	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> +	switch (dip->di_aformat) {
> +	case XFS_DINODE_FMT_EXTENTS:
> +		if (nextents > fork_recs)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_BTREE:
> +		if (nextents <= fork_recs)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +	case XFS_DINODE_FMT_DEV:
> +	case XFS_DINODE_FMT_UUID:
> +	default:
> +		if (nextents != 0)
> +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +		break;
> +	}

Don't we need a check here first to see whether an attribute fork
exists or not?

> +
> +	/* di_forkoff */
> +	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +	if (dip->di_anextents != 0 && dip->di_forkoff == 0)
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> +
> +	/* di_aformat */
> +	if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
> +	    dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
> +	    dip->di_aformat != XFS_DINODE_FMT_BTREE)
> +		xfs_scrub_ino_set_corrupt(sc, ino, bp);

Shouldn't this come before we use dip->di_aformat in a switch
statement? Hmmm - aren't we missing the same checks for the data
fork?

Cheers,

Dave.
Darrick J. Wong Oct. 5, 2017, 5:22 a.m. UTC | #2
On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote:
> On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote:
> > @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree(
> >  
> >  	return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
> >  }
> > +
> > +/*
> > + * Given an inode and the scrub control structure, grab either the
> > + * inode referenced in the control structure or the inode passed in.
> > + * The inode is not locked.
> > + */
> > +int
> > +xfs_scrub_get_inode(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_inode		*ip_in)
> > +{
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_inode		*ips = NULL;
> 
> *ip?
> 
> > +	int				error;
> > +
> > +	if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino))
> > +		return -EINVAL;
> 
> What's this checking?

/*
 * Jump out if userspace fed us an AG number or a inode generation
 * without an inode number.  Both indicate that userspace hasn't got a
 * clue.
 */

> > +
> > +	/* We want to scan the inode we already had opened. */
> > +	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
> > +		sc->ip = ip_in;
> > +		return 0;
> > +	}
> > +
> > +	/* Look up the inode, see if the generation number matches. */
> > +	if (xfs_internal_inum(mp, sc->sm->sm_ino))
> > +		return -ENOENT;
> 
> maybe xfs_internal_inum should be moved to the same place as all the
> inode/agbno/bno verification functions....

Yes.

> > +	error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED,
> > +			0, &ips);
> 
> I think we also want XFS_IGET_DONTCACHE here, so we don't trash the
> inode cache with inodes that we use once for scrub and never touch
> again.

I thought about adding this, but if we let the inodes fall out of the
cache now then we'll just have to load them back in for the bmap checks,
right?

> > +	if (error == -ENOENT || error == -EINVAL) {
> > +		/* inode doesn't exist... */
> > +		return -ENOENT;
> > +	} else if (error) {
> > +		trace_xfs_scrub_op_error(sc,
> > +				XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
> > +				XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
> > +				error, __return_address);
> > +		return error;
> > +	}
> > +	if (VFS_I(ips)->i_generation != sc->sm->sm_gen) {
> > +		iput(VFS_I(ips));
> > +		return -ENOENT;
> > +	}
> > +
> > +	sc->ip = ips;
> > +	return 0;
> > +}
> > +
> > +/* Push everything out of the log onto disk. */
> > +int
> > +xfs_scrub_checkpoint_log(
> > +	struct xfs_mount	*mp)
> > +{
> > +	int			error;
> > +
> > +	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
> > +	if (error)
> > +		return error;
> > +	xfs_ail_push_all_sync(mp->m_ail);
> > +	return 0;
> > +}
> 
> Oooo, that's a nasty thing to do on busy systems with large dirty
> logs. I hope this is a "last resort" kind of thing....

It is; we only do this if the inobt says there's an inode there and the
inode verifiers fail.

> > +/* Set us up with an inode. */
> 
> What state are we trying to get the inode into here? We grab all the
> various locks, but we can still have data changing via mmap pages
> that are already faulted in and delalloc extents in the incore
> extent list that aren't reflected on disk...
> 
> A comment explaining what we expect here would be nice.

/* 
 * Grab total control of the inode metadata.  It doesn't matter here if
 * the file data is still changing, we just want exclusive access to the
 * metadata.
 */

> > +int
> > +xfs_scrub_setup_inode(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_inode		*ip)
> > +{
> > +	struct xfs_mount		*mp = sc->mp;
> > +	int				error;
> > +
> > +	/*
> > +	 * Try to get the inode.  If the verifiers fail, we try again
> > +	 * in raw mode.
> > +	 */
> > +	error = xfs_scrub_get_inode(sc, ip);
> > +	switch (error) {
> > +	case 0:
> > +		break;
> > +	case -EFSCORRUPTED:
> > +	case -EFSBADCRC:
> > +		return xfs_scrub_checkpoint_log(mp);
> > +	default:
> > +		return error;
> > +	}
> > +
> > +	/* Got the inode, lock it and we're ready to go. */
> > +	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > +	xfs_ilock(sc->ip, sc->ilock_flags);
> > +	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
> > +	if (error)
> > +		goto out_unlock;
> > +	sc->ilock_flags |= XFS_ILOCK_EXCL;
> > +	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
> 
> Should the inode be joined to the transaction so that cancelling the
> transaction unlocks the inode? Then the need for the ilock_flags
> variable goes away....

This is the confluence of two semi-icky things: first, some of the
scrubbers (particularly the dir and parent pointer scrubbers) will need
to drop the ILOCK for short periods of time; later on, repair will want
to keep the inode locked across all the repair transactions, so it makes
more sense to control the lock and unlock directly.

> > +
> > +	return error;
> > +out_unlock:
> > +	xfs_iunlock(sc->ip, sc->ilock_flags);
> > +	if (sc->ip != ip)
> > +		iput(VFS_I(sc->ip));
> > +	sc->ip = NULL;
> > +	return error;
> > +}
> > +
> > +/* Inode core */
> > +
> > +/* Scrub an inode. */
> > +int
> > +xfs_scrub_inode(
> > +	struct xfs_scrub_context	*sc)
> > +{
> > +	struct xfs_imap			imap;
> > +	struct xfs_dinode		di;
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_buf			*bp = NULL;
> > +	struct xfs_dinode		*dip;
> > +	xfs_ino_t			ino;
> > +	size_t				fork_recs;
> > +	unsigned long long		isize;
> > +	uint64_t			flags2;
> > +	uint32_t			nextents;
> > +	uint32_t			extsize;
> > +	uint32_t			cowextsize;
> > +	uint16_t			flags;
> > +	uint16_t			mode;
> > +	bool				has_shared;
> > +	int				error = 0;
> > +
> > +	/* Did we get the in-core inode, or are we doing this manually? */
> > +	if (sc->ip) {
> > +		ino = sc->ip->i_ino;
> > +		xfs_inode_to_disk(sc->ip, &di, 0);
> > +		dip = &di;
> > +	} else {
> > +		/* Map & read inode. */
> > +		ino = sc->sm->sm_ino;
> > +		error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
> > +		if (error == -EINVAL) {
> > +			/*
> > +			 * Inode could have gotten deleted out from under us;
> > +			 * just forget about it.
> > +			 */
> > +			error = -ENOENT;
> > +			goto out;
> > +		}
> > +		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
> > +				XFS_INO_TO_AGBNO(mp, ino), &error))
> > +			goto out;
> > +
> > +		error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> > +				imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
> > +				NULL);
> > +		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
> > +				XFS_INO_TO_AGBNO(mp, ino), &error))
> > +			goto out;
> > +
> > +		/* Is this really an inode? */
> > +		bp->b_ops = &xfs_inode_buf_ops;
> > +		dip = xfs_buf_offset(bp, imap.im_boffset);
> > +		if (!xfs_dinode_verify(mp, ino, dip) ||
> > +		    !xfs_dinode_good_version(mp, dip->di_version)) {
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +			goto out;
> > +		}
> > +
> > +		/* ...and is it the one we asked for? */
> > +		if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
> > +			error = -ENOENT;
> > +			goto out;
> > +		}
> > +	}
> 
> Can we factor the manual mapping into a separate function? Just
> makes it a bit cleaner and gets rid of a bunch of local variables
> from this function that are just used to map and read the inode.
> 
> ANd reading on and getting ahead of the code, could we split it
> further into
> 
> 	<setup dip>
> 
> 	xfs_scrub_dinode(sc, ino, dip, bp);
> 
> 	<do live incore inode stuff>

Yes, good plan.

> > +
> > +	flags = be16_to_cpu(dip->di_flags);
> > +	if (dip->di_version >= 3)
> > +		flags2 = be64_to_cpu(dip->di_flags2);
> > +	else
> > +		flags2 = 0;
> > +
> > +	/* di_mode */
> > +	mode = be16_to_cpu(dip->di_mode);
> > +	if (mode & ~(S_IALLUGO | S_IFMT))
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +	/* v1/v2 fields */
> > +	switch (dip->di_version) {
> > +	case 1:
> > +		if (dip->di_nlink != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +		if (dip->di_mode == 0 && sc->ip)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +		if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> 
> We don't really support v1 inode format anymore - we convert it to
> version 2 automatically in xfs_inode_from_disk() so the in-memory
> inode is always v2 or v3, never v1. And when we write it back out,
> we write it as a v2 inode, never as a v1 inode.
> 
> Hence I'm not sure whether we should be worrying about scrubbing
> such inodes - they are going to be in an ever shrinking minority
> of filesystems. At minimum, they should always return "preen".

Ok.  I figured that we might end up at "v1 => preen" but decided to play
this straight until we got to review.

> > +	case 2:
> > +	case 3:
> > +		if (dip->di_onlink != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +		if (dip->di_mode == 0 && sc->ip)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +		if (dip->di_projid_hi != 0 &&
> > +		    !xfs_sb_version_hasprojid32bit(&mp->m_sb))
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	default:
> > +		ASSERT(0);
> 
> If we don't understand the version, it's corrupt.

Yikes an assert.  Will replace that stat!

> > +		break;
> > +	}
> > +
> > +	/*
> > +	 * di_uid/di_gid -- -1 isn't invalid, but there's no way that
> > +	 * userspace could have created that.
> > +	 */
> > +	if (dip->di_uid == cpu_to_be32(-1U) ||
> > +	    dip->di_gid == cpu_to_be32(-1U))
> > +		xfs_scrub_ino_set_warning(sc, bp);
> > +
> > +	/* di_format */
> > +	switch (dip->di_format) {
> > +	case XFS_DINODE_FMT_DEV:
> > +		if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
> > +		    !S_ISFIFO(mode) && !S_ISSOCK(mode))
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_LOCAL:
> > +		if (!S_ISDIR(mode) && !S_ISLNK(mode))
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_EXTENTS:
> > +		if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_BTREE:
> > +		if (!S_ISREG(mode) && !S_ISDIR(mode))
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_UUID:
> > +	default:
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	}
> > +
> > +	/* di_size */
> > +	isize = be64_to_cpu(dip->di_size);
> > +	if (isize & (1ULL << 63))
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> 
> Should we be checking against the on disk format size, or the
> mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)?
> 32 or 64 bit systems are going to have different maximum valid file
> sizes..

It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only
thing we can really check for here is that the upper bit isn't set
(because the VFS does not check, but barfs on, files with that large of
a size).

> Directories have a maximum bound size, too - the data space, leaf
> space and freespace space, each of which are 32GB in size, IIRC.
> 
> And symlinks have a different maximum size, too.

Fair enough, I'll expand the i_size checks, though ISTR the verifiers
now check that for us.

> > +	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> 
> > +
> > +	/* di_nblocks */
> > +	if (flags2 & XFS_DIFLAG2_REFLINK) {
> > +		; /* nblocks can exceed dblocks */
> > +	} else if (flags & XFS_DIFLAG_REALTIME) {
> > +		if (be64_to_cpu(dip->di_nblocks) >=
> > +		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> 
> That doesn't seem right. the file can be on either the data or the
> rt device, so the maximum file blocks is the size of one device or
> the other, not both combined.

di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks),
right?  So in theory if you had a rt file with 1000 data blocks, 10 bmbt
blocks to map the data blocks, and 100 attr blocks then di_nblocks has
to be 1110.

> > +	} else {
> > +		if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +	}
> > +
> > +	/* di_extsize */
> > +	extsize = be32_to_cpu(dip->di_extsize);
> > +	if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) {
> > +		if (extsize <= 0 || extsize > MAXEXTLEN)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +		if (!(flags & XFS_DIFLAG_REALTIME) &&
> > +		    extsize > mp->m_sb.sb_agblocks / 2)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +	} else {
> > +		if (extsize != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +	}
> 
> There more to validating the extentsize hints than this. From
> xfs_ioctl_setattr_check_extsize():
> 
> /*
>  * extent size hint validation is somewhat cumbersome. Rules are:
>  *
>  * 1. extent size hint is only valid for directories and regular files
>  * 2. FS_XFLAG_EXTSIZE is only valid for regular files
>  * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
>  * 4. can only be changed on regular files if no extents are allocated
>  * 5. can be changed on directories at any time
>  * 6. extsize hint of 0 turns off hints, clears inode flags.
>  * 7. Extent size must be a multiple of the appropriate block size.
>  * 8. for non-realtime files, the extent size hint must be limited
>  *    to half the AG size to avoid alignment extending the extent beyond the
>  *    limits of the AG.
>  */
> 
> Maybe there's some commonality between these two functions...

Definitely, will refactor both.

> > +
> > +	/* di_flags */
> > +	if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND))
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +	if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> 
> What about project id flags?
> 
> Also, there are flags that are allowed only on regular files, and
> there are flags that are only allowed on directories. Those should
> probably also be checked for preening.

<nod> Some of these are checked by the dinode verifier, but there needs
to be a comment (or more comment) explaining that.

> > +
> > +	/* di_nextents */
> > +	nextents = be32_to_cpu(dip->di_nextents);
> > +	fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> > +	switch (dip->di_format) {
> > +	case XFS_DINODE_FMT_EXTENTS:
> > +		if (nextents > fork_recs)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_BTREE:
> > +		if (nextents <= fork_recs)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_LOCAL:
> > +	case XFS_DINODE_FMT_DEV:
> > +	case XFS_DINODE_FMT_UUID:
> > +	default:
> > +		if (nextents != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	}
> > +
> > +	/* di_anextents */
> > +	nextents = be16_to_cpu(dip->di_anextents);
> > +	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> > +	switch (dip->di_aformat) {
> > +	case XFS_DINODE_FMT_EXTENTS:
> > +		if (nextents > fork_recs)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_BTREE:
> > +		if (nextents <= fork_recs)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	case XFS_DINODE_FMT_LOCAL:
> > +	case XFS_DINODE_FMT_DEV:
> > +	case XFS_DINODE_FMT_UUID:
> > +	default:
> > +		if (nextents != 0)
> > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +		break;
> > +	}
> 
> Don't we need a check here first to see whether an attribute fork
> exists or not?

Do you mean the xfs_inode_fork, or something else?

XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on
di_forkoff so we're really only checking that di_aformat makes sense
given the number of extents and the size of the attr fork area.

We're not actually checking anything in the attr fork; that's a
different scrubber.

> > +
> > +	/* di_forkoff */
> > +	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +	if (dip->di_anextents != 0 && dip->di_forkoff == 0)
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > +
> > +	/* di_aformat */
> > +	if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
> > +	    dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
> > +	    dip->di_aformat != XFS_DINODE_FMT_BTREE)
> > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> 
> Shouldn't this come before we use dip->di_aformat in a switch
> statement?

Yes.  Good catch.

> Hmmm - aren't we missing the same checks for the data fork?

I believe you'll find them further up in the function.

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Oct. 5, 2017, 7:13 a.m. UTC | #3
On Wed, Oct 04, 2017 at 10:22:19PM -0700, Darrick J. Wong wrote:
> On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote:
> > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote:
> > > +	error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED,
> > > +			0, &ips);
> > 
> > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the
> > inode cache with inodes that we use once for scrub and never touch
> > again.
> 
> I thought about adding this, but if we let the inodes fall out of the
> cache now then we'll just have to load them back in for the bmap checks,
> right?

Well, I'm looking at ensuring that we don't blow out the memory
side of things. We've still got the inode buffer in the buffer
cache, so I don't see why we should double cache these things
and then leave both cached copied hanging around after we've
finished with them. Leave the buffer around because we do a fair few
checks with it, but don't use excessive icache memory and trash the
working set if we can avoid it...

> > > +xfs_scrub_checkpoint_log(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	int			error;
> > > +
> > > +	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
> > > +	if (error)
> > > +		return error;
> > > +	xfs_ail_push_all_sync(mp->m_ail);
> > > +	return 0;
> > > +}
> > 
> > Oooo, that's a nasty thing to do on busy systems with large dirty
> > logs. I hope this is a "last resort" kind of thing....
> 
> It is; we only do this if the inobt says there's an inode there and the
> inode verifiers fail.

Ok, so why would pushing the log and the AIL make the verifier then
succeed? how likely is this to occur on a busy system?

> > > +/* Set us up with an inode. */
> > 
> > What state are we trying to get the inode into here? We grab all the
> > various locks, but we can still have data changing via mmap pages
> > that are already faulted in and delalloc extents in the incore
> > extent list that aren't reflected on disk...
> > 
> > A comment explaining what we expect here would be nice.
> 
> /* 
>  * Grab total control of the inode metadata.  It doesn't matter here if
>  * the file data is still changing, we just want exclusive access to the
>  * metadata.
>  */

*nod*

> > > +	/* Got the inode, lock it and we're ready to go. */
> > > +	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > > +	xfs_ilock(sc->ip, sc->ilock_flags);
> > > +	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
> > > +	if (error)
> > > +		goto out_unlock;
> > > +	sc->ilock_flags |= XFS_ILOCK_EXCL;
> > > +	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
> > 
> > Should the inode be joined to the transaction so that cancelling the
> > transaction unlocks the inode? Then the need for the ilock_flags
> > variable goes away....
> 
> This is the confluence of two semi-icky things: first, some of the
> scrubbers (particularly the dir and parent pointer scrubbers) will need
> to drop the ILOCK for short periods of time; later on, repair will want
> to keep the inode locked across all the repair transactions, so it makes
> more sense to control the lock and unlock directly.

Ok, I'll pass on this for now, see how the rest of the code falls
out.

> > > +	/* di_size */
> > > +	isize = be64_to_cpu(dip->di_size);
> > > +	if (isize & (1ULL << 63))
> > > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > 
> > Should we be checking against the on disk format size, or the
> > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)?
> > 32 or 64 bit systems are going to have different maximum valid file
> > sizes..
> 
> It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only

Ugh. We can't do IO past 16TB on 32 bit systems, so I'm kinda
surprised truncate doesn't have the same s_maxbytes restriction...

> thing we can really check for here is that the upper bit isn't set
> (because the VFS does not check, but barfs on, files with that large of
> a size).

xfs_max_file_offset() sets the max file offset to 2^63 - 1, so it
looks like the lack of checking in truncate is the problem here,
not the IO path.

> > Directories have a maximum bound size, too - the data space, leaf
> > space and freespace space, each of which are 32GB in size, IIRC.
> > 
> > And symlinks have a different maximum size, too.
> 
> Fair enough, I'll expand the i_size checks, though ISTR the verifiers
> now check that for us.

If they do, then just drop a comment in there to say what is checked
by the verifier.

> > > +	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
> > > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > 
> > > +
> > > +	/* di_nblocks */
> > > +	if (flags2 & XFS_DIFLAG2_REFLINK) {
> > > +		; /* nblocks can exceed dblocks */
> > > +	} else if (flags & XFS_DIFLAG_REALTIME) {
> > > +		if (be64_to_cpu(dip->di_nblocks) >=
> > > +		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > 
> > That doesn't seem right. the file can be on either the data or the
> > rt device, so the maximum file blocks is the size of one device or
> > the other, not both combined.
> 
> di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks),
> right?

Yeah, forgot it was more than just data extents.

> So in theory if you had a rt file with 1000 data blocks, 10 bmbt
> blocks to map the data blocks, and 100 attr blocks then di_nblocks has
> to be 1110.

Yup, but the additional metadata on the data device is not going to
be anywhere near the size of the data device.

/me shrugs

I can't think of an easy way to get a maximum block count, so I
guess that'll have to do...

> > > +		if (nextents > fork_recs)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	case XFS_DINODE_FMT_BTREE:
> > > +		if (nextents <= fork_recs)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	case XFS_DINODE_FMT_LOCAL:
> > > +	case XFS_DINODE_FMT_DEV:
> > > +	case XFS_DINODE_FMT_UUID:
> > > +	default:
> > > +		if (nextents != 0)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	}
> > > +
> > > +	/* di_anextents */
> > > +	nextents = be16_to_cpu(dip->di_anextents);
> > > +	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> > > +	switch (dip->di_aformat) {
> > > +	case XFS_DINODE_FMT_EXTENTS:
> > > +		if (nextents > fork_recs)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	case XFS_DINODE_FMT_BTREE:
> > > +		if (nextents <= fork_recs)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	case XFS_DINODE_FMT_LOCAL:
> > > +	case XFS_DINODE_FMT_DEV:
> > > +	case XFS_DINODE_FMT_UUID:
> > > +	default:
> > > +		if (nextents != 0)
> > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > +		break;
> > > +	}
> > 
> > Don't we need a check here first to see whether an attribute fork
> > exists or not?
> 
> Do you mean the xfs_inode_fork, or something else?

SOmething else. :P

> XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on
> di_forkoff so we're really only checking that di_aformat makes sense
> given the number of extents and the size of the attr fork area.

Right, but if XFS_DFORK_ASIZE == 0, the dip->di_aformat *must* be
XFS_DINODE_FMT_EXTENTS. That's the only valid configuration when
there is no attribute fork present.

If there is an attribute fork present, then it can be XFS_DINODE_FMT_LOCAL,
EXTENT or BTREE, and then the extent count needs checking.
XFS_DINODE_FMT_DEV and XFS_DINODE_FMT_UUID are both invalid for the
attribute fork.

Cheers,

Dave.
Darrick J. Wong Oct. 5, 2017, 7:56 p.m. UTC | #4
On Thu, Oct 05, 2017 at 06:13:39PM +1100, Dave Chinner wrote:
> On Wed, Oct 04, 2017 at 10:22:19PM -0700, Darrick J. Wong wrote:
> > On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote:
> > > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote:
> > > > +	error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED,
> > > > +			0, &ips);
> > > 
> > > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the
> > > inode cache with inodes that we use once for scrub and never touch
> > > again.
> > 
> > I thought about adding this, but if we let the inodes fall out of the
> > cache now then we'll just have to load them back in for the bmap checks,
> > right?
> 
> Well, I'm looking at ensuring that we don't blow out the memory
> side of things. We've still got the inode buffer in the buffer
> cache, so I don't see why we should double cache these things
> and then leave both cached copied hanging around after we've
> finished with them. Leave the buffer around because we do a fair few
> checks with it, but don't use excessive icache memory and trash the
> working set if we can avoid it...

Sure, I'll give it a try and see what the performance impacts are.
(I /have/ been looking into OOM problems on artificially memory-constrained VMs.)

> > > > +xfs_scrub_checkpoint_log(
> > > > +	struct xfs_mount	*mp)
> > > > +{
> > > > +	int			error;
> > > > +
> > > > +	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
> > > > +	if (error)
> > > > +		return error;
> > > > +	xfs_ail_push_all_sync(mp->m_ail);
> > > > +	return 0;
> > > > +}
> > > 
> > > Oooo, that's a nasty thing to do on busy systems with large dirty
> > > logs. I hope this is a "last resort" kind of thing....
> > 
> > It is; we only do this if the inobt says there's an inode there and the
> > inode verifiers fail.
> 
> Ok, so why would pushing the log and the AIL make the verifier then
> succeed? how likely is this to occur on a busy system?

Hmm, looking through my notes the original reason for shoving on the log
is to force any dirty inode items to get checkpointed back to disk so
that we could read the raw buffers.

I think there's still a desire to be able to checkpoint the log in order
to resolve discrepancies via raw disk buffer reads ... but at this point
the only code that does this is:

(a) the code that checks inobt freemask against the on-disk inodes if
the inode isn't in the cache (we can't rely on the inobt to look up
uncached inodes in order to check inobt fields)

(b) AGF/AGI repair will want to ->verify_read the alleged btree root
blocks.  If the root blocks are new and have never been checkpointed,
they'll have a crc/lsn of zero, which causes verifier error.

(Granted this should never happen except when we're artificially forcing
repair to rebuild otherwise ok metadata as part of testing.)

(c) bnobt repair will want to be able to flush the log to empty out the
busy extent list because it works by freeing all the blocks not listed
by the rmap and the busy extent list can't handle overlapping entries.

Though now that I look at it, this function belongs in the ialloc patch,
and the if (force_log) xfs_scrub_checkpoint_log() call has somehow
migrated out to one of the repair patches.  Ugh.

> > > > +/* Set us up with an inode. */
> > > 
> > > What state are we trying to get the inode into here? We grab all the
> > > various locks, but we can still have data changing via mmap pages
> > > that are already faulted in and delalloc extents in the incore
> > > extent list that aren't reflected on disk...
> > > 
> > > A comment explaining what we expect here would be nice.
> > 
> > /* 
> >  * Grab total control of the inode metadata.  It doesn't matter here if
> >  * the file data is still changing, we just want exclusive access to the
> >  * metadata.
> >  */
> 
> *nod*
> 
> > > > +	/* Got the inode, lock it and we're ready to go. */
> > > > +	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > > > +	xfs_ilock(sc->ip, sc->ilock_flags);
> > > > +	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
> > > > +	if (error)
> > > > +		goto out_unlock;
> > > > +	sc->ilock_flags |= XFS_ILOCK_EXCL;
> > > > +	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
> > > 
> > > Should the inode be joined to the transaction so that cancelling the
> > > transaction unlocks the inode? Then the need for the ilock_flags
> > > variable goes away....
> > 
> > This is the confluence of two semi-icky things: first, some of the
> > scrubbers (particularly the dir and parent pointer scrubbers) will need
> > to drop the ILOCK for short periods of time; later on, repair will want
> > to keep the inode locked across all the repair transactions, so it makes
> > more sense to control the lock and unlock directly.
> 
> Ok, I'll pass on this for now, see how the rest of the code falls
> out.
> 
> > > > +	/* di_size */
> > > > +	isize = be64_to_cpu(dip->di_size);
> > > > +	if (isize & (1ULL << 63))
> > > > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > 
> > > Should we be checking against the on disk format size, or the
> > > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)?
> > > 32 or 64 bit systems are going to have different maximum valid file
> > > sizes..
> > 
> > It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only
> 
> Ugh. We can't do IO past 16TB on 32 bit systems, so I'm kinda
> surprised truncate doesn't have the same s_maxbytes restriction...
>
> > thing we can really check for here is that the upper bit isn't set
> > (because the VFS does not check, but barfs on, files with that large of
> > a size).
> 
> xfs_max_file_offset() sets the max file offset to 2^63 - 1, so it
> looks like the lack of checking in truncate is the problem here,
> not the IO path.

Imagine that you format an xfs on a x64 system, create a 20TB file, and
scrub/repair say it's ok.  Then you mount the same fs on your 2014-era
smartphone^W^Warm32 system.  Yes, the smartphone can't handle such a big
file, but is this fs corruption?  I don't think this qualifies for
OFLAG_CORRUPT.

Come to think of it, this is one of the scenarios for which
OFLAG_WARNING was intended -- not technically a violation of the disk
spec, but needs review anyway.

Ok, I've convinced myself. :)

	/*
	 * Warn if the running kernel can't handle the kinds of offsets
	 * needed to deal with the file size.  In other words, if the
	 * pagecache can't cache all the blocks in this file due to
	 * overly large offsets, flag the inode for admin review.
	 */
	if (isize >= mp->m_super->s_maxbytes)
		xfs_scrub_ino_set_warning(sc, bp);

> > > Directories have a maximum bound size, too - the data space, leaf
> > > space and freespace space, each of which are 32GB in size, IIRC.
> > > 
> > > And symlinks have a different maximum size, too.
> > 
> > Fair enough, I'll expand the i_size checks, though ISTR the verifiers
> > now check that for us.
> 
> If they do, then just drop a comment in there to say what is checked
> by the verifier.

Hmm, the dinode verifier checks for the upper bit and for zero-length
symlinks and directories.

Will add the symlink/directory isize checks.

> > > > +	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
> > > > +		xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > 
> > > > +
> > > > +	/* di_nblocks */
> > > > +	if (flags2 & XFS_DIFLAG2_REFLINK) {
> > > > +		; /* nblocks can exceed dblocks */
> > > > +	} else if (flags & XFS_DIFLAG_REALTIME) {
> > > > +		if (be64_to_cpu(dip->di_nblocks) >=
> > > > +		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > 
> > > That doesn't seem right. the file can be on either the data or the
> > > rt device, so the maximum file blocks is the size of one device or
> > > the other, not both combined.
> > 
> > di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks),
> > right?
> 
> Yeah, forgot it was more than just data extents.
> 
> > So in theory if you had a rt file with 1000 data blocks, 10 bmbt
> > blocks to map the data blocks, and 100 attr blocks then di_nblocks has
> > to be 1110.
> 
> Yup, but the additional metadata on the data device is not going to
> be anywhere near the size of the data device.
> 
> /me shrugs
> 
> I can't think of an easy way to get a maximum block count, so I
> guess that'll have to do...

Yeah, I couldn't come up with a more precise check that doesn't involve
checking the data forks directly (which comes up later).

> > > > +		if (nextents > fork_recs)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	case XFS_DINODE_FMT_BTREE:
> > > > +		if (nextents <= fork_recs)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	case XFS_DINODE_FMT_LOCAL:
> > > > +	case XFS_DINODE_FMT_DEV:
> > > > +	case XFS_DINODE_FMT_UUID:
> > > > +	default:
> > > > +		if (nextents != 0)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	}
> > > > +
> > > > +	/* di_anextents */
> > > > +	nextents = be16_to_cpu(dip->di_anextents);
> > > > +	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
> > > > +	switch (dip->di_aformat) {
> > > > +	case XFS_DINODE_FMT_EXTENTS:
> > > > +		if (nextents > fork_recs)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	case XFS_DINODE_FMT_BTREE:
> > > > +		if (nextents <= fork_recs)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	case XFS_DINODE_FMT_LOCAL:
> > > > +	case XFS_DINODE_FMT_DEV:
> > > > +	case XFS_DINODE_FMT_UUID:
> > > > +	default:
> > > > +		if (nextents != 0)
> > > > +			xfs_scrub_ino_set_corrupt(sc, ino, bp);
> > > > +		break;
> > > > +	}
> > > 
> > > Don't we need a check here first to see whether an attribute fork
> > > exists or not?
> > 
> > Do you mean the xfs_inode_fork, or something else?
> 
> SOmething else. :P
> 
> > XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on
> > di_forkoff so we're really only checking that di_aformat makes sense
> > given the number of extents and the size of the attr fork area.
> 
> Right, but if XFS_DFORK_ASIZE == 0, the dip->di_aformat *must* be
> XFS_DINODE_FMT_EXTENTS. That's the only valid configuration when
> there is no attribute fork present.
> 
> If there is an attribute fork present, then it can be XFS_DINODE_FMT_LOCAL,
> EXTENT or BTREE, and then the extent count needs checking.
> XFS_DINODE_FMT_DEV and XFS_DINODE_FMT_UUID are both invalid for the
> attribute fork.

Ok.  Will rearrange this to check di_forkoff and di_aformat before
bothering with the di_anextents checks.

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a7c5752..28e14b7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -151,6 +151,7 @@  xfs-y				+= $(addprefix scrub/, \
 				   btree.o \
 				   common.o \
 				   ialloc.o \
+				   inode.o \
 				   refcount.o \
 				   rmap.o \
 				   scrub.o \
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b3f992c..f8463e0 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -494,9 +494,10 @@  struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_FINOBT	8	/* free inode btree */
 #define XFS_SCRUB_TYPE_RMAPBT	9	/* reverse mapping btree */
 #define XFS_SCRUB_TYPE_REFCNTBT	10	/* reference count btree */
+#define XFS_SCRUB_TYPE_INODE	11	/* inode record */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	11
+#define XFS_SCRUB_TYPE_NR	12
 
 /* i: Repair this metadata. */
 #define XFS_SCRUB_IFLAG_REPAIR		(1 << 0)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 4f8d103..415dec6 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -30,6 +30,8 @@ 
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
 #include "xfs_alloc.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap.h"
@@ -40,6 +42,8 @@ 
 #include "xfs_refcount_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -559,3 +563,64 @@  xfs_scrub_setup_ag_btree(
 
 	return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
 }
+
+/*
+ * Given an inode and the scrub control structure, grab either the
+ * inode referenced in the control structure or the inode passed in.
+ * The inode is not locked.
+ */
+int
+xfs_scrub_get_inode(
+	struct xfs_scrub_context	*sc,
+	struct xfs_inode		*ip_in)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_inode		*ips = NULL;
+	int				error;
+
+	if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino))
+		return -EINVAL;
+
+	/* We want to scan the inode we already had opened. */
+	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+		sc->ip = ip_in;
+		return 0;
+	}
+
+	/* Look up the inode, see if the generation number matches. */
+	if (xfs_internal_inum(mp, sc->sm->sm_ino))
+		return -ENOENT;
+	error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED,
+			0, &ips);
+	if (error == -ENOENT || error == -EINVAL) {
+		/* inode doesn't exist... */
+		return -ENOENT;
+	} else if (error) {
+		trace_xfs_scrub_op_error(sc,
+				XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
+				XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+				error, __return_address);
+		return error;
+	}
+	if (VFS_I(ips)->i_generation != sc->sm->sm_gen) {
+		iput(VFS_I(ips));
+		return -ENOENT;
+	}
+
+	sc->ip = ips;
+	return 0;
+}
+
+/* Push everything out of the log onto disk. */
+int
+xfs_scrub_checkpoint_log(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+	if (error)
+		return error;
+	xfs_ail_push_all_sync(mp->m_ail);
+	return 0;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 6325f03..c8143e8 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -74,6 +74,8 @@  void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork,
 
 void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc);
 
+int xfs_scrub_checkpoint_log(struct xfs_mount *mp);
+
 /* Setup functions */
 int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip);
 int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc,
@@ -86,6 +88,8 @@  int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc,
 			      struct xfs_inode *ip);
 int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc,
 				  struct xfs_inode *ip);
+int xfs_scrub_setup_inode(struct xfs_scrub_context *sc,
+			  struct xfs_inode *ip);
 
 
 void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
@@ -106,5 +110,6 @@  int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
 
 int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc,
 			     struct xfs_inode *ip, bool force_log);
+int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in);
 
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
new file mode 100644
index 0000000..88dd3fc
--- /dev/null
+++ b/fs/xfs/scrub/inode.c
@@ -0,0 +1,353 @@ 
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up with an inode. */
+int
+xfs_scrub_setup_inode(
+	struct xfs_scrub_context	*sc,
+	struct xfs_inode		*ip)
+{
+	struct xfs_mount		*mp = sc->mp;
+	int				error;
+
+	/*
+	 * Try to get the inode.  If the verifiers fail, we try again
+	 * in raw mode.
+	 */
+	error = xfs_scrub_get_inode(sc, ip);
+	switch (error) {
+	case 0:
+		break;
+	case -EFSCORRUPTED:
+	case -EFSBADCRC:
+		return xfs_scrub_checkpoint_log(mp);
+	default:
+		return error;
+	}
+
+	/* Got the inode, lock it and we're ready to go. */
+	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	if (error)
+		goto out_unlock;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+	return error;
+out_unlock:
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	if (sc->ip != ip)
+		iput(VFS_I(sc->ip));
+	sc->ip = NULL;
+	return error;
+}
+
+/* Inode core */
+
+/* Scrub an inode. */
+int
+xfs_scrub_inode(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_imap			imap;
+	struct xfs_dinode		di;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*bp = NULL;
+	struct xfs_dinode		*dip;
+	xfs_ino_t			ino;
+	size_t				fork_recs;
+	unsigned long long		isize;
+	uint64_t			flags2;
+	uint32_t			nextents;
+	uint32_t			extsize;
+	uint32_t			cowextsize;
+	uint16_t			flags;
+	uint16_t			mode;
+	bool				has_shared;
+	int				error = 0;
+
+	/* Did we get the in-core inode, or are we doing this manually? */
+	if (sc->ip) {
+		ino = sc->ip->i_ino;
+		xfs_inode_to_disk(sc->ip, &di, 0);
+		dip = &di;
+	} else {
+		/* Map & read inode. */
+		ino = sc->sm->sm_ino;
+		error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
+		if (error == -EINVAL) {
+			/*
+			 * Inode could have gotten deleted out from under us;
+			 * just forget about it.
+			 */
+			error = -ENOENT;
+			goto out;
+		}
+		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
+				XFS_INO_TO_AGBNO(mp, ino), &error))
+			goto out;
+
+		error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+				imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
+				NULL);
+		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
+				XFS_INO_TO_AGBNO(mp, ino), &error))
+			goto out;
+
+		/* Is this really an inode? */
+		bp->b_ops = &xfs_inode_buf_ops;
+		dip = xfs_buf_offset(bp, imap.im_boffset);
+		if (!xfs_dinode_verify(mp, ino, dip) ||
+		    !xfs_dinode_good_version(mp, dip->di_version)) {
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+			goto out;
+		}
+
+		/* ...and is it the one we asked for? */
+		if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
+			error = -ENOENT;
+			goto out;
+		}
+	}
+
+	flags = be16_to_cpu(dip->di_flags);
+	if (dip->di_version >= 3)
+		flags2 = be64_to_cpu(dip->di_flags2);
+	else
+		flags2 = 0;
+
+	/* di_mode */
+	mode = be16_to_cpu(dip->di_mode);
+	if (mode & ~(S_IALLUGO | S_IFMT))
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+	/* v1/v2 fields */
+	switch (dip->di_version) {
+	case 1:
+		if (dip->di_nlink != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+		if (dip->di_mode == 0 && sc->ip)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+		if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case 2:
+	case 3:
+		if (dip->di_onlink != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+		if (dip->di_mode == 0 && sc->ip)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+		if (dip->di_projid_hi != 0 &&
+		    !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * di_uid/di_gid -- -1 isn't invalid, but there's no way that
+	 * userspace could have created that.
+	 */
+	if (dip->di_uid == cpu_to_be32(-1U) ||
+	    dip->di_gid == cpu_to_be32(-1U))
+		xfs_scrub_ino_set_warning(sc, bp);
+
+	/* di_format */
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_DEV:
+		if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
+		    !S_ISFIFO(mode) && !S_ISSOCK(mode))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		if (!S_ISDIR(mode) && !S_ISLNK(mode))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (!S_ISREG(mode) && !S_ISDIR(mode))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_UUID:
+	default:
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	}
+
+	/* di_size */
+	isize = be64_to_cpu(dip->di_size);
+	if (isize & (1ULL << 63))
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+	/* di_nblocks */
+	if (flags2 & XFS_DIFLAG2_REFLINK) {
+		; /* nblocks can exceed dblocks */
+	} else if (flags & XFS_DIFLAG_REALTIME) {
+		if (be64_to_cpu(dip->di_nblocks) >=
+		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	} else {
+		if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	}
+
+	/* di_extsize */
+	extsize = be32_to_cpu(dip->di_extsize);
+	if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) {
+		if (extsize <= 0 || extsize > MAXEXTLEN)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+		if (!(flags & XFS_DIFLAG_REALTIME) &&
+		    extsize > mp->m_sb.sb_agblocks / 2)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	} else {
+		if (extsize != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	}
+
+	/* di_flags */
+	if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND))
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+	/* di_nextents */
+	nextents = be32_to_cpu(dip->di_nextents);
+	fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if (nextents > fork_recs)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (nextents <= fork_recs)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_UUID:
+	default:
+		if (nextents != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	}
+
+	/* di_anextents */
+	nextents = be16_to_cpu(dip->di_anextents);
+	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		if (nextents > fork_recs)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (nextents <= fork_recs)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_UUID:
+	default:
+		if (nextents != 0)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		break;
+	}
+
+	/* di_forkoff */
+	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+	/* di_aformat */
+	if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
+	    dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
+	    dip->di_aformat != XFS_DINODE_FMT_BTREE)
+		xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+	/* di_cowextsize */
+	if (flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+		cowextsize = be32_to_cpu(dip->di_cowextsize);
+
+		if (!xfs_sb_version_hasreflink(&mp->m_sb))
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		if (cowextsize <= 0 || cowextsize > MAXEXTLEN)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+		if (cowextsize > mp->m_sb.sb_agblocks / 2)
+			xfs_scrub_ino_set_corrupt(sc, ino, bp);
+	}
+
+	/* Now let's do the things that require a live inode. */
+	if (!sc->ip)
+		goto out;
+
+	/*
+	 * Does this inode have the reflink flag set but no shared extents?
+	 * Set the preening flag if this is the case.
+	 */
+	if (xfs_is_reflink_inode(sc->ip)) {
+		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+				&has_shared);
+		if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino),
+				XFS_INO_TO_AGBNO(mp, ino), &error))
+			goto out;
+		if (!has_shared)
+			xfs_scrub_ino_set_preen(sc, bp);
+	}
+
+out:
+	if (bp)
+		xfs_trans_brelse(sc->tp, bp);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7b44364..f014ef0 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -30,6 +30,8 @@ 
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
 #include "xfs_alloc.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap.h"
@@ -136,6 +138,7 @@  xfs_scrub_probe(
 STATIC int
 xfs_scrub_teardown(
 	struct xfs_scrub_context	*sc,
+	struct xfs_inode		*ip_in,
 	int				error)
 {
 	xfs_scrub_ag_free(sc, &sc->sa);
@@ -143,6 +146,12 @@  xfs_scrub_teardown(
 		xfs_trans_cancel(sc->tp);
 		sc->tp = NULL;
 	}
+	if (sc->ip) {
+		xfs_iunlock(sc->ip, sc->ilock_flags);
+		if (sc->ip != ip_in)
+			iput(VFS_I(sc->ip));
+		sc->ip = NULL;
+	}
 	return error;
 }
 
@@ -196,6 +205,10 @@  static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
 		.scrub	= xfs_scrub_refcountbt,
 		.has	= xfs_sb_version_hasreflink,
 	},
+	{ /* inode record */
+		.setup	= xfs_scrub_setup_inode,
+		.scrub	= xfs_scrub_inode,
+	},
 };
 
 /* This isn't a stable feature, warn once per day. */
@@ -281,7 +294,7 @@  xfs_scrub_metadata(
 		 * Tear down everything we hold, then set up again with
 		 * preparation for worst-case scenarios.
 		 */
-		error = xfs_scrub_teardown(&sc, 0);
+		error = xfs_scrub_teardown(&sc, ip, 0);
 		if (error)
 			goto out;
 		try_harder = true;
@@ -294,7 +307,7 @@  xfs_scrub_metadata(
 		xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
 
 out_teardown:
-	error = xfs_scrub_teardown(&sc, error);
+	error = xfs_scrub_teardown(&sc, ip, error);
 out:
 	trace_xfs_scrub_done(ip, sm, error);
 	return error;
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1c80bf5..ec635d4 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -59,6 +59,7 @@  struct xfs_scrub_context {
 	const struct xfs_scrub_meta_ops	*ops;
 	struct xfs_trans		*tp;
 	struct xfs_inode		*ip;
+	uint				ilock_flags;
 	bool				try_harder;
 
 	/* State tracking for single-AG operations. */
@@ -77,5 +78,6 @@  int xfs_scrub_inobt(struct xfs_scrub_context *sc);
 int xfs_scrub_finobt(struct xfs_scrub_context *sc);
 int xfs_scrub_rmapbt(struct xfs_scrub_context *sc);
 int xfs_scrub_refcountbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inode(struct xfs_scrub_context *sc);
 
 #endif	/* __XFS_SCRUB_SCRUB_H__ */