diff mbox series

[3/6] xfs: repair inode records

Message ID 169049626483.922543.14635359971498732607.stgit@frogsfrogsfrogs (mailing list archive)
State Superseded, archived
Headers show
Series xfs: online repair of inodes and forks | expand

Commit Message

Darrick J. Wong July 27, 2023, 10:32 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

If an inode is so badly damaged that it cannot be loaded into the cache,
fix the ondisk metadata and try again.  If there /is/ a cached inode,
fix any problems and apply any optimizations that can be solved incore.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile             |    1 
 fs/xfs/libxfs/xfs_format.h  |    3 
 fs/xfs/scrub/alloc.c        |    2 
 fs/xfs/scrub/inode.c        |   10 +
 fs/xfs/scrub/inode_repair.c |  763 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.c       |   32 ++
 fs/xfs/scrub/repair.h       |   19 +
 fs/xfs/scrub/scrub.c        |    2 
 fs/xfs/scrub/trace.h        |  129 +++++++
 9 files changed, 958 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/scrub/inode_repair.c

Comments

Dave Chinner Aug. 9, 2023, 8:42 a.m. UTC | #1
On Thu, Jul 27, 2023 at 03:32:53PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> If an inode is so badly damaged that it cannot be loaded into the cache,
> fix the ondisk metadata and try again.  If there /is/ a cached inode,
> fix any problems and apply any optimizations that can be solved incore.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
.....
> diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
> new file mode 100644
> index 0000000000000..952832e9fd029
> --- /dev/null
> +++ b/fs/xfs/scrub/inode_repair.c
> @@ -0,0 +1,763 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@kernel.org>
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_btree.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_sb.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_inode_buf.h"
> +#include "xfs_inode_fork.h"
> +#include "xfs_ialloc.h"
> +#include "xfs_da_format.h"
> +#include "xfs_reflink.h"
> +#include "xfs_rmap.h"
> +#include "xfs_bmap.h"
> +#include "xfs_bmap_util.h"
> +#include "xfs_dir2.h"
> +#include "xfs_dir2_priv.h"
> +#include "xfs_quota_defs.h"
> +#include "xfs_quota.h"
> +#include "xfs_ag.h"
> +#include "scrub/xfs_scrub.h"
> +#include "scrub/scrub.h"
> +#include "scrub/common.h"
> +#include "scrub/btree.h"
> +#include "scrub/trace.h"
> +#include "scrub/repair.h"
> +
> +/*
> + * Inode Repair
> + *
> + * Roughly speaking, inode problems can be classified based on whether or not
> + * they trip the dinode verifiers.  If those trip, then we won't be able to
> + * _iget ourselves the inode.
> + *
> + * Therefore, the xrep_dinode_* functions fix anything that will cause the
> + * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
> + * fix things on live incore inodes.
> + */

I'd like to see some of the decisions made documented here. Stuff
like:

- "unknown di_mode converts inode to a regular file only root can
  read" needs to be clearly documented because that "regular file"
  that results might not actually contain user data....
- what we do with setuid/setgid on repaired inodes
- things we just trash and leave to other parts of repair to clean
  up stuff we leak or trash...


> +/* Fix any conflicting flags that the verifiers complain about. */
> +STATIC void
> +xrep_dinode_flags(
> +	struct xfs_scrub	*sc,
> +	struct xfs_dinode	*dip)
> +{
> +	struct xfs_mount	*mp = sc->mp;
> +	uint64_t		flags2;
> +	uint16_t		mode;
> +	uint16_t		flags;
> +
> +	trace_xrep_dinode_flags(sc, dip);
> +
> +	mode = be16_to_cpu(dip->di_mode);
> +	flags = be16_to_cpu(dip->di_flags);
> +	flags2 = be64_to_cpu(dip->di_flags2);
> +
> +	if (xfs_has_reflink(mp) && S_ISREG(mode))
> +		flags2 |= XFS_DIFLAG2_REFLINK;
> +	else
> +		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
> +	if (flags & XFS_DIFLAG_REALTIME)
> +		flags2 &= ~XFS_DIFLAG2_REFLINK;
> +	if (flags2 & XFS_DIFLAG2_REFLINK)
> +		flags2 &= ~XFS_DIFLAG2_DAX;

IIRC, reflink and DAX co-exist just fine now....

> +	if (!xfs_has_bigtime(mp))
> +		flags2 &= ~XFS_DIFLAG2_BIGTIME;
> +	if (!xfs_has_large_extent_counts(mp))
> +		flags2 &= ~XFS_DIFLAG2_NREXT64;
> +	if (flags2 & XFS_DIFLAG2_NREXT64)
> +		dip->di_nrext64_pad = 0;
> +	else if (dip->di_version >= 3)
> +		dip->di_v3_pad = 0;
> +	dip->di_flags = cpu_to_be16(flags);
> +	dip->di_flags2 = cpu_to_be64(flags2);
> +}
> +
> +/*
> + * Blow out symlink; now it points to the current dir.  We don't have to worry
> + * about incore state because this inode is failing the verifiers.
> + */
> +STATIC void
> +xrep_dinode_zap_symlink(
> +	struct xfs_scrub	*sc,
> +	struct xfs_dinode	*dip)
> +{
> +	char			*p;
> +
> +	trace_xrep_dinode_zap_symlink(sc, dip);
> +
> +	dip->di_format = XFS_DINODE_FMT_LOCAL;
> +	dip->di_size = cpu_to_be64(1);
> +	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
> +	*p = '.';

What if this was in extent form? Didn't we just leak an extent?

> +}
> +
> +/*
> + * Blow out dir, make it point to the root.  In the future repair will
> + * reconstruct this directory for us.  Note that there's no in-core directory
> + * inode because the sf verifier tripped, so we don't have to worry about the
> + * dentry cache.
> + */
> +STATIC void
> +xrep_dinode_zap_dir(
> +	struct xfs_scrub	*sc,
> +	struct xfs_dinode	*dip)
> +{
> +	struct xfs_mount	*mp = sc->mp;
> +	struct xfs_dir2_sf_hdr	*sfp;
> +	int			i8count;
> +
> +	trace_xrep_dinode_zap_dir(sc, dip);
> +
> +	dip->di_format = XFS_DINODE_FMT_LOCAL;
> +	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
> +	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
> +	sfp->count = 0;
> +	sfp->i8count = i8count;
> +	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
> +	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
> +}

Same here?

> +
> +/* Make sure we don't have a garbage file size. */
> +STATIC void
> +xrep_dinode_size(
> +	struct xfs_scrub	*sc,
> +	struct xfs_dinode	*dip)
> +{
> +	uint64_t		size;
> +	uint16_t		mode;
> +
> +	trace_xrep_dinode_size(sc, dip);
> +
> +	mode = be16_to_cpu(dip->di_mode);
> +	size = be64_to_cpu(dip->di_size);
> +	switch (mode & S_IFMT) {
> +	case S_IFIFO:
> +	case S_IFCHR:
> +	case S_IFBLK:
> +	case S_IFSOCK:
> +		/* di_size can't be nonzero for special files */
> +		dip->di_size = 0;
> +		break;
> +	case S_IFREG:
> +		/* Regular files can't be larger than 2^63-1 bytes. */
> +		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
> +		break;
> +	case S_IFLNK:
> +		/*
> +		 * Truncate ridiculously oversized symlinks.  If the size is
> +		 * zero, reset it to point to the current directory.  Both of
> +		 * these conditions trigger dinode verifier errors, so there
> +		 * is no in-core state to reset.
> +		 */
> +		if (size > XFS_SYMLINK_MAXLEN)
> +			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
> +		else if (size == 0)
> +			xrep_dinode_zap_symlink(sc, dip);
> +		break;
> +	case S_IFDIR:
> +		/*
> +		 * Directories can't have a size larger than 32G.  If the size
> +		 * is zero, reset it to an empty directory.  Both of these
> +		 * conditions trigger dinode verifier errors, so there is no
> +		 * in-core state to reset.
> +		 */
> +		if (size > XFS_DIR2_SPACE_SIZE)
> +			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
> +		else if (size == 0)
> +			xrep_dinode_zap_dir(sc, dip);
> +		break;
> +	}
> +}
> +
> +/* Fix extent size hints. */
> +STATIC void
> +xrep_dinode_extsize_hints(
> +	struct xfs_scrub	*sc,
> +	struct xfs_dinode	*dip)
> +{
> +	struct xfs_mount	*mp = sc->mp;
> +	uint64_t		flags2;
> +	uint16_t		flags;
> +	uint16_t		mode;
> +	xfs_failaddr_t		fa;
> +
> +	trace_xrep_dinode_extsize_hints(sc, dip);
> +
> +	mode = be16_to_cpu(dip->di_mode);
> +	flags = be16_to_cpu(dip->di_flags);
> +	flags2 = be64_to_cpu(dip->di_flags2);
> +
> +	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
> +			mode, flags);
> +	if (fa) {
> +		dip->di_extsize = 0;
> +		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
> +					      XFS_DIFLAG_EXTSZINHERIT);
> +	}
> +
> +	if (dip->di_version < 3)
> +		return;
> +
> +	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> +			mode, flags, flags2);
> +	if (fa) {
> +		dip->di_cowextsize = 0;
> +		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
> +	}
> +}
> +
> +/* Inode didn't pass verifiers, so fix the raw buffer and retry iget. */
> +STATIC int
> +xrep_dinode_core(
> +	struct xrep_inode	*ri)
> +{
> +	struct xfs_scrub	*sc = ri->sc;
> +	struct xfs_buf		*bp;
> +	struct xfs_dinode	*dip;
> +	xfs_ino_t		ino = sc->sm->sm_ino;
> +	int			error;
> +
> +	/* Read the inode cluster buffer. */
> +	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
> +			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
> +			NULL);
> +	if (error)
> +		return error;
> +
> +	/* Make sure we can pass the inode buffer verifier. */
> +	xrep_dinode_buf(sc, bp);
> +	bp->b_ops = &xfs_inode_buf_ops;

Hmmmmm. Don't we at least need to check this looks like an inode
cluster buffer first?

....
> +
> +/* Check for invalid uid/gid/prid. */
> +STATIC void
> +xrep_inode_ids(
> +	struct xfs_scrub	*sc)
> +{
> +	trace_xrep_inode_ids(sc);
> +
> +	if (i_uid_read(VFS_I(sc->ip)) == -1U) {
> +		i_uid_write(VFS_I(sc->ip), 0);
> +		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
> +		if (XFS_IS_UQUOTA_ON(sc->mp))
> +			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
> +	}
> +
> +	if (i_gid_read(VFS_I(sc->ip)) == -1U) {
> +		i_gid_write(VFS_I(sc->ip), 0);
> +		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
> +		if (XFS_IS_GQUOTA_ON(sc->mp))
> +			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
> +	}

IF we are repairing an inode that has setuid or setgid, I think we
should just strip those permissions regardless of whether the
uid/gid are valid. It think it's better to be cautious here rather
than leave setuid on a file that we reconstructed but have no real
way of knowing that data in the file is untainted.

> +
> +	if (sc->ip->i_projid == -1U) {
> +		sc->ip->i_projid = 0;
> +		if (XFS_IS_PQUOTA_ON(sc->mp))
> +			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
> +	}
> +}
> +
> +static inline void
> +xrep_clamp_nsec(
> +	struct timespec64	*ts)
> +{
> +	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
> +}
> +
> +/* Nanosecond counters can't have more than 1 billion. */
> +STATIC void
> +xrep_inode_timestamps(
> +	struct xfs_inode	*ip)
> +{
> +	xrep_clamp_nsec(&VFS_I(ip)->i_atime);
> +	xrep_clamp_nsec(&VFS_I(ip)->i_mtime);
> +	xrep_clamp_nsec(&VFS_I(ip)->i_ctime);
> +	xrep_clamp_nsec(&ip->i_crtime);
> +}

Should we be clamping the entire timestamp within the valid
filesystem timestamp range here?

> +
> +/* Fix inode flags that don't make sense together. */
> +STATIC void
> +xrep_inode_flags(
> +	struct xfs_scrub	*sc)
> +{
> +	uint16_t		mode;
> +
> +	trace_xrep_inode_flags(sc);
....
> +	/* No mixing reflink and DAX yet. */
> +	if (sc->ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
> +		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;

This can go, too...

.....
> @@ -750,6 +750,38 @@ xrep_ino_dqattach(
>  }
>  #endif /* CONFIG_XFS_QUOTA */
>  
> +/*
> + * Ensure that the inode being repaired is ready to handle a certain number of
> + * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
> + * being repaired and have joined it to the scrub transaction.
> + */
> +int
> +xrep_ino_ensure_extent_count(
> +	struct xfs_scrub	*sc,
> +	int			whichfork,
> +	xfs_extnum_t		nextents)
> +{
> +	xfs_extnum_t		max_extents;
> +	bool			large_extcount;
> +
> +	large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
> +	max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
> +	if (nextents <= max_extents)
> +		return 0;
> +	if (large_extcount)
> +		return -EFSCORRUPTED;
> +	if (!xfs_has_large_extent_counts(sc->mp))
> +		return -EFSCORRUPTED;

This logic took me a bit of peering at to work out. large_extcount says
whether the inode has the large extcount flag set, which is
different to whether the superblock has large extcoutn flag set.

Can change large_extcount to inode_has_nrext64 or something like
that just so it's really clear that there are two different flags
being checked here?

> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index ac8f0200b2963..e239b432d19e8 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -28,6 +28,16 @@ bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
>  		enum xfs_ag_resv_type type);
>  xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
>  
> +static inline int
> +xrep_trans_commit(
> +	struct xfs_scrub	*sc)
> +{
> +	int			error = xfs_trans_commit(sc->tp);
> +
> +	sc->tp = NULL;
> +	return error;
> +}

That's .... interesting formatting. I'd be happy with using standard
linux format for this:

static inline int xrep_trans_commit(struct xfs_scrub *sc)
{
	int error = xfs_trans_commit(sc->tp);

	sc->tp = NULL;
	return error;
}

But that's just personal preference....

-Dave.
Darrick J. Wong Aug. 10, 2023, 12:43 a.m. UTC | #2
On Wed, Aug 09, 2023 at 06:42:58PM +1000, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 03:32:53PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > If an inode is so badly damaged that it cannot be loaded into the cache,
> > fix the ondisk metadata and try again.  If there /is/ a cached inode,
> > fix any problems and apply any optimizations that can be solved incore.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> .....
> > diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
> > new file mode 100644
> > index 0000000000000..952832e9fd029
> > --- /dev/null
> > +++ b/fs/xfs/scrub/inode_repair.c
> > @@ -0,0 +1,763 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <djwong@kernel.org>
> > + */
> > +#include "xfs.h"
> > +#include "xfs_fs.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_btree.h"
> > +#include "xfs_bit.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans.h"
> > +#include "xfs_sb.h"
> > +#include "xfs_inode.h"
> > +#include "xfs_icache.h"
> > +#include "xfs_inode_buf.h"
> > +#include "xfs_inode_fork.h"
> > +#include "xfs_ialloc.h"
> > +#include "xfs_da_format.h"
> > +#include "xfs_reflink.h"
> > +#include "xfs_rmap.h"
> > +#include "xfs_bmap.h"
> > +#include "xfs_bmap_util.h"
> > +#include "xfs_dir2.h"
> > +#include "xfs_dir2_priv.h"
> > +#include "xfs_quota_defs.h"
> > +#include "xfs_quota.h"
> > +#include "xfs_ag.h"
> > +#include "scrub/xfs_scrub.h"
> > +#include "scrub/scrub.h"
> > +#include "scrub/common.h"
> > +#include "scrub/btree.h"
> > +#include "scrub/trace.h"
> > +#include "scrub/repair.h"
> > +
> > +/*
> > + * Inode Repair
> > + *
> > + * Roughly speaking, inode problems can be classified based on whether or not
> > + * they trip the dinode verifiers.  If those trip, then we won't be able to
> > + * _iget ourselves the inode.
> > + *
> > + * Therefore, the xrep_dinode_* functions fix anything that will cause the
> > + * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
> > + * fix things on live incore inodes.
> > + */
> 
> I'd like to see some of the decisions made documented here. Stuff
> like:
> 
> - "unknown di_mode converts inode to a regular file only root can
>   read" needs to be clearly documented because that "regular file"
>   that results might not actually contain user data....
> - what we do with setuid/setgid on repaired inodes
> - things we just trash and leave to other parts of repair to clean
>   up stuff we leak or trash...

Ok.

 * Therefore, the xrep_dinode_* functions fix anything that will cause
 * the inode buffer verifier or the dinode verifier.  The xrep_inode_*
 * functions fix things on live incore inodes.  The repair functions in
 * here can make decisions with security and usability implications in
 * order to revive a file:
 *
 * - Files with zero di_mode or a garbage di_mode are converted to a
 * file that only root can read.  If the immediate data fork area or
 * block 0 of the data fork look like a directory, the file type will be
 * set to a directory.  If the immediate data fork area has no nulls, it
 * will be turned into a symbolic link.  Otherwise, it is turned into a
 * regular file.  This file may not actually contain user data, if the
 * file was not previously a regular file.  Setuid and setgid bits are
 * cleared.
 *
 * - Zero-size directories can be truncated to look empty.  It is
 * necessary to run the bmapbtd and directory repair functions to fully
 * rebuild the directory.
 *
 * - Zero-size symbolic link targets can be truncated to '.'.  It is
 * necessary to run the bmapbtd and symlink repair functions to salvage
 * the symlink.
 *
 * - Invalid extent size hints will be removed.
 *
 * - Quotacheck will be scheduled if we repaired an inode that was so
 * badly damaged that the ondisk inode had to be rebuilt.
 *
 * - Invalid user, group, or project IDs (aka -1U) will be reset to
 * zero.  Setuid and setgid bits are cleared.

The next patch will add to that:

 * - Data and attr forks are reset to extents format with zero extents
 * if the fork data is inconsistent.  It is necessary to run the bmapbtd
 * or bmapbta repair functions to recover the space mapping.
 *
 * - ACLs will not be recovered if the attr fork is zapped or the
 * extended attribute structure itself requires salvaging.
 *
 * - If the attr fork is zapped, the user and group ids are reset to
 * root and the setuid and setgid bits are removed.

How does that sit with you?

> 
> > +/* Fix any conflicting flags that the verifiers complain about. */
> > +STATIC void
> > +xrep_dinode_flags(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_dinode	*dip)
> > +{
> > +	struct xfs_mount	*mp = sc->mp;
> > +	uint64_t		flags2;
> > +	uint16_t		mode;
> > +	uint16_t		flags;
> > +
> > +	trace_xrep_dinode_flags(sc, dip);
> > +
> > +	mode = be16_to_cpu(dip->di_mode);
> > +	flags = be16_to_cpu(dip->di_flags);
> > +	flags2 = be64_to_cpu(dip->di_flags2);
> > +
> > +	if (xfs_has_reflink(mp) && S_ISREG(mode))
> > +		flags2 |= XFS_DIFLAG2_REFLINK;
> > +	else
> > +		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
> > +	if (flags & XFS_DIFLAG_REALTIME)
> > +		flags2 &= ~XFS_DIFLAG2_REFLINK;
> > +	if (flags2 & XFS_DIFLAG2_REFLINK)
> > +		flags2 &= ~XFS_DIFLAG2_DAX;
> 
> IIRC, reflink and DAX co-exist just fine now....

Yep.  Fixed.

> > +	if (!xfs_has_bigtime(mp))
> > +		flags2 &= ~XFS_DIFLAG2_BIGTIME;
> > +	if (!xfs_has_large_extent_counts(mp))
> > +		flags2 &= ~XFS_DIFLAG2_NREXT64;
> > +	if (flags2 & XFS_DIFLAG2_NREXT64)
> > +		dip->di_nrext64_pad = 0;
> > +	else if (dip->di_version >= 3)
> > +		dip->di_v3_pad = 0;
> > +	dip->di_flags = cpu_to_be16(flags);
> > +	dip->di_flags2 = cpu_to_be64(flags2);
> > +}
> > +
> > +/*
> > + * Blow out symlink; now it points to the current dir.  We don't have to worry
> > + * about incore state because this inode is failing the verifiers.
> > + */
> > +STATIC void
> > +xrep_dinode_zap_symlink(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_dinode	*dip)
> > +{
> > +	char			*p;
> > +
> > +	trace_xrep_dinode_zap_symlink(sc, dip);
> > +
> > +	dip->di_format = XFS_DINODE_FMT_LOCAL;
> > +	dip->di_size = cpu_to_be64(1);
> > +	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
> > +	*p = '.';
> 
> What if this was in extent form? Didn't we just leak an extent?

Yeah.  I'll add that to the giant comment.

"Zero-size symbolic link targets can be truncated to '.'.  It is
necessary to run the bmapbtd and symlink repair functions to salvage the
symlink."

The next few patches will add the ability to zap the data and attr forks
if either of them look bad.  After that, the (or really xfs_scrub) will
have to call several more scrubbers to completely fix the file:

bmapbtd -> symlink/directory

bmapbta -> attr -> parent ptr

So this is a common post-requirement for the inode repair code.  It's a
bit racy, and arguably the kernel could auto-invoke those repair
functions instead of requiring userspace to call back, but that's
something for another conversation. :)

(e.g. "Is it ok if this one repair function could potentially take a
very long time to finish, and won't tell userspace what it's up to?")

((The scrub vectorization in online fsck part 3 might actually be a
reasonable way for xfs_scrub to get the kernel to do everything all at
once.))

> > +}
> > +
> > +/*
> > + * Blow out dir, make it point to the root.  In the future repair will
> > + * reconstruct this directory for us.  Note that there's no in-core directory
> > + * inode because the sf verifier tripped, so we don't have to worry about the
> > + * dentry cache.
> > + */
> > +STATIC void
> > +xrep_dinode_zap_dir(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_dinode	*dip)
> > +{
> > +	struct xfs_mount	*mp = sc->mp;
> > +	struct xfs_dir2_sf_hdr	*sfp;
> > +	int			i8count;
> > +
> > +	trace_xrep_dinode_zap_dir(sc, dip);
> > +
> > +	dip->di_format = XFS_DINODE_FMT_LOCAL;
> > +	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
> > +	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
> > +	sfp->count = 0;
> > +	sfp->i8count = i8count;
> > +	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
> > +	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
> > +}
> 
> Same here?

Same as above.

> > +
> > +/* Make sure we don't have a garbage file size. */
> > +STATIC void
> > +xrep_dinode_size(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_dinode	*dip)
> > +{
> > +	uint64_t		size;
> > +	uint16_t		mode;
> > +
> > +	trace_xrep_dinode_size(sc, dip);
> > +
> > +	mode = be16_to_cpu(dip->di_mode);
> > +	size = be64_to_cpu(dip->di_size);
> > +	switch (mode & S_IFMT) {
> > +	case S_IFIFO:
> > +	case S_IFCHR:
> > +	case S_IFBLK:
> > +	case S_IFSOCK:
> > +		/* di_size can't be nonzero for special files */
> > +		dip->di_size = 0;
> > +		break;
> > +	case S_IFREG:
> > +		/* Regular files can't be larger than 2^63-1 bytes. */
> > +		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
> > +		break;
> > +	case S_IFLNK:
> > +		/*
> > +		 * Truncate ridiculously oversized symlinks.  If the size is
> > +		 * zero, reset it to point to the current directory.  Both of
> > +		 * these conditions trigger dinode verifier errors, so there
> > +		 * is no in-core state to reset.
> > +		 */
> > +		if (size > XFS_SYMLINK_MAXLEN)
> > +			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
> > +		else if (size == 0)
> > +			xrep_dinode_zap_symlink(sc, dip);
> > +		break;
> > +	case S_IFDIR:
> > +		/*
> > +		 * Directories can't have a size larger than 32G.  If the size
> > +		 * is zero, reset it to an empty directory.  Both of these
> > +		 * conditions trigger dinode verifier errors, so there is no
> > +		 * in-core state to reset.
> > +		 */
> > +		if (size > XFS_DIR2_SPACE_SIZE)
> > +			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
> > +		else if (size == 0)
> > +			xrep_dinode_zap_dir(sc, dip);
> > +		break;
> > +	}
> > +}
> > +
> > +/* Fix extent size hints. */
> > +STATIC void
> > +xrep_dinode_extsize_hints(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_dinode	*dip)
> > +{
> > +	struct xfs_mount	*mp = sc->mp;
> > +	uint64_t		flags2;
> > +	uint16_t		flags;
> > +	uint16_t		mode;
> > +	xfs_failaddr_t		fa;
> > +
> > +	trace_xrep_dinode_extsize_hints(sc, dip);
> > +
> > +	mode = be16_to_cpu(dip->di_mode);
> > +	flags = be16_to_cpu(dip->di_flags);
> > +	flags2 = be64_to_cpu(dip->di_flags2);
> > +
> > +	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
> > +			mode, flags);
> > +	if (fa) {
> > +		dip->di_extsize = 0;
> > +		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
> > +					      XFS_DIFLAG_EXTSZINHERIT);
> > +	}
> > +
> > +	if (dip->di_version < 3)
> > +		return;
> > +
> > +	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> > +			mode, flags, flags2);
> > +	if (fa) {
> > +		dip->di_cowextsize = 0;
> > +		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
> > +	}
> > +}
> > +
> > +/* Inode didn't pass verifiers, so fix the raw buffer and retry iget. */
> > +STATIC int
> > +xrep_dinode_core(
> > +	struct xrep_inode	*ri)
> > +{
> > +	struct xfs_scrub	*sc = ri->sc;
> > +	struct xfs_buf		*bp;
> > +	struct xfs_dinode	*dip;
> > +	xfs_ino_t		ino = sc->sm->sm_ino;
> > +	int			error;
> > +
> > +	/* Read the inode cluster buffer. */
> > +	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
> > +			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
> > +			NULL);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Make sure we can pass the inode buffer verifier. */
> > +	xrep_dinode_buf(sc, bp);
> > +	bp->b_ops = &xfs_inode_buf_ops;
> 
> Hmmmmm. Don't we at least need to check this looks like an inode
> cluster buffer first?

Check it how?  The cluster buffer could be completely trashed due to
crosslinking with a regular file, or bad storage devices, or whatnot.
xrep_dinode_buf will rewrite the whole buffer to get it to the point
where it'll pass the buffer verifier.

> ....
> > +
> > +/* Check for invalid uid/gid/prid. */
> > +STATIC void
> > +xrep_inode_ids(
> > +	struct xfs_scrub	*sc)
> > +{
> > +	trace_xrep_inode_ids(sc);
> > +
> > +	if (i_uid_read(VFS_I(sc->ip)) == -1U) {
> > +		i_uid_write(VFS_I(sc->ip), 0);
> > +		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
> > +		if (XFS_IS_UQUOTA_ON(sc->mp))
> > +			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
> > +	}
> > +
> > +	if (i_gid_read(VFS_I(sc->ip)) == -1U) {
> > +		i_gid_write(VFS_I(sc->ip), 0);
> > +		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
> > +		if (XFS_IS_GQUOTA_ON(sc->mp))
> > +			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
> > +	}
> 
> IF we are repairing an inode that has setuid or setgid, I think we
> should just strip those permissions regardless of whether the
> uid/gid are valid. It think it's better to be cautious here rather
> than leave setuid on a file that we reconstructed but have no real
> way of knowing that data in the file is untainted.

Ok, changed.

> > +
> > +	if (sc->ip->i_projid == -1U) {
> > +		sc->ip->i_projid = 0;
> > +		if (XFS_IS_PQUOTA_ON(sc->mp))
> > +			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
> > +	}
> > +}
> > +
> > +static inline void
> > +xrep_clamp_nsec(
> > +	struct timespec64	*ts)
> > +{
> > +	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
> > +}
> > +
> > +/* Nanosecond counters can't have more than 1 billion. */
> > +STATIC void
> > +xrep_inode_timestamps(
> > +	struct xfs_inode	*ip)
> > +{
> > +	xrep_clamp_nsec(&VFS_I(ip)->i_atime);
> > +	xrep_clamp_nsec(&VFS_I(ip)->i_mtime);
> > +	xrep_clamp_nsec(&VFS_I(ip)->i_ctime);
> > +	xrep_clamp_nsec(&ip->i_crtime);
> > +}
> 
> Should we be clamping the entire timestamp within the valid
> filesystem timestamp range here?

Yes.


static inline void
xrep_clamp_timestamp(
	struct xfs_inode	*ip,
	struct timespec64	*ts)
{
	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
	*ts = timestamp_truncate(*ts, VFS_I(ip));
}

> > +
> > +/* Fix inode flags that don't make sense together. */
> > +STATIC void
> > +xrep_inode_flags(
> > +	struct xfs_scrub	*sc)
> > +{
> > +	uint16_t		mode;
> > +
> > +	trace_xrep_inode_flags(sc);
> ....
> > +	/* No mixing reflink and DAX yet. */
> > +	if (sc->ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
> > +		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
> 
> This can go, too...

Fixed.

> .....
> > @@ -750,6 +750,38 @@ xrep_ino_dqattach(
> >  }
> >  #endif /* CONFIG_XFS_QUOTA */
> >  
> > +/*
> > + * Ensure that the inode being repaired is ready to handle a certain number of
> > + * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
> > + * being repaired and have joined it to the scrub transaction.
> > + */
> > +int
> > +xrep_ino_ensure_extent_count(
> > +	struct xfs_scrub	*sc,
> > +	int			whichfork,
> > +	xfs_extnum_t		nextents)
> > +{
> > +	xfs_extnum_t		max_extents;
> > +	bool			large_extcount;
> > +
> > +	large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
> > +	max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
> > +	if (nextents <= max_extents)
> > +		return 0;
> > +	if (large_extcount)
> > +		return -EFSCORRUPTED;
> > +	if (!xfs_has_large_extent_counts(sc->mp))
> > +		return -EFSCORRUPTED;
> 
> This logic took me a bit of peering at to work out. large_extcount says
> whether the inode has the large extcount flag set, which is
> different to whether the superblock has large extcoutn flag set.
> 
> Can change large_extcount to inode_has_nrext64 or something like
> that just so it's really clear that there are two different flags
> being checked here?

Yup, done.

> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index ac8f0200b2963..e239b432d19e8 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> > @@ -28,6 +28,16 @@ bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
> >  		enum xfs_ag_resv_type type);
> >  xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
> >  
> > +static inline int
> > +xrep_trans_commit(
> > +	struct xfs_scrub	*sc)
> > +{
> > +	int			error = xfs_trans_commit(sc->tp);
> > +
> > +	sc->tp = NULL;
> > +	return error;
> > +}
> 
> That's .... interesting formatting. I'd be happy with using standard
> linux format for this:
> 
> static inline int xrep_trans_commit(struct xfs_scrub *sc)
> {
> 	int error = xfs_trans_commit(sc->tp);
> 
> 	sc->tp = NULL;
> 	return error;
> }
> 
> But that's just personal preference....

Yeah, that's ok with me.

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a6f708dc56cc2..0d86d75422f60 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -183,6 +183,7 @@  xfs-y				+= $(addprefix scrub/, \
 				   agheader_repair.o \
 				   alloc_repair.o \
 				   ialloc_repair.o \
+				   inode_repair.o \
 				   newbt.o \
 				   reap.o \
 				   refcount_repair.o \
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e05..5ba2dae7aa2f8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -996,7 +996,8 @@  enum xfs_dinode_fmt {
 #define XFS_DFORK_APTR(dip)	\
 	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)	\
-	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+	((void *)((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : \
+					 XFS_DFORK_APTR(dip)))
 
 #define XFS_DFORK_FORMAT(dip,w) \
 	((w) == XFS_DATA_FORK ? \
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 267c169a21ca9..964089e24ca6d 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,6 +9,8 @@ 
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
 #include "xfs_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6b6d912c710eb..fcd6e9df618f8 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -24,6 +24,7 @@ 
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
 
 /* Prepare the attached inode for scrubbing. */
 static inline int
@@ -184,8 +185,11 @@  xchk_setup_inode(
 	 * saying the inode is allocated and the icache being unable to load
 	 * the inode until we can flag the corruption in xchk_inode.  The
 	 * scrub function has to note the corruption, since we're not really
-	 * supposed to do that from the setup function.
+	 * supposed to do that from the setup function.  Save the mapping to
+	 * make repairs to the ondisk inode buffer.
 	 */
+	if (xchk_could_repair(sc))
+		xrep_setup_inode(sc, &imap);
 	return 0;
 
 out_cancel:
@@ -341,6 +345,10 @@  xchk_inode_flags2(
 	if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
 		goto bad;
 
+	/* no large extent counts without the filesystem feature */
+	if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp))
+		goto bad;
+
 	return;
 bad:
 	xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 0000000000000..952832e9fd029
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,763 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_rmap.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Inode Repair
+ *
+ * Roughly speaking, inode problems can be classified based on whether or not
+ * they trip the dinode verifiers.  If those trip, then we won't be able to
+ * _iget ourselves the inode.
+ *
+ * Therefore, the xrep_dinode_* functions fix anything that will cause the
+ * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
+ * fix things on live incore inodes.
+ */
+
+/*
+ * All the information we need to repair the ondisk inode if we can't iget the
+ * incore inode.  We don't allocate this buffer unless we're going to perform
+ * a repair to the ondisk inode cluster buffer.
+ */
+struct xrep_inode {
+	/* Inode mapping that we saved from the initial lookup attempt. */
+	struct xfs_imap		imap;
+
+	struct xfs_scrub	*sc;
+};
+
+/* Setup function for inode repair. */
+int
+xrep_setup_inode(
+	struct xfs_scrub	*sc,
+	struct xfs_imap		*imap)
+{
+	struct xrep_inode	*ri;
+
+	/*
+	 * The only information that needs to be passed between inode scrub and
+	 * repair is the location of the ondisk metadata if iget fails.  The
+	 * rest of struct xrep_inode is context data that we need to massage
+	 * the ondisk inode to the point that iget will work, which means that
+	 * we don't allocate anything at all if the incore inode is loaded.
+	 */
+	if (!imap)
+		return 0;
+
+	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	ri = sc->buf;
+	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
+	ri->sc = sc;
+	return 0;
+}
+
+/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
+STATIC void
+xrep_dinode_buf(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_trans	*tp = sc->tp;
+	struct xfs_perag	*pag;
+	struct xfs_dinode	*dip;
+	xfs_agnumber_t		agno;
+	xfs_agino_t		agino;
+	int			ioff;
+	int			i;
+	int			ni;
+	bool			crc_ok;
+	bool			magic_ok;
+	bool			unlinked_ok;
+
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
+	pag = xfs_perag_get(mp, agno);
+	for (i = 0; i < ni; i++) {
+		ioff = i << mp->m_sb.sb_inodelog;
+		dip = xfs_buf_offset(bp, ioff);
+		agino = be32_to_cpu(dip->di_next_unlinked);
+
+		unlinked_ok = magic_ok = crc_ok = false;
+
+		if (xfs_verify_agino_or_null(pag, agino))
+			unlinked_ok = true;
+
+		if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+		    xfs_dinode_good_version(mp, dip->di_version))
+			magic_ok = true;
+
+		if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+				XFS_DINODE_CRC_OFF))
+			crc_ok = true;
+
+		if (magic_ok && unlinked_ok && crc_ok)
+			continue;
+
+		if (!magic_ok) {
+			dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			dip->di_version = 3;
+		}
+		if (!unlinked_ok)
+			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+		xfs_dinode_calc_crc(mp, dip);
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+		xfs_trans_log_buf(tp, bp, ioff, ioff + sizeof(*dip) - 1);
+	}
+	xfs_perag_put(pag);
+}
+
+/* Reinitialize things that never change in an inode. */
+STATIC void
+xrep_dinode_header(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	trace_xrep_dinode_header(sc, dip);
+
+	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+		dip->di_version = 3;
+	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
+	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+}
+
+/* Turn di_mode into /something/ recognizable. */
+STATIC void
+xrep_dinode_mode(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	uint16_t		mode;
+
+	trace_xrep_dinode_mode(sc, dip);
+
+	mode = be16_to_cpu(dip->di_mode);
+	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
+		return;
+
+	/* bad mode, so we set it to a file that only root can read */
+	mode = S_IFREG;
+	dip->di_mode = cpu_to_be16(mode);
+	dip->di_uid = 0;
+	dip->di_gid = 0;
+}
+
+/* Fix any conflicting flags that the verifiers complain about. */
+STATIC void
+xrep_dinode_flags(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = sc->mp;
+	uint64_t		flags2;
+	uint16_t		mode;
+	uint16_t		flags;
+
+	trace_xrep_dinode_flags(sc, dip);
+
+	mode = be16_to_cpu(dip->di_mode);
+	flags = be16_to_cpu(dip->di_flags);
+	flags2 = be64_to_cpu(dip->di_flags2);
+
+	if (xfs_has_reflink(mp) && S_ISREG(mode))
+		flags2 |= XFS_DIFLAG2_REFLINK;
+	else
+		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+	if (flags & XFS_DIFLAG_REALTIME)
+		flags2 &= ~XFS_DIFLAG2_REFLINK;
+	if (flags2 & XFS_DIFLAG2_REFLINK)
+		flags2 &= ~XFS_DIFLAG2_DAX;
+	if (!xfs_has_bigtime(mp))
+		flags2 &= ~XFS_DIFLAG2_BIGTIME;
+	if (!xfs_has_large_extent_counts(mp))
+		flags2 &= ~XFS_DIFLAG2_NREXT64;
+	if (flags2 & XFS_DIFLAG2_NREXT64)
+		dip->di_nrext64_pad = 0;
+	else if (dip->di_version >= 3)
+		dip->di_v3_pad = 0;
+	dip->di_flags = cpu_to_be16(flags);
+	dip->di_flags2 = cpu_to_be64(flags2);
+}
+
+/*
+ * Blow out symlink; now it points to the current dir.  We don't have to worry
+ * about incore state because this inode is failing the verifiers.
+ */
+STATIC void
+xrep_dinode_zap_symlink(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	char			*p;
+
+	trace_xrep_dinode_zap_symlink(sc, dip);
+
+	dip->di_format = XFS_DINODE_FMT_LOCAL;
+	dip->di_size = cpu_to_be64(1);
+	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	*p = '.';
+}
+
+/*
+ * Blow out dir, make it point to the root.  In the future repair will
+ * reconstruct this directory for us.  Note that there's no in-core directory
+ * inode because the sf verifier tripped, so we don't have to worry about the
+ * dentry cache.
+ */
+STATIC void
+xrep_dinode_zap_dir(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_dir2_sf_hdr	*sfp;
+	int			i8count;
+
+	trace_xrep_dinode_zap_dir(sc, dip);
+
+	dip->di_format = XFS_DINODE_FMT_LOCAL;
+	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	sfp->count = 0;
+	sfp->i8count = i8count;
+	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
+	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+}
+
+/* Make sure we don't have a garbage file size. */
+STATIC void
+xrep_dinode_size(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	uint64_t		size;
+	uint16_t		mode;
+
+	trace_xrep_dinode_size(sc, dip);
+
+	mode = be16_to_cpu(dip->di_mode);
+	size = be64_to_cpu(dip->di_size);
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		/* di_size can't be nonzero for special files */
+		dip->di_size = 0;
+		break;
+	case S_IFREG:
+		/* Regular files can't be larger than 2^63-1 bytes. */
+		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
+		break;
+	case S_IFLNK:
+		/*
+		 * Truncate ridiculously oversized symlinks.  If the size is
+		 * zero, reset it to point to the current directory.  Both of
+		 * these conditions trigger dinode verifier errors, so there
+		 * is no in-core state to reset.
+		 */
+		if (size > XFS_SYMLINK_MAXLEN)
+			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
+		else if (size == 0)
+			xrep_dinode_zap_symlink(sc, dip);
+		break;
+	case S_IFDIR:
+		/*
+		 * Directories can't have a size larger than 32G.  If the size
+		 * is zero, reset it to an empty directory.  Both of these
+		 * conditions trigger dinode verifier errors, so there is no
+		 * in-core state to reset.
+		 */
+		if (size > XFS_DIR2_SPACE_SIZE)
+			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
+		else if (size == 0)
+			xrep_dinode_zap_dir(sc, dip);
+		break;
+	}
+}
+
+/* Fix extent size hints. */
+STATIC void
+xrep_dinode_extsize_hints(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = sc->mp;
+	uint64_t		flags2;
+	uint16_t		flags;
+	uint16_t		mode;
+	xfs_failaddr_t		fa;
+
+	trace_xrep_dinode_extsize_hints(sc, dip);
+
+	mode = be16_to_cpu(dip->di_mode);
+	flags = be16_to_cpu(dip->di_flags);
+	flags2 = be64_to_cpu(dip->di_flags2);
+
+	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+			mode, flags);
+	if (fa) {
+		dip->di_extsize = 0;
+		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
+					      XFS_DIFLAG_EXTSZINHERIT);
+	}
+
+	if (dip->di_version < 3)
+		return;
+
+	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+			mode, flags, flags2);
+	if (fa) {
+		dip->di_cowextsize = 0;
+		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+	}
+}
+
+/* Inode didn't pass verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xrep_dinode_core(
+	struct xrep_inode	*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_buf		*bp;
+	struct xfs_dinode	*dip;
+	xfs_ino_t		ino = sc->sm->sm_ino;
+	int			error;
+
+	/* Read the inode cluster buffer. */
+	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
+			NULL);
+	if (error)
+		return error;
+
+	/* Make sure we can pass the inode buffer verifier. */
+	xrep_dinode_buf(sc, bp);
+	bp->b_ops = &xfs_inode_buf_ops;
+
+	/* Fix everything the verifier will complain about. */
+	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
+	xrep_dinode_header(sc, dip);
+	xrep_dinode_mode(sc, dip);
+	xrep_dinode_flags(sc, dip);
+	xrep_dinode_size(sc, dip);
+	xrep_dinode_extsize_hints(sc, dip);
+
+	/* Write out the inode. */
+	trace_xrep_dinode_fixed(sc, dip);
+	xfs_dinode_calc_crc(sc->mp, dip);
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
+			ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+
+	/*
+	 * Now that we've finished rewriting anything in the ondisk metadata
+	 * that would prevent iget from giving us an incore inode, commit the
+	 * inode cluster buffer updates and drop the AGI buffer that we've been
+	 * holding since scrub setup.
+	 */
+	error = xrep_trans_commit(sc);
+	if (error)
+		return error;
+
+	/* Try again to load the inode. */
+	error = xchk_iget(sc, ino, &sc->ip);
+	if (error)
+		return error;
+
+	xchk_ilock(sc, XFS_IOLOCK_EXCL);
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	error = xrep_ino_dqattach(sc);
+	if (error)
+		return error;
+
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/* Fix everything xfs_dinode_verify cares about. */
+STATIC int
+xrep_dinode_problems(
+	struct xrep_inode	*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	error = xrep_dinode_core(ri);
+	if (error)
+		return error;
+
+	/* We had to fix a totally busted inode, schedule quotacheck. */
+	if (XFS_IS_UQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+	if (XFS_IS_GQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+	if (XFS_IS_PQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+
+	return 0;
+}
+
+/*
+ * Fix problems that the verifiers don't care about.  In general these are
+ * errors that don't cause problems elsewhere in the kernel that we can easily
+ * detect, so we don't check them all that rigorously.
+ */
+
+/* Make sure block and extent counts are ok. */
+STATIC int
+xrep_inode_blockcounts(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_ifork	*ifp;
+	xfs_filblks_t		count;
+	xfs_filblks_t		acount;
+	xfs_extnum_t		nextents;
+	int			error;
+
+	trace_xrep_inode_blockcounts(sc);
+
+	/* Set data fork counters from the data fork mappings. */
+	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+			&nextents, &count);
+	if (error)
+		return error;
+	if (xfs_has_reflink(sc->mp)) {
+		; /* data fork blockcount can exceed physical storage */
+	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
+		if (count >= sc->mp->m_sb.sb_rblocks)
+			return -EFSCORRUPTED;
+	} else {
+		if (count >= sc->mp->m_sb.sb_dblocks)
+			return -EFSCORRUPTED;
+	}
+	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
+	if (error)
+		return error;
+	sc->ip->i_df.if_nextents = nextents;
+
+	/* Set attr fork counters from the attr fork mappings. */
+	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+	if (ifp) {
+		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+				&nextents, &acount);
+		if (error)
+			return error;
+		if (count >= sc->mp->m_sb.sb_dblocks)
+			return -EFSCORRUPTED;
+		error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
+				nextents);
+		if (error)
+			return error;
+		ifp->if_nextents = nextents;
+	} else {
+		acount = 0;
+	}
+
+	sc->ip->i_nblocks = count + acount;
+	return 0;
+}
+
+/* Check for invalid uid/gid/prid. */
+STATIC void
+xrep_inode_ids(
+	struct xfs_scrub	*sc)
+{
+	trace_xrep_inode_ids(sc);
+
+	if (i_uid_read(VFS_I(sc->ip)) == -1U) {
+		i_uid_write(VFS_I(sc->ip), 0);
+		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+		if (XFS_IS_UQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+	}
+
+	if (i_gid_read(VFS_I(sc->ip)) == -1U) {
+		i_gid_write(VFS_I(sc->ip), 0);
+		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+		if (XFS_IS_GQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+	}
+
+	if (sc->ip->i_projid == -1U) {
+		sc->ip->i_projid = 0;
+		if (XFS_IS_PQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+	}
+}
+
+static inline void
+xrep_clamp_nsec(
+	struct timespec64	*ts)
+{
+	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
+}
+
+/* Nanosecond counters can't have more than 1 billion. */
+STATIC void
+xrep_inode_timestamps(
+	struct xfs_inode	*ip)
+{
+	xrep_clamp_nsec(&VFS_I(ip)->i_atime);
+	xrep_clamp_nsec(&VFS_I(ip)->i_mtime);
+	xrep_clamp_nsec(&VFS_I(ip)->i_ctime);
+	xrep_clamp_nsec(&ip->i_crtime);
+}
+
+/* Fix inode flags that don't make sense together. */
+STATIC void
+xrep_inode_flags(
+	struct xfs_scrub	*sc)
+{
+	uint16_t		mode;
+
+	trace_xrep_inode_flags(sc);
+
+	mode = VFS_I(sc->ip)->i_mode;
+
+	/* Clear junk flags */
+	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
+		sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
+
+	/* NEWRTBM only applies to realtime bitmaps */
+	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+		sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+	else
+		sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
+
+	/* These only make sense for directories. */
+	if (!S_ISDIR(mode))
+		sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
+					  XFS_DIFLAG_EXTSZINHERIT |
+					  XFS_DIFLAG_PROJINHERIT |
+					  XFS_DIFLAG_NOSYMLINKS);
+
+	/* These only make sense for files. */
+	if (!S_ISREG(mode))
+		sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
+					  XFS_DIFLAG_EXTSIZE);
+
+	/* These only make sense for non-rt files. */
+	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+		sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
+
+	/* Immutable and append only?  Drop the append. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
+	    (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
+		sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
+
+	/* Clear junk flags. */
+	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
+
+	/* No reflink flag unless we support it and it's a file. */
+	if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* DAX only applies to files and dirs. */
+	if (!(S_ISREG(mode) || S_ISDIR(mode)))
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+
+	/* No reflink files on the realtime device. */
+	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* No mixing reflink and DAX yet. */
+	if (sc->ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+}
+
+/*
+ * Fix size problems with block/node format directories.  If we fail to find
+ * the extent list, just bail out and let the bmapbtd repair functions clean
+ * up that mess.
+ */
+STATIC void
+xrep_inode_blockdir_size(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		off;
+	int			error;
+
+	trace_xrep_inode_blockdir_size(sc);
+
+	/* Find the last block before 32G; this is the dir size. */
+	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
+	if (error)
+		return;
+
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
+	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
+		/* zero-extents directory? */
+		return;
+	}
+
+	off = got.br_startoff + got.br_blockcount;
+	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
+			XFS_FSB_TO_B(sc->mp, off));
+}
+
+/* Fix size problems with short format directories. */
+STATIC void
+xrep_inode_sfdir_size(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_ifork	*ifp;
+
+	trace_xrep_inode_sfdir_size(sc);
+
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	sc->ip->i_disk_size = ifp->if_bytes;
+}
+
+/*
+ * Fix any irregularities in an inode's size now that we can iterate extent
+ * maps and access other regular inode data.
+ */
+STATIC void
+xrep_inode_size(
+	struct xfs_scrub	*sc)
+{
+	trace_xrep_inode_size(sc);
+
+	/*
+	 * Currently we only support fixing size on extents or btree format
+	 * directories.  Files can be any size and sizes for the other inode
+	 * special types are fixed by xrep_dinode_size.
+	 */
+	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+		return;
+	switch (sc->ip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		xrep_inode_blockdir_size(sc);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		xrep_inode_sfdir_size(sc);
+		break;
+	}
+}
+
+/* Fix extent size hint problems. */
+STATIC void
+xrep_inode_extsize(
+	struct xfs_scrub	*sc)
+{
+	/* Fix misaligned extent size hints on a directory. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+	    sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+		sc->ip->i_extsize = 0;
+		sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
+	}
+}
+
+/* Fix any irregularities in an inode that the verifiers don't catch. */
+STATIC int
+xrep_inode_problems(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	error = xrep_inode_blockcounts(sc);
+	if (error)
+		return error;
+	xrep_inode_timestamps(sc->ip);
+	xrep_inode_flags(sc);
+	xrep_inode_ids(sc);
+	xrep_inode_size(sc);
+	xrep_inode_extsize(sc);
+
+	trace_xrep_inode_fixed(sc);
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return xrep_roll_trans(sc);
+}
+
+/* Repair an inode's fields. */
+int
+xrep_inode(
+	struct xfs_scrub	*sc)
+{
+	int			error = 0;
+
+	/*
+	 * No inode?  That means we failed the _iget verifiers.  Repair all
+	 * the things that the inode verifiers care about, then retry _iget.
+	 */
+	if (!sc->ip) {
+		struct xrep_inode	*ri = sc->buf;
+
+		ASSERT(ri != NULL);
+
+		error = xrep_dinode_problems(ri);
+		if (error)
+			return error;
+
+		/* By this point we had better have a working incore inode. */
+		if (!sc->ip)
+			return -EFSCORRUPTED;
+	}
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* If we found corruption of any kind, try to fix it. */
+	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
+	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+		error = xrep_inode_problems(sc);
+		if (error)
+			return error;
+	}
+
+	/* See if we can clear the reflink flag. */
+	if (xfs_is_reflink_inode(sc->ip))
+		return xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index a7fd91e774fe0..e2b90cabed6dd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -750,6 +750,38 @@  xrep_ino_dqattach(
 }
 #endif /* CONFIG_XFS_QUOTA */
 
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+int
+xrep_ino_ensure_extent_count(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_extnum_t		nextents)
+{
+	xfs_extnum_t		max_extents;
+	bool			large_extcount;
+
+	large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
+	max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
+	if (nextents <= max_extents)
+		return 0;
+	if (large_extcount)
+		return -EFSCORRUPTED;
+	if (!xfs_has_large_extent_counts(sc->mp))
+		return -EFSCORRUPTED;
+
+	max_extents = xfs_iext_max_nextents(true, whichfork);
+	if (nextents > max_extents)
+		return -EFSCORRUPTED;
+
+	sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return 0;
+}
+
 /* Initialize all the btree cursors for an AG repair. */
 void
 xrep_ag_btcur_init(
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ac8f0200b2963..e239b432d19e8 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,6 +28,16 @@  bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
 xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
 
+static inline int
+xrep_trans_commit(
+	struct xfs_scrub	*sc)
+{
+	int			error = xfs_trans_commit(sc->tp);
+
+	sc->tp = NULL;
+	return error;
+}
+
 struct xbitmap;
 struct xagb_bitmap;
 
@@ -59,11 +69,16 @@  int xrep_ino_dqattach(struct xfs_scrub *sc);
 # define xrep_ino_dqattach(sc)			(0)
 #endif /* CONFIG_XFS_QUOTA */
 
+int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
+		xfs_extnum_t nextents);
 int xrep_reset_perag_resv(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
 
+struct xfs_imap;
+int xrep_setup_inode(struct xfs_scrub *sc, struct xfs_imap *imap);
+
 void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
 
 /* Metadata revalidators */
@@ -81,6 +96,7 @@  int xrep_agi(struct xfs_scrub *sc);
 int xrep_allocbt(struct xfs_scrub *sc);
 int xrep_iallocbt(struct xfs_scrub *sc);
 int xrep_refcountbt(struct xfs_scrub *sc);
+int xrep_inode(struct xfs_scrub *sc);
 
 int xrep_reinit_pagf(struct xfs_scrub *sc);
 int xrep_reinit_pagi(struct xfs_scrub *sc);
@@ -126,6 +142,8 @@  xrep_setup_nothing(
 }
 #define xrep_setup_ag_allocbt		xrep_setup_nothing
 
+#define xrep_setup_inode(sc, imap)	((void)0)
+
 #define xrep_revalidate_allocbt		(NULL)
 #define xrep_revalidate_iallocbt	(NULL)
 
@@ -137,6 +155,7 @@  xrep_setup_nothing(
 #define xrep_allocbt			xrep_notsupported
 #define xrep_iallocbt			xrep_notsupported
 #define xrep_refcountbt			xrep_notsupported
+#define xrep_inode			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 71aee7e3dd43a..e502174202fba 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -282,7 +282,7 @@  static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.type	= ST_INODE,
 		.setup	= xchk_setup_inode,
 		.scrub	= xchk_inode,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_inode,
 	},
 	[XFS_SCRUB_TYPE_BMBTD] = {	/* inode data fork */
 		.type	= ST_INODE,
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 358c7ddbf14e2..fa14adb88c63b 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1366,6 +1366,135 @@  DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
 DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
 DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
 
+DECLARE_EVENT_CLASS(xrep_dinode_class,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip),
+	TP_ARGS(sc, dip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(uint16_t, mode)
+		__field(uint8_t, version)
+		__field(uint8_t, format)
+		__field(uint32_t, uid)
+		__field(uint32_t, gid)
+		__field(uint64_t, size)
+		__field(uint64_t, nblocks)
+		__field(uint32_t, extsize)
+		__field(uint32_t, nextents)
+		__field(uint16_t, anextents)
+		__field(uint8_t, forkoff)
+		__field(uint8_t, aformat)
+		__field(uint16_t, flags)
+		__field(uint32_t, gen)
+		__field(uint64_t, flags2)
+		__field(uint32_t, cowextsize)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->mode = be16_to_cpu(dip->di_mode);
+		__entry->version = dip->di_version;
+		__entry->format = dip->di_format;
+		__entry->uid = be32_to_cpu(dip->di_uid);
+		__entry->gid = be32_to_cpu(dip->di_gid);
+		__entry->size = be64_to_cpu(dip->di_size);
+		__entry->nblocks = be64_to_cpu(dip->di_nblocks);
+		__entry->extsize = be32_to_cpu(dip->di_extsize);
+		__entry->nextents = be32_to_cpu(dip->di_nextents);
+		__entry->anextents = be16_to_cpu(dip->di_anextents);
+		__entry->forkoff = dip->di_forkoff;
+		__entry->aformat = dip->di_aformat;
+		__entry->flags = be16_to_cpu(dip->di_flags);
+		__entry->gen = be32_to_cpu(dip->di_gen);
+		__entry->flags2 = be64_to_cpu(dip->di_flags2);
+		__entry->cowextsize = be32_to_cpu(dip->di_cowextsize);
+	),
+	TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->mode,
+		  __entry->version,
+		  __entry->format,
+		  __entry->uid,
+		  __entry->gid,
+		  __entry->size,
+		  __entry->nblocks,
+		  __entry->extsize,
+		  __entry->nextents,
+		  __entry->anextents,
+		  __entry->forkoff,
+		  __entry->aformat,
+		  __entry->flags,
+		  __entry->gen,
+		  __entry->flags2,
+		  __entry->cowextsize)
+)
+
+#define DEFINE_REPAIR_DINODE_EVENT(name) \
+DEFINE_EVENT(xrep_dinode_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \
+	TP_ARGS(sc, dip))
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed);
+
+DECLARE_EVENT_CLASS(xrep_inode_class,
+	TP_PROTO(struct xfs_scrub *sc),
+	TP_ARGS(sc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_rfsblock_t, nblocks)
+		__field(uint16_t, flags)
+		__field(uint64_t, flags2)
+		__field(uint32_t, nextents)
+		__field(uint8_t, format)
+		__field(uint32_t, anextents)
+		__field(uint8_t, aformat)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->size = sc->ip->i_disk_size;
+		__entry->nblocks = sc->ip->i_nblocks;
+		__entry->flags = sc->ip->i_diflags;
+		__entry->flags2 = sc->ip->i_diflags2;
+		__entry->nextents = sc->ip->i_df.if_nextents;
+		__entry->format = sc->ip->i_df.if_format;
+		__entry->anextents = sc->ip->i_af.if_nextents;
+		__entry->aformat = sc->ip->i_af.if_format;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->nblocks,
+		  __entry->flags,
+		  __entry->flags2,
+		  __entry->nextents,
+		  __entry->format,
+		  __entry->anextents,
+		  __entry->aformat)
+)
+
+#define DEFINE_REPAIR_INODE_EVENT(name) \
+DEFINE_EVENT(xrep_inode_class, name, \
+	TP_PROTO(struct xfs_scrub *sc), \
+	TP_ARGS(sc))
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */