diff mbox series

[1/8] xfs: track metadata health status

Message ID 155494713235.1090518.11696420703305243139.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: online health tracking support | expand

Commit Message

Darrick J. Wong April 11, 2019, 1:45 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Add the necessary in-core metadata fields to keep track of which parts
of the filesystem have been observed and which parts were observed to be
unhealthy, and print a warning at unmount time if we have unfixed
problems.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile            |    1 
 fs/xfs/libxfs/xfs_health.h |  175 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_health.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.c        |    8 ++
 fs/xfs/xfs_inode.h         |    8 ++
 fs/xfs/xfs_mount.c         |    1 
 fs/xfs/xfs_mount.h         |   23 +++++
 fs/xfs/xfs_trace.h         |   73 +++++++++++++++++
 8 files changed, 481 insertions(+)
 create mode 100644 fs/xfs/libxfs/xfs_health.h
 create mode 100644 fs/xfs/xfs_health.c

Comments

Brian Foster April 11, 2019, 12:29 p.m. UTC | #1
On Wed, Apr 10, 2019 at 06:45:32PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add the necessary in-core metadata fields to keep track of which parts
> of the filesystem have been observed and which parts were observed to be
> unhealthy, and print a warning at unmount time if we have unfixed
> problems.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/Makefile            |    1 
>  fs/xfs/libxfs/xfs_health.h |  175 ++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_health.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_icache.c        |    8 ++
>  fs/xfs/xfs_inode.h         |    8 ++
>  fs/xfs/xfs_mount.c         |    1 
>  fs/xfs/xfs_mount.h         |   23 +++++
>  fs/xfs/xfs_trace.h         |   73 +++++++++++++++++
>  8 files changed, 481 insertions(+)
>  create mode 100644 fs/xfs/libxfs/xfs_health.h
>  create mode 100644 fs/xfs/xfs_health.c
> 
> 
...
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index e70e7db29026..885decab4735 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -73,6 +73,8 @@ xfs_inode_alloc(
>  	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
>  	INIT_LIST_HEAD(&ip->i_iodone_list);
>  	spin_lock_init(&ip->i_iodone_lock);
> +	ip->i_sick = 0;
> +	ip->i_checked = 0;
>  
>  	return ip;
>  }
> @@ -133,6 +135,8 @@ xfs_inode_free(
>  	spin_lock(&ip->i_flags_lock);
>  	ip->i_flags = XFS_IRECLAIM;
>  	ip->i_ino = 0;
> +	ip->i_sick = 0;
> +	ip->i_checked = 0;
>  	spin_unlock(&ip->i_flags_lock);
>  

FWIW, I'm not totally clear on what the i_checked mask is for yet. That
aside, is it necessary to reset these fields in the free/reclaim paths?
I wonder if it's sufficient to zero them on alloc and the cache hit path
just below..?

Otherwise looks fine:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  	__xfs_inode_free(ip);
> @@ -449,6 +453,8 @@ xfs_iget_cache_hit(
>  		ip->i_flags |= XFS_INEW;
>  		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
>  		inode->i_state = I_NEW;
> +		ip->i_sick = 0;
> +		ip->i_checked = 0;
>  
>  		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
>  		init_rwsem(&inode->i_rwsem);
> @@ -1177,6 +1183,8 @@ xfs_reclaim_inode(
>  	spin_lock(&ip->i_flags_lock);
>  	ip->i_flags = XFS_IRECLAIM;
>  	ip->i_ino = 0;
> +	ip->i_sick = 0;
> +	ip->i_checked = 0;
>  	spin_unlock(&ip->i_flags_lock);
>  
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 88239c2dd824..494e47ef42cb 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -45,6 +45,14 @@ typedef struct xfs_inode {
>  	mrlock_t		i_lock;		/* inode lock */
>  	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
>  	atomic_t		i_pincount;	/* inode pin count */
> +
> +	/*
> +	 * Bitsets of inode metadata that have been checked and/or are sick.
> +	 * Callers must hold i_flags_lock before accessing this field.
> +	 */
> +	uint16_t		i_checked;
> +	uint16_t		i_sick;
> +
>  	spinlock_t		i_flags_lock;	/* inode i_flags lock */
>  	/* Miscellaneous state. */
>  	unsigned long		i_flags;	/* see defined flags below */
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index fd63b0b1307c..6581381c12be 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -231,6 +231,7 @@ xfs_initialize_perag(
>  		error = xfs_iunlink_init(pag);
>  		if (error)
>  			goto out_hash_destroy;
> +		spin_lock_init(&pag->pag_state_lock);
>  	}
>  
>  	index = xfs_set_inode_alloc(mp, agcount);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 110f927cf943..cf7facc36a5f 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -60,6 +60,20 @@ struct xfs_error_cfg {
>  typedef struct xfs_mount {
>  	struct super_block	*m_super;
>  	xfs_tid_t		m_tid;		/* next unused tid for fs */
> +
> +	/*
> +	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> +	 * Callers must hold m_sb_lock to access these two fields.
> +	 */
> +	uint8_t			m_fs_checked;
> +	uint8_t			m_fs_sick;
> +	/*
> +	 * Bitsets of rt metadata that have been checked and/or are sick.
> +	 * Callers must hold m_sb_lock to access this field.
> +	 */
> +	uint8_t			m_rt_checked;
> +	uint8_t			m_rt_sick;
> +
>  	struct xfs_ail		*m_ail;		/* fs active log item list */
>  
>  	struct xfs_sb		m_sb;		/* copy of fs superblock */
> @@ -369,6 +383,15 @@ typedef struct xfs_perag {
>  	xfs_agino_t	pagl_pagino;
>  	xfs_agino_t	pagl_leftrec;
>  	xfs_agino_t	pagl_rightrec;
> +
> +	/*
> +	 * Bitsets of per-ag metadata that have been checked and/or are sick.
> +	 * Callers should hold pag_state_lock before accessing this field.
> +	 */
> +	uint16_t	pag_checked;
> +	uint16_t	pag_sick;
> +	spinlock_t	pag_state_lock;
> +
>  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
>  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
>  	unsigned int	pagb_gen;	/* generation count for pagb_tree */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 47fb07d86efd..f079841c7af6 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3440,6 +3440,79 @@ DEFINE_AGINODE_EVENT(xfs_iunlink);
>  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
>  DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
>  
> +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
> +	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> +	TP_ARGS(mp, flags),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(unsigned int, flags)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->flags = flags;
> +	),
> +	TP_printk("dev %d:%d flags 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->flags)
> +);
> +#define DEFINE_FS_CORRUPT_EVENT(name)	\
> +DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
> +	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
> +	TP_ARGS(mp, flags))
> +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
> +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
> +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
> +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
> +
> +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
> +	TP_ARGS(mp, agno, flags),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(unsigned int, flags)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->flags = flags;
> +	),
> +	TP_printk("dev %d:%d agno %u flags 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno, __entry->flags)
> +);
> +#define DEFINE_AG_CORRUPT_EVENT(name)	\
> +DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> +		 unsigned int flags), \
> +	TP_ARGS(mp, agno, flags))
> +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
> +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
> +
> +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
> +	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
> +	TP_ARGS(ip, flags),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_ino_t, ino)
> +		__field(unsigned int, flags)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = ip->i_mount->m_super->s_dev;
> +		__entry->ino = ip->i_ino;
> +		__entry->flags = flags;
> +	),
> +	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->ino, __entry->flags)
> +);
> +#define DEFINE_INODE_CORRUPT_EVENT(name)	\
> +DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> +	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
> +	TP_ARGS(ip, flags))
> +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
>
Darrick J. Wong April 11, 2019, 3:18 p.m. UTC | #2
On Thu, Apr 11, 2019 at 08:29:04AM -0400, Brian Foster wrote:
> On Wed, Apr 10, 2019 at 06:45:32PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add the necessary in-core metadata fields to keep track of which parts
> > of the filesystem have been observed and which parts were observed to be
> > unhealthy, and print a warning at unmount time if we have unfixed
> > problems.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/Makefile            |    1 
> >  fs/xfs/libxfs/xfs_health.h |  175 ++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_health.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_icache.c        |    8 ++
> >  fs/xfs/xfs_inode.h         |    8 ++
> >  fs/xfs/xfs_mount.c         |    1 
> >  fs/xfs/xfs_mount.h         |   23 +++++
> >  fs/xfs/xfs_trace.h         |   73 +++++++++++++++++
> >  8 files changed, 481 insertions(+)
> >  create mode 100644 fs/xfs/libxfs/xfs_health.h
> >  create mode 100644 fs/xfs/xfs_health.c
> > 
> > 
> ...
> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index e70e7db29026..885decab4735 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> > @@ -73,6 +73,8 @@ xfs_inode_alloc(
> >  	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> >  	INIT_LIST_HEAD(&ip->i_iodone_list);
> >  	spin_lock_init(&ip->i_iodone_lock);
> > +	ip->i_sick = 0;
> > +	ip->i_checked = 0;
> >  
> >  	return ip;
> >  }
> > @@ -133,6 +135,8 @@ xfs_inode_free(
> >  	spin_lock(&ip->i_flags_lock);
> >  	ip->i_flags = XFS_IRECLAIM;
> >  	ip->i_ino = 0;
> > +	ip->i_sick = 0;
> > +	ip->i_checked = 0;
> >  	spin_unlock(&ip->i_flags_lock);
> >  
> 
> FWIW, I'm not totally clear on what the i_checked mask is for yet.

Bleh, I forgot to update the introductory comment. :(

/*
 * <introductory stuff that's in xfs_health.h now>
 *
 * Each health tracking group uses a pair of fields for reporting.  The
 * "checked" field tell us if a given piece of metadata has ever been examined,
 * and the "sick" field tells us if that piece was found to need repairs.
 * Therefore we can conclude that for a given mask:
 *
 *  - checked && sick  => metadata needs repair
 *  - checked && !sick => metadata is ok
 *  - !checked         => has not been examined since mount
 */

In any case, I worked out the need for this new checked field when I was
writing the manual pages describing how all this worked:

https://djwong.org/docs/man/ioctl_xfs_fsop_geometry.2.html
https://djwong.org/docs/man/ioctl_xfs_ag_geometry.2.html
https://djwong.org/docs/man/ioctl_xfs_fsbulkstat.2.html

(See the part "The fields sick and checked indicate...")

@checked is a mask of all the metadata types that scrub has looked at,
whether or not the metadata was any good.  @sick is the mask of all the
metadata that scrub thought was bad, so we now can report to userspace
if something's good, bad, or unchecked.

> That aside, is it necessary to reset these fields in the free/reclaim
> paths?  I wonder if it's sufficient to zero them on alloc and the
> cache hit path just below..?

I think it's not strictly needed, but once we've broken the association
between a (struct xfs_inode *) buffer and a particular inode number, we
ought to zero out the health data just in case that buffer resurfaces
during the rcu grace period.

--D

> Otherwise looks fine:
> 
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> 
> >  	__xfs_inode_free(ip);
> > @@ -449,6 +453,8 @@ xfs_iget_cache_hit(
> >  		ip->i_flags |= XFS_INEW;
> >  		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
> >  		inode->i_state = I_NEW;
> > +		ip->i_sick = 0;
> > +		ip->i_checked = 0;
> >  
> >  		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
> >  		init_rwsem(&inode->i_rwsem);
> > @@ -1177,6 +1183,8 @@ xfs_reclaim_inode(
> >  	spin_lock(&ip->i_flags_lock);
> >  	ip->i_flags = XFS_IRECLAIM;
> >  	ip->i_ino = 0;
> > +	ip->i_sick = 0;
> > +	ip->i_checked = 0;
> >  	spin_unlock(&ip->i_flags_lock);
> >  
> >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index 88239c2dd824..494e47ef42cb 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -45,6 +45,14 @@ typedef struct xfs_inode {
> >  	mrlock_t		i_lock;		/* inode lock */
> >  	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
> >  	atomic_t		i_pincount;	/* inode pin count */
> > +
> > +	/*
> > +	 * Bitsets of inode metadata that have been checked and/or are sick.
> > +	 * Callers must hold i_flags_lock before accessing this field.
> > +	 */
> > +	uint16_t		i_checked;
> > +	uint16_t		i_sick;
> > +
> >  	spinlock_t		i_flags_lock;	/* inode i_flags lock */
> >  	/* Miscellaneous state. */
> >  	unsigned long		i_flags;	/* see defined flags below */
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index fd63b0b1307c..6581381c12be 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -231,6 +231,7 @@ xfs_initialize_perag(
> >  		error = xfs_iunlink_init(pag);
> >  		if (error)
> >  			goto out_hash_destroy;
> > +		spin_lock_init(&pag->pag_state_lock);
> >  	}
> >  
> >  	index = xfs_set_inode_alloc(mp, agcount);
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 110f927cf943..cf7facc36a5f 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -60,6 +60,20 @@ struct xfs_error_cfg {
> >  typedef struct xfs_mount {
> >  	struct super_block	*m_super;
> >  	xfs_tid_t		m_tid;		/* next unused tid for fs */
> > +
> > +	/*
> > +	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> > +	 * Callers must hold m_sb_lock to access these two fields.
> > +	 */
> > +	uint8_t			m_fs_checked;
> > +	uint8_t			m_fs_sick;
> > +	/*
> > +	 * Bitsets of rt metadata that have been checked and/or are sick.
> > +	 * Callers must hold m_sb_lock to access this field.
> > +	 */
> > +	uint8_t			m_rt_checked;
> > +	uint8_t			m_rt_sick;
> > +
> >  	struct xfs_ail		*m_ail;		/* fs active log item list */
> >  
> >  	struct xfs_sb		m_sb;		/* copy of fs superblock */
> > @@ -369,6 +383,15 @@ typedef struct xfs_perag {
> >  	xfs_agino_t	pagl_pagino;
> >  	xfs_agino_t	pagl_leftrec;
> >  	xfs_agino_t	pagl_rightrec;
> > +
> > +	/*
> > +	 * Bitsets of per-ag metadata that have been checked and/or are sick.
> > +	 * Callers should hold pag_state_lock before accessing this field.
> > +	 */
> > +	uint16_t	pag_checked;
> > +	uint16_t	pag_sick;
> > +	spinlock_t	pag_state_lock;
> > +
> >  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
> >  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> >  	unsigned int	pagb_gen;	/* generation count for pagb_tree */
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 47fb07d86efd..f079841c7af6 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3440,6 +3440,79 @@ DEFINE_AGINODE_EVENT(xfs_iunlink);
> >  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> >  DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
> >  
> > +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
> > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> > +	TP_ARGS(mp, flags),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(unsigned int, flags)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->flags = flags;
> > +	),
> > +	TP_printk("dev %d:%d flags 0x%x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->flags)
> > +);
> > +#define DEFINE_FS_CORRUPT_EVENT(name)	\
> > +DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
> > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
> > +	TP_ARGS(mp, flags))
> > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
> > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
> > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
> > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
> > +
> > +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
> > +	TP_ARGS(mp, agno, flags),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(unsigned int, flags)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->flags = flags;
> > +	),
> > +	TP_printk("dev %d:%d agno %u flags 0x%x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->agno, __entry->flags)
> > +);
> > +#define DEFINE_AG_CORRUPT_EVENT(name)	\
> > +DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > +		 unsigned int flags), \
> > +	TP_ARGS(mp, agno, flags))
> > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
> > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
> > +
> > +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
> > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
> > +	TP_ARGS(ip, flags),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_ino_t, ino)
> > +		__field(unsigned int, flags)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = ip->i_mount->m_super->s_dev;
> > +		__entry->ino = ip->i_ino;
> > +		__entry->flags = flags;
> > +	),
> > +	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->ino, __entry->flags)
> > +);
> > +#define DEFINE_INODE_CORRUPT_EVENT(name)	\
> > +DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
> > +	TP_ARGS(ip, flags))
> > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> >
Brian Foster April 11, 2019, 4:05 p.m. UTC | #3
On Thu, Apr 11, 2019 at 08:18:45AM -0700, Darrick J. Wong wrote:
> On Thu, Apr 11, 2019 at 08:29:04AM -0400, Brian Foster wrote:
> > On Wed, Apr 10, 2019 at 06:45:32PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Add the necessary in-core metadata fields to keep track of which parts
> > > of the filesystem have been observed and which parts were observed to be
> > > unhealthy, and print a warning at unmount time if we have unfixed
> > > problems.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/Makefile            |    1 
> > >  fs/xfs/libxfs/xfs_health.h |  175 ++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_health.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_icache.c        |    8 ++
> > >  fs/xfs/xfs_inode.h         |    8 ++
> > >  fs/xfs/xfs_mount.c         |    1 
> > >  fs/xfs/xfs_mount.h         |   23 +++++
> > >  fs/xfs/xfs_trace.h         |   73 +++++++++++++++++
> > >  8 files changed, 481 insertions(+)
> > >  create mode 100644 fs/xfs/libxfs/xfs_health.h
> > >  create mode 100644 fs/xfs/xfs_health.c
> > > 
> > > 
> > ...
> > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > > index e70e7db29026..885decab4735 100644
> > > --- a/fs/xfs/xfs_icache.c
> > > +++ b/fs/xfs/xfs_icache.c
> > > @@ -73,6 +73,8 @@ xfs_inode_alloc(
> > >  	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> > >  	INIT_LIST_HEAD(&ip->i_iodone_list);
> > >  	spin_lock_init(&ip->i_iodone_lock);
> > > +	ip->i_sick = 0;
> > > +	ip->i_checked = 0;
> > >  
> > >  	return ip;
> > >  }
> > > @@ -133,6 +135,8 @@ xfs_inode_free(
> > >  	spin_lock(&ip->i_flags_lock);
> > >  	ip->i_flags = XFS_IRECLAIM;
> > >  	ip->i_ino = 0;
> > > +	ip->i_sick = 0;
> > > +	ip->i_checked = 0;
> > >  	spin_unlock(&ip->i_flags_lock);
> > >  
> > 
> > FWIW, I'm not totally clear on what the i_checked mask is for yet.
> 
> Bleh, I forgot to update the introductory comment. :(
> 
> /*
>  * <introductory stuff that's in xfs_health.h now>
>  *
>  * Each health tracking group uses a pair of fields for reporting.  The
>  * "checked" field tell us if a given piece of metadata has ever been examined,
>  * and the "sick" field tells us if that piece was found to need repairs.
>  * Therefore we can conclude that for a given mask:
>  *
>  *  - checked && sick  => metadata needs repair
>  *  - checked && !sick => metadata is ok
>  *  - !checked         => has not been examined since mount
>  */
> 
> In any case, I worked out the need for this new checked field when I was
> writing the manual pages describing how all this worked:
> 
> https://djwong.org/docs/man/ioctl_xfs_fsop_geometry.2.html
> https://djwong.org/docs/man/ioctl_xfs_ag_geometry.2.html
> https://djwong.org/docs/man/ioctl_xfs_fsbulkstat.2.html
> 
> (See the part "The fields sick and checked indicate...")
> 
> @checked is a mask of all the metadata types that scrub has looked at,
> whether or not the metadata was any good.  @sick is the mask of all the
> metadata that scrub thought was bad, so we now can report to userspace
> if something's good, bad, or unchecked.
> 

Ok, thanks.

> > That aside, is it necessary to reset these fields in the free/reclaim
> > paths?  I wonder if it's sufficient to zero them on alloc and the
> > cache hit path just below..?
> 
> I think it's not strictly needed, but once we've broken the association
> between a (struct xfs_inode *) buffer and a particular inode number, we
> ought to zero out the health data just in case that buffer resurfaces
> during the rcu grace period.
> 

I thought freeing the inode was imminent at that point. We set
XFS_IRECLAIM then call into the RCU mechanism to free the memory. If
lookup finds the inode, we retry on XFS_IRECLAIM or attempt to reuse on
XFS_IRECLAIMABLE (which is covered by the fields being reset in
iget_cache_hit()).

Brian

> --D
> 
> > Otherwise looks fine:
> > 
> > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > 
> > >  	__xfs_inode_free(ip);
> > > @@ -449,6 +453,8 @@ xfs_iget_cache_hit(
> > >  		ip->i_flags |= XFS_INEW;
> > >  		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
> > >  		inode->i_state = I_NEW;
> > > +		ip->i_sick = 0;
> > > +		ip->i_checked = 0;
> > >  
> > >  		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
> > >  		init_rwsem(&inode->i_rwsem);
> > > @@ -1177,6 +1183,8 @@ xfs_reclaim_inode(
> > >  	spin_lock(&ip->i_flags_lock);
> > >  	ip->i_flags = XFS_IRECLAIM;
> > >  	ip->i_ino = 0;
> > > +	ip->i_sick = 0;
> > > +	ip->i_checked = 0;
> > >  	spin_unlock(&ip->i_flags_lock);
> > >  
> > >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > index 88239c2dd824..494e47ef42cb 100644
> > > --- a/fs/xfs/xfs_inode.h
> > > +++ b/fs/xfs/xfs_inode.h
> > > @@ -45,6 +45,14 @@ typedef struct xfs_inode {
> > >  	mrlock_t		i_lock;		/* inode lock */
> > >  	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
> > >  	atomic_t		i_pincount;	/* inode pin count */
> > > +
> > > +	/*
> > > +	 * Bitsets of inode metadata that have been checked and/or are sick.
> > > +	 * Callers must hold i_flags_lock before accessing this field.
> > > +	 */
> > > +	uint16_t		i_checked;
> > > +	uint16_t		i_sick;
> > > +
> > >  	spinlock_t		i_flags_lock;	/* inode i_flags lock */
> > >  	/* Miscellaneous state. */
> > >  	unsigned long		i_flags;	/* see defined flags below */
> > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > index fd63b0b1307c..6581381c12be 100644
> > > --- a/fs/xfs/xfs_mount.c
> > > +++ b/fs/xfs/xfs_mount.c
> > > @@ -231,6 +231,7 @@ xfs_initialize_perag(
> > >  		error = xfs_iunlink_init(pag);
> > >  		if (error)
> > >  			goto out_hash_destroy;
> > > +		spin_lock_init(&pag->pag_state_lock);
> > >  	}
> > >  
> > >  	index = xfs_set_inode_alloc(mp, agcount);
> > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > index 110f927cf943..cf7facc36a5f 100644
> > > --- a/fs/xfs/xfs_mount.h
> > > +++ b/fs/xfs/xfs_mount.h
> > > @@ -60,6 +60,20 @@ struct xfs_error_cfg {
> > >  typedef struct xfs_mount {
> > >  	struct super_block	*m_super;
> > >  	xfs_tid_t		m_tid;		/* next unused tid for fs */
> > > +
> > > +	/*
> > > +	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> > > +	 * Callers must hold m_sb_lock to access these two fields.
> > > +	 */
> > > +	uint8_t			m_fs_checked;
> > > +	uint8_t			m_fs_sick;
> > > +	/*
> > > +	 * Bitsets of rt metadata that have been checked and/or are sick.
> > > +	 * Callers must hold m_sb_lock to access this field.
> > > +	 */
> > > +	uint8_t			m_rt_checked;
> > > +	uint8_t			m_rt_sick;
> > > +
> > >  	struct xfs_ail		*m_ail;		/* fs active log item list */
> > >  
> > >  	struct xfs_sb		m_sb;		/* copy of fs superblock */
> > > @@ -369,6 +383,15 @@ typedef struct xfs_perag {
> > >  	xfs_agino_t	pagl_pagino;
> > >  	xfs_agino_t	pagl_leftrec;
> > >  	xfs_agino_t	pagl_rightrec;
> > > +
> > > +	/*
> > > +	 * Bitsets of per-ag metadata that have been checked and/or are sick.
> > > +	 * Callers should hold pag_state_lock before accessing this field.
> > > +	 */
> > > +	uint16_t	pag_checked;
> > > +	uint16_t	pag_sick;
> > > +	spinlock_t	pag_state_lock;
> > > +
> > >  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
> > >  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> > >  	unsigned int	pagb_gen;	/* generation count for pagb_tree */
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index 47fb07d86efd..f079841c7af6 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3440,6 +3440,79 @@ DEFINE_AGINODE_EVENT(xfs_iunlink);
> > >  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> > >  DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
> > >  
> > > +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
> > > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> > > +	TP_ARGS(mp, flags),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(unsigned int, flags)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->flags = flags;
> > > +	),
> > > +	TP_printk("dev %d:%d flags 0x%x",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __entry->flags)
> > > +);
> > > +#define DEFINE_FS_CORRUPT_EVENT(name)	\
> > > +DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
> > > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
> > > +	TP_ARGS(mp, flags))
> > > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
> > > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
> > > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
> > > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
> > > +
> > > +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
> > > +	TP_ARGS(mp, agno, flags),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(unsigned int, flags)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->agno = agno;
> > > +		__entry->flags = flags;
> > > +	),
> > > +	TP_printk("dev %d:%d agno %u flags 0x%x",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __entry->agno, __entry->flags)
> > > +);
> > > +#define DEFINE_AG_CORRUPT_EVENT(name)	\
> > > +DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > > +		 unsigned int flags), \
> > > +	TP_ARGS(mp, agno, flags))
> > > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
> > > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
> > > +
> > > +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
> > > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
> > > +	TP_ARGS(ip, flags),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_ino_t, ino)
> > > +		__field(unsigned int, flags)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = ip->i_mount->m_super->s_dev;
> > > +		__entry->ino = ip->i_ino;
> > > +		__entry->flags = flags;
> > > +	),
> > > +	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __entry->ino, __entry->flags)
> > > +);
> > > +#define DEFINE_INODE_CORRUPT_EVENT(name)	\
> > > +DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> > > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
> > > +	TP_ARGS(ip, flags))
> > > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> > > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> > > +
> > >  #endif /* _TRACE_XFS_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > >
Darrick J. Wong April 11, 2019, 6:31 p.m. UTC | #4
On Thu, Apr 11, 2019 at 12:05:30PM -0400, Brian Foster wrote:
> On Thu, Apr 11, 2019 at 08:18:45AM -0700, Darrick J. Wong wrote:
> > On Thu, Apr 11, 2019 at 08:29:04AM -0400, Brian Foster wrote:
> > > On Wed, Apr 10, 2019 at 06:45:32PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Add the necessary in-core metadata fields to keep track of which parts
> > > > of the filesystem have been observed and which parts were observed to be
> > > > unhealthy, and print a warning at unmount time if we have unfixed
> > > > problems.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/Makefile            |    1 
> > > >  fs/xfs/libxfs/xfs_health.h |  175 ++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_health.c        |  192 ++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_icache.c        |    8 ++
> > > >  fs/xfs/xfs_inode.h         |    8 ++
> > > >  fs/xfs/xfs_mount.c         |    1 
> > > >  fs/xfs/xfs_mount.h         |   23 +++++
> > > >  fs/xfs/xfs_trace.h         |   73 +++++++++++++++++
> > > >  8 files changed, 481 insertions(+)
> > > >  create mode 100644 fs/xfs/libxfs/xfs_health.h
> > > >  create mode 100644 fs/xfs/xfs_health.c
> > > > 
> > > > 
> > > ...
> > > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > > > index e70e7db29026..885decab4735 100644
> > > > --- a/fs/xfs/xfs_icache.c
> > > > +++ b/fs/xfs/xfs_icache.c
> > > > @@ -73,6 +73,8 @@ xfs_inode_alloc(
> > > >  	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
> > > >  	INIT_LIST_HEAD(&ip->i_iodone_list);
> > > >  	spin_lock_init(&ip->i_iodone_lock);
> > > > +	ip->i_sick = 0;
> > > > +	ip->i_checked = 0;
> > > >  
> > > >  	return ip;
> > > >  }
> > > > @@ -133,6 +135,8 @@ xfs_inode_free(
> > > >  	spin_lock(&ip->i_flags_lock);
> > > >  	ip->i_flags = XFS_IRECLAIM;
> > > >  	ip->i_ino = 0;
> > > > +	ip->i_sick = 0;
> > > > +	ip->i_checked = 0;
> > > >  	spin_unlock(&ip->i_flags_lock);
> > > >  
> > > 
> > > FWIW, I'm not totally clear on what the i_checked mask is for yet.
> > 
> > Bleh, I forgot to update the introductory comment. :(
> > 
> > /*
> >  * <introductory stuff that's in xfs_health.h now>
> >  *
> >  * Each health tracking group uses a pair of fields for reporting.  The
> >  * "checked" field tell us if a given piece of metadata has ever been examined,
> >  * and the "sick" field tells us if that piece was found to need repairs.
> >  * Therefore we can conclude that for a given mask:
> >  *
> >  *  - checked && sick  => metadata needs repair
> >  *  - checked && !sick => metadata is ok
> >  *  - !checked         => has not been examined since mount
> >  */
> > 
> > In any case, I worked out the need for this new checked field when I was
> > writing the manual pages describing how all this worked:
> > 
> > https://djwong.org/docs/man/ioctl_xfs_fsop_geometry.2.html
> > https://djwong.org/docs/man/ioctl_xfs_ag_geometry.2.html
> > https://djwong.org/docs/man/ioctl_xfs_fsbulkstat.2.html
> > 
> > (See the part "The fields sick and checked indicate...")
> > 
> > @checked is a mask of all the metadata types that scrub has looked at,
> > whether or not the metadata was any good.  @sick is the mask of all the
> > metadata that scrub thought was bad, so we now can report to userspace
> > if something's good, bad, or unchecked.
> > 
> 
> Ok, thanks.
> 
> > > That aside, is it necessary to reset these fields in the free/reclaim
> > > paths?  I wonder if it's sufficient to zero them on alloc and the
> > > cache hit path just below..?
> > 
> > I think it's not strictly needed, but once we've broken the association
> > between a (struct xfs_inode *) buffer and a particular inode number, we
> > ought to zero out the health data just in case that buffer resurfaces
> > during the rcu grace period.
> > 
> 
> I thought freeing the inode was imminent at that point. We set
> XFS_IRECLAIM then call into the RCU mechanism to free the memory. If
> lookup finds the inode, we retry on XFS_IRECLAIM or attempt to reuse on
> XFS_IRECLAIMABLE (which is covered by the fields being reset in
> iget_cache_hit()).

<nod> The i_ino change effectively prevents anyone else from seeing
stale sick/checked contents, so I might as well drop this for v3.

--D

> Brian
> 
> > --D
> > 
> > > Otherwise looks fine:
> > > 
> > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > 
> > > >  	__xfs_inode_free(ip);
> > > > @@ -449,6 +453,8 @@ xfs_iget_cache_hit(
> > > >  		ip->i_flags |= XFS_INEW;
> > > >  		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
> > > >  		inode->i_state = I_NEW;
> > > > +		ip->i_sick = 0;
> > > > +		ip->i_checked = 0;
> > > >  
> > > >  		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
> > > >  		init_rwsem(&inode->i_rwsem);
> > > > @@ -1177,6 +1183,8 @@ xfs_reclaim_inode(
> > > >  	spin_lock(&ip->i_flags_lock);
> > > >  	ip->i_flags = XFS_IRECLAIM;
> > > >  	ip->i_ino = 0;
> > > > +	ip->i_sick = 0;
> > > > +	ip->i_checked = 0;
> > > >  	spin_unlock(&ip->i_flags_lock);
> > > >  
> > > >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > > index 88239c2dd824..494e47ef42cb 100644
> > > > --- a/fs/xfs/xfs_inode.h
> > > > +++ b/fs/xfs/xfs_inode.h
> > > > @@ -45,6 +45,14 @@ typedef struct xfs_inode {
> > > >  	mrlock_t		i_lock;		/* inode lock */
> > > >  	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
> > > >  	atomic_t		i_pincount;	/* inode pin count */
> > > > +
> > > > +	/*
> > > > +	 * Bitsets of inode metadata that have been checked and/or are sick.
> > > > +	 * Callers must hold i_flags_lock before accessing this field.
> > > > +	 */
> > > > +	uint16_t		i_checked;
> > > > +	uint16_t		i_sick;
> > > > +
> > > >  	spinlock_t		i_flags_lock;	/* inode i_flags lock */
> > > >  	/* Miscellaneous state. */
> > > >  	unsigned long		i_flags;	/* see defined flags below */
> > > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > > index fd63b0b1307c..6581381c12be 100644
> > > > --- a/fs/xfs/xfs_mount.c
> > > > +++ b/fs/xfs/xfs_mount.c
> > > > @@ -231,6 +231,7 @@ xfs_initialize_perag(
> > > >  		error = xfs_iunlink_init(pag);
> > > >  		if (error)
> > > >  			goto out_hash_destroy;
> > > > +		spin_lock_init(&pag->pag_state_lock);
> > > >  	}
> > > >  
> > > >  	index = xfs_set_inode_alloc(mp, agcount);
> > > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > > index 110f927cf943..cf7facc36a5f 100644
> > > > --- a/fs/xfs/xfs_mount.h
> > > > +++ b/fs/xfs/xfs_mount.h
> > > > @@ -60,6 +60,20 @@ struct xfs_error_cfg {
> > > >  typedef struct xfs_mount {
> > > >  	struct super_block	*m_super;
> > > >  	xfs_tid_t		m_tid;		/* next unused tid for fs */
> > > > +
> > > > +	/*
> > > > +	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> > > > +	 * Callers must hold m_sb_lock to access these two fields.
> > > > +	 */
> > > > +	uint8_t			m_fs_checked;
> > > > +	uint8_t			m_fs_sick;
> > > > +	/*
> > > > +	 * Bitsets of rt metadata that have been checked and/or are sick.
> > > > +	 * Callers must hold m_sb_lock to access this field.
> > > > +	 */
> > > > +	uint8_t			m_rt_checked;
> > > > +	uint8_t			m_rt_sick;
> > > > +
> > > >  	struct xfs_ail		*m_ail;		/* fs active log item list */
> > > >  
> > > >  	struct xfs_sb		m_sb;		/* copy of fs superblock */
> > > > @@ -369,6 +383,15 @@ typedef struct xfs_perag {
> > > >  	xfs_agino_t	pagl_pagino;
> > > >  	xfs_agino_t	pagl_leftrec;
> > > >  	xfs_agino_t	pagl_rightrec;
> > > > +
> > > > +	/*
> > > > +	 * Bitsets of per-ag metadata that have been checked and/or are sick.
> > > > +	 * Callers should hold pag_state_lock before accessing this field.
> > > > +	 */
> > > > +	uint16_t	pag_checked;
> > > > +	uint16_t	pag_sick;
> > > > +	spinlock_t	pag_state_lock;
> > > > +
> > > >  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
> > > >  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> > > >  	unsigned int	pagb_gen;	/* generation count for pagb_tree */
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index 47fb07d86efd..f079841c7af6 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -3440,6 +3440,79 @@ DEFINE_AGINODE_EVENT(xfs_iunlink);
> > > >  DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
> > > >  DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
> > > >  
> > > > +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
> > > > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
> > > > +	TP_ARGS(mp, flags),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(unsigned int, flags)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->flags = flags;
> > > > +	),
> > > > +	TP_printk("dev %d:%d flags 0x%x",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->flags)
> > > > +);
> > > > +#define DEFINE_FS_CORRUPT_EVENT(name)	\
> > > > +DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
> > > > +	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
> > > > +	TP_ARGS(mp, flags))
> > > > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
> > > > +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
> > > > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
> > > > +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
> > > > +
> > > > +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
> > > > +	TP_ARGS(mp, agno, flags),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(unsigned int, flags)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->agno = agno;
> > > > +		__entry->flags = flags;
> > > > +	),
> > > > +	TP_printk("dev %d:%d agno %u flags 0x%x",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->agno, __entry->flags)
> > > > +);
> > > > +#define DEFINE_AG_CORRUPT_EVENT(name)	\
> > > > +DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > > > +		 unsigned int flags), \
> > > > +	TP_ARGS(mp, agno, flags))
> > > > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
> > > > +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
> > > > +
> > > > +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
> > > > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
> > > > +	TP_ARGS(ip, flags),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_ino_t, ino)
> > > > +		__field(unsigned int, flags)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = ip->i_mount->m_super->s_dev;
> > > > +		__entry->ino = ip->i_ino;
> > > > +		__entry->flags = flags;
> > > > +	),
> > > > +	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->ino, __entry->flags)
> > > > +);
> > > > +#define DEFINE_INODE_CORRUPT_EVENT(name)	\
> > > > +DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> > > > +	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
> > > > +	TP_ARGS(ip, flags))
> > > > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> > > > +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > >
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7f96bdadc372..786379c143f4 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -73,6 +73,7 @@  xfs-y				+= xfs_aops.o \
 				   xfs_fsmap.o \
 				   xfs_fsops.o \
 				   xfs_globals.o \
+				   xfs_health.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 000000000000..30762a5d4862
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -0,0 +1,175 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs.  Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful.
+ *
+ * If problems remain at unmount time, we can also request manual intervention
+ * by logging a notice to run xfs_repair.
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
+#define XFS_SICK_FS_UQUOTA	(1 << 1)  /* user quota */
+#define XFS_SICK_FS_GQUOTA	(1 << 2)  /* group quota */
+#define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY	(1 << 1)  /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB		(1 << 0)  /* superblock */
+#define XFS_SICK_AG_AGF		(1 << 1)  /* AGF header */
+#define XFS_SICK_AG_AGFL	(1 << 2)  /* AGFL header */
+#define XFS_SICK_AG_AGI		(1 << 3)  /* AGI header */
+#define XFS_SICK_AG_BNOBT	(1 << 4)  /* free space by block */
+#define XFS_SICK_AG_CNTBT	(1 << 5)  /* free space by length */
+#define XFS_SICK_AG_INOBT	(1 << 6)  /* inode index */
+#define XFS_SICK_AG_FINOBT	(1 << 7)  /* free inode index */
+#define XFS_SICK_AG_RMAPBT	(1 << 8)  /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT	(1 << 9)  /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE	(1 << 0)  /* inode core */
+#define XFS_SICK_INO_BMBTD	(1 << 1)  /* data fork */
+#define XFS_SICK_INO_BMBTA	(1 << 2)  /* attr fork */
+#define XFS_SICK_INO_BMBTC	(1 << 3)  /* cow fork */
+#define XFS_SICK_INO_DIR	(1 << 4)  /* directory */
+#define XFS_SICK_INO_XATTR	(1 << 5)  /* extended attributes */
+#define XFS_SICK_INO_SYMLINK	(1 << 6)  /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT	(1 << 7)  /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY	(XFS_SICK_FS_COUNTERS | \
+				 XFS_SICK_FS_UQUOTA | \
+				 XFS_SICK_FS_GQUOTA | \
+				 XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
+				 XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY	(XFS_SICK_AG_SB | \
+				 XFS_SICK_AG_AGF | \
+				 XFS_SICK_AG_AGFL | \
+				 XFS_SICK_AG_AGI | \
+				 XFS_SICK_AG_BNOBT | \
+				 XFS_SICK_AG_CNTBT | \
+				 XFS_SICK_AG_INOBT | \
+				 XFS_SICK_AG_FINOBT | \
+				 XFS_SICK_AG_RMAPBT | \
+				 XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY	(XFS_SICK_INO_CORE | \
+				 XFS_SICK_INO_BMBTD | \
+				 XFS_SICK_INO_BMBTA | \
+				 XFS_SICK_INO_BMBTC | \
+				 XFS_SICK_INO_DIR | \
+				 XFS_SICK_INO_XATTR | \
+				 XFS_SICK_INO_SYMLINK | \
+				 XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+		unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+		unsigned int *checked);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_ag_measure_sickness(pag, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+	unsigned int	sick, checked;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+	return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+	return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+	return !xfs_inode_has_sickness(ip, -1U);
+}
+
+#endif	/* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
new file mode 100644
index 000000000000..941f33037e2f
--- /dev/null
+++ b/fs/xfs/xfs_health.c
@@ -0,0 +1,192 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_health.h"
+
+/* Mark unhealthy per-fs metadata. */
+void
+xfs_fs_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick |= mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a per-fs metadata healed. */
+void
+xfs_fs_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	trace_xfs_fs_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick &= ~mask;
+	mp->m_fs_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which per-fs metadata are unhealthy. */
+void
+xfs_fs_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_fs_sick;
+	*checked = mp->m_fs_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy realtime metadata. */
+void
+xfs_rt_mark_sick(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_sick(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick |= mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a realtime metadata healed. */
+void
+xfs_rt_mark_healthy(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	trace_xfs_rt_mark_healthy(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick &= ~mask;
+	mp->m_rt_checked |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which realtime metadata are unhealthy. */
+void
+xfs_rt_measure_sickness(
+	struct xfs_mount	*mp,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&mp->m_sb_lock);
+	*sick = mp->m_rt_sick;
+	*checked = mp->m_rt_checked;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy per-ag metadata. */
+void
+xfs_ag_mark_sick(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick |= mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata ok. */
+void
+xfs_ag_mark_healthy(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick &= ~mask;
+	pag->pag_checked |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Sample which per-ag metadata are unhealthy. */
+void
+xfs_ag_measure_sickness(
+	struct xfs_perag	*pag,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&pag->pag_state_lock);
+	*sick = pag->pag_sick;
+	*checked = pag->pag_checked;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark the unhealthy parts of an inode. */
+void
+xfs_inode_mark_sick(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_sick(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick |= mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mark parts of an inode healed. */
+void
+xfs_inode_mark_healthy(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	trace_xfs_inode_mark_healthy(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick &= ~mask;
+	ip->i_checked |= mask;
+	spin_unlock(&ip->i_flags_lock);
+}
+
+/* Sample which parts of an inode are unhealthy. */
+void
+xfs_inode_measure_sickness(
+	struct xfs_inode	*ip,
+	unsigned int		*sick,
+	unsigned int		*checked)
+{
+	spin_lock(&ip->i_flags_lock);
+	*sick = ip->i_sick;
+	*checked = ip->i_checked;
+	spin_unlock(&ip->i_flags_lock);
+}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e70e7db29026..885decab4735 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -73,6 +73,8 @@  xfs_inode_alloc(
 	INIT_WORK(&ip->i_iodone_work, xfs_end_io);
 	INIT_LIST_HEAD(&ip->i_iodone_list);
 	spin_lock_init(&ip->i_iodone_lock);
+	ip->i_sick = 0;
+	ip->i_checked = 0;
 
 	return ip;
 }
@@ -133,6 +135,8 @@  xfs_inode_free(
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
 	ip->i_ino = 0;
+	ip->i_sick = 0;
+	ip->i_checked = 0;
 	spin_unlock(&ip->i_flags_lock);
 
 	__xfs_inode_free(ip);
@@ -449,6 +453,8 @@  xfs_iget_cache_hit(
 		ip->i_flags |= XFS_INEW;
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
+		ip->i_sick = 0;
+		ip->i_checked = 0;
 
 		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 		init_rwsem(&inode->i_rwsem);
@@ -1177,6 +1183,8 @@  xfs_reclaim_inode(
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
 	ip->i_ino = 0;
+	ip->i_sick = 0;
+	ip->i_checked = 0;
 	spin_unlock(&ip->i_flags_lock);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 88239c2dd824..494e47ef42cb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -45,6 +45,14 @@  typedef struct xfs_inode {
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
+
+	/*
+	 * Bitsets of inode metadata that have been checked and/or are sick.
+	 * Callers must hold i_flags_lock before accessing this field.
+	 */
+	uint16_t		i_checked;
+	uint16_t		i_sick;
+
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fd63b0b1307c..6581381c12be 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -231,6 +231,7 @@  xfs_initialize_perag(
 		error = xfs_iunlink_init(pag);
 		if (error)
 			goto out_hash_destroy;
+		spin_lock_init(&pag->pag_state_lock);
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 110f927cf943..cf7facc36a5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,20 @@  struct xfs_error_cfg {
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
+
+	/*
+	 * Bitsets of per-fs metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access these two fields.
+	 */
+	uint8_t			m_fs_checked;
+	uint8_t			m_fs_sick;
+	/*
+	 * Bitsets of rt metadata that have been checked and/or are sick.
+	 * Callers must hold m_sb_lock to access this field.
+	 */
+	uint8_t			m_rt_checked;
+	uint8_t			m_rt_sick;
+
 	struct xfs_ail		*m_ail;		/* fs active log item list */
 
 	struct xfs_sb		m_sb;		/* copy of fs superblock */
@@ -369,6 +383,15 @@  typedef struct xfs_perag {
 	xfs_agino_t	pagl_pagino;
 	xfs_agino_t	pagl_leftrec;
 	xfs_agino_t	pagl_rightrec;
+
+	/*
+	 * Bitsets of per-ag metadata that have been checked and/or are sick.
+	 * Callers should hold pag_state_lock before accessing this field.
+	 */
+	uint16_t	pag_checked;
+	uint16_t	pag_sick;
+	spinlock_t	pag_state_lock;
+
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 	unsigned int	pagb_gen;	/* generation count for pagb_tree */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 47fb07d86efd..f079841c7af6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3440,6 +3440,79 @@  DEFINE_AGINODE_EVENT(xfs_iunlink);
 DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
 DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
 
+DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
+	TP_ARGS(mp, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->flags)
+);
+#define DEFINE_FS_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
+	TP_ARGS(mp, flags))
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+
+DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
+	TP_ARGS(mp, agno, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno %u flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->flags)
+);
+#define DEFINE_AG_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 unsigned int flags), \
+	TP_ARGS(mp, agno, flags))
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+
+DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags),
+	TP_ARGS(ip, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->flags)
+);
+#define DEFINE_INODE_CORRUPT_EVENT(name)	\
+DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
+	TP_ARGS(ip, flags))
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH