diff mbox

[05/22] xfs: scrub in-memory metadata buffers

Message ID 150061194082.14732.17289613506653968866.stgit@magnolia (mailing list archive)
State New, archived
Headers show

Commit Message

Darrick J. Wong July 21, 2017, 4:39 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Call the verifier function for all in-memory metadata buffers, looking
for memory corruption either due to bad memory or coding bugs.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile         |    1 
 fs/xfs/libxfs/xfs_fs.h  |    3 +
 fs/xfs/scrub/common.c   |    4 +
 fs/xfs/scrub/common.h   |    2 +
 fs/xfs/scrub/metabufs.c |  177 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h      |    3 +
 6 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 fs/xfs/scrub/metabufs.c



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Allison Henderson July 23, 2017, 4:48 p.m. UTC | #1
On 7/20/2017 9:39 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Call the verifier function for all in-memory metadata buffers, looking
> for memory corruption either due to bad memory or coding bugs.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/Makefile         |    1
>  fs/xfs/libxfs/xfs_fs.h  |    3 +
>  fs/xfs/scrub/common.c   |    4 +
>  fs/xfs/scrub/common.h   |    2 +
>  fs/xfs/scrub/metabufs.c |  177 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_trace.h      |    3 +
>  6 files changed, 188 insertions(+), 2 deletions(-)
>  create mode 100644 fs/xfs/scrub/metabufs.c
>
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 4e04da9..67cf4ac 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -142,5 +142,6 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
>  xfs-y				+= $(addprefix scrub/, \
>  				   btree.o \
>  				   common.o \
> +				   metabufs.o \
>  				   )
>  endif
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index aeccc99..9fb3c65 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -482,7 +482,8 @@ struct xfs_scrub_metadata {
>   * Metadata types and flags for scrub operation.
>   */
>  #define XFS_SCRUB_TYPE_TEST	0	/* dummy to test ioctl */
> -#define XFS_SCRUB_TYPE_MAX	0
> +#define XFS_SCRUB_TYPE_METABUFS	1	/* in-core metadata buffers */
> +#define XFS_SCRUB_TYPE_MAX	1
>
>  /* i: repair this metadata */
>  #define XFS_SCRUB_FLAG_REPAIR		(1 << 0)
> diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
> index 331aa14..e06131f 100644
> --- a/fs/xfs/scrub/common.c
> +++ b/fs/xfs/scrub/common.c
> @@ -610,6 +610,10 @@ static const struct xfs_scrub_meta_fns meta_scrub_fns[] = {
>  		.setup	= xfs_scrub_setup_fs,
>  		.scrub	= xfs_scrub_dummy,
>  	},
> +	{ /* in-memory metadata buffers */
> +		.setup	= xfs_scrub_setup_metabufs,
> +		.scrub	= xfs_scrub_metabufs,
> +	},
>  };
>
>  /* Dispatch metadata scrubbing. */
> diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
> index 15baccb..5f0818c 100644
> --- a/fs/xfs/scrub/common.h
> +++ b/fs/xfs/scrub/common.h
> @@ -198,12 +198,14 @@ int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
>
>  #define SETUP_FN(name) int name(struct xfs_scrub_context *sc, struct xfs_inode *ip)
>  SETUP_FN(xfs_scrub_setup_fs);
> +SETUP_FN(xfs_scrub_setup_metabufs);
>  #undef SETUP_FN
>
>  /* Metadata scrubbers */
>
>  #define SCRUB_FN(name) int name(struct xfs_scrub_context *sc)
>  SCRUB_FN(xfs_scrub_dummy);
> +SCRUB_FN(xfs_scrub_metabufs);
>  #undef SCRUB_FN
>
>  #endif	/* __XFS_REPAIR_COMMON_H__ */
> diff --git a/fs/xfs/scrub/metabufs.c b/fs/xfs/scrub/metabufs.c
> new file mode 100644
> index 0000000..63faaa6
> --- /dev/null
> +++ b/fs/xfs/scrub/metabufs.c
> @@ -0,0 +1,177 @@
> +/*
> + * Copyright (C) 2017 Oracle.  All Rights Reserved.
> + *
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_btree.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_trace.h"
> +#include "xfs_sb.h"
> +#include "scrub/common.h"
> +
> +/* We only iterate buffers one by one, so we don't need any setup. */
> +int
> +xfs_scrub_setup_metabufs(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_inode		*ip)
> +{
> +	return 0;
> +}
> +
> +#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
> +struct xfs_scrub_metabufs_info {
> +	struct xfs_scrub_context	*sc;
> +	unsigned int			retries;
> +};
> +
> +/* In-memory buffer corruption. */
> +
> +#define XFS_SCRUB_BUF_OP_ERROR_GOTO(label) \
> +	XFS_SCRUB_OP_ERROR_GOTO(smi->sc, \
> +			xfs_daddr_to_agno(smi->sc->mp, bp->b_bn), \
> +			xfs_daddr_to_agbno(smi->sc->mp, bp->b_bn), "buf", \
> +			&error, label)
> +STATIC int
> +xfs_scrub_metabufs_scrub_buf(
> +	struct xfs_scrub_metabufs_info	*smi,
> +	struct xfs_buf			*bp)
> +{
> +	int				olderror;
> +	int				error = 0;
> +
> +	/*
> +	 * We hold the rcu lock during the rhashtable walk, so we can't risk
> +	 * having the log forced due to a stale buffer by xfs_buf_lock.
> +	 */
> +	if (bp->b_flags & XBF_STALE)
> +		return 0;
> +
> +	atomic_inc(&bp->b_hold);
> +	if (!xfs_buf_trylock(bp)) {
> +		if (smi->retries > XFS_SCRUB_METABUFS_TOO_MANY_RETRIES) {
> +			/* We've retried too many times, do what we can. */
> +			XFS_SCRUB_INCOMPLETE(smi->sc, "metabufs", true);
> +			error = 0;
> +		} else {
> +			/* Restart the metabuf scrub from the start. */
> +			smi->retries++;
> +			error = -EAGAIN;
> +		}
> +		goto out_dec;
> +	}
> +
> +	/* Skip this buffer if it's stale, unread, or has no verifiers. */
> +	if ((bp->b_flags & XBF_STALE) ||
> +	    !(bp->b_flags & XBF_DONE) ||
> +	    !bp->b_ops)
> +		goto out_unlock;
> +
> +	/*
> +	 * Run the verifiers to see if the in-memory buffer is bitrotting or
> +	 * otherwise corrupt.  If the buffer doesn't have a log item then
> +	 * it's clean, so call the read verifier.  However, if the buffer
> +	 * has a log item, it is probably dirty.  Checksums will be written
> +	 * when the buffer is about to go out to disk, so call the write
> +	 * verifier to check the structure.
> +	 */
> +	olderror = bp->b_error;
> +	if (bp->b_fspriv)
> +		bp->b_ops->verify_write(bp);
> +	else
> +		bp->b_ops->verify_read(bp);
> +	error = bp->b_error;
> +	bp->b_error = olderror;
> +
> +	/* Mark any corruption errors we might find. */
> +	XFS_SCRUB_BUF_OP_ERROR_GOTO(out_unlock);
> +
> +out_unlock:
> +	xfs_buf_unlock(bp);
> +out_dec:
> +	atomic_dec(&bp->b_hold);
> +	return error;
> +}
> +#undef XFS_SCRUB_BUF_OP_ERROR_GOTO
> +
> +/* Walk the buffer rhashtable and dispatch buffer checking. */
> +STATIC int
> +xfs_scrub_metabufs_walk_rhash(
> +	struct xfs_scrub_metabufs_info	*smi,
> +	struct rhashtable_iter		*iter)
> +{
> +	struct xfs_buf			*bp;
> +	int				error = 0;
> +
> +	do {
> +		if (xfs_scrub_should_terminate(&error))
> +			break;
> +
> +		bp = rhashtable_walk_next(iter);
> +		if (IS_ERR(bp))
> +			return PTR_ERR(bp);
> +		else if (bp == NULL)
> +			return 0;
> +
> +		error = xfs_scrub_metabufs_scrub_buf(smi, bp);
> +	} while (error != 0);
> +
> +	return error;
> +}
> +
> +/* Try to walk the buffers in this AG in order to scrub them. */
> +int
> +xfs_scrub_metabufs(
> +	struct xfs_scrub_context	*sc)
> +{
> +	struct xfs_scrub_metabufs_info	smi;
> +	struct rhashtable_iter		iter;
> +	struct xfs_perag		*pag;
> +	int				error;
> +
> +	smi.sc = sc;
> +	smi.retries = 0;
> +	pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
> +	rhashtable_walk_enter(&pag->pag_buf_hash, &iter);
> +
> +	while (1) {
> +		if (xfs_scrub_should_terminate(&error))
> +			break;
> +
> +		error = rhashtable_walk_start(&iter);
> +		if (!error) {
> +			error = xfs_scrub_metabufs_walk_rhash(&smi, &iter);
> +			rhashtable_walk_stop(&iter);
> +		}
> +
> +		if (error != -EAGAIN)
> +			break;
> +		cond_resched();
> +	}
I suppose it's unlikely that we end up looping too many times, but do 
you think we should we have a max number of tries just in case?

Rest of the patch looks good.
Reviewed by: Allison Henderson <allison.henderson@oracle.com>

> +
> +	rhashtable_walk_exit(&iter);
> +	xfs_perag_put(pag);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index d4de29b..036e65c 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3312,7 +3312,8 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
>
>  /* scrub */
>  #define XFS_SCRUB_TYPE_DESC \
> -	{ XFS_SCRUB_TYPE_TEST,		"dummy" }
> +	{ XFS_SCRUB_TYPE_TEST,		"dummy" }, \
> +	{ XFS_SCRUB_TYPE_METABUFS,	"metabufs" }
>  DECLARE_EVENT_CLASS(xfs_scrub_class,
>  	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
>  		 int error),
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner July 24, 2017, 1:43 a.m. UTC | #2
On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Call the verifier function for all in-memory metadata buffers, looking
> for memory corruption either due to bad memory or coding bugs.

How does this fit into the bigger picture? We can't do an exhaustive
search of the in memory buffer cache, because access is racy w.r.t.
the life cycle of in memory buffers.

Also, if we are doing a full scrub, we're going to hit and then
check the cached in-memory buffers anyway, so I'm missing the
context that explains why this code is necessary.

>  #endif	/* __XFS_REPAIR_COMMON_H__ */
> diff --git a/fs/xfs/scrub/metabufs.c b/fs/xfs/scrub/metabufs.c
> new file mode 100644
> index 0000000..63faaa6
> --- /dev/null
> +++ b/fs/xfs/scrub/metabufs.c
> @@ -0,0 +1,177 @@
> +/*
> + * Copyright (C) 2017 Oracle.  All Rights Reserved.
> + *
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_btree.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_trace.h"
> +#include "xfs_sb.h"
> +#include "scrub/common.h"
> +
> +/* We only iterate buffers one by one, so we don't need any setup. */
> +int
> +xfs_scrub_setup_metabufs(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_inode		*ip)
> +{
> +	return 0;
> +}
> +
> +#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
> +struct xfs_scrub_metabufs_info {
> +	struct xfs_scrub_context	*sc;
> +	unsigned int			retries;
> +};

So do we get 10 retries per buffer, or 10 retries across an entire
scan?

> +/* In-memory buffer corruption. */
> +
> +#define XFS_SCRUB_BUF_OP_ERROR_GOTO(label) \
> +	XFS_SCRUB_OP_ERROR_GOTO(smi->sc, \
> +			xfs_daddr_to_agno(smi->sc->mp, bp->b_bn), \
> +			xfs_daddr_to_agbno(smi->sc->mp, bp->b_bn), "buf", \
> +			&error, label)

Nested macros - yuck!

> +STATIC int
> +xfs_scrub_metabufs_scrub_buf(
> +	struct xfs_scrub_metabufs_info	*smi,
> +	struct xfs_buf			*bp)
> +{
> +	int				olderror;
> +	int				error = 0;
> +
> +	/*
> +	 * We hold the rcu lock during the rhashtable walk, so we can't risk
> +	 * having the log forced due to a stale buffer by xfs_buf_lock.
> +	 */
> +	if (bp->b_flags & XBF_STALE)
> +		return 0;
> +
> +	atomic_inc(&bp->b_hold);

This looks wrong. I think it can race with reclaim because we don't
hold the pag->pag_buf_lock. i.e.  xfs_buf_rele() does this:

	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);

to prevent lookups - which are done under the pag->pag_buf_lock -
from finding the buffer while it has a zero hold count and may be
removed from the cache and freed.

Further, if we are going to iterate the cache, I'd much prefer the
iteration code to be in fs/xfs/xfs_buf.c - nothing outside that file
should be touching core buffer cache structures....

FWIW, the LRU walks already handle this problem by the fact the LRU
owns a hold count on the buffer. So it may be better to do this via
a LRU walk rather than a hashtable walk....

[snip]

> +	olderror = bp->b_error;
> +	if (bp->b_fspriv)
> +		bp->b_ops->verify_write(bp);

Should we be recalculating the CRC on buffers we aren't about to 
be writing to disk? Should we be verifying a buffer that has a
non-zero error value on it?

> +	else
> +		bp->b_ops->verify_read(bp);
> +	error = bp->b_error;
> +	bp->b_error = olderror;
> +
> +	/* Mark any corruption errors we might find. */
> +	XFS_SCRUB_BUF_OP_ERROR_GOTO(out_unlock);

Ah, what? Why does this need a goto? And why doesn't it report the
error that was found? (bloody macros!).

> +out_unlock:
> +	xfs_buf_unlock(bp);
> +out_dec:
> +	atomic_dec(&bp->b_hold);
> +	return error;
> +}
> +#undef XFS_SCRUB_BUF_OP_ERROR_GOTO

Oh, that's the macro defined above the function. Which I paid little
attention to other than it called another macro. Now I realise that
it (ab)uses local variables without them being passed into the
macro. Yup, another reason we need to get rid of the macros from
this code....

> +	struct xfs_scrub_metabufs_info	*smi,
> +	struct rhashtable_iter		*iter)
> +{
> +	struct xfs_buf			*bp;
> +	int				error = 0;
> +
> +	do {
> +		if (xfs_scrub_should_terminate(&error))
> +			break;
> +
> +		bp = rhashtable_walk_next(iter);
> +		if (IS_ERR(bp))
> +			return PTR_ERR(bp);
> +		else if (bp == NULL)
> +			return 0;
> +
> +		error = xfs_scrub_metabufs_scrub_buf(smi, bp);
> +	} while (error != 0);
> +
> +	return error;
> +}
> +
> +/* Try to walk the buffers in this AG in order to scrub them. */
> +int
> +xfs_scrub_metabufs(

Ah, please put an "_ag_" in this so it's clear it's only scrubbing a
single AG. This is hidden deep inside the scrub context, so it took
me a little bit of back tracking to understand that this wasn't a
global scan....

Cheers,

Dave.
Darrick J. Wong July 24, 2017, 10:36 p.m. UTC | #3
On Mon, Jul 24, 2017 at 11:43:27AM +1000, Dave Chinner wrote:
> On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Call the verifier function for all in-memory metadata buffers, looking
> > for memory corruption either due to bad memory or coding bugs.
> 
> How does this fit into the bigger picture? We can't do an exhaustive
> search of the in memory buffer cache, because access is racy w.r.t.
> the life cycle of in memory buffers.
> 
> Also, if we are doing a full scrub, we're going to hit and then
> check the cached in-memory buffers anyway, so I'm missing the
> context that explains why this code is necessary.

Before we start scanning the filesystem (which could lead to clean
buffers being pushed out of memory and later reread), we want to check
the buffers that have been sitting around in memory to see if they've
mutated since the last time the verifiers ran.

> >  #endif	/* __XFS_REPAIR_COMMON_H__ */
> > diff --git a/fs/xfs/scrub/metabufs.c b/fs/xfs/scrub/metabufs.c
> > new file mode 100644
> > index 0000000..63faaa6
> > --- /dev/null
> > +++ b/fs/xfs/scrub/metabufs.c
> > @@ -0,0 +1,177 @@
> > +/*
> > + * Copyright (C) 2017 Oracle.  All Rights Reserved.
> > + *
> > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version 2
> > + * of the License, or (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it would be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write the Free Software Foundation,
> > + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
> > + */
> > +#include "xfs.h"
> > +#include "xfs_fs.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_btree.h"
> > +#include "xfs_bit.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans.h"
> > +#include "xfs_trace.h"
> > +#include "xfs_sb.h"
> > +#include "scrub/common.h"
> > +
> > +/* We only iterate buffers one by one, so we don't need any setup. */
> > +int
> > +xfs_scrub_setup_metabufs(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_inode		*ip)
> > +{
> > +	return 0;
> > +}
> > +
> > +#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
> > +struct xfs_scrub_metabufs_info {
> > +	struct xfs_scrub_context	*sc;
> > +	unsigned int			retries;
> > +};
> 
> So do we get 10 retries per buffer, or 10 retries across an entire
> scan?

Ten per scan.

> > +/* In-memory buffer corruption. */
> > +
> > +#define XFS_SCRUB_BUF_OP_ERROR_GOTO(label) \
> > +	XFS_SCRUB_OP_ERROR_GOTO(smi->sc, \
> > +			xfs_daddr_to_agno(smi->sc->mp, bp->b_bn), \
> > +			xfs_daddr_to_agbno(smi->sc->mp, bp->b_bn), "buf", \
> > +			&error, label)
> 
> Nested macros - yuck!
> 
> > +STATIC int
> > +xfs_scrub_metabufs_scrub_buf(
> > +	struct xfs_scrub_metabufs_info	*smi,
> > +	struct xfs_buf			*bp)
> > +{
> > +	int				olderror;
> > +	int				error = 0;
> > +
> > +	/*
> > +	 * We hold the rcu lock during the rhashtable walk, so we can't risk
> > +	 * having the log forced due to a stale buffer by xfs_buf_lock.
> > +	 */
> > +	if (bp->b_flags & XBF_STALE)
> > +		return 0;
> > +
> > +	atomic_inc(&bp->b_hold);
> 
> This looks wrong. I think it can race with reclaim because we don't
> hold the pag->pag_buf_lock. i.e.  xfs_buf_rele() does this:
> 
> 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
> 
> to prevent lookups - which are done under the pag->pag_buf_lock -
> from finding the buffer while it has a zero hold count and may be
> removed from the cache and freed.

I could be misunderstanding rhashtable here -- as I understand it, the
rhashtable_walk_start function calls rcu_read_lock and doesn't release
it until we call rhashtable_walk_stop.  The rhashtable lookup, insert,
and remove functions each call rcu_read_lock before fidding with the
hashtable internals.  I /think/ this means that even if the scrubber
gets ahold of a buffer with zero b_hold that's being xfs_buf_rele'd,
that concurrent xfs_buf_rele will be waiting for rcu_read_lock, and
therefore the buffer cannot be freed until the walk stops.

> Further, if we are going to iterate the cache, I'd much prefer the
> iteration code to be in fs/xfs/xfs_buf.c - nothing outside that file
> should be touching core buffer cache structures....
> 
> FWIW, the LRU walks already handle this problem by the fact the LRU
> owns a hold count on the buffer. So it may be better to do this via
> a LRU walk rather than a hashtable walk....

Yeah, once we clear up the above I'll move it to xfs_buf.c.

> [snip]
> 
> > +	olderror = bp->b_error;
> > +	if (bp->b_fspriv)
> > +		bp->b_ops->verify_write(bp);
> 
> Should we be recalculating the CRC on buffers we aren't about to 
> be writing to disk?

I don't think it causes any harm to recalculate the crc early.  If the
buffer is dirty and corrupt we can't fix it and write out will flag it
and shut down the fs anyway, so it likely doesn't matter anyway.

> Should we be verifying a buffer that has a non-zero error value on it?

No.

> > +	else
> > +		bp->b_ops->verify_read(bp);
> > +	error = bp->b_error;
> > +	bp->b_error = olderror;
> > +
> > +	/* Mark any corruption errors we might find. */
> > +	XFS_SCRUB_BUF_OP_ERROR_GOTO(out_unlock);
> 
> Ah, what? Why does this need a goto? And why doesn't it report the
> error that was found? (bloody macros!).
> 
> > +out_unlock:
> > +	xfs_buf_unlock(bp);
> > +out_dec:
> > +	atomic_dec(&bp->b_hold);
> > +	return error;
> > +}
> > +#undef XFS_SCRUB_BUF_OP_ERROR_GOTO
> 
> Oh, that's the macro defined above the function. Which I paid little
> attention to other than it called another macro. Now I realise that
> it (ab)uses local variables without them being passed into the
> macro. Yup, another reason we need to get rid of the macros from
> this code....

Ok.

> > +	struct xfs_scrub_metabufs_info	*smi,
> > +	struct rhashtable_iter		*iter)
> > +{
> > +	struct xfs_buf			*bp;
> > +	int				error = 0;
> > +
> > +	do {
> > +		if (xfs_scrub_should_terminate(&error))
> > +			break;
> > +
> > +		bp = rhashtable_walk_next(iter);
> > +		if (IS_ERR(bp))
> > +			return PTR_ERR(bp);
> > +		else if (bp == NULL)
> > +			return 0;
> > +
> > +		error = xfs_scrub_metabufs_scrub_buf(smi, bp);
> > +	} while (error != 0);
> > +
> > +	return error;
> > +}
> > +
> > +/* Try to walk the buffers in this AG in order to scrub them. */
> > +int
> > +xfs_scrub_metabufs(
> 
> Ah, please put an "_ag_" in this so it's clear it's only scrubbing a
> single AG. This is hidden deep inside the scrub context, so it took
> me a little bit of back tracking to understand that this wasn't a
> global scan....

Ok.

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner July 24, 2017, 11:38 p.m. UTC | #4
On Mon, Jul 24, 2017 at 03:36:54PM -0700, Darrick J. Wong wrote:
> On Mon, Jul 24, 2017 at 11:43:27AM +1000, Dave Chinner wrote:
> > On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Call the verifier function for all in-memory metadata buffers, looking
> > > for memory corruption either due to bad memory or coding bugs.
> > 
> > How does this fit into the bigger picture? We can't do an exhaustive
> > search of the in memory buffer cache, because access is racy w.r.t.
> > the life cycle of in memory buffers.
> > 
> > Also, if we are doing a full scrub, we're going to hit and then
> > check the cached in-memory buffers anyway, so I'm missing the
> > context that explains why this code is necessary.
> 
> Before we start scanning the filesystem (which could lead to clean
> buffers being pushed out of memory and later reread), we want to check
> the buffers that have been sitting around in memory to see if they've
> mutated since the last time the verifiers ran.

I'm not sure we need a special cache walk to do this.

My thinking is that if the buffers get pushed out of memory, the
verifier will be run at that time, so we don't need to run the
verifier before a scrub to avoid problems here.

Further, if we read the buffer as part of the scrub and it's found
in cache, then if the scrub finds a corruption we'll know it
happened between the last verifier invocation and the scrub.

If the buffer is not in cache and scrub reads the metadata from
disk, then the verifier should fire on read if the item is corrupt
coming off disk. If the verifier doesn't find corruption in this
case but scrub does, then we've got to think about whether the
verifier has sufficient coverage.

> > > +#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
> > > +struct xfs_scrub_metabufs_info {
> > > +	struct xfs_scrub_context	*sc;
> > > +	unsigned int			retries;
> > > +};
> > 
> > So do we get 10 retries per buffer, or 10 retries across an entire
> > scan?
> 
> Ten per scan.

That will prevent large/active filesystems from being scanned
completely, I think.

> > > +STATIC int
> > > +xfs_scrub_metabufs_scrub_buf(
> > > +	struct xfs_scrub_metabufs_info	*smi,
> > > +	struct xfs_buf			*bp)
> > > +{
> > > +	int				olderror;
> > > +	int				error = 0;
> > > +
> > > +	/*
> > > +	 * We hold the rcu lock during the rhashtable walk, so we can't risk
> > > +	 * having the log forced due to a stale buffer by xfs_buf_lock.
> > > +	 */
> > > +	if (bp->b_flags & XBF_STALE)
> > > +		return 0;
> > > +
> > > +	atomic_inc(&bp->b_hold);
> > 
> > This looks wrong. I think it can race with reclaim because we don't
> > hold the pag->pag_buf_lock. i.e.  xfs_buf_rele() does this:
> > 
> > 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
> > 
> > to prevent lookups - which are done under the pag->pag_buf_lock -
> > from finding the buffer while it has a zero hold count and may be
> > removed from the cache and freed.
> 
> I could be misunderstanding rhashtable here -- as I understand it, the
> rhashtable_walk_start function calls rcu_read_lock and doesn't release
> it until we call rhashtable_walk_stop.  The rhashtable lookup, insert,
> and remove functions each call rcu_read_lock before fidding with the
> hashtable internals.  I /think/ this means that even if the scrubber
> gets ahold of a buffer with zero b_hold that's being xfs_buf_rele'd,
> that concurrent xfs_buf_rele will be waiting for rcu_read_lock, and
> therefore the buffer cannot be freed until the walk stops.

No, we're not using RCU to protect object life cycles, and RCU also
doesn't protect the rhashtable walks for xfs_bufs because the
xfs_bufs are not freed by a RCU callback at the end of the grace
period.

FYI, when we moved to the rhashtable code, Lucas Stach also provided
a RCU lookup patch which I didn't merge.  It turns out that the RCU
freeing of xfs_bufs has more overhead than the potential CPU usage
saved by removing lock contention in lookups:

https://www.spinics.net/lists/linux-xfs/msg02186.html

IOWs, the per-ag rhashtable has low enough contention
characteristics than the infrastructure overhead of lockless lookups
result in a net performance loss and so the cache index
insert/lookup/remove code is still protected by
pag->pag_buf_lock....

The LRU walk doesn't need the pag->pag_buf_lock because all the
objects on the LRU already have a reference. Hence it can walked
without affecting lookup/insert/remove performance of the cache...

> > [snip]
> > 
> > > +	olderror = bp->b_error;
> > > +	if (bp->b_fspriv)
> > > +		bp->b_ops->verify_write(bp);
> > 
> > Should we be recalculating the CRC on buffers we aren't about to 
> > be writing to disk?
> 
> I don't think it causes any harm to recalculate the crc early.  If the
> buffer is dirty and corrupt we can't fix it and write out will flag it
> and shut down the fs anyway, so it likely doesn't matter anyway.

ok.

Cheers,

Dave.
Darrick J. Wong July 25, 2017, 12:14 a.m. UTC | #5
On Tue, Jul 25, 2017 at 09:38:13AM +1000, Dave Chinner wrote:
> On Mon, Jul 24, 2017 at 03:36:54PM -0700, Darrick J. Wong wrote:
> > On Mon, Jul 24, 2017 at 11:43:27AM +1000, Dave Chinner wrote:
> > > On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Call the verifier function for all in-memory metadata buffers, looking
> > > > for memory corruption either due to bad memory or coding bugs.
> > > 
> > > How does this fit into the bigger picture? We can't do an exhaustive
> > > search of the in memory buffer cache, because access is racy w.r.t.
> > > the life cycle of in memory buffers.
> > > 
> > > Also, if we are doing a full scrub, we're going to hit and then
> > > check the cached in-memory buffers anyway, so I'm missing the
> > > context that explains why this code is necessary.
> > 
> > Before we start scanning the filesystem (which could lead to clean
> > buffers being pushed out of memory and later reread), we want to check
> > the buffers that have been sitting around in memory to see if they've
> > mutated since the last time the verifiers ran.
> 
> I'm not sure we need a special cache walk to do this.
> 
> My thinking is that if the buffers get pushed out of memory, the
> verifier will be run at that time, so we don't need to run the
> verifier before a scrub to avoid problems here.

Agreed.

> Further, if we read the buffer as part of the scrub and it's found
> in cache, then if the scrub finds a corruption we'll know it
> happened between the last verifier invocation and the scrub.

Hm.  Prior to the introduction of the metabufs scanner a few weeks ago, 
I had thought it sufficient to assume that memory won't get corrupt, so
as long as the read verifier ran at /some/ point in the past we didn't
need to recheck now.

What if we scrap the metabufs scanner and adapt the read verifier
function pointer to allow scrub to bypass the crc check and return the
_THIS_IP_ from any failing structural test?  Then scrubbers can call the
read verifier directly and extract failure info directly.

> If the buffer is not in cache and scrub reads the metadata from
> disk, then the verifier should fire on read if the item is corrupt
> coming off disk. If the verifier doesn't find corruption in this
> case but scrub does, then we've got to think about whether the
> verifier has sufficient coverage.

Scrub has more comprehensive checks (or it will when xref comes along)
so this is likely to happen, fyi.

> > > > +#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
> > > > +struct xfs_scrub_metabufs_info {
> > > > +	struct xfs_scrub_context	*sc;
> > > > +	unsigned int			retries;
> > > > +};
> > > 
> > > So do we get 10 retries per buffer, or 10 retries across an entire
> > > scan?
> > 
> > Ten per scan.
> 
> That will prevent large/active filesystems from being scanned
> completely, I think.

Perhaps.  Though at this point I think I'll just drop this patch entirely
in favor of re-calling the verifiers....

--D

> > > > +STATIC int
> > > > +xfs_scrub_metabufs_scrub_buf(
> > > > +	struct xfs_scrub_metabufs_info	*smi,
> > > > +	struct xfs_buf			*bp)
> > > > +{
> > > > +	int				olderror;
> > > > +	int				error = 0;
> > > > +
> > > > +	/*
> > > > +	 * We hold the rcu lock during the rhashtable walk, so we can't risk
> > > > +	 * having the log forced due to a stale buffer by xfs_buf_lock.
> > > > +	 */
> > > > +	if (bp->b_flags & XBF_STALE)
> > > > +		return 0;
> > > > +
> > > > +	atomic_inc(&bp->b_hold);
> > > 
> > > This looks wrong. I think it can race with reclaim because we don't
> > > hold the pag->pag_buf_lock. i.e.  xfs_buf_rele() does this:
> > > 
> > > 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
> > > 
> > > to prevent lookups - which are done under the pag->pag_buf_lock -
> > > from finding the buffer while it has a zero hold count and may be
> > > removed from the cache and freed.
> > 
> > I could be misunderstanding rhashtable here -- as I understand it, the
> > rhashtable_walk_start function calls rcu_read_lock and doesn't release
> > it until we call rhashtable_walk_stop.  The rhashtable lookup, insert,
> > and remove functions each call rcu_read_lock before fidding with the
> > hashtable internals.  I /think/ this means that even if the scrubber
> > gets ahold of a buffer with zero b_hold that's being xfs_buf_rele'd,
> > that concurrent xfs_buf_rele will be waiting for rcu_read_lock, and
> > therefore the buffer cannot be freed until the walk stops.
> 
> No, we're not using RCU to protect object life cycles, and RCU also
> doesn't protect the rhashtable walks for xfs_bufs because the
> xfs_bufs are not freed by a RCU callback at the end of the grace
> period.
> 
> FYI, when we moved to the rhashtable code, Lucas Stach also provided
> a RCU lookup patch which I didn't merge.  It turns out that the RCU
> freeing of xfs_bufs has more overhead than the potential CPU usage
> saved by removing lock contention in lookups:
> 
> https://www.spinics.net/lists/linux-xfs/msg02186.html
> 
> IOWs, the per-ag rhashtable has low enough contention
> characteristics than the infrastructure overhead of lockless lookups
> result in a net performance loss and so the cache index
> insert/lookup/remove code is still protected by
> pag->pag_buf_lock....
> 
> The LRU walk doesn't need the pag->pag_buf_lock because all the
> objects on the LRU already have a reference. Hence it can walked
> without affecting lookup/insert/remove performance of the cache...
> 
> > > [snip]
> > > 
> > > > +	olderror = bp->b_error;
> > > > +	if (bp->b_fspriv)
> > > > +		bp->b_ops->verify_write(bp);
> > > 
> > > Should we be recalculating the CRC on buffers we aren't about to 
> > > be writing to disk?
> > 
> > I don't think it causes any harm to recalculate the crc early.  If the
> > buffer is dirty and corrupt we can't fix it and write out will flag it
> > and shut down the fs anyway, so it likely doesn't matter anyway.
> 
> ok.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner July 25, 2017, 3:32 a.m. UTC | #6
On Mon, Jul 24, 2017 at 05:14:33PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 25, 2017 at 09:38:13AM +1000, Dave Chinner wrote:
> > On Mon, Jul 24, 2017 at 03:36:54PM -0700, Darrick J. Wong wrote:
> > > On Mon, Jul 24, 2017 at 11:43:27AM +1000, Dave Chinner wrote:
> > > > On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > Call the verifier function for all in-memory metadata buffers, looking
> > > > > for memory corruption either due to bad memory or coding bugs.
> > > > 
> > > > How does this fit into the bigger picture? We can't do an exhaustive
> > > > search of the in memory buffer cache, because access is racy w.r.t.
> > > > the life cycle of in memory buffers.
> > > > 
> > > > Also, if we are doing a full scrub, we're going to hit and then
> > > > check the cached in-memory buffers anyway, so I'm missing the
> > > > context that explains why this code is necessary.
> > > 
> > > Before we start scanning the filesystem (which could lead to clean
> > > buffers being pushed out of memory and later reread), we want to check
> > > the buffers that have been sitting around in memory to see if they've
> > > mutated since the last time the verifiers ran.
> > 
> > I'm not sure we need a special cache walk to do this.
> > 
> > My thinking is that if the buffers get pushed out of memory, the
> > verifier will be run at that time, so we don't need to run the
> > verifier before a scrub to avoid problems here.
> 
> Agreed.
> 
> > Further, if we read the buffer as part of the scrub and it's found
> > in cache, then if the scrub finds a corruption we'll know it
> > happened between the last verifier invocation and the scrub.
> 
> Hm.  Prior to the introduction of the metabufs scanner a few weeks ago, 
> I had thought it sufficient to assume that memory won't get corrupt, so
> as long as the read verifier ran at /some/ point in the past we didn't
> need to recheck now.
> 
> What if we scrap the metabufs scanner and adapt the read verifier
> function pointer to allow scrub to bypass the crc check and return the
> _THIS_IP_ from any failing structural test?  Then scrubbers can call the
> read verifier directly and extract failure info directly.

Yeah, that would work - rather than adapting the .read_verify op
we currently have, maybe a new op .read_verify_nocrc could be added?
THat would mostly just be a different wrapper around the existing
verify functions that are shared between the read and write
verifiers...

> > If the buffer is not in cache and scrub reads the metadata from
> > disk, then the verifier should fire on read if the item is corrupt
> > coming off disk. If the verifier doesn't find corruption in this
> > case but scrub does, then we've got to think about whether the
> > verifier has sufficient coverage.
> 
> Scrub has more comprehensive checks (or it will when xref comes along)
> so this is likely to happen, fyi.

Yup, I expect it will. :) I also expect this to point out where the
verifiers can be improved, because I'm sure we haven't caught all
the "obviously wrong" cases in the verifiers yet...

Cheers,

Dave.
Darrick J. Wong July 25, 2017, 5:27 a.m. UTC | #7
On Tue, Jul 25, 2017 at 01:32:03PM +1000, Dave Chinner wrote:
> On Mon, Jul 24, 2017 at 05:14:33PM -0700, Darrick J. Wong wrote:
> > On Tue, Jul 25, 2017 at 09:38:13AM +1000, Dave Chinner wrote:
> > > On Mon, Jul 24, 2017 at 03:36:54PM -0700, Darrick J. Wong wrote:
> > > > On Mon, Jul 24, 2017 at 11:43:27AM +1000, Dave Chinner wrote:
> > > > > On Thu, Jul 20, 2017 at 09:39:00PM -0700, Darrick J. Wong wrote:
> > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > 
> > > > > > Call the verifier function for all in-memory metadata buffers, looking
> > > > > > for memory corruption either due to bad memory or coding bugs.
> > > > > 
> > > > > How does this fit into the bigger picture? We can't do an exhaustive
> > > > > search of the in memory buffer cache, because access is racy w.r.t.
> > > > > the life cycle of in memory buffers.
> > > > > 
> > > > > Also, if we are doing a full scrub, we're going to hit and then
> > > > > check the cached in-memory buffers anyway, so I'm missing the
> > > > > context that explains why this code is necessary.
> > > > 
> > > > Before we start scanning the filesystem (which could lead to clean
> > > > buffers being pushed out of memory and later reread), we want to check
> > > > the buffers that have been sitting around in memory to see if they've
> > > > mutated since the last time the verifiers ran.
> > > 
> > > I'm not sure we need a special cache walk to do this.
> > > 
> > > My thinking is that if the buffers get pushed out of memory, the
> > > verifier will be run at that time, so we don't need to run the
> > > verifier before a scrub to avoid problems here.
> > 
> > Agreed.
> > 
> > > Further, if we read the buffer as part of the scrub and it's found
> > > in cache, then if the scrub finds a corruption we'll know it
> > > happened between the last verifier invocation and the scrub.
> > 
> > Hm.  Prior to the introduction of the metabufs scanner a few weeks ago, 
> > I had thought it sufficient to assume that memory won't get corrupt, so
> > as long as the read verifier ran at /some/ point in the past we didn't
> > need to recheck now.
> > 
> > What if we scrap the metabufs scanner and adapt the read verifier
> > function pointer to allow scrub to bypass the crc check and return the
> > _THIS_IP_ from any failing structural test?  Then scrubbers can call the
> > read verifier directly and extract failure info directly.
> 
> Yeah, that would work - rather than adapting the .read_verify op
> we currently have, maybe a new op .read_verify_nocrc could be added?
> THat would mostly just be a different wrapper around the existing
> verify functions that are shared between the read and write
> verifiers...

I was going to call them ->verify_structure() or something like that.

> > > If the buffer is not in cache and scrub reads the metadata from
> > > disk, then the verifier should fire on read if the item is corrupt
> > > coming off disk. If the verifier doesn't find corruption in this
> > > case but scrub does, then we've got to think about whether the
> > > verifier has sufficient coverage.
> > 
> > Scrub has more comprehensive checks (or it will when xref comes along)
> > so this is likely to happen, fyi.
> 
> Yup, I expect it will. :) I also expect this to point out where the
> verifiers can be improved, because I'm sure we haven't caught all
> the "obviously wrong" cases in the verifiers yet...

Well yes. :)  I have been running the "dangerous_fuzzers dangerous_scrub
dangerous_repair" tests again, as they fuzz everything to see what scrub
complains about vs. what repair actually tries to fix.

Current wtf list:

- do we need to check all the padding areas? does repair?
- problems with repair not resetting di_nlink when it moves stuff to l+f?
  (x/380)
- repair doesn't correct remote symlink blocks
- scrub doesn't barf on free block problems?? (x/396)
- repair doesn't notice attr remote value block problems? (x/404)
- XFS_WANT_CORRUPTED_ are verifiers, but should not trigger asserts?
- xfs_repair segfaults when freetag is too big

(Actually, I already sent a fix a couple of days ago for that last one,
but nobody has reviewed it.)

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4e04da9..67cf4ac 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -142,5 +142,6 @@  ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
 xfs-y				+= $(addprefix scrub/, \
 				   btree.o \
 				   common.o \
+				   metabufs.o \
 				   )
 endif
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index aeccc99..9fb3c65 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -482,7 +482,8 @@  struct xfs_scrub_metadata {
  * Metadata types and flags for scrub operation.
  */
 #define XFS_SCRUB_TYPE_TEST	0	/* dummy to test ioctl */
-#define XFS_SCRUB_TYPE_MAX	0
+#define XFS_SCRUB_TYPE_METABUFS	1	/* in-core metadata buffers */
+#define XFS_SCRUB_TYPE_MAX	1
 
 /* i: repair this metadata */
 #define XFS_SCRUB_FLAG_REPAIR		(1 << 0)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 331aa14..e06131f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -610,6 +610,10 @@  static const struct xfs_scrub_meta_fns meta_scrub_fns[] = {
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_dummy,
 	},
+	{ /* in-memory metadata buffers */
+		.setup	= xfs_scrub_setup_metabufs,
+		.scrub	= xfs_scrub_metabufs,
+	},
 };
 
 /* Dispatch metadata scrubbing. */
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 15baccb..5f0818c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -198,12 +198,14 @@  int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
 
 #define SETUP_FN(name) int name(struct xfs_scrub_context *sc, struct xfs_inode *ip)
 SETUP_FN(xfs_scrub_setup_fs);
+SETUP_FN(xfs_scrub_setup_metabufs);
 #undef SETUP_FN
 
 /* Metadata scrubbers */
 
 #define SCRUB_FN(name) int name(struct xfs_scrub_context *sc)
 SCRUB_FN(xfs_scrub_dummy);
+SCRUB_FN(xfs_scrub_metabufs);
 #undef SCRUB_FN
 
 #endif	/* __XFS_REPAIR_COMMON_H__ */
diff --git a/fs/xfs/scrub/metabufs.c b/fs/xfs/scrub/metabufs.c
new file mode 100644
index 0000000..63faaa6
--- /dev/null
+++ b/fs/xfs/scrub/metabufs.c
@@ -0,0 +1,177 @@ 
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_sb.h"
+#include "scrub/common.h"
+
+/* We only iterate buffers one by one, so we don't need any setup. */
+int
+xfs_scrub_setup_metabufs(
+	struct xfs_scrub_context	*sc,
+	struct xfs_inode		*ip)
+{
+	return 0;
+}
+
+#define XFS_SCRUB_METABUFS_TOO_MANY_RETRIES	10
+struct xfs_scrub_metabufs_info {
+	struct xfs_scrub_context	*sc;
+	unsigned int			retries;
+};
+
+/* In-memory buffer corruption. */
+
+#define XFS_SCRUB_BUF_OP_ERROR_GOTO(label) \
+	XFS_SCRUB_OP_ERROR_GOTO(smi->sc, \
+			xfs_daddr_to_agno(smi->sc->mp, bp->b_bn), \
+			xfs_daddr_to_agbno(smi->sc->mp, bp->b_bn), "buf", \
+			&error, label)
+STATIC int
+xfs_scrub_metabufs_scrub_buf(
+	struct xfs_scrub_metabufs_info	*smi,
+	struct xfs_buf			*bp)
+{
+	int				olderror;
+	int				error = 0;
+
+	/*
+	 * We hold the rcu lock during the rhashtable walk, so we can't risk
+	 * having the log forced due to a stale buffer by xfs_buf_lock.
+	 */
+	if (bp->b_flags & XBF_STALE)
+		return 0;
+
+	atomic_inc(&bp->b_hold);
+	if (!xfs_buf_trylock(bp)) {
+		if (smi->retries > XFS_SCRUB_METABUFS_TOO_MANY_RETRIES) {
+			/* We've retried too many times, do what we can. */
+			XFS_SCRUB_INCOMPLETE(smi->sc, "metabufs", true);
+			error = 0;
+		} else {
+			/* Restart the metabuf scrub from the start. */
+			smi->retries++;
+			error = -EAGAIN;
+		}
+		goto out_dec;
+	}
+
+	/* Skip this buffer if it's stale, unread, or has no verifiers. */
+	if ((bp->b_flags & XBF_STALE) ||
+	    !(bp->b_flags & XBF_DONE) ||
+	    !bp->b_ops)
+		goto out_unlock;
+
+	/*
+	 * Run the verifiers to see if the in-memory buffer is bitrotting or
+	 * otherwise corrupt.  If the buffer doesn't have a log item then
+	 * it's clean, so call the read verifier.  However, if the buffer
+	 * has a log item, it is probably dirty.  Checksums will be written
+	 * when the buffer is about to go out to disk, so call the write
+	 * verifier to check the structure.
+	 */
+	olderror = bp->b_error;
+	if (bp->b_fspriv)
+		bp->b_ops->verify_write(bp);
+	else
+		bp->b_ops->verify_read(bp);
+	error = bp->b_error;
+	bp->b_error = olderror;
+
+	/* Mark any corruption errors we might find. */
+	XFS_SCRUB_BUF_OP_ERROR_GOTO(out_unlock);
+
+out_unlock:
+	xfs_buf_unlock(bp);
+out_dec:
+	atomic_dec(&bp->b_hold);
+	return error;
+}
+#undef XFS_SCRUB_BUF_OP_ERROR_GOTO
+
+/* Walk the buffer rhashtable and dispatch buffer checking. */
+STATIC int
+xfs_scrub_metabufs_walk_rhash(
+	struct xfs_scrub_metabufs_info	*smi,
+	struct rhashtable_iter		*iter)
+{
+	struct xfs_buf			*bp;
+	int				error = 0;
+
+	do {
+		if (xfs_scrub_should_terminate(&error))
+			break;
+
+		bp = rhashtable_walk_next(iter);
+		if (IS_ERR(bp))
+			return PTR_ERR(bp);
+		else if (bp == NULL)
+			return 0;
+
+		error = xfs_scrub_metabufs_scrub_buf(smi, bp);
+	} while (error != 0);
+
+	return error;
+}
+
+/* Try to walk the buffers in this AG in order to scrub them. */
+int
+xfs_scrub_metabufs(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_scrub_metabufs_info	smi;
+	struct rhashtable_iter		iter;
+	struct xfs_perag		*pag;
+	int				error;
+
+	smi.sc = sc;
+	smi.retries = 0;
+	pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
+	rhashtable_walk_enter(&pag->pag_buf_hash, &iter);
+
+	while (1) {
+		if (xfs_scrub_should_terminate(&error))
+			break;
+
+		error = rhashtable_walk_start(&iter);
+		if (!error) {
+			error = xfs_scrub_metabufs_walk_rhash(&smi, &iter);
+			rhashtable_walk_stop(&iter);
+		}
+
+		if (error != -EAGAIN)
+			break;
+		cond_resched();
+	}
+
+	rhashtable_walk_exit(&iter);
+	xfs_perag_put(pag);
+	return error;
+}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d4de29b..036e65c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3312,7 +3312,8 @@  DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
 
 /* scrub */
 #define XFS_SCRUB_TYPE_DESC \
-	{ XFS_SCRUB_TYPE_TEST,		"dummy" }
+	{ XFS_SCRUB_TYPE_TEST,		"dummy" }, \
+	{ XFS_SCRUB_TYPE_METABUFS,	"metabufs" }
 DECLARE_EVENT_CLASS(xfs_scrub_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
 		 int error),