diff mbox

[RFC] xfs_db: sanitize geometry on load

Message ID 20170110194249.GK14038@birch.djwong.org (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Darrick J. Wong Jan. 10, 2017, 7:42 p.m. UTC
xfs_db doesn't check the filesystem geometry when it's mounting, which
means that garbage agcount values can cause OOMs when we try to allocate
all the per-AG incore metadata.  If we see geometry that looks
suspicious, try to derive the actual AG geometry to avoid crashing the
system.  This should help with xfs/1301 fuzzing.

Also fix up xfs_repair to use the min/max dblocks macros.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 db/init.c   |   91 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 repair/sb.c |    7 +----
 2 files changed, 83 insertions(+), 15 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Sandeen Jan. 10, 2017, 8:52 p.m. UTC | #1
On 1/10/17 1:42 PM, Darrick J. Wong wrote:
> xfs_db doesn't check the filesystem geometry when it's mounting, which
> means that garbage agcount values can cause OOMs when we try to allocate
> all the per-AG incore metadata.  If we see geometry that looks
> suspicious, try to derive the actual AG geometry to avoid crashing the
> system.  This should help with xfs/1301 fuzzing.
> 
> Also fix up xfs_repair to use the min/max dblocks macros.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  db/init.c   |   91 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
>  repair/sb.c |    7 +----

The repair change is an unrelated no-op cleanup, yes?  Worth doing,
but in a second patch.

Ok, so now we do a whole bunch of tests and behind-your-back
fixups, but regardless of the problem we always issue the same
warning:

xfs_db: device fsfile AG geometry is insane.  Using agcount=XX

... even if agcount wasn't the problem:

xfs_db> write -c blocksize 32768
Allowing write of corrupted data and bad CRC
blocksize = 32768
xfs_db> quit
[sandeen@sandeen xfsprogs]$ db/xfs_db -x fsfile 
xfs_db: device fsfile AG geometry is insane.  Using agcount=4.

That's confusing.

The whole point of xfs_db is to debug what is /on disk/ - 
the above doesn't make it clear that it's not changed the on-disk
value.  And it doesn't make it clear what was wrong, or what got changed.

But backing up - the /problem/ here is the OOM, right?  That happens
for a very large agcount.  We shouldn't be doing any more than that;
I don't really agree that your sanitize function should be fixing up
stuff behind the user's back.  If it's broken, it's broken; let the
xfs_db wielding expert sort that out as long as xfs_db an be used to
do so.

But if the OOM means xfs_db can't start, then yeah, that's a problem.

What about sanitizing only the agcount, and saying something like:

xfs_db: device fsfile AG count is insane.  Initializing only first $X

(where $X is based on your heuristics)

-Eric

>  2 files changed, 83 insertions(+), 15 deletions(-)
> 
> diff --git a/db/init.c b/db/init.c
> index ec1e274..7e66d14 100644
> --- a/db/init.c
> +++ b/db/init.c
> @@ -51,13 +51,90 @@ usage(void)
>  	exit(1);
>  }
>  
> +/* Try to load an AG's superblock, no verifiers. */
> +static bool
> +load_sb(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	struct xfs_sb		*sbp)
> +{
> +	struct xfs_buf		*bp;
> +
> +	bp = libxfs_readbuf(mp->m_ddev_targp,
> +			    XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
> +			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
> +
> +	if (!bp || bp->b_error)
> +		return false;
> +
> +	/* copy SB from buffer to in-core, converting architecture as we go */
> +	libxfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
> +	libxfs_putbuf(bp);
> +	libxfs_purgebuf(bp);
> +
> +	return true;
> +}
> +
> +/* If the geometry doesn't look sane, try to figure out the real geometry. */
> +static void
> +sanitize_geometry(
> +	struct xfs_mount	*mp,
> +	struct xfs_sb		*sbp)
> +{
> +	struct xfs_sb		sb;
> +	xfs_agblock_t		agblocks;
> +
> +	/* If the geometry looks ok, we're done. */
> +	if (sbp->sb_blocklog >= XFS_MIN_BLOCKSIZE_LOG &&
> +	    sbp->sb_blocklog <= XFS_MAX_BLOCKSIZE_LOG &&
> +	    sbp->sb_blocksize == (1 << sbp->sb_blocklog) &&
> +	    sbp->sb_dblocks * sbp->sb_blocksize <= x.dsize * x.dbsize &&
> +	    sbp->sb_dblocks <= XFS_MAX_DBLOCKS(sbp) &&
> +	    sbp->sb_dblocks >= XFS_MIN_DBLOCKS(sbp))
> +		return;
> +
> +	/* Check blocklog and blocksize */
> +	if (sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
> +	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG)
> +		sbp->sb_blocklog = libxfs_log2_roundup(sbp->sb_blocksize);
> +	if (sbp->sb_blocksize != (1 << sbp->sb_blocklog))
> +		sbp->sb_blocksize = (1 << sbp->sb_blocksize);
> +
> +	/* Clamp dblocks to the size of the device. */
> +	if (sbp->sb_dblocks > x.dsize * x.dbsize / sbp->sb_blocksize)
> +		sbp->sb_dblocks = x.dsize * x.dbsize / sbp->sb_blocksize;
> +
> +	/* See if agblocks helps us find a superblock. */
> +	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
> +	if (load_sb(mp, 1, &sb) && sb.sb_magicnum == XFS_SB_MAGIC) {
> +		sbp->sb_agcount = sbp->sb_dblocks / sbp->sb_agblocks;
> +		goto out;
> +	}
> +
> +	/* See if agcount helps us find a superblock. */
> +	agblocks = sbp->sb_agblocks;
> +	sbp->sb_agblocks = sbp->sb_dblocks / sbp->sb_agcount;
> +	if (sbp->sb_agblocks != 0 &&
> +	    load_sb(mp, 1, &sb) &&
> +	    sb.sb_magicnum == XFS_SB_MAGIC) {
> +		goto out;
> +	}
> +
> +	/* Both are nuts, assume 1 AG. */
> +	sbp->sb_agblocks = agblocks;
> +	sbp->sb_agcount = 1;
> +out:
> +	fprintf(stderr,
> +		_("%s: device %s AG geometry is insane.  Using agcount=%u.\n"),
> +		progname, fsdevice, sbp->sb_agcount);
> +}
> +
>  void
>  init(
>  	int		argc,
>  	char		**argv)
>  {
>  	struct xfs_sb	*sbp;
> -	struct xfs_buf	*bp;
>  	int		c;
>  
>  	setlocale(LC_ALL, "");
> @@ -124,20 +201,12 @@ init(
>  	 */
>  	memset(&xmount, 0, sizeof(struct xfs_mount));
>  	libxfs_buftarg_init(&xmount, x.ddev, x.logdev, x.rtdev);
> -	bp = libxfs_readbuf(xmount.m_ddev_targp, XFS_SB_DADDR,
> -			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
> -
> -	if (!bp || bp->b_error) {
> +	if (!load_sb(&xmount, 0, &xmount.m_sb)) {
>  		fprintf(stderr, _("%s: %s is invalid (cannot read first 512 "
>  			"bytes)\n"), progname, fsdevice);
>  		exit(1);
>  	}
>  
> -	/* copy SB from buffer to in-core, converting architecture as we go */
> -	libxfs_sb_from_disk(&xmount.m_sb, XFS_BUF_TO_SBP(bp));
> -	libxfs_putbuf(bp);
> -	libxfs_purgebuf(bp);
> -
>  	sbp = &xmount.m_sb;
>  	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
>  		fprintf(stderr, _("%s: %s is not a valid XFS filesystem (unexpected SB magic number 0x%08x)\n"),
> @@ -148,6 +217,8 @@ init(
>  		}
>  	}
>  
> +	sanitize_geometry(&xmount, sbp);
> +
>  	mp = libxfs_mount(&xmount, sbp, x.ddev, x.logdev, x.rtdev,
>  			  LIBXFS_MOUNT_DEBUGGER);
>  	if (!mp) {
> diff --git a/repair/sb.c b/repair/sb.c
> index 1b352e8..e108613 100644
> --- a/repair/sb.c
> +++ b/repair/sb.c
> @@ -398,11 +398,8 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
>  	/* sanity check ag count, size fields against data size field */
>  
>  	if (sb->sb_dblocks == 0 ||
> -		sb->sb_dblocks >
> -			((__uint64_t)sb->sb_agcount * sb->sb_agblocks) ||
> -		sb->sb_dblocks <
> -			((__uint64_t)(sb->sb_agcount - 1) * sb->sb_agblocks
> -			+ XFS_MIN_AG_BLOCKS))
> +		sb->sb_dblocks > XFS_MAX_DBLOCKS(sb) ||
> +		sb->sb_dblocks < XFS_MIN_DBLOCKS(sb))
>  		return(XR_BAD_FS_SIZE_DATA);
>  
>  	if (sb->sb_agblklog != (__uint8_t)libxfs_log2_roundup(sb->sb_agblocks))
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 10, 2017, 9:27 p.m. UTC | #2
On Tue, Jan 10, 2017 at 02:52:00PM -0600, Eric Sandeen wrote:
> 
> On 1/10/17 1:42 PM, Darrick J. Wong wrote:
> > xfs_db doesn't check the filesystem geometry when it's mounting, which
> > means that garbage agcount values can cause OOMs when we try to allocate
> > all the per-AG incore metadata.  If we see geometry that looks
> > suspicious, try to derive the actual AG geometry to avoid crashing the
> > system.  This should help with xfs/1301 fuzzing.
> > 
> > Also fix up xfs_repair to use the min/max dblocks macros.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  db/init.c   |   91 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  repair/sb.c |    7 +----
> 
> The repair change is an unrelated no-op cleanup, yes?  Worth doing,
> but in a second patch.

<nod> Was going to do that for the real patch, but for RFC I just
slapped them together.

> Ok, so now we do a whole bunch of tests and behind-your-back
> fixups, but regardless of the problem we always issue the same
> warning:
> 
> xfs_db: device fsfile AG geometry is insane.  Using agcount=XX
> 
> ... even if agcount wasn't the problem:
> 
> xfs_db> write -c blocksize 32768
> Allowing write of corrupted data and bad CRC
> blocksize = 32768
> xfs_db> quit
> [sandeen@sandeen xfsprogs]$ db/xfs_db -x fsfile 
> xfs_db: device fsfile AG geometry is insane.  Using agcount=4.
> 
> That's confusing.

Yeah, it is.

> The whole point of xfs_db is to debug what is /on disk/ - 
> the above doesn't make it clear that it's not changed the on-disk
> value.  And it doesn't make it clear what was wrong, or what got changed.
> 
> But backing up - the /problem/ here is the OOM, right?  That happens
> for a very large agcount.  We shouldn't be doing any more than that;
> I don't really agree that your sanitize function should be fixing up
> stuff behind the user's back.  If it's broken, it's broken; let the
> xfs_db wielding expert sort that out as long as xfs_db an be used to
> do so.
> 
> But if the OOM means xfs_db can't start, then yeah, that's a problem.
> 
> What about sanitizing only the agcount, and saying something like:
> 
> xfs_db: device fsfile AG count is insane.  Initializing only first $X
> 
> (where $X is based on your heuristics)

I feel like "Initializing" could be misinterpreted as formatting the
disk.  How about:

xfs_db: device fsfile AG count is insane.  Reading only the first $X AGs.

?

--D

> 
> -Eric
> 
> >  2 files changed, 83 insertions(+), 15 deletions(-)
> > 
> > diff --git a/db/init.c b/db/init.c
> > index ec1e274..7e66d14 100644
> > --- a/db/init.c
> > +++ b/db/init.c
> > @@ -51,13 +51,90 @@ usage(void)
> >  	exit(1);
> >  }
> >  
> > +/* Try to load an AG's superblock, no verifiers. */
> > +static bool
> > +load_sb(
> > +	struct xfs_mount	*mp,
> > +	xfs_agnumber_t		agno,
> > +	struct xfs_sb		*sbp)
> > +{
> > +	struct xfs_buf		*bp;
> > +
> > +	bp = libxfs_readbuf(mp->m_ddev_targp,
> > +			    XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
> > +			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
> > +
> > +	if (!bp || bp->b_error)
> > +		return false;
> > +
> > +	/* copy SB from buffer to in-core, converting architecture as we go */
> > +	libxfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
> > +	libxfs_putbuf(bp);
> > +	libxfs_purgebuf(bp);
> > +
> > +	return true;
> > +}
> > +
> > +/* If the geometry doesn't look sane, try to figure out the real geometry. */
> > +static void
> > +sanitize_geometry(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_sb		*sbp)
> > +{
> > +	struct xfs_sb		sb;
> > +	xfs_agblock_t		agblocks;
> > +
> > +	/* If the geometry looks ok, we're done. */
> > +	if (sbp->sb_blocklog >= XFS_MIN_BLOCKSIZE_LOG &&
> > +	    sbp->sb_blocklog <= XFS_MAX_BLOCKSIZE_LOG &&
> > +	    sbp->sb_blocksize == (1 << sbp->sb_blocklog) &&
> > +	    sbp->sb_dblocks * sbp->sb_blocksize <= x.dsize * x.dbsize &&
> > +	    sbp->sb_dblocks <= XFS_MAX_DBLOCKS(sbp) &&
> > +	    sbp->sb_dblocks >= XFS_MIN_DBLOCKS(sbp))
> > +		return;
> > +
> > +	/* Check blocklog and blocksize */
> > +	if (sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
> > +	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG)
> > +		sbp->sb_blocklog = libxfs_log2_roundup(sbp->sb_blocksize);
> > +	if (sbp->sb_blocksize != (1 << sbp->sb_blocklog))
> > +		sbp->sb_blocksize = (1 << sbp->sb_blocksize);
> > +
> > +	/* Clamp dblocks to the size of the device. */
> > +	if (sbp->sb_dblocks > x.dsize * x.dbsize / sbp->sb_blocksize)
> > +		sbp->sb_dblocks = x.dsize * x.dbsize / sbp->sb_blocksize;
> > +
> > +	/* See if agblocks helps us find a superblock. */
> > +	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
> > +	if (load_sb(mp, 1, &sb) && sb.sb_magicnum == XFS_SB_MAGIC) {
> > +		sbp->sb_agcount = sbp->sb_dblocks / sbp->sb_agblocks;
> > +		goto out;
> > +	}
> > +
> > +	/* See if agcount helps us find a superblock. */
> > +	agblocks = sbp->sb_agblocks;
> > +	sbp->sb_agblocks = sbp->sb_dblocks / sbp->sb_agcount;
> > +	if (sbp->sb_agblocks != 0 &&
> > +	    load_sb(mp, 1, &sb) &&
> > +	    sb.sb_magicnum == XFS_SB_MAGIC) {
> > +		goto out;
> > +	}
> > +
> > +	/* Both are nuts, assume 1 AG. */
> > +	sbp->sb_agblocks = agblocks;
> > +	sbp->sb_agcount = 1;
> > +out:
> > +	fprintf(stderr,
> > +		_("%s: device %s AG geometry is insane.  Using agcount=%u.\n"),
> > +		progname, fsdevice, sbp->sb_agcount);
> > +}
> > +
> >  void
> >  init(
> >  	int		argc,
> >  	char		**argv)
> >  {
> >  	struct xfs_sb	*sbp;
> > -	struct xfs_buf	*bp;
> >  	int		c;
> >  
> >  	setlocale(LC_ALL, "");
> > @@ -124,20 +201,12 @@ init(
> >  	 */
> >  	memset(&xmount, 0, sizeof(struct xfs_mount));
> >  	libxfs_buftarg_init(&xmount, x.ddev, x.logdev, x.rtdev);
> > -	bp = libxfs_readbuf(xmount.m_ddev_targp, XFS_SB_DADDR,
> > -			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
> > -
> > -	if (!bp || bp->b_error) {
> > +	if (!load_sb(&xmount, 0, &xmount.m_sb)) {
> >  		fprintf(stderr, _("%s: %s is invalid (cannot read first 512 "
> >  			"bytes)\n"), progname, fsdevice);
> >  		exit(1);
> >  	}
> >  
> > -	/* copy SB from buffer to in-core, converting architecture as we go */
> > -	libxfs_sb_from_disk(&xmount.m_sb, XFS_BUF_TO_SBP(bp));
> > -	libxfs_putbuf(bp);
> > -	libxfs_purgebuf(bp);
> > -
> >  	sbp = &xmount.m_sb;
> >  	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
> >  		fprintf(stderr, _("%s: %s is not a valid XFS filesystem (unexpected SB magic number 0x%08x)\n"),
> > @@ -148,6 +217,8 @@ init(
> >  		}
> >  	}
> >  
> > +	sanitize_geometry(&xmount, sbp);
> > +
> >  	mp = libxfs_mount(&xmount, sbp, x.ddev, x.logdev, x.rtdev,
> >  			  LIBXFS_MOUNT_DEBUGGER);
> >  	if (!mp) {
> > diff --git a/repair/sb.c b/repair/sb.c
> > index 1b352e8..e108613 100644
> > --- a/repair/sb.c
> > +++ b/repair/sb.c
> > @@ -398,11 +398,8 @@ verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
> >  	/* sanity check ag count, size fields against data size field */
> >  
> >  	if (sb->sb_dblocks == 0 ||
> > -		sb->sb_dblocks >
> > -			((__uint64_t)sb->sb_agcount * sb->sb_agblocks) ||
> > -		sb->sb_dblocks <
> > -			((__uint64_t)(sb->sb_agcount - 1) * sb->sb_agblocks
> > -			+ XFS_MIN_AG_BLOCKS))
> > +		sb->sb_dblocks > XFS_MAX_DBLOCKS(sb) ||
> > +		sb->sb_dblocks < XFS_MIN_DBLOCKS(sb))
> >  		return(XR_BAD_FS_SIZE_DATA);
> >  
> >  	if (sb->sb_agblklog != (__uint8_t)libxfs_log2_roundup(sb->sb_agblocks))
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen Jan. 10, 2017, 9:34 p.m. UTC | #3
On 1/10/17 3:27 PM, Darrick J. Wong wrote:
>> What about sanitizing only the agcount, and saying something like:
>>
>> xfs_db: device fsfile AG count is insane.  Initializing only first $X
>>
>> (where $X is based on your heuristics)
> I feel like "Initializing" could be misinterpreted as formatting the
> disk.  How about:
> 
> xfs_db: device fsfile AG count is insane.  Reading only the first $X AGs.

Sure.  (I'm not sure what the implications are - it's more than just
"Oh, we didn't read it" right?  It'll have implications for use if
more AGs really exist?)

-Eric

> ?
> 
> --D
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 10, 2017, 10:29 p.m. UTC | #4
On Tue, Jan 10, 2017 at 03:34:05PM -0600, Eric Sandeen wrote:
> On 1/10/17 3:27 PM, Darrick J. Wong wrote:
> >> What about sanitizing only the agcount, and saying something like:
> >>
> >> xfs_db: device fsfile AG count is insane.  Initializing only first $X
> >>
> >> (where $X is based on your heuristics)
> > I feel like "Initializing" could be misinterpreted as formatting the
> > disk.  How about:
> > 
> > xfs_db: device fsfile AG count is insane.  Reading only the first $X AGs.
> 
> Sure.  (I'm not sure what the implications are - it's more than just
> "Oh, we didn't read it" right?  It'll have implications for use if
> more AGs really exist?)

I don't think there are.  AFAICT we don't ever write the m_sb values
back to disk; the only commands capable of writing (blocktrash, write,
fuzz) deal with buffers directly.  There's a side effect that things
like metadump will only dump the first $X AGs worth of metadata, but
right now it'll simply OOM and crash.

--D

> 
> -Eric
> 
> > ?
> > 
> > --D
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 11, 2017, 8:34 a.m. UTC | #5
On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> xfs_db doesn't check the filesystem geometry when it's mounting, which
> means that garbage agcount values can cause OOMs when we try to allocate
> all the per-AG incore metadata.  If we see geometry that looks
> suspicious, try to derive the actual AG geometry to avoid crashing the
> system.  This should help with xfs/1301 fuzzing.
>
> Also fix up xfs_repair to use the min/max dblocks macros.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Test machine is back to health with this patch, but some test are failing due
to the new error messages.
I guess its no surprise to you.


xfs/1300 5s ... 5s
xfs/1301         - output mismatch (see
/home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
    --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
    +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
2017-01-11 09:58:10.981678272 +0200
    @@ -1,4 +1,61 @@
     QA output created by 1301
     Format and populate
     Fuzz superblock
    +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
Using agcount=4.
    +SB sanity check failed
    +Metadata corruption detected at xfs_sb block 0x0/0x200
    +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
Using agcount=4.
    ...
    (Run 'diff -u tests/xfs/1301.out
/home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the
entire diff)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (c) (see
/home/amir/src/xfstests-dev/results//xfs/1301.full)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (r) (see
/home/amir/src/xfstests-dev/results//xfs/1301.full)
xfs/1302         - output mismatch (see
/home/amir/src/xfstests-dev/results//xfs/1302.out.bad)
    --- tests/xfs/1302.out      2017-01-08 15:35:07.647897359 +0200
    +++ /home/amir/src/xfstests-dev/results//xfs/1302.out.bad
2017-01-11 10:05:16.710031113 +0200
    @@ -1,4 +1,26 @@
     QA output created by 1302
     Format and populate
     Fuzz AGF
    +Metadata corruption detected at xfs_agf block 0x1/0x200
    +xfs_db: cannot init perag data (117). Continuing anyway.
    +Metadata corruption detected at xfs_agf block 0x1/0x200
    +xfs_db: cannot init perag data (117). Continuing anyway.
    ...
    (Run 'diff -u tests/xfs/1302.out
/home/amir/src/xfstests-dev/results//xfs/1302.out.bad'  to see the
entire diff)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (c) (see
/home/amir/src/xfstests-dev/results//xfs/1302.full)
xfs/1303 132s ... 130s
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (c) (see
/home/amir/src/xfstests-dev/results//xfs/1303.full)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (r) (see
/home/amir/src/xfstests-dev/results//xfs/1303.full)
xfs/1304         - output mismatch (see
/home/amir/src/xfstests-dev/results//xfs/1304.out.bad)
    --- tests/xfs/1304.out      2017-01-08 15:35:07.647897359 +0200
    +++ /home/amir/src/xfstests-dev/results//xfs/1304.out.bad
2017-01-11 10:12:26.506167776 +0200
    @@ -1,4 +1,12 @@
     QA output created by 1304
     Format and populate
     Fuzz AGI
    +Metadata corruption detected at xfs_agi block 0x2/0x200
    +xfs_db: cannot init perag data (117). Continuing anyway.
    +Metadata corruption detected at xfs_agi block 0x2/0x200
    +xfs_db: cannot init perag data (117). Continuing anyway.
    ...
    (Run 'diff -u tests/xfs/1304.out
/home/amir/src/xfstests-dev/results//xfs/1304.out.bad'  to see the
entire diff)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (c) (see
/home/amir/src/xfstests-dev/results//xfs/1304.full)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (r) (see
/home/amir/src/xfstests-dev/results//xfs/1304.full)
xfs/1305 224s ... 218s
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (c) (see
/home/amir/src/xfstests-dev/results//xfs/1305.full)
_check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
inconsistent (r) (see
/home/amir/src/xfstests-dev/results//xfs/1305.full)
xfs/1306 239s ... 234s
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 11, 2017, 10:01 a.m. UTC | #6
On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
>> xfs_db doesn't check the filesystem geometry when it's mounting, which
>> means that garbage agcount values can cause OOMs when we try to allocate
>> all the per-AG incore metadata.  If we see geometry that looks
>> suspicious, try to derive the actual AG geometry to avoid crashing the
>> system.  This should help with xfs/1301 fuzzing.
>>
>> Also fix up xfs_repair to use the min/max dblocks macros.
>>
>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>
> Test machine is back to health with this patch, but some test are failing due
> to the new error messages.
> I guess its no surprise to you.
>
>
> xfs/1300 5s ... 5s
> xfs/1301         - output mismatch (see
> /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
>     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
>     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
> 2017-01-11 09:58:10.981678272 +0200
>     @@ -1,4 +1,61 @@
>      QA output created by 1301
>      Format and populate
>      Fuzz superblock
>     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> Using agcount=4.
>     +SB sanity check failed
>     +Metadata corruption detected at xfs_sb block 0x0/0x200
>     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> Using agcount=4.
>     ...
>     (Run 'diff -u tests/xfs/1301.out
> /home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the
> entire diff)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (c) (see
> /home/amir/src/xfstests-dev/results//xfs/1301.full)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (r) (see
> /home/amir/src/xfstests-dev/results//xfs/1301.full)
> xfs/1302         - output mismatch (see
> /home/amir/src/xfstests-dev/results//xfs/1302.out.bad)
>     --- tests/xfs/1302.out      2017-01-08 15:35:07.647897359 +0200
>     +++ /home/amir/src/xfstests-dev/results//xfs/1302.out.bad
> 2017-01-11 10:05:16.710031113 +0200
>     @@ -1,4 +1,26 @@
>      QA output created by 1302
>      Format and populate
>      Fuzz AGF
>     +Metadata corruption detected at xfs_agf block 0x1/0x200
>     +xfs_db: cannot init perag data (117). Continuing anyway.
>     +Metadata corruption detected at xfs_agf block 0x1/0x200
>     +xfs_db: cannot init perag data (117). Continuing anyway.
>     ...
>     (Run 'diff -u tests/xfs/1302.out
> /home/amir/src/xfstests-dev/results//xfs/1302.out.bad'  to see the
> entire diff)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (c) (see
> /home/amir/src/xfstests-dev/results//xfs/1302.full)
> xfs/1303 132s ... 130s
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (c) (see
> /home/amir/src/xfstests-dev/results//xfs/1303.full)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (r) (see
> /home/amir/src/xfstests-dev/results//xfs/1303.full)
> xfs/1304         - output mismatch (see
> /home/amir/src/xfstests-dev/results//xfs/1304.out.bad)
>     --- tests/xfs/1304.out      2017-01-08 15:35:07.647897359 +0200
>     +++ /home/amir/src/xfstests-dev/results//xfs/1304.out.bad
> 2017-01-11 10:12:26.506167776 +0200
>     @@ -1,4 +1,12 @@
>      QA output created by 1304
>      Format and populate
>      Fuzz AGI
>     +Metadata corruption detected at xfs_agi block 0x2/0x200
>     +xfs_db: cannot init perag data (117). Continuing anyway.
>     +Metadata corruption detected at xfs_agi block 0x2/0x200
>     +xfs_db: cannot init perag data (117). Continuing anyway.
>     ...
>     (Run 'diff -u tests/xfs/1304.out
> /home/amir/src/xfstests-dev/results//xfs/1304.out.bad'  to see the
> entire diff)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (c) (see
> /home/amir/src/xfstests-dev/results//xfs/1304.full)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (r) (see
> /home/amir/src/xfstests-dev/results//xfs/1304.full)
> xfs/1305 224s ... 218s
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (c) (see
> /home/amir/src/xfstests-dev/results//xfs/1305.full)
> _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> inconsistent (r) (see
> /home/amir/src/xfstests-dev/results//xfs/1305.full)
> xfs/1306 239s ... 234s

Now I am hitting these xfs_db crashes during xfs/1316, which are apparently not
related to OOM killer. I have seen them last run as well but dmesg is quiet now.

xfs/1316        *** Error in `/usr/sbin/xfs_db': free(): invalid
pointer: 0x00007f9dbf036b78 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x77725)[0x7f9dbecea725]
/lib/x86_64-linux-gnu/libc.so.6(+0x7ff4a)[0x7f9dbecf2f4a]
/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f9dbecf6abc]
/usr/sbin/xfs_db[0x414961]
/usr/sbin/xfs_db[0x4154de]
/usr/sbin/xfs_db[0x420d38]
/usr/sbin/xfs_db[0x420926]
/usr/sbin/xfs_db[0x405125]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7f9dbec93830]
/usr/sbin/xfs_db[0x405179]
======= Memory map: ========
00400000-0049e000 r-xp 00000000 08:01 15995842
  /usr/sbin/xfs_db
0069d000-0069e000 r--p 0009d000 08:01 15995842
  /usr/sbin/xfs_db
0069e000-006a1000 rw-p 0009e000 08:01 15995842
  /usr/sbin/xfs_db
006a1000-006b0000 rw-p 00000000 00:00 0
0119e000-011e0000 rw-p 00000000 00:00 0                                  [heap]
7f9db8000000-7f9db8021000 rw-p 00000000 00:00 0
7f9db8021000-7f9dbc000000 ---p 00000000 00:00 0
7f9dbea5d000-7f9dbea73000 r-xp 00000000 08:01 6033829
  /lib/x86_64-linux-gnu/libgcc_s.so.1
7f9dbea73000-7f9dbec72000 ---p 00016000 08:01 6033829
  /lib/x86_64-linux-gnu/libgcc_s.so.1
7f9dbec72000-7f9dbec73000 rw-p 00015000 08:01 6033829
  /lib/x86_64-linux-gnu/libgcc_s.so.1
7f9dbec73000-7f9dbee33000 r-xp 00000000 08:01 6033791
  /lib/x86_64-linux-gnu/libc-2.23.so
7f9dbee33000-7f9dbf032000 ---p 001c0000 08:01 6033791
  /lib/x86_64-linux-gnu/libc-2.23.so
7f9dbf032000-7f9dbf036000 r--p 001bf000 08:01 6033791
  /lib/x86_64-linux-gnu/libc-2.23.so
7f9dbf036000-7f9dbf038000 rw-p 001c3000 08:01 6033791
  /lib/x86_64-linux-gnu/libc-2.23.so
7f9dbf038000-7f9dbf03c000 rw-p 00000000 00:00 0
7f9dbf03c000-7f9dbf054000 r-xp 00000000 08:01 6033937
  /lib/x86_64-linux-gnu/libpthread-2.23.so
7f9dbf054000-7f9dbf253000 ---p 00018000 08:01 6033937
  /lib/x86_64-linux-gnu/libpthread-2.23.so
7f9dbf253000-7f9dbf254000 r--p 00017000 08:01 6033937
  /lib/x86_64-linux-gnu/libpthread-2.23.so
7f9dbf254000-7f9dbf255000 rw-p 00018000 08:01 6033937
  /lib/x86_64-linux-gnu/libpthread-2.23.so
7f9dbf255000-7f9dbf259000 rw-p 00000000 00:00 0
7f9dbf259000-7f9dbf25d000 r-xp 00000000 08:01 6033975
  /lib/x86_64-linux-gnu/libuuid.so.1.3.0
7f9dbf25d000-7f9dbf45c000 ---p 00004000 08:01 6033975
  /lib/x86_64-linux-gnu/libuuid.so.1.3.0
7f9dbf45c000-7f9dbf45d000 r--p 00003000 08:01 6033975
  /lib/x86_64-linux-gnu/libuuid.so.1.3.0
7f9dbf45d000-7f9dbf45e000 rw-p 00004000 08:01 6033975
  /lib/x86_64-linux-gnu/libuuid.so.1.3.0
7f9dbf45e000-7f9dbf484000 r-xp 00000000 08:01 6033763
  /lib/x86_64-linux-gnu/ld-2.23.so
7f9dbf667000-7f9dbf66b000 rw-p 00000000 00:00 0
7f9dbf680000-7f9dbf683000 rw-p 00000000 00:00 0
7f9dbf683000-7f9dbf684000 r--p 00025000 08:01 6033763
  /lib/x86_64-linux-gnu/ld-2.23.so
7f9dbf684000-7f9dbf685000 rw-p 00026000 08:01 6033763
  /lib/x86_64-linux-gnu/ld-2.23.so
7f9dbf685000-7f9dbf686000 rw-p 00000000 00:00 0
7ffdde2cb000-7ffdde2ed000 rw-p 00000000 00:00 0                          [stack]
7ffdde366000-7ffdde368000 r--p 00000000 00:00 0                          [vvar]
7ffdde368000-7ffdde36a000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0
  [vsyscall]
*** Error in `/usr/sbin/xfs_db': free(): invalid pointer: 0x00007f202b108b78 ***

...

 - output mismatch (see /home/amir/src/xfstests-dev/results//xfs/1316.out.bad)
    --- tests/xfs/1316.out      2017-01-08 15:35:07.647897359 +0200
    +++ /home/amir/src/xfstests-dev/results//xfs/1316.out.bad
2017-01-11 11:56:06.156948852 +0200
    @@ -2,4 +2,20 @@
     Format and populate
     Find bmbt block
     Fuzz bmbt
    +./common/xfs: line 157: 19209 Aborted                 (core
dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
    +./common/xfs: line 157: 19219 Aborted                 (core
dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
    +./common/xfs: line 157: 19256 Aborted                 (core
dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
    +./common/xfs: line 157: 19264 Aborted                 (core
dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
    ...
    (Run 'diff -u tests/xfs/1316.out
/home/amir/src/xfstests-dev/results//xfs/1316.out.bad'  to see the
entire diff)
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen Jan. 11, 2017, 2:09 p.m. UTC | #7
On 1/11/17 4:01 AM, Amir Goldstein wrote:
> Now I am hitting these xfs_db crashes during xfs/1316, which are apparently not
> related to OOM killer. I have seen them last run as well but dmesg is quiet now.

Please install debug bits and get a proper coredump, or try valgrind,
to get a real backtrace here so we can see what's going on...

Thanks,
-Eric
 
> xfs/1316        *** Error in `/usr/sbin/xfs_db': free(): invalid
> pointer: 0x00007f9dbf036b78 ***
> ======= Backtrace: =========
> /lib/x86_64-linux-gnu/libc.so.6(+0x77725)[0x7f9dbecea725]
> /lib/x86_64-linux-gnu/libc.so.6(+0x7ff4a)[0x7f9dbecf2f4a]
> /lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f9dbecf6abc]
> /usr/sbin/xfs_db[0x414961]
> /usr/sbin/xfs_db[0x4154de]
> /usr/sbin/xfs_db[0x420d38]
> /usr/sbin/xfs_db[0x420926]
> /usr/sbin/xfs_db[0x405125]
> /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7f9dbec93830]
> /usr/sbin/xfs_db[0x405179]
> ======= Memory map: ========
> 00400000-0049e000 r-xp 00000000 08:01 15995842
>   /usr/sbin/xfs_db
> 0069d000-0069e000 r--p 0009d000 08:01 15995842
>   /usr/sbin/xfs_db
> 0069e000-006a1000 rw-p 0009e000 08:01 15995842
>   /usr/sbin/xfs_db
> 006a1000-006b0000 rw-p 00000000 00:00 0
> 0119e000-011e0000 rw-p 00000000 00:00 0                                  [heap]
> 7f9db8000000-7f9db8021000 rw-p 00000000 00:00 0
> 7f9db8021000-7f9dbc000000 ---p 00000000 00:00 0
> 7f9dbea5d000-7f9dbea73000 r-xp 00000000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbea73000-7f9dbec72000 ---p 00016000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbec72000-7f9dbec73000 rw-p 00015000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbec73000-7f9dbee33000 r-xp 00000000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbee33000-7f9dbf032000 ---p 001c0000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf032000-7f9dbf036000 r--p 001bf000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf036000-7f9dbf038000 rw-p 001c3000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf038000-7f9dbf03c000 rw-p 00000000 00:00 0
> 7f9dbf03c000-7f9dbf054000 r-xp 00000000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf054000-7f9dbf253000 ---p 00018000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf253000-7f9dbf254000 r--p 00017000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf254000-7f9dbf255000 rw-p 00018000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf255000-7f9dbf259000 rw-p 00000000 00:00 0
> 7f9dbf259000-7f9dbf25d000 r-xp 00000000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf25d000-7f9dbf45c000 ---p 00004000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45c000-7f9dbf45d000 r--p 00003000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45d000-7f9dbf45e000 rw-p 00004000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45e000-7f9dbf484000 r-xp 00000000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf667000-7f9dbf66b000 rw-p 00000000 00:00 0
> 7f9dbf680000-7f9dbf683000 rw-p 00000000 00:00 0
> 7f9dbf683000-7f9dbf684000 r--p 00025000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf684000-7f9dbf685000 rw-p 00026000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf685000-7f9dbf686000 rw-p 00000000 00:00 0
> 7ffdde2cb000-7ffdde2ed000 rw-p 00000000 00:00 0                          [stack]
> 7ffdde366000-7ffdde368000 r--p 00000000 00:00 0                          [vvar]
> 7ffdde368000-7ffdde36a000 r-xp 00000000 00:00 0                          [vdso]
> ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0
>   [vsyscall]
> *** Error in `/usr/sbin/xfs_db': free(): invalid pointer: 0x00007f202b108b78 ***
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 11, 2017, 2:28 p.m. UTC | #8
On Wed, Jan 11, 2017 at 4:09 PM, Eric Sandeen <sandeen@sandeen.net> wrote:
> On 1/11/17 4:01 AM, Amir Goldstein wrote:
>> Now I am hitting these xfs_db crashes during xfs/1316, which are apparently not
>> related to OOM killer. I have seen them last run as well but dmesg is quiet now.
>
> Please install debug bits and get a proper coredump, or try valgrind,
> to get a real backtrace here so we can see what's going on...
>

Since this is the second run where I am getting the same core dumps
(and they happen many times during xfs/1316 I doubt that this is not
reproducible on any system.

Anyway, scrub tests are still running and will be running for a while,
so I can re-run 1316 tomorrow to get the coredump.
Is there a standard configure/make way to install the xfsprogs debug binaries?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen Jan. 11, 2017, 2:51 p.m. UTC | #9
On 1/11/17 8:28 AM, Amir Goldstein wrote:
> Is there a standard configure/make way to install the xfsprogs debug binaries?

I almost always go through packaging, even on my test systems, and
then install the -debuginfo rpms.

I /think/ a make install will keep the (default, AFAIK) debug binaries
intact too.

For obtaining glibc debug info, it'd be distro-specific ...

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 11, 2017, 5:32 p.m. UTC | #10
On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
> On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
> > <darrick.wong@oracle.com> wrote:
> >> xfs_db doesn't check the filesystem geometry when it's mounting, which
> >> means that garbage agcount values can cause OOMs when we try to allocate
> >> all the per-AG incore metadata.  If we see geometry that looks
> >> suspicious, try to derive the actual AG geometry to avoid crashing the
> >> system.  This should help with xfs/1301 fuzzing.
> >>
> >> Also fix up xfs_repair to use the min/max dblocks macros.
> >>
> >> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Test machine is back to health with this patch, but some test are failing due
> > to the new error messages.
> > I guess its no surprise to you.
> >
> >
> > xfs/1300 5s ... 5s
> > xfs/1301         - output mismatch (see
> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
> >     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
> >     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
> > 2017-01-11 09:58:10.981678272 +0200
> >     @@ -1,4 +1,61 @@
> >      QA output created by 1301
> >      Format and populate
> >      Fuzz superblock
> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> > Using agcount=4.
> >     +SB sanity check failed
> >     +Metadata corruption detected at xfs_sb block 0x0/0x200
> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> > Using agcount=4.
> >     ...
> >     (Run 'diff -u tests/xfs/1301.out
> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the

Uh.... this is odd, all that stuff should go into 1301.full.

> > entire diff)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (c) (see
> > /home/amir/src/xfstests-dev/results//xfs/1301.full)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (r) (see
> > /home/amir/src/xfstests-dev/results//xfs/1301.full)
> > xfs/1302         - output mismatch (see
> > /home/amir/src/xfstests-dev/results//xfs/1302.out.bad)
> >     --- tests/xfs/1302.out      2017-01-08 15:35:07.647897359 +0200
> >     +++ /home/amir/src/xfstests-dev/results//xfs/1302.out.bad
> > 2017-01-11 10:05:16.710031113 +0200
> >     @@ -1,4 +1,26 @@
> >      QA output created by 1302
> >      Format and populate
> >      Fuzz AGF
> >     +Metadata corruption detected at xfs_agf block 0x1/0x200
> >     +xfs_db: cannot init perag data (117). Continuing anyway.
> >     +Metadata corruption detected at xfs_agf block 0x1/0x200
> >     +xfs_db: cannot init perag data (117). Continuing anyway.

I just ran 1302, all the output goes into 1302.full.

Now I wonder what's different with your setup than mine?

> >     ...
> >     (Run 'diff -u tests/xfs/1302.out
> > /home/amir/src/xfstests-dev/results//xfs/1302.out.bad'  to see the
> > entire diff)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (c) (see
> > /home/amir/src/xfstests-dev/results//xfs/1302.full)
> > xfs/1303 132s ... 130s
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (c) (see
> > /home/amir/src/xfstests-dev/results//xfs/1303.full)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (r) (see
> > /home/amir/src/xfstests-dev/results//xfs/1303.full)
> > xfs/1304         - output mismatch (see
> > /home/amir/src/xfstests-dev/results//xfs/1304.out.bad)
> >     --- tests/xfs/1304.out      2017-01-08 15:35:07.647897359 +0200
> >     +++ /home/amir/src/xfstests-dev/results//xfs/1304.out.bad
> > 2017-01-11 10:12:26.506167776 +0200
> >     @@ -1,4 +1,12 @@
> >      QA output created by 1304
> >      Format and populate
> >      Fuzz AGI
> >     +Metadata corruption detected at xfs_agi block 0x2/0x200
> >     +xfs_db: cannot init perag data (117). Continuing anyway.
> >     +Metadata corruption detected at xfs_agi block 0x2/0x200
> >     +xfs_db: cannot init perag data (117). Continuing anyway.
> >     ...
> >     (Run 'diff -u tests/xfs/1304.out
> > /home/amir/src/xfstests-dev/results//xfs/1304.out.bad'  to see the
> > entire diff)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (c) (see
> > /home/amir/src/xfstests-dev/results//xfs/1304.full)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (r) (see
> > /home/amir/src/xfstests-dev/results//xfs/1304.full)
> > xfs/1305 224s ... 218s
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (c) (see
> > /home/amir/src/xfstests-dev/results//xfs/1305.full)
> > _check_xfs_filesystem: filesystem on /dev/mapper/storage-scratch is
> > inconsistent (r) (see
> > /home/amir/src/xfstests-dev/results//xfs/1305.full)
> > xfs/1306 239s ... 234s
> 
> Now I am hitting these xfs_db crashes during xfs/1316, which are apparently not
> related to OOM killer. I have seen them last run as well but dmesg is quiet now.
> 
> xfs/1316        *** Error in `/usr/sbin/xfs_db': free(): invalid
> pointer: 0x00007f9dbf036b78 ***
> ======= Backtrace: =========
> /lib/x86_64-linux-gnu/libc.so.6(+0x77725)[0x7f9dbecea725]
> /lib/x86_64-linux-gnu/libc.so.6(+0x7ff4a)[0x7f9dbecf2f4a]
> /lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f9dbecf6abc]
> /usr/sbin/xfs_db[0x414961]
> /usr/sbin/xfs_db[0x4154de]
> /usr/sbin/xfs_db[0x420d38]
> /usr/sbin/xfs_db[0x420926]
> /usr/sbin/xfs_db[0x405125]
> /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7f9dbec93830]
> /usr/sbin/xfs_db[0x405179]

Ok well I definitely don't see /this/ happening.  I gather you built
xfsprogs with the insane geometry patch; if so, against what git commit?
And, did the binary get installed as /usr/sbin/xfs_db, or is this just
the system xfs_db?

In any case, all that extra output is supposed to end up in
$seqres.full, not on stdout.  At most you should see admonishments about
scrub or repair failing to detect/fix things; those messages look like:

"offline repair failed (4) with $field = $fuzzverb"

--D

> ======= Memory map: ========
> 00400000-0049e000 r-xp 00000000 08:01 15995842
>   /usr/sbin/xfs_db
> 0069d000-0069e000 r--p 0009d000 08:01 15995842
>   /usr/sbin/xfs_db
> 0069e000-006a1000 rw-p 0009e000 08:01 15995842
>   /usr/sbin/xfs_db
> 006a1000-006b0000 rw-p 00000000 00:00 0
> 0119e000-011e0000 rw-p 00000000 00:00 0                                  [heap]
> 7f9db8000000-7f9db8021000 rw-p 00000000 00:00 0
> 7f9db8021000-7f9dbc000000 ---p 00000000 00:00 0
> 7f9dbea5d000-7f9dbea73000 r-xp 00000000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbea73000-7f9dbec72000 ---p 00016000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbec72000-7f9dbec73000 rw-p 00015000 08:01 6033829
>   /lib/x86_64-linux-gnu/libgcc_s.so.1
> 7f9dbec73000-7f9dbee33000 r-xp 00000000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbee33000-7f9dbf032000 ---p 001c0000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf032000-7f9dbf036000 r--p 001bf000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf036000-7f9dbf038000 rw-p 001c3000 08:01 6033791
>   /lib/x86_64-linux-gnu/libc-2.23.so
> 7f9dbf038000-7f9dbf03c000 rw-p 00000000 00:00 0
> 7f9dbf03c000-7f9dbf054000 r-xp 00000000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf054000-7f9dbf253000 ---p 00018000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf253000-7f9dbf254000 r--p 00017000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf254000-7f9dbf255000 rw-p 00018000 08:01 6033937
>   /lib/x86_64-linux-gnu/libpthread-2.23.so
> 7f9dbf255000-7f9dbf259000 rw-p 00000000 00:00 0
> 7f9dbf259000-7f9dbf25d000 r-xp 00000000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf25d000-7f9dbf45c000 ---p 00004000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45c000-7f9dbf45d000 r--p 00003000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45d000-7f9dbf45e000 rw-p 00004000 08:01 6033975
>   /lib/x86_64-linux-gnu/libuuid.so.1.3.0
> 7f9dbf45e000-7f9dbf484000 r-xp 00000000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf667000-7f9dbf66b000 rw-p 00000000 00:00 0
> 7f9dbf680000-7f9dbf683000 rw-p 00000000 00:00 0
> 7f9dbf683000-7f9dbf684000 r--p 00025000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf684000-7f9dbf685000 rw-p 00026000 08:01 6033763
>   /lib/x86_64-linux-gnu/ld-2.23.so
> 7f9dbf685000-7f9dbf686000 rw-p 00000000 00:00 0
> 7ffdde2cb000-7ffdde2ed000 rw-p 00000000 00:00 0                          [stack]
> 7ffdde366000-7ffdde368000 r--p 00000000 00:00 0                          [vvar]
> 7ffdde368000-7ffdde36a000 r-xp 00000000 00:00 0                          [vdso]
> ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0
>   [vsyscall]
> *** Error in `/usr/sbin/xfs_db': free(): invalid pointer: 0x00007f202b108b78 ***
> 
> ...
> 
>  - output mismatch (see /home/amir/src/xfstests-dev/results//xfs/1316.out.bad)
>     --- tests/xfs/1316.out      2017-01-08 15:35:07.647897359 +0200
>     +++ /home/amir/src/xfstests-dev/results//xfs/1316.out.bad
> 2017-01-11 11:56:06.156948852 +0200
>     @@ -2,4 +2,20 @@
>      Format and populate
>      Find bmbt block
>      Fuzz bmbt
>     +./common/xfs: line 157: 19209 Aborted                 (core
> dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
>     +./common/xfs: line 157: 19219 Aborted                 (core
> dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
>     +./common/xfs: line 157: 19256 Aborted                 (core
> dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
>     +./common/xfs: line 157: 19264 Aborted                 (core
> dumped) $XFS_DB_PROG "$@" $(_scratch_xfs_db_options)
>     ...
>     (Run 'diff -u tests/xfs/1316.out
> /home/amir/src/xfstests-dev/results//xfs/1316.out.bad'  to see the
> entire diff)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 11, 2017, 6:04 p.m. UTC | #11
On Wed, Jan 11, 2017 at 7:32 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
...
> Ok well I definitely don't see /this/ happening.  I gather you built
> xfsprogs with the insane geometry patch; if so, against what git commit?

against djwon-devel from your tree:
75581a8 xfs_db: sanitize geometry on load
2efc292 xfs_scrub: create a script to scrub all xfs filesystems

xfstests head is:
1377e1e xfs: fuzz every field of every structure

> And, did the binary get installed as /usr/sbin/xfs_db, or is this just
> the system xfs_db?

/usr/sbin/xfs_db

I ran "make realclean; make; sudo make install;"

My system is ubuntu 16.4, in case it matters for shell or anything.

>
> In any case, all that extra output is supposed to end up in
> $seqres.full, not on stdout.  At most you should see admonishments about
> scrub or repair failing to detect/fix things; those messages look like:
>
> "offline repair failed (4) with $field = $fuzzverb"
>

Strange. Anyway it's late now and run has not completed yet.
Tomorrow I'll narrow down on 1301 and 1316 to see what's what.

Hopefully, we can get another test sample from Eryu by then,
to see if I'm the only one seeing these errors.

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 12, 2017, 8:01 a.m. UTC | #12
On Wed, Jan 11, 2017 at 7:32 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
>> On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>> > On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
>> > <darrick.wong@oracle.com> wrote:
>> >> xfs_db doesn't check the filesystem geometry when it's mounting, which
>> >> means that garbage agcount values can cause OOMs when we try to allocate
>> >> all the per-AG incore metadata.  If we see geometry that looks
>> >> suspicious, try to derive the actual AG geometry to avoid crashing the
>> >> system.  This should help with xfs/1301 fuzzing.
>> >>
>> >> Also fix up xfs_repair to use the min/max dblocks macros.
>> >>
>> >> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>> >
>> > Test machine is back to health with this patch, but some test are failing due
>> > to the new error messages.
>> > I guess its no surprise to you.
>> >
>> >
>> > xfs/1300 5s ... 5s
>> > xfs/1301         - output mismatch (see
>> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
>> >     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
>> >     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
>> > 2017-01-11 09:58:10.981678272 +0200
>> >     @@ -1,4 +1,61 @@
>> >      QA output created by 1301
>> >      Format and populate
>> >      Fuzz superblock
>> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
>> > Using agcount=4.
>> >     +SB sanity check failed
>> >     +Metadata corruption detected at xfs_sb block 0x0/0x200
>> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
>> > Using agcount=4.
>> >     ...
>> >     (Run 'diff -u tests/xfs/1301.out
>> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the
>
> Uh.... this is odd, all that stuff should go into 1301.full.
>

Darrick,

I see that you went down the problematic path of redirecting input to
interactive xfs_db shell.

I say problematic, because I went down that path with xfs_io when
I wrote test overlay/016, because xfs_io -c "open foo" was broken:
http://www.spinics.net/lists/fstests/msg04526.html

What I found out is that on some systems, the "xfs_io>" echo appears
in output and on some systems it does not. It is not related to xfs_io
itself, but to something else in the environment, presumably, the shell
or the terminal?
I was running the same test with same version of xfs_io on
debian Jessie inside kvm-xfstests and on Ubuntu 16.04 (ssh+tmux)
and got different results.

Can you rearrange all the fuzzy helpers to use xfs_db -c instead of
redirecting stdin to xfs_db?

Regarding xfs/1301, all output from xfs_db should go to 1301.full, so
the existence of the "xfs_db>" prompt is probably not important, but
perhaps there are other factors in play which cause stderr from xfs_db
to get to 1301.out.bad on my system.
I must say that I looked to see where in xfs/1301 or in common/fuzzy
you redirect stderr to stdout and didn't find it.
Please enlighten me.

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 13, 2017, 1:06 a.m. UTC | #13
On Thu, Jan 12, 2017 at 10:01:54AM +0200, Amir Goldstein wrote:
> On Wed, Jan 11, 2017 at 7:32 PM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> > On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
> >> On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> >> > On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
> >> > <darrick.wong@oracle.com> wrote:
> >> >> xfs_db doesn't check the filesystem geometry when it's mounting, which
> >> >> means that garbage agcount values can cause OOMs when we try to allocate
> >> >> all the per-AG incore metadata.  If we see geometry that looks
> >> >> suspicious, try to derive the actual AG geometry to avoid crashing the
> >> >> system.  This should help with xfs/1301 fuzzing.
> >> >>
> >> >> Also fix up xfs_repair to use the min/max dblocks macros.
> >> >>
> >> >> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >> >
> >> > Test machine is back to health with this patch, but some test are failing due
> >> > to the new error messages.
> >> > I guess its no surprise to you.
> >> >
> >> >
> >> > xfs/1300 5s ... 5s
> >> > xfs/1301         - output mismatch (see
> >> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
> >> >     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
> >> >     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
> >> > 2017-01-11 09:58:10.981678272 +0200
> >> >     @@ -1,4 +1,61 @@
> >> >      QA output created by 1301
> >> >      Format and populate
> >> >      Fuzz superblock
> >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> >> > Using agcount=4.
> >> >     +SB sanity check failed
> >> >     +Metadata corruption detected at xfs_sb block 0x0/0x200
> >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> >> > Using agcount=4.
> >> >     ...
> >> >     (Run 'diff -u tests/xfs/1301.out
> >> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the
> >
> > Uh.... this is odd, all that stuff should go into 1301.full.
> >
> 
> Darrick,
> 
> I see that you went down the problematic path of redirecting input to
> interactive xfs_db shell.
> 
> I say problematic, because I went down that path with xfs_io when
> I wrote test overlay/016, because xfs_io -c "open foo" was broken:
> http://www.spinics.net/lists/fstests/msg04526.html
>
> What I found out is that on some systems, the "xfs_io>" echo appears
> in output and on some systems it does not. It is not related to xfs_io
> itself, but to something else in the environment, presumably, the shell
> or the terminal?
> I was running the same test with same version of xfs_io on
> debian Jessie inside kvm-xfstests and on Ubuntu 16.04 (ssh+tmux)
> and got different results.

I get the feeling this could be related to readline -- on my systems I
build with libreadline6-dev installed and configure picks it up.  ISTR
if configure doesn't find a libreadline xfs_db'll do things a little
differently wrt printing the prompt... and that might be why you see
stray "xfs_db>" and I don't.  Or maybe you're linking against a
different version, or... ugh, what a mess.

> Can you rearrange all the fuzzy helpers to use xfs_db -c instead of
> redirecting stdin to xfs_db?

I changed the tests to dump commands to a file and run:

_scratch_xfs_db -c "source $seqres.cmd"

but then discovered that the source command is just plain broken when
invoked via -c, so I fixed that.  Then I noticed that you changed
common/fuzzy to use a bash array and sent the patch to me, and now I'm
wondering if that's a better solution than making tempfiles everywhere.

> Regarding xfs/1301, all output from xfs_db should go to 1301.full, so
> the existence of the "xfs_db>" prompt is probably not important, but
> perhaps there are other factors in play which cause stderr from xfs_db
> to get to 1301.out.bad on my system.
> I must say that I looked to see where in xfs/1301 or in common/fuzzy
> you redirect stderr to stdout and didn't find it.

/me is confused; there should be a bunch of places where stderr gets
redirected to stdout.

$ grep '2>' common/fuzzy -n
32:     find "${SCRATCH_MNT}/" -type f 2> /dev/null | head -n "${nr}" | while read f; do
52:     ls -laR "${SCRATCH_MNT}/test.1/" >/dev/null 2>&1
55:     (find "${SCRATCH_MNT}/test.1/" -type f -size -1048576k -print0 | xargs -0 cat) >/dev/null 2>&1
180:    newval="$(_scratch_xfs_get_metadata_field "${key}" "$@" 2> /dev/null)"
191:    while _scratch_unmount 2>/dev/null; do sleep 0.2; done
227:    _scratch_mount 2>&1
231:            _scratch_scrub -n -a 1 -e continue 2>&1
239:                    _scratch_scrub -y 2>&1
253:            _repair_scratch_fs 2>&1
261:        _scratch_xfs_repair -n 2>&1
268:    _scratch_mount 2>&1
271:            _scratch_scrub -e continue 2>&1
278:            _scratch_fuzz_modify 100 2>&1
286:    _scratch_xfs_repair -n 2>&1
298:    _scratch_mkfs_xfs >/dev/null 2>&1

Sorry this has been such a mess. :/

--D

> Please enlighten me.
> 
> Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 13, 2017, 7:02 a.m. UTC | #14
On Thu, Jan 12, 2017 at 05:06:14PM -0800, Darrick J. Wong wrote:
> On Thu, Jan 12, 2017 at 10:01:54AM +0200, Amir Goldstein wrote:
> > On Wed, Jan 11, 2017 at 7:32 PM, Darrick J. Wong
> > <darrick.wong@oracle.com> wrote:
> > > On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
> > >> On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > >> > On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
> > >> > <darrick.wong@oracle.com> wrote:
> > >> >> xfs_db doesn't check the filesystem geometry when it's mounting, which
> > >> >> means that garbage agcount values can cause OOMs when we try to allocate
> > >> >> all the per-AG incore metadata.  If we see geometry that looks
> > >> >> suspicious, try to derive the actual AG geometry to avoid crashing the
> > >> >> system.  This should help with xfs/1301 fuzzing.
> > >> >>
> > >> >> Also fix up xfs_repair to use the min/max dblocks macros.
> > >> >>
> > >> >> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > >> >
> > >> > Test machine is back to health with this patch, but some test are failing due
> > >> > to the new error messages.
> > >> > I guess its no surprise to you.
> > >> >
> > >> >
> > >> > xfs/1300 5s ... 5s
> > >> > xfs/1301         - output mismatch (see
> > >> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
> > >> >     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
> > >> >     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
> > >> > 2017-01-11 09:58:10.981678272 +0200
> > >> >     @@ -1,4 +1,61 @@
> > >> >      QA output created by 1301
> > >> >      Format and populate
> > >> >      Fuzz superblock
> > >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> > >> > Using agcount=4.
> > >> >     +SB sanity check failed
> > >> >     +Metadata corruption detected at xfs_sb block 0x0/0x200
> > >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
> > >> > Using agcount=4.

Pah, what a dunce I am!  I think I figured out the cause of this.  The
inner loop of _scratch_xfs_fuzz_metadata is created by calling xfs_db on
the scratch filesystem to enumerate the available fuzz verbs.  This is
bad because sb 0 could be trashed (it certainly is in 1301) and crash
the debugger... and it's also unnecessary since the verb list doesn't
change.

I'll change the double loop to precompute the field and verb list and
use the precomputed value, saving us a potentially fraught call to
xfs_db for every field.

This also fixes the problem wherein xfs_repair fails to fix some field
in the superblock and the whole test suddenly stops working because the
scratch fs is toast.

--D

> > >> >     ...
> > >> >     (Run 'diff -u tests/xfs/1301.out
> > >> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad'  to see the
> > >
> > > Uh.... this is odd, all that stuff should go into 1301.full.
> > >
> > 
> > Darrick,
> > 
> > I see that you went down the problematic path of redirecting input to
> > interactive xfs_db shell.
> > 
> > I say problematic, because I went down that path with xfs_io when
> > I wrote test overlay/016, because xfs_io -c "open foo" was broken:
> > http://www.spinics.net/lists/fstests/msg04526.html
> >
> > What I found out is that on some systems, the "xfs_io>" echo appears
> > in output and on some systems it does not. It is not related to xfs_io
> > itself, but to something else in the environment, presumably, the shell
> > or the terminal?
> > I was running the same test with same version of xfs_io on
> > debian Jessie inside kvm-xfstests and on Ubuntu 16.04 (ssh+tmux)
> > and got different results.
> 
> I get the feeling this could be related to readline -- on my systems I
> build with libreadline6-dev installed and configure picks it up.  ISTR
> if configure doesn't find a libreadline xfs_db'll do things a little
> differently wrt printing the prompt... and that might be why you see
> stray "xfs_db>" and I don't.  Or maybe you're linking against a
> different version, or... ugh, what a mess.
> 
> > Can you rearrange all the fuzzy helpers to use xfs_db -c instead of
> > redirecting stdin to xfs_db?
> 
> I changed the tests to dump commands to a file and run:
> 
> _scratch_xfs_db -c "source $seqres.cmd"
> 
> but then discovered that the source command is just plain broken when
> invoked via -c, so I fixed that.  Then I noticed that you changed
> common/fuzzy to use a bash array and sent the patch to me, and now I'm
> wondering if that's a better solution than making tempfiles everywhere.
> 
> > Regarding xfs/1301, all output from xfs_db should go to 1301.full, so
> > the existence of the "xfs_db>" prompt is probably not important, but
> > perhaps there are other factors in play which cause stderr from xfs_db
> > to get to 1301.out.bad on my system.
> > I must say that I looked to see where in xfs/1301 or in common/fuzzy
> > you redirect stderr to stdout and didn't find it.
> 
> /me is confused; there should be a bunch of places where stderr gets
> redirected to stdout.
> 
> $ grep '2>' common/fuzzy -n
> 32:     find "${SCRATCH_MNT}/" -type f 2> /dev/null | head -n "${nr}" | while read f; do
> 52:     ls -laR "${SCRATCH_MNT}/test.1/" >/dev/null 2>&1
> 55:     (find "${SCRATCH_MNT}/test.1/" -type f -size -1048576k -print0 | xargs -0 cat) >/dev/null 2>&1
> 180:    newval="$(_scratch_xfs_get_metadata_field "${key}" "$@" 2> /dev/null)"
> 191:    while _scratch_unmount 2>/dev/null; do sleep 0.2; done
> 227:    _scratch_mount 2>&1
> 231:            _scratch_scrub -n -a 1 -e continue 2>&1
> 239:                    _scratch_scrub -y 2>&1
> 253:            _repair_scratch_fs 2>&1
> 261:        _scratch_xfs_repair -n 2>&1
> 268:    _scratch_mount 2>&1
> 271:            _scratch_scrub -e continue 2>&1
> 278:            _scratch_fuzz_modify 100 2>&1
> 286:    _scratch_xfs_repair -n 2>&1
> 298:    _scratch_mkfs_xfs >/dev/null 2>&1
> 
> Sorry this has been such a mess. :/
> 
> --D
> 
> > Please enlighten me.
> > 
> > Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Jan. 13, 2017, 7:07 a.m. UTC | #15
On Fri, Jan 13, 2017 at 9:02 AM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Thu, Jan 12, 2017 at 05:06:14PM -0800, Darrick J. Wong wrote:
>> On Thu, Jan 12, 2017 at 10:01:54AM +0200, Amir Goldstein wrote:
>> > On Wed, Jan 11, 2017 at 7:32 PM, Darrick J. Wong
>> > <darrick.wong@oracle.com> wrote:
>> > > On Wed, Jan 11, 2017 at 12:01:22PM +0200, Amir Goldstein wrote:
>> > >> On Wed, Jan 11, 2017 at 10:34 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>> > >> > On Tue, Jan 10, 2017 at 9:42 PM, Darrick J. Wong
>> > >> > <darrick.wong@oracle.com> wrote:
>> > >> >> xfs_db doesn't check the filesystem geometry when it's mounting, which
>> > >> >> means that garbage agcount values can cause OOMs when we try to allocate
>> > >> >> all the per-AG incore metadata.  If we see geometry that looks
>> > >> >> suspicious, try to derive the actual AG geometry to avoid crashing the
>> > >> >> system.  This should help with xfs/1301 fuzzing.
>> > >> >>
>> > >> >> Also fix up xfs_repair to use the min/max dblocks macros.
>> > >> >>
>> > >> >> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>> > >> >
>> > >> > Test machine is back to health with this patch, but some test are failing due
>> > >> > to the new error messages.
>> > >> > I guess its no surprise to you.
>> > >> >
>> > >> >
>> > >> > xfs/1300 5s ... 5s
>> > >> > xfs/1301         - output mismatch (see
>> > >> > /home/amir/src/xfstests-dev/results//xfs/1301.out.bad)
>> > >> >     --- tests/xfs/1301.out      2017-01-08 15:35:07.647897359 +0200
>> > >> >     +++ /home/amir/src/xfstests-dev/results//xfs/1301.out.bad
>> > >> > 2017-01-11 09:58:10.981678272 +0200
>> > >> >     @@ -1,4 +1,61 @@
>> > >> >      QA output created by 1301
>> > >> >      Format and populate
>> > >> >      Fuzz superblock
>> > >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
>> > >> > Using agcount=4.
>> > >> >     +SB sanity check failed
>> > >> >     +Metadata corruption detected at xfs_sb block 0x0/0x200
>> > >> >     +xfs_db: device /dev/mapper/storage-scratch AG geometry is insane.
>> > >> > Using agcount=4.
>
> Pah, what a dunce I am!  I think I figured out the cause of this.  The
> inner loop of _scratch_xfs_fuzz_metadata is created by calling xfs_db on
> the scratch filesystem to enumerate the available fuzz verbs.  This is
> bad because sb 0 could be trashed (it certainly is in 1301) and crash
> the debugger... and it's also unnecessary since the verb list doesn't
> change.
>
> I'll change the double loop to precompute the field and verb list and
> use the precomputed value, saving us a potentially fraught call to
> xfs_db for every field.
>
> This also fixes the problem wherein xfs_repair fails to fix some field
> in the superblock and the whole test suddenly stops working because the
> scratch fs is toast.
>

Nice :)
This was also the case I was referring to where strerr is not redirected to
stdout, which was causing the errors above to get to 1301.out.bad.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/db/init.c b/db/init.c
index ec1e274..7e66d14 100644
--- a/db/init.c
+++ b/db/init.c
@@ -51,13 +51,90 @@  usage(void)
 	exit(1);
 }
 
+/* Try to load an AG's superblock, no verifiers. */
+static bool
+load_sb(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	struct xfs_sb		*sbp)
+{
+	struct xfs_buf		*bp;
+
+	bp = libxfs_readbuf(mp->m_ddev_targp,
+			    XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
+
+	if (!bp || bp->b_error)
+		return false;
+
+	/* copy SB from buffer to in-core, converting architecture as we go */
+	libxfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+	libxfs_putbuf(bp);
+	libxfs_purgebuf(bp);
+
+	return true;
+}
+
+/* If the geometry doesn't look sane, try to figure out the real geometry. */
+static void
+sanitize_geometry(
+	struct xfs_mount	*mp,
+	struct xfs_sb		*sbp)
+{
+	struct xfs_sb		sb;
+	xfs_agblock_t		agblocks;
+
+	/* If the geometry looks ok, we're done. */
+	if (sbp->sb_blocklog >= XFS_MIN_BLOCKSIZE_LOG &&
+	    sbp->sb_blocklog <= XFS_MAX_BLOCKSIZE_LOG &&
+	    sbp->sb_blocksize == (1 << sbp->sb_blocklog) &&
+	    sbp->sb_dblocks * sbp->sb_blocksize <= x.dsize * x.dbsize &&
+	    sbp->sb_dblocks <= XFS_MAX_DBLOCKS(sbp) &&
+	    sbp->sb_dblocks >= XFS_MIN_DBLOCKS(sbp))
+		return;
+
+	/* Check blocklog and blocksize */
+	if (sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG)
+		sbp->sb_blocklog = libxfs_log2_roundup(sbp->sb_blocksize);
+	if (sbp->sb_blocksize != (1 << sbp->sb_blocklog))
+		sbp->sb_blocksize = (1 << sbp->sb_blocksize);
+
+	/* Clamp dblocks to the size of the device. */
+	if (sbp->sb_dblocks > x.dsize * x.dbsize / sbp->sb_blocksize)
+		sbp->sb_dblocks = x.dsize * x.dbsize / sbp->sb_blocksize;
+
+	/* See if agblocks helps us find a superblock. */
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	if (load_sb(mp, 1, &sb) && sb.sb_magicnum == XFS_SB_MAGIC) {
+		sbp->sb_agcount = sbp->sb_dblocks / sbp->sb_agblocks;
+		goto out;
+	}
+
+	/* See if agcount helps us find a superblock. */
+	agblocks = sbp->sb_agblocks;
+	sbp->sb_agblocks = sbp->sb_dblocks / sbp->sb_agcount;
+	if (sbp->sb_agblocks != 0 &&
+	    load_sb(mp, 1, &sb) &&
+	    sb.sb_magicnum == XFS_SB_MAGIC) {
+		goto out;
+	}
+
+	/* Both are nuts, assume 1 AG. */
+	sbp->sb_agblocks = agblocks;
+	sbp->sb_agcount = 1;
+out:
+	fprintf(stderr,
+		_("%s: device %s AG geometry is insane.  Using agcount=%u.\n"),
+		progname, fsdevice, sbp->sb_agcount);
+}
+
 void
 init(
 	int		argc,
 	char		**argv)
 {
 	struct xfs_sb	*sbp;
-	struct xfs_buf	*bp;
 	int		c;
 
 	setlocale(LC_ALL, "");
@@ -124,20 +201,12 @@  init(
 	 */
 	memset(&xmount, 0, sizeof(struct xfs_mount));
 	libxfs_buftarg_init(&xmount, x.ddev, x.logdev, x.rtdev);
-	bp = libxfs_readbuf(xmount.m_ddev_targp, XFS_SB_DADDR,
-			    1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, NULL);
-
-	if (!bp || bp->b_error) {
+	if (!load_sb(&xmount, 0, &xmount.m_sb)) {
 		fprintf(stderr, _("%s: %s is invalid (cannot read first 512 "
 			"bytes)\n"), progname, fsdevice);
 		exit(1);
 	}
 
-	/* copy SB from buffer to in-core, converting architecture as we go */
-	libxfs_sb_from_disk(&xmount.m_sb, XFS_BUF_TO_SBP(bp));
-	libxfs_putbuf(bp);
-	libxfs_purgebuf(bp);
-
 	sbp = &xmount.m_sb;
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
 		fprintf(stderr, _("%s: %s is not a valid XFS filesystem (unexpected SB magic number 0x%08x)\n"),
@@ -148,6 +217,8 @@  init(
 		}
 	}
 
+	sanitize_geometry(&xmount, sbp);
+
 	mp = libxfs_mount(&xmount, sbp, x.ddev, x.logdev, x.rtdev,
 			  LIBXFS_MOUNT_DEBUGGER);
 	if (!mp) {
diff --git a/repair/sb.c b/repair/sb.c
index 1b352e8..e108613 100644
--- a/repair/sb.c
+++ b/repair/sb.c
@@ -398,11 +398,8 @@  verify_sb(char *sb_buf, xfs_sb_t *sb, int is_primary_sb)
 	/* sanity check ag count, size fields against data size field */
 
 	if (sb->sb_dblocks == 0 ||
-		sb->sb_dblocks >
-			((__uint64_t)sb->sb_agcount * sb->sb_agblocks) ||
-		sb->sb_dblocks <
-			((__uint64_t)(sb->sb_agcount - 1) * sb->sb_agblocks
-			+ XFS_MIN_AG_BLOCKS))
+		sb->sb_dblocks > XFS_MAX_DBLOCKS(sb) ||
+		sb->sb_dblocks < XFS_MIN_DBLOCKS(sb))
 		return(XR_BAD_FS_SIZE_DATA);
 
 	if (sb->sb_agblklog != (__uint8_t)libxfs_log2_roundup(sb->sb_agblocks))