diff mbox series

[RFC] xfs: don't commit sunit/swidth updates to disk if that would cause repair failures

Message ID 20191202173538.GD7335@magnolia (mailing list archive)
State Superseded
Headers show
Series [RFC] xfs: don't commit sunit/swidth updates to disk if that would cause repair failures | expand

Commit Message

Darrick J. Wong Dec. 2, 2019, 5:35 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Alex Lyakas reported[1] that mounting an xfs filesystem with new sunit
and swidth values could cause xfs_repair to fail loudly.  The problem
here is that repair calculates the where mkfs should have allocated the
root inode, based on the superblock geometry.  The allocation decisions
depend on sunit, which means that we really can't go updating sunit if
it would lead to a subsequent repair failure on an otherwise correct
filesystem.

Port the computation code from xfs_repair and teach mount to avoid the
ondisk update if it would cause problems for repair.  We allow the mount
to proceed (and new allocations will reflect this new geometry) because
we've never screened this kind of thing before.

[1] https://lore.kernel.org/linux-xfs/20191125130744.GA44777@bfoster/T/#m00f9594b511e076e2fcdd489d78bc30216d72a7d

Reported-by: Alex Lyakas <alex@zadara.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_ialloc.c |   88 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_ialloc.h |    2 +
 fs/xfs/xfs_ioctl.c         |    3 ++
 fs/xfs/xfs_ioctl32.c       |    4 ++
 fs/xfs/xfs_mount.c         |   47 +++++++++++++++++-------
 5 files changed, 130 insertions(+), 14 deletions(-)

Comments

Dave Chinner Dec. 2, 2019, 9:21 p.m. UTC | #1
On Mon, Dec 02, 2019 at 09:35:38AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Alex Lyakas reported[1] that mounting an xfs filesystem with new sunit
> and swidth values could cause xfs_repair to fail loudly.  The problem
> here is that repair calculates the where mkfs should have allocated the
> root inode, based on the superblock geometry.  The allocation decisions
> depend on sunit, which means that we really can't go updating sunit if
> it would lead to a subsequent repair failure on an otherwise correct
> filesystem.
> 
> Port the computation code from xfs_repair and teach mount to avoid the
> ondisk update if it would cause problems for repair.  We allow the mount
> to proceed (and new allocations will reflect this new geometry) because
> we've never screened this kind of thing before.
> 
> [1] https://lore.kernel.org/linux-xfs/20191125130744.GA44777@bfoster/T/#m00f9594b511e076e2fcdd489d78bc30216d72a7d
....
> +/*
> + * Compute the first and last inodes numbers of the inode chunk that was
> + * preallocated for the root directory.
> + */
> +void
> +xfs_ialloc_find_prealloc(
> +	struct xfs_mount	*mp,
> +	xfs_agino_t		*first_agino,
> +	xfs_agino_t		*last_agino)
> +{
> +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> +	xfs_agblock_t		first_bno;
> +
> +	/*
> +	 * Pre-calculate the geometry of ag 0. We know what it looks like
> +	 * because we know what mkfs does: 2 allocation btree roots (by block
> +	 * and by size), the inode allocation btree root, the free inode
> +	 * allocation btree root (if enabled) and some number of blocks to
> +	 * prefill the agfl.
> +	 *
> +	 * Because the current shape of the btrees may differ from the current
> +	 * shape, we open code the mkfs freelist block count here. mkfs creates
> +	 * single level trees, so the calculation is pertty straight forward for

pretty.

> +	 * the trees that use the AGFL.
> +	 */
> +
> +	/* free space by block btree root comes after the ag headers */
> +	first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
> +
> +	/* free space by length btree root */
> +	first_bno += 1;
> +
> +	/* inode btree root */
> +	first_bno += 1;
> +
> +	/* agfl */
> +	first_bno += (2 * min(2U, mp->m_ag_maxlevels)) + 1;

min_t(xfs_agblock_t, 2, mp->m_ag_maxlevels) ?

> +
> +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> +		first_bno++;
> +
> +	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
> +		first_bno += min(2U, mp->m_rmap_maxlevels); /* agfl blocks */

same.

> +		first_bno++;
> +	}
> +
> +	if (xfs_sb_version_hasreflink(&mp->m_sb))
> +		first_bno++;
> +
> +	/*
> +	 * If the log is allocated in the first allocation group we need to
> +	 * add the number of blocks used by the log to the above calculation.
> +	 *
> +	 * This can happens with filesystems that only have a single
> +	 * allocation group, or very odd geometries created by old mkfs
> +	 * versions on very small filesystems.
> +	 */
> +	if (mp->m_sb.sb_logstart &&
> +	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) {
> +
> +		/*
> +		 * XXX(hch): verify that sb_logstart makes sense?
> +		 */
> +		 first_bno += mp->m_sb.sb_logblocks;
> +	}
> +
> +	/*
> +	 * ditto the location of the first inode chunks in the fs ('/')
> +	 */
> +	if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) {
> +		*first_agino = XFS_AGB_TO_AGINO(mp,
> +				roundup(first_bno, mp->m_sb.sb_unit));
> +	} else if (xfs_sb_version_hasalign(&mp->m_sb) &&
> +		   mp->m_sb.sb_inoalignmt > 1)  {
> +		*first_agino = XFS_AGB_TO_AGINO(mp,
> +				roundup(first_bno, mp->m_sb.sb_inoalignmt));
> +	} else  {
> +		*first_agino = XFS_AGB_TO_AGINO(mp, first_bno);
> +	}
> +
> +	ASSERT(igeo->ialloc_blks > 0);
> +
> +	if (igeo->ialloc_blks > 1)
> +		*last_agino = *first_agino + XFS_INODES_PER_CHUNK;
> +	else
> +		*last_agino = XFS_AGB_TO_AGINO(mp, first_bno + 1);

Isn't last_agino of the first inode of the next chunk? i.e. this is
an off-by-one...

> +}
> diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> index 323592d563d5..9d9fe7b488b8 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.h
> +++ b/fs/xfs/libxfs/xfs_ialloc.h
> @@ -152,5 +152,7 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
>  
>  int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
>  void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
> +void xfs_ialloc_find_prealloc(struct xfs_mount *mp, xfs_agino_t *first_agino,
> +		xfs_agino_t *last_agino);
>  
>  #endif	/* __XFS_IALLOC_H__ */
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 7b35d62ede9f..d830a9e13817 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -891,6 +891,9 @@ xfs_ioc_fsgeometry(
>  
>  	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
>  
> +	fsgeo.sunit = mp->m_sb.sb_unit;
> +	fsgeo.swidth = mp->m_sb.sb_width;

Why?

> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index fca65109cf24..0323a89256c7 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -368,6 +368,11 @@ xfs_update_alignment(xfs_mount_t *mp)
>  	xfs_sb_t	*sbp = &(mp->m_sb);
>  
>  	if (mp->m_dalign) {
> +		uint32_t	old_su;
> +		uint32_t	old_sw;
> +		xfs_agino_t	first;
> +		xfs_agino_t	last;
> +
>  		/*
>  		 * If stripe unit and stripe width are not multiples
>  		 * of the fs blocksize turn off alignment.
> @@ -398,24 +403,38 @@ xfs_update_alignment(xfs_mount_t *mp)
>  			}
>  		}
>  
> -		/*
> -		 * Update superblock with new values
> -		 * and log changes
> -		 */
> -		if (xfs_sb_version_hasdalign(sbp)) {
> -			if (sbp->sb_unit != mp->m_dalign) {
> -				sbp->sb_unit = mp->m_dalign;
> -				mp->m_update_sb = true;
> -			}
> -			if (sbp->sb_width != mp->m_swidth) {
> -				sbp->sb_width = mp->m_swidth;
> -				mp->m_update_sb = true;
> -			}
> -		} else {
> +		/* Update superblock with new values and log changes. */
> +		if (!xfs_sb_version_hasdalign(sbp)) {
>  			xfs_warn(mp,
>  	"cannot change alignment: superblock does not support data alignment");
>  			return -EINVAL;
>  		}
> +
> +		if (sbp->sb_unit == mp->m_dalign &&
> +		    sbp->sb_width == mp->m_swidth)
> +			return 0;
> +
> +		old_su = sbp->sb_unit;
> +		old_sw = sbp->sb_width;
> +		sbp->sb_unit = mp->m_dalign;
> +		sbp->sb_width = mp->m_swidth;
> +		xfs_ialloc_find_prealloc(mp, &first, &last);

We just chuck last away? why calculate it then? And why not just
pass mp->m_dalign/mp->m_swidth into the function rather than setting
them in the sb and then having to undo the change? i.e.

		rootino = xfs_ialloc_calc_rootino(mp, mp->m_dalign, mp->m_swidth);
		if (sbp->sb_rootino != rootino) {
			.....
		}
> +
> +		/*
> +		 * If the sunit/swidth change would move the precomputed root
> +		 * inode value, we must reject the ondisk change because repair
> +		 * will stumble over that.  However, we allow the mount to
> +		 * proceed because we never rejected this combination before.
> +		 */
> +		if (sbp->sb_rootino != XFS_AGINO_TO_INO(mp, 0, first)) {
> +			sbp->sb_unit = old_su;
> +			sbp->sb_width = old_sw;
> +			xfs_warn(mp,
> +	"cannot change alignment: would require moving root inode");

"cannot change stripe alignment: ..." ?

Should this also return EINVAL, as per above when the DALIGN sb
feature bit is not set?

Cheers,

Dave.
Darrick J. Wong Dec. 3, 2019, 2:30 a.m. UTC | #2
On Tue, Dec 03, 2019 at 08:21:40AM +1100, Dave Chinner wrote:
> On Mon, Dec 02, 2019 at 09:35:38AM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Alex Lyakas reported[1] that mounting an xfs filesystem with new sunit
> > and swidth values could cause xfs_repair to fail loudly.  The problem
> > here is that repair calculates the where mkfs should have allocated the
> > root inode, based on the superblock geometry.  The allocation decisions
> > depend on sunit, which means that we really can't go updating sunit if
> > it would lead to a subsequent repair failure on an otherwise correct
> > filesystem.
> > 
> > Port the computation code from xfs_repair and teach mount to avoid the
> > ondisk update if it would cause problems for repair.  We allow the mount
> > to proceed (and new allocations will reflect this new geometry) because
> > we've never screened this kind of thing before.
> > 
> > [1] https://lore.kernel.org/linux-xfs/20191125130744.GA44777@bfoster/T/#m00f9594b511e076e2fcdd489d78bc30216d72a7d
> ....
> > +/*
> > + * Compute the first and last inodes numbers of the inode chunk that was
> > + * preallocated for the root directory.
> > + */
> > +void
> > +xfs_ialloc_find_prealloc(
> > +	struct xfs_mount	*mp,
> > +	xfs_agino_t		*first_agino,
> > +	xfs_agino_t		*last_agino)
> > +{
> > +	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> > +	xfs_agblock_t		first_bno;
> > +
> > +	/*
> > +	 * Pre-calculate the geometry of ag 0. We know what it looks like
> > +	 * because we know what mkfs does: 2 allocation btree roots (by block
> > +	 * and by size), the inode allocation btree root, the free inode
> > +	 * allocation btree root (if enabled) and some number of blocks to
> > +	 * prefill the agfl.
> > +	 *
> > +	 * Because the current shape of the btrees may differ from the current
> > +	 * shape, we open code the mkfs freelist block count here. mkfs creates
> > +	 * single level trees, so the calculation is pertty straight forward for
> 
> pretty.
> 
> > +	 * the trees that use the AGFL.
> > +	 */
> > +
> > +	/* free space by block btree root comes after the ag headers */
> > +	first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
> > +
> > +	/* free space by length btree root */
> > +	first_bno += 1;
> > +
> > +	/* inode btree root */
> > +	first_bno += 1;
> > +
> > +	/* agfl */
> > +	first_bno += (2 * min(2U, mp->m_ag_maxlevels)) + 1;
> 
> min_t(xfs_agblock_t, 2, mp->m_ag_maxlevels) ?
> 
> > +
> > +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > +		first_bno++;
> > +
> > +	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
> > +		first_bno += min(2U, mp->m_rmap_maxlevels); /* agfl blocks */
> 
> same.

Fixed all three.

> > +		first_bno++;
> > +	}
> > +
> > +	if (xfs_sb_version_hasreflink(&mp->m_sb))
> > +		first_bno++;
> > +
> > +	/*
> > +	 * If the log is allocated in the first allocation group we need to
> > +	 * add the number of blocks used by the log to the above calculation.
> > +	 *
> > +	 * This can happens with filesystems that only have a single
> > +	 * allocation group, or very odd geometries created by old mkfs
> > +	 * versions on very small filesystems.
> > +	 */
> > +	if (mp->m_sb.sb_logstart &&
> > +	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) {
> > +
> > +		/*
> > +		 * XXX(hch): verify that sb_logstart makes sense?
> > +		 */
> > +		 first_bno += mp->m_sb.sb_logblocks;
> > +	}
> > +
> > +	/*
> > +	 * ditto the location of the first inode chunks in the fs ('/')
> > +	 */
> > +	if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) {
> > +		*first_agino = XFS_AGB_TO_AGINO(mp,
> > +				roundup(first_bno, mp->m_sb.sb_unit));
> > +	} else if (xfs_sb_version_hasalign(&mp->m_sb) &&
> > +		   mp->m_sb.sb_inoalignmt > 1)  {
> > +		*first_agino = XFS_AGB_TO_AGINO(mp,
> > +				roundup(first_bno, mp->m_sb.sb_inoalignmt));
> > +	} else  {
> > +		*first_agino = XFS_AGB_TO_AGINO(mp, first_bno);
> > +	}
> > +
> > +	ASSERT(igeo->ialloc_blks > 0);
> > +
> > +	if (igeo->ialloc_blks > 1)
> > +		*last_agino = *first_agino + XFS_INODES_PER_CHUNK;
> > +	else
> > +		*last_agino = XFS_AGB_TO_AGINO(mp, first_bno + 1);
> 
> Isn't last_agino of the first inode of the next chunk? i.e. this is
> an off-by-one...

Yep.  It's also an off-by-one error in repair...

> > +}
> > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> > index 323592d563d5..9d9fe7b488b8 100644
> > --- a/fs/xfs/libxfs/xfs_ialloc.h
> > +++ b/fs/xfs/libxfs/xfs_ialloc.h
> > @@ -152,5 +152,7 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
> >  
> >  int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
> >  void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
> > +void xfs_ialloc_find_prealloc(struct xfs_mount *mp, xfs_agino_t *first_agino,
> > +		xfs_agino_t *last_agino);
> >  
> >  #endif	/* __XFS_IALLOC_H__ */
> > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > index 7b35d62ede9f..d830a9e13817 100644
> > --- a/fs/xfs/xfs_ioctl.c
> > +++ b/fs/xfs/xfs_ioctl.c
> > @@ -891,6 +891,9 @@ xfs_ioc_fsgeometry(
> >  
> >  	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
> >  
> > +	fsgeo.sunit = mp->m_sb.sb_unit;
> > +	fsgeo.swidth = mp->m_sb.sb_width;
> 
> Why?

This was in keeping with Alex' suggestion to use the sunit values incore
even if we don't update the superblock.

> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index fca65109cf24..0323a89256c7 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -368,6 +368,11 @@ xfs_update_alignment(xfs_mount_t *mp)
> >  	xfs_sb_t	*sbp = &(mp->m_sb);
> >  
> >  	if (mp->m_dalign) {
> > +		uint32_t	old_su;
> > +		uint32_t	old_sw;
> > +		xfs_agino_t	first;
> > +		xfs_agino_t	last;
> > +
> >  		/*
> >  		 * If stripe unit and stripe width are not multiples
> >  		 * of the fs blocksize turn off alignment.
> > @@ -398,24 +403,38 @@ xfs_update_alignment(xfs_mount_t *mp)
> >  			}
> >  		}
> >  
> > -		/*
> > -		 * Update superblock with new values
> > -		 * and log changes
> > -		 */
> > -		if (xfs_sb_version_hasdalign(sbp)) {
> > -			if (sbp->sb_unit != mp->m_dalign) {
> > -				sbp->sb_unit = mp->m_dalign;
> > -				mp->m_update_sb = true;
> > -			}
> > -			if (sbp->sb_width != mp->m_swidth) {
> > -				sbp->sb_width = mp->m_swidth;
> > -				mp->m_update_sb = true;
> > -			}
> > -		} else {
> > +		/* Update superblock with new values and log changes. */
> > +		if (!xfs_sb_version_hasdalign(sbp)) {
> >  			xfs_warn(mp,
> >  	"cannot change alignment: superblock does not support data alignment");
> >  			return -EINVAL;
> >  		}
> > +
> > +		if (sbp->sb_unit == mp->m_dalign &&
> > +		    sbp->sb_width == mp->m_swidth)
> > +			return 0;
> > +
> > +		old_su = sbp->sb_unit;
> > +		old_sw = sbp->sb_width;
> > +		sbp->sb_unit = mp->m_dalign;
> > +		sbp->sb_width = mp->m_swidth;
> > +		xfs_ialloc_find_prealloc(mp, &first, &last);
> 
> We just chuck last away? why calculate it then?

Hmmm.  Repair uses it to silence the "inode chunk claims used block"
error if an inobt record points to something owned by XR_E_INUSE_FS* if
the inode points to something in that first chunk.  Not sure /why/ it
does that; it seems to have done that since the creation of the git
repo.

Frankly, I'm not convinced that's the right behavior; the root inode
chunk should never collide with something else, period.

> And why not just
> pass mp->m_dalign/mp->m_swidth into the function rather than setting
> them in the sb and then having to undo the change? i.e.
> 
> 		rootino = xfs_ialloc_calc_rootino(mp, mp->m_dalign, mp->m_swidth);

<shrug> The whole point was to create a function that computes where the
first allocated inode chunk should be from an existing mountpoint and
superblock, maybe the caller should make a copy, update the parameters,
and then pass the copy into this function?

> 		if (sbp->sb_rootino != rootino) {
> 			.....
> 		}
> > +
> > +		/*
> > +		 * If the sunit/swidth change would move the precomputed root
> > +		 * inode value, we must reject the ondisk change because repair
> > +		 * will stumble over that.  However, we allow the mount to
> > +		 * proceed because we never rejected this combination before.
> > +		 */
> > +		if (sbp->sb_rootino != XFS_AGINO_TO_INO(mp, 0, first)) {
> > +			sbp->sb_unit = old_su;
> > +			sbp->sb_width = old_sw;
> > +			xfs_warn(mp,
> > +	"cannot change alignment: would require moving root inode");
> 
> "cannot change stripe alignment: ..." ?

Ok.

> Should this also return EINVAL, as per above when the DALIGN sb
> feature bit is not set?

I dunno.  We've never rejected these mount options before, which makes
me a little hesitant to break everybody's scripts, even if it /is/
improper behavior that leads to repair failure.  We /do/ have the option
that Alex suggested of modifying the incore values to change the
allocator behavior without committing them to the superblock, which is
what this patch does.

OTOH the manual pages say that you're not supposed to do this, which
might be a strong enough reason to start banning it.

Thoughts?

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Dec. 3, 2019, 9:21 p.m. UTC | #3
On Mon, Dec 02, 2019 at 06:30:41PM -0800, Darrick J. Wong wrote:
> On Tue, Dec 03, 2019 at 08:21:40AM +1100, Dave Chinner wrote:
> > On Mon, Dec 02, 2019 at 09:35:38AM -0800, Darrick J. Wong wrote:
> > > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> > > index 323592d563d5..9d9fe7b488b8 100644
> > > --- a/fs/xfs/libxfs/xfs_ialloc.h
> > > +++ b/fs/xfs/libxfs/xfs_ialloc.h
> > > @@ -152,5 +152,7 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
> > >  
> > >  int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
> > >  void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
> > > +void xfs_ialloc_find_prealloc(struct xfs_mount *mp, xfs_agino_t *first_agino,
> > > +		xfs_agino_t *last_agino);
> > >  
> > >  #endif	/* __XFS_IALLOC_H__ */
> > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > > index 7b35d62ede9f..d830a9e13817 100644
> > > --- a/fs/xfs/xfs_ioctl.c
> > > +++ b/fs/xfs/xfs_ioctl.c
> > > @@ -891,6 +891,9 @@ xfs_ioc_fsgeometry(
> > >  
> > >  	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
> > >  
> > > +	fsgeo.sunit = mp->m_sb.sb_unit;
> > > +	fsgeo.swidth = mp->m_sb.sb_width;
> > 
> > Why?
> 
> This was in keeping with Alex' suggestion to use the sunit values incore
> even if we don't update the superblock.

Not sure about that. If we are getting the geometry for the purposes
of working out where something is on disk (e.g. the root inode :),
then we need what is in the superblock, not what is in memory...

> > > +		if (sbp->sb_unit == mp->m_dalign &&
> > > +		    sbp->sb_width == mp->m_swidth)
> > > +			return 0;
> > > +
> > > +		old_su = sbp->sb_unit;
> > > +		old_sw = sbp->sb_width;
> > > +		sbp->sb_unit = mp->m_dalign;
> > > +		sbp->sb_width = mp->m_swidth;
> > > +		xfs_ialloc_find_prealloc(mp, &first, &last);
> > 
> > We just chuck last away? why calculate it then?
> 
> Hmmm.  Repair uses it to silence the "inode chunk claims used block"
> error if an inobt record points to something owned by XR_E_INUSE_FS* if
> the inode points to something in that first chunk.  Not sure /why/ it
> does that; it seems to have done that since the creation of the git
> repo.

Hysterical raisins that have long since decomposed, I'm guessing....

> Frankly, I'm not convinced that's the right behavior; the root inode
> chunk should never collide with something else, period.

*nod*

I suspect the way repair uses the last_prealloc_ino can go away,
especially as the inode number calculated is not correct in the
first place...

> > And why not just
> > pass mp->m_dalign/mp->m_swidth into the function rather than setting
> > them in the sb and then having to undo the change? i.e.
> > 
> > 		rootino = xfs_ialloc_calc_rootino(mp, mp->m_dalign, mp->m_swidth);
> 
> <shrug> The whole point was to create a function that computes where the
> first allocated inode chunk should be from an existing mountpoint and
> superblock, maybe the caller should make a copy, update the parameters,
> and then pass the copy into this function?

That's a whole lot of cruft that we can avoid just by passing in
our specific stripe alignment.

What we need to kow is whether a specific stripe geometry will
result in the root inode location changing, and so I'm of the
opinion we should just write a function that calculates the location
based on the supplied geometry and the caller can do whatever checks
it needs to with the inode number returned.

That provides what both repair and the kernel mount validation
requires...

> > Should this also return EINVAL, as per above when the DALIGN sb
> > feature bit is not set?
> 
> I dunno.  We've never rejected these mount options before, which makes
> me a little hesitant to break everybody's scripts, even if it /is/
> improper behavior that leads to repair failure.  We /do/ have the option
> that Alex suggested of modifying the incore values to change the
> allocator behavior without committing them to the superblock, which is
> what this patch does.
> 
> OTOH the manual pages say that you're not supposed to do this, which
> might be a strong enough reason to start banning it.
> 
> Thoughts?

On second thoughts, knowing that many users have put sunit/swidth in
their fstab, we probably shouldn't make it return an error as that
may make their systems unbootable.

Cheers,

Dave.
Darrick J. Wong Dec. 4, 2019, 4:51 p.m. UTC | #4
On Wed, Dec 04, 2019 at 08:21:36AM +1100, Dave Chinner wrote:
> On Mon, Dec 02, 2019 at 06:30:41PM -0800, Darrick J. Wong wrote:
> > On Tue, Dec 03, 2019 at 08:21:40AM +1100, Dave Chinner wrote:
> > > On Mon, Dec 02, 2019 at 09:35:38AM -0800, Darrick J. Wong wrote:
> > > > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> > > > index 323592d563d5..9d9fe7b488b8 100644
> > > > --- a/fs/xfs/libxfs/xfs_ialloc.h
> > > > +++ b/fs/xfs/libxfs/xfs_ialloc.h
> > > > @@ -152,5 +152,7 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
> > > >  
> > > >  int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
> > > >  void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
> > > > +void xfs_ialloc_find_prealloc(struct xfs_mount *mp, xfs_agino_t *first_agino,
> > > > +		xfs_agino_t *last_agino);
> > > >  
> > > >  #endif	/* __XFS_IALLOC_H__ */
> > > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > > > index 7b35d62ede9f..d830a9e13817 100644
> > > > --- a/fs/xfs/xfs_ioctl.c
> > > > +++ b/fs/xfs/xfs_ioctl.c
> > > > @@ -891,6 +891,9 @@ xfs_ioc_fsgeometry(
> > > >  
> > > >  	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
> > > >  
> > > > +	fsgeo.sunit = mp->m_sb.sb_unit;
> > > > +	fsgeo.swidth = mp->m_sb.sb_width;
> > > 
> > > Why?
> > 
> > This was in keeping with Alex' suggestion to use the sunit values incore
> > even if we don't update the superblock.
> 
> Not sure about that. If we are getting the geometry for the purposes
> of working out where something is on disk (e.g. the root inode :),
> then we need what is in the superblock, not what is in memory...
> 
> > > > +		if (sbp->sb_unit == mp->m_dalign &&
> > > > +		    sbp->sb_width == mp->m_swidth)
> > > > +			return 0;
> > > > +
> > > > +		old_su = sbp->sb_unit;
> > > > +		old_sw = sbp->sb_width;
> > > > +		sbp->sb_unit = mp->m_dalign;
> > > > +		sbp->sb_width = mp->m_swidth;
> > > > +		xfs_ialloc_find_prealloc(mp, &first, &last);
> > > 
> > > We just chuck last away? why calculate it then?
> > 
> > Hmmm.  Repair uses it to silence the "inode chunk claims used block"
> > error if an inobt record points to something owned by XR_E_INUSE_FS* if
> > the inode points to something in that first chunk.  Not sure /why/ it
> > does that; it seems to have done that since the creation of the git
> > repo.
> 
> Hysterical raisins that have long since decomposed, I'm guessing....

<nod> I'll nuke it then.

> > Frankly, I'm not convinced that's the right behavior; the root inode
> > chunk should never collide with something else, period.
> 
> *nod*
> 
> I suspect the way repair uses the last_prealloc_ino can go away,
> especially as the inode number calculated is not correct in the
> first place...
> 
> > > And why not just
> > > pass mp->m_dalign/mp->m_swidth into the function rather than setting
> > > them in the sb and then having to undo the change? i.e.
> > > 
> > > 		rootino = xfs_ialloc_calc_rootino(mp, mp->m_dalign, mp->m_swidth);
> > 
> > <shrug> The whole point was to create a function that computes where the
> > first allocated inode chunk should be from an existing mountpoint and
> > superblock, maybe the caller should make a copy, update the parameters,
> > and then pass the copy into this function?
> 
> That's a whole lot of cruft that we can avoid just by passing in
> our specific stripe alignment.
> 
> What we need to kow is whether a specific stripe geometry will
> result in the root inode location changing, and so I'm of the
> opinion we should just write a function that calculates the location
> based on the supplied geometry and the caller can do whatever checks
> it needs to with the inode number returned.
> 
> That provides what both repair and the kernel mount validation
> requires...

Done.

> 
> > > Should this also return EINVAL, as per above when the DALIGN sb
> > > feature bit is not set?
> > 
> > I dunno.  We've never rejected these mount options before, which makes
> > me a little hesitant to break everybody's scripts, even if it /is/
> > improper behavior that leads to repair failure.  We /do/ have the option
> > that Alex suggested of modifying the incore values to change the
> > allocator behavior without committing them to the superblock, which is
> > what this patch does.
> > 
> > OTOH the manual pages say that you're not supposed to do this, which
> > might be a strong enough reason to start banning it.
> > 
> > Thoughts?
> 
> On second thoughts, knowing that many users have put sunit/swidth in
> their fstab, we probably shouldn't make it return an error as that
> may make their systems unbootable.

For now I'll add an XXX comment about how the next time we add a new
incompat feature we should make it start returning EINVAL if that
feature is enabled.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 988cde7744e6..51fbf0cb3255 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2909,3 +2909,91 @@  xfs_ialloc_setup_geometry(
 	else
 		igeo->ialloc_align = 0;
 }
+
+/*
+ * Compute the first and last inodes numbers of the inode chunk that was
+ * preallocated for the root directory.
+ */
+void
+xfs_ialloc_find_prealloc(
+	struct xfs_mount	*mp,
+	xfs_agino_t		*first_agino,
+	xfs_agino_t		*last_agino)
+{
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_agblock_t		first_bno;
+
+	/*
+	 * Pre-calculate the geometry of ag 0. We know what it looks like
+	 * because we know what mkfs does: 2 allocation btree roots (by block
+	 * and by size), the inode allocation btree root, the free inode
+	 * allocation btree root (if enabled) and some number of blocks to
+	 * prefill the agfl.
+	 *
+	 * Because the current shape of the btrees may differ from the current
+	 * shape, we open code the mkfs freelist block count here. mkfs creates
+	 * single level trees, so the calculation is pertty straight forward for
+	 * the trees that use the AGFL.
+	 */
+
+	/* free space by block btree root comes after the ag headers */
+	first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+	/* free space by length btree root */
+	first_bno += 1;
+
+	/* inode btree root */
+	first_bno += 1;
+
+	/* agfl */
+	first_bno += (2 * min(2U, mp->m_ag_maxlevels)) + 1;
+
+	if (xfs_sb_version_hasfinobt(&mp->m_sb))
+		first_bno++;
+
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		first_bno += min(2U, mp->m_rmap_maxlevels); /* agfl blocks */
+		first_bno++;
+	}
+
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		first_bno++;
+
+	/*
+	 * If the log is allocated in the first allocation group we need to
+	 * add the number of blocks used by the log to the above calculation.
+	 *
+	 * This can happens with filesystems that only have a single
+	 * allocation group, or very odd geometries created by old mkfs
+	 * versions on very small filesystems.
+	 */
+	if (mp->m_sb.sb_logstart &&
+	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) {
+
+		/*
+		 * XXX(hch): verify that sb_logstart makes sense?
+		 */
+		 first_bno += mp->m_sb.sb_logblocks;
+	}
+
+	/*
+	 * ditto the location of the first inode chunks in the fs ('/')
+	 */
+	if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) {
+		*first_agino = XFS_AGB_TO_AGINO(mp,
+				roundup(first_bno, mp->m_sb.sb_unit));
+	} else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+		   mp->m_sb.sb_inoalignmt > 1)  {
+		*first_agino = XFS_AGB_TO_AGINO(mp,
+				roundup(first_bno, mp->m_sb.sb_inoalignmt));
+	} else  {
+		*first_agino = XFS_AGB_TO_AGINO(mp, first_bno);
+	}
+
+	ASSERT(igeo->ialloc_blks > 0);
+
+	if (igeo->ialloc_blks > 1)
+		*last_agino = *first_agino + XFS_INODES_PER_CHUNK;
+	else
+		*last_agino = XFS_AGB_TO_AGINO(mp, first_bno + 1);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 323592d563d5..9d9fe7b488b8 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -152,5 +152,7 @@  int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
 
 int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
 void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+void xfs_ialloc_find_prealloc(struct xfs_mount *mp, xfs_agino_t *first_agino,
+		xfs_agino_t *last_agino);
 
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7b35d62ede9f..d830a9e13817 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -891,6 +891,9 @@  xfs_ioc_fsgeometry(
 
 	xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
 
+	fsgeo.sunit = mp->m_sb.sb_unit;
+	fsgeo.swidth = mp->m_sb.sb_width;
+
 	if (struct_version <= 3)
 		len = sizeof(struct xfs_fsop_geom_v1);
 	else if (struct_version == 4)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4c4f09113d3..7e9009d26628 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -51,6 +51,10 @@  xfs_compat_ioc_fsgeometry_v1(
 	struct xfs_fsop_geom	  fsgeo;
 
 	xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+
+	fsgeo.sunit = mp->m_sb.sb_unit;
+	fsgeo.swidth = mp->m_sb.sb_width;
+
 	/* The 32-bit variant simply has some padding at the end */
 	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
 		return -EFAULT;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fca65109cf24..0323a89256c7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -368,6 +368,11 @@  xfs_update_alignment(xfs_mount_t *mp)
 	xfs_sb_t	*sbp = &(mp->m_sb);
 
 	if (mp->m_dalign) {
+		uint32_t	old_su;
+		uint32_t	old_sw;
+		xfs_agino_t	first;
+		xfs_agino_t	last;
+
 		/*
 		 * If stripe unit and stripe width are not multiples
 		 * of the fs blocksize turn off alignment.
@@ -398,24 +403,38 @@  xfs_update_alignment(xfs_mount_t *mp)
 			}
 		}
 
-		/*
-		 * Update superblock with new values
-		 * and log changes
-		 */
-		if (xfs_sb_version_hasdalign(sbp)) {
-			if (sbp->sb_unit != mp->m_dalign) {
-				sbp->sb_unit = mp->m_dalign;
-				mp->m_update_sb = true;
-			}
-			if (sbp->sb_width != mp->m_swidth) {
-				sbp->sb_width = mp->m_swidth;
-				mp->m_update_sb = true;
-			}
-		} else {
+		/* Update superblock with new values and log changes. */
+		if (!xfs_sb_version_hasdalign(sbp)) {
 			xfs_warn(mp,
 	"cannot change alignment: superblock does not support data alignment");
 			return -EINVAL;
 		}
+
+		if (sbp->sb_unit == mp->m_dalign &&
+		    sbp->sb_width == mp->m_swidth)
+			return 0;
+
+		old_su = sbp->sb_unit;
+		old_sw = sbp->sb_width;
+		sbp->sb_unit = mp->m_dalign;
+		sbp->sb_width = mp->m_swidth;
+		xfs_ialloc_find_prealloc(mp, &first, &last);
+
+		/*
+		 * If the sunit/swidth change would move the precomputed root
+		 * inode value, we must reject the ondisk change because repair
+		 * will stumble over that.  However, we allow the mount to
+		 * proceed because we never rejected this combination before.
+		 */
+		if (sbp->sb_rootino != XFS_AGINO_TO_INO(mp, 0, first)) {
+			sbp->sb_unit = old_su;
+			sbp->sb_width = old_sw;
+			xfs_warn(mp,
+	"cannot change alignment: would require moving root inode");
+			return 0;
+		}
+
+		mp->m_update_sb = true;
 	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
 		    xfs_sb_version_hasdalign(&mp->m_sb)) {
 			mp->m_dalign = sbp->sb_unit;