[4/5] xfs: fix inode_cluster_size rounding mayhem
diff mbox series

Message ID 155960228579.1194435.18215658650812508577.stgit@magnolia
State New
Headers show
Series
  • xfs: refactor inode geometry
Related show

Commit Message

Darrick J. Wong June 3, 2019, 10:51 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

inode_cluster_size is supposed to represent the size (in bytes) of an
inode cluster buffer.  We avoid having to handle multiple clusters per
filesystem block on filesystems with large blocks by openly rounding
this value up to 1 FSB when necessary.  However, we never reset
inode_cluster_size to reflect this new rounded value, which adds to the
potential for mistakes in calculating geometries.

Fix this by setting inode_cluster_size to reflect the rounded-up size if
needed, and special-case the few places in the sparse inodes code where
we actually need the smaller value to validate on-disk metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h     |    8 ++++++--
 fs/xfs/libxfs/xfs_ialloc.c     |   19 +++++++++++++++++++
 fs/xfs/libxfs/xfs_trans_resv.c |    6 ++----
 fs/xfs/xfs_log_recover.c       |    3 +--
 fs/xfs/xfs_mount.c             |    4 ++--
 5 files changed, 30 insertions(+), 10 deletions(-)

Comments

Dave Chinner June 4, 2019, 12:25 a.m. UTC | #1
On Mon, Jun 03, 2019 at 03:51:25PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> inode_cluster_size is supposed to represent the size (in bytes) of an
> inode cluster buffer.  We avoid having to handle multiple clusters per
> filesystem block on filesystems with large blocks by openly rounding
> this value up to 1 FSB when necessary.  However, we never reset
> inode_cluster_size to reflect this new rounded value, which adds to the
> potential for mistakes in calculating geometries.
> 
> Fix this by setting inode_cluster_size to reflect the rounded-up size if
> needed, and special-case the few places in the sparse inodes code where
> we actually need the smaller value to validate on-disk metadata.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

Looks good - really helps simplify what some of the code does.

A few minor things below.

> ---
>  fs/xfs/libxfs/xfs_format.h     |    8 ++++++--
>  fs/xfs/libxfs/xfs_ialloc.c     |   19 +++++++++++++++++++
>  fs/xfs/libxfs/xfs_trans_resv.c |    6 ++----
>  fs/xfs/xfs_log_recover.c       |    3 +--
>  fs/xfs/xfs_mount.c             |    4 ++--
>  5 files changed, 30 insertions(+), 10 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 0e831f04725c..1d3e1e66baf5 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -1698,11 +1698,15 @@ struct xfs_ino_geometry {
>  	/* Maximum inode count in this filesystem. */
>  	uint64_t	maxicount;
>  
> +	/* Actual inode cluster buffer size, in bytes. */
> +	unsigned int	inode_cluster_size;
> +
>  	/*
>  	 * Desired inode cluster buffer size, in bytes.  This value is not
> -	 * rounded up to at least one filesystem block.
> +	 * rounded up to at least one filesystem block, which is necessary for
> +	 * validating sb_spino_align.
>  	 */
> -	unsigned int	inode_cluster_size;
> +	unsigned int	inode_cluster_size_raw;

Can you mention in the comment that this should never be used
outside of validating sb_spino_align and that inode_cluster_size is
the value that should be used by all runtime code?

>  	/* Inode cluster sizes, adjusted to be at least 1 fsb. */
>  	unsigned int	inodes_per_cluster;
> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> index cff202d0ee4a..976860673b6c 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.c
> +++ b/fs/xfs/libxfs/xfs_ialloc.c
> @@ -2810,6 +2810,16 @@ xfs_ialloc_setup_geometry(
>  		igeo->maxicount = 0;
>  	}
>  
> +	/*
> +	 * Compute the desired size of an inode cluster buffer size, which
> +	 * starts at 8K and (on v5 filesystems) scales up with larger inode
> +	 * sizes.
> +	 *
> +	 * Preserve the desired inode cluster size because the sparse inodes
> +	 * feature uses that desired size (not the actual size) to compute the
> +	 * sparse inode alignment.  The mount code validates this value, so we
> +	 * cannot change the behavior.
> +	 */
>  	igeo->inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
>  	if (xfs_sb_version_hascrc(&mp->m_sb)) {
>  		int	new_size = igeo->inode_cluster_size;
> @@ -2818,12 +2828,21 @@ xfs_ialloc_setup_geometry(
>  		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
>  			igeo->inode_cluster_size = new_size;
>  	}
> +	igeo->inode_cluster_size_raw = igeo->inode_cluster_size;

I think I'd prefer to see the calculation use
igeo->inode_cluster_size_raw, and then that gets used to calculate
igeo->blocks_per_cluster, then igeo->inode_cluster_size is
calculated from blocks_per_cluster like you've done below. That way
there is an obvious logic flow to the variable derivation. i.e.
"this is how we calculate the raw cluster size and this is how we
calculate the actual runtime cluster size"...

> +
> +	/*
> +	 * Compute the inode cluster buffer size ratios.  We avoid needing to
> +	 * handle multiple clusters per filesystem block by rounding up
> +	 * blocks_per_cluster to 1 if necessary.  Set the inode cluster size
> +	 * to the actual value.
> +	 */
>  	igeo->blocks_per_cluster = xfs_icluster_size_fsb(mp);
>  	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp,
>  			igeo->blocks_per_cluster);
>  	igeo->cluster_align = xfs_ialloc_cluster_alignment(mp);
>  	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp,
>  			igeo->cluster_align);
> +	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);

I'd put this immediately after we calculate blocks_per_cluster...

Cheers,

Dave.
Darrick J. Wong June 4, 2019, 1:08 a.m. UTC | #2
On Tue, Jun 04, 2019 at 10:25:07AM +1000, Dave Chinner wrote:
> On Mon, Jun 03, 2019 at 03:51:25PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > inode_cluster_size is supposed to represent the size (in bytes) of an
> > inode cluster buffer.  We avoid having to handle multiple clusters per
> > filesystem block on filesystems with large blocks by openly rounding
> > this value up to 1 FSB when necessary.  However, we never reset
> > inode_cluster_size to reflect this new rounded value, which adds to the
> > potential for mistakes in calculating geometries.
> > 
> > Fix this by setting inode_cluster_size to reflect the rounded-up size if
> > needed, and special-case the few places in the sparse inodes code where
> > we actually need the smaller value to validate on-disk metadata.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Looks good - really helps simplify what some of the code does.
> 
> A few minor things below.
> 
> > ---
> >  fs/xfs/libxfs/xfs_format.h     |    8 ++++++--
> >  fs/xfs/libxfs/xfs_ialloc.c     |   19 +++++++++++++++++++
> >  fs/xfs/libxfs/xfs_trans_resv.c |    6 ++----
> >  fs/xfs/xfs_log_recover.c       |    3 +--
> >  fs/xfs/xfs_mount.c             |    4 ++--
> >  5 files changed, 30 insertions(+), 10 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > index 0e831f04725c..1d3e1e66baf5 100644
> > --- a/fs/xfs/libxfs/xfs_format.h
> > +++ b/fs/xfs/libxfs/xfs_format.h
> > @@ -1698,11 +1698,15 @@ struct xfs_ino_geometry {
> >  	/* Maximum inode count in this filesystem. */
> >  	uint64_t	maxicount;
> >  
> > +	/* Actual inode cluster buffer size, in bytes. */
> > +	unsigned int	inode_cluster_size;
> > +
> >  	/*
> >  	 * Desired inode cluster buffer size, in bytes.  This value is not
> > -	 * rounded up to at least one filesystem block.
> > +	 * rounded up to at least one filesystem block, which is necessary for
> > +	 * validating sb_spino_align.
> >  	 */
> > -	unsigned int	inode_cluster_size;
> > +	unsigned int	inode_cluster_size_raw;
> 
> Can you mention in the comment that this should never be used
> outside of validating sb_spino_align and that inode_cluster_size is
> the value that should be used by all runtime code?

Ok.

> >  	/* Inode cluster sizes, adjusted to be at least 1 fsb. */
> >  	unsigned int	inodes_per_cluster;
> > diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> > index cff202d0ee4a..976860673b6c 100644
> > --- a/fs/xfs/libxfs/xfs_ialloc.c
> > +++ b/fs/xfs/libxfs/xfs_ialloc.c
> > @@ -2810,6 +2810,16 @@ xfs_ialloc_setup_geometry(
> >  		igeo->maxicount = 0;
> >  	}
> >  
> > +	/*
> > +	 * Compute the desired size of an inode cluster buffer size, which
> > +	 * starts at 8K and (on v5 filesystems) scales up with larger inode
> > +	 * sizes.
> > +	 *
> > +	 * Preserve the desired inode cluster size because the sparse inodes
> > +	 * feature uses that desired size (not the actual size) to compute the
> > +	 * sparse inode alignment.  The mount code validates this value, so we
> > +	 * cannot change the behavior.
> > +	 */
> >  	igeo->inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
> >  	if (xfs_sb_version_hascrc(&mp->m_sb)) {
> >  		int	new_size = igeo->inode_cluster_size;
> > @@ -2818,12 +2828,21 @@ xfs_ialloc_setup_geometry(
> >  		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
> >  			igeo->inode_cluster_size = new_size;
> >  	}
> > +	igeo->inode_cluster_size_raw = igeo->inode_cluster_size;
> 
> I think I'd prefer to see the calculation use
> igeo->inode_cluster_size_raw, and then that gets used to calculate
> igeo->blocks_per_cluster, then igeo->inode_cluster_size is
> calculated from blocks_per_cluster like you've done below. That way
> there is an obvious logic flow to the variable derivation. i.e.
> "this is how we calculate the raw cluster size and this is how we
> calculate the actual runtime cluster size"...

Makes sense.

> > +
> > +	/*
> > +	 * Compute the inode cluster buffer size ratios.  We avoid needing to
> > +	 * handle multiple clusters per filesystem block by rounding up
> > +	 * blocks_per_cluster to 1 if necessary.  Set the inode cluster size
> > +	 * to the actual value.
> > +	 */
> >  	igeo->blocks_per_cluster = xfs_icluster_size_fsb(mp);
> >  	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp,
> >  			igeo->blocks_per_cluster);
> >  	igeo->cluster_align = xfs_ialloc_cluster_alignment(mp);
> >  	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp,
> >  			igeo->cluster_align);
> > +	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
> 
> I'd put this immediately after we calculate blocks_per_cluster...

Ok.  Weirdly I just did this to fix up merge conflicts in the patch
stack before I even saw this comment....

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

Patch
diff mbox series

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 0e831f04725c..1d3e1e66baf5 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1698,11 +1698,15 @@  struct xfs_ino_geometry {
 	/* Maximum inode count in this filesystem. */
 	uint64_t	maxicount;
 
+	/* Actual inode cluster buffer size, in bytes. */
+	unsigned int	inode_cluster_size;
+
 	/*
 	 * Desired inode cluster buffer size, in bytes.  This value is not
-	 * rounded up to at least one filesystem block.
+	 * rounded up to at least one filesystem block, which is necessary for
+	 * validating sb_spino_align.
 	 */
-	unsigned int	inode_cluster_size;
+	unsigned int	inode_cluster_size_raw;
 
 	/* Inode cluster sizes, adjusted to be at least 1 fsb. */
 	unsigned int	inodes_per_cluster;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index cff202d0ee4a..976860673b6c 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2810,6 +2810,16 @@  xfs_ialloc_setup_geometry(
 		igeo->maxicount = 0;
 	}
 
+	/*
+	 * Compute the desired size of an inode cluster buffer size, which
+	 * starts at 8K and (on v5 filesystems) scales up with larger inode
+	 * sizes.
+	 *
+	 * Preserve the desired inode cluster size because the sparse inodes
+	 * feature uses that desired size (not the actual size) to compute the
+	 * sparse inode alignment.  The mount code validates this value, so we
+	 * cannot change the behavior.
+	 */
 	igeo->inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		int	new_size = igeo->inode_cluster_size;
@@ -2818,12 +2828,21 @@  xfs_ialloc_setup_geometry(
 		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
 			igeo->inode_cluster_size = new_size;
 	}
+	igeo->inode_cluster_size_raw = igeo->inode_cluster_size;
+
+	/*
+	 * Compute the inode cluster buffer size ratios.  We avoid needing to
+	 * handle multiple clusters per filesystem block by rounding up
+	 * blocks_per_cluster to 1 if necessary.  Set the inode cluster size
+	 * to the actual value.
+	 */
 	igeo->blocks_per_cluster = xfs_icluster_size_fsb(mp);
 	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp,
 			igeo->blocks_per_cluster);
 	igeo->cluster_align = xfs_ialloc_cluster_alignment(mp);
 	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp,
 			igeo->cluster_align);
+	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
 
 	/* Set whether we're using inode alignment. */
 	igeo->inoalign_mask = igeo->cluster_align - 1;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 2663dd7975a5..9d1326d14af9 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -308,8 +308,7 @@  xfs_calc_iunlink_remove_reservation(
 	struct xfs_mount        *mp)
 {
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-	       2 * max_t(uint, XFS_FSB_TO_B(mp, 1),
-			 M_IGEO(mp)->inode_cluster_size);
+	       2 * M_IGEO(mp)->inode_cluster_size;
 }
 
 /*
@@ -347,8 +346,7 @@  STATIC uint
 xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
 {
 	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-			max_t(uint, XFS_FSB_TO_B(mp, 1),
-			      M_IGEO(mp)->inode_cluster_size);
+			M_IGEO(mp)->inode_cluster_size;
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1557304f3d68..f7c062df29bf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2893,8 +2893,7 @@  xlog_recover_buffer_pass2(
 	 */
 	if (XFS_DINODE_MAGIC ==
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
-	    (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
-			M_IGEO(log->l_mp)->inode_cluster_size))) {
+	    (BBTOB(bp->b_io_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
 		xfs_buf_stale(bp);
 		error = xfs_bwrite(bp);
 	} else {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 81d6535b24b4..544fa469aca4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -746,11 +746,11 @@  xfs_mountfs(
 	 */
 	if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
 	    mp->m_sb.sb_spino_align !=
-			XFS_B_TO_FSBT(mp, igeo->inode_cluster_size)) {
+			XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
 		xfs_warn(mp,
 	"Sparse inode block alignment (%u) must match cluster size (%llu).",
 			 mp->m_sb.sb_spino_align,
-			 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size));
+			 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
 		error = -EINVAL;
 		goto out_remove_uuid;
 	}