diff mbox series

[v2,07/13] xfs: Introduce FORCEALIGN inode flag

Message ID 20240705162450.3481169-8-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series forcealign for xfs | expand

Commit Message

John Garry July 5, 2024, 4:24 p.m. UTC
From: "Darrick J. Wong" <djwong@kernel.org>

Add a new inode flag to require that all file data extent mappings must
be aligned (both the file offset range and the allocated space itself)
to the extent size hint.  Having a separate COW extent size hint is no
longer allowed.

The goal here is to enable sysadmins and users to mandate that all space
mappings in a file must have a startoff/blockcount that are aligned to
(say) a 2MB alignment and that the startblock/blockcount will follow the
same alignment.

Allocated space will be aligned to start of the AG, and not necessarily
aligned with disk blocks. The upcoming atomic writes feature will rely and
forcealign and will also require allocated space will also be aligned to
disk blocks.

Currently RT is not be supported for forcealign, so reject a mount under
that condition. In future, it should be possible to support forcealign for
RT. In this case, the extent size hint will need to be aligned with
rtextsize, so add inode verification for that now.

reflink link will not be supported for forcealign. This is because we have
the limitation of pageache writeback not knowing how to writeback an
entire allocation unut, so reject a mount with relink.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Co-developed-by: John Garry <john.g.garry@oracle.com>
[jpg: many changes from orig, including forcealign inode verification
 rework, disallow RT and forcealign mount, ioctl setattr rework,
 disallow reflink a forcealign inode]
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h    |  6 +++-
 fs/xfs/libxfs/xfs_inode_buf.c | 55 +++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_buf.h |  3 ++
 fs/xfs/libxfs/xfs_sb.c        |  2 ++
 fs/xfs/xfs_inode.c            | 13 +++++++++
 fs/xfs/xfs_inode.h            | 20 ++++++++++++-
 fs/xfs/xfs_ioctl.c            | 51 ++++++++++++++++++++++++++++++--
 fs/xfs/xfs_mount.h            |  2 ++
 fs/xfs/xfs_reflink.c          |  5 ++--
 fs/xfs/xfs_reflink.h          | 10 -------
 fs/xfs/xfs_super.c            | 11 +++++++
 include/uapi/linux/fs.h       |  2 ++
 12 files changed, 164 insertions(+), 16 deletions(-)

Comments

Darrick J. Wong July 11, 2024, 2:59 a.m. UTC | #1
On Fri, Jul 05, 2024 at 04:24:44PM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@kernel.org>
> 
> Add a new inode flag to require that all file data extent mappings must
> be aligned (both the file offset range and the allocated space itself)
> to the extent size hint.  Having a separate COW extent size hint is no
> longer allowed.
> 
> The goal here is to enable sysadmins and users to mandate that all space
> mappings in a file must have a startoff/blockcount that are aligned to
> (say) a 2MB alignment and that the startblock/blockcount will follow the
> same alignment.
> 
> Allocated space will be aligned to start of the AG, and not necessarily
> aligned with disk blocks. The upcoming atomic writes feature will rely and
> forcealign and will also require allocated space will also be aligned to
> disk blocks.
> 
> Currently RT is not be supported for forcealign, so reject a mount under
> that condition. In future, it should be possible to support forcealign for
> RT. In this case, the extent size hint will need to be aligned with
> rtextsize, so add inode verification for that now.
> 
> reflink link will not be supported for forcealign. This is because we have
> the limitation of pageache writeback not knowing how to writeback an
> entire allocation unut, so reject a mount with relink.
> 
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> Co-developed-by: John Garry <john.g.garry@oracle.com>
> [jpg: many changes from orig, including forcealign inode verification
>  rework, disallow RT and forcealign mount, ioctl setattr rework,
>  disallow reflink a forcealign inode]
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_format.h    |  6 +++-
>  fs/xfs/libxfs/xfs_inode_buf.c | 55 +++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_inode_buf.h |  3 ++
>  fs/xfs/libxfs/xfs_sb.c        |  2 ++
>  fs/xfs/xfs_inode.c            | 13 +++++++++
>  fs/xfs/xfs_inode.h            | 20 ++++++++++++-
>  fs/xfs/xfs_ioctl.c            | 51 ++++++++++++++++++++++++++++++--
>  fs/xfs/xfs_mount.h            |  2 ++
>  fs/xfs/xfs_reflink.c          |  5 ++--
>  fs/xfs/xfs_reflink.h          | 10 -------
>  fs/xfs/xfs_super.c            | 11 +++++++
>  include/uapi/linux/fs.h       |  2 ++
>  12 files changed, 164 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 61f51becff4f..b48cd75d34a6 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
>  #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
>  #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>  #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
>  #define XFS_SB_FEAT_RO_COMPAT_ALL \
>  		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>  		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> @@ -1094,16 +1095,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>  #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
>  #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
>  #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
> +/* data extent mappings for regular files must be aligned to extent size hint */
> +#define XFS_DIFLAG2_FORCEALIGN_BIT 5
>  
>  #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
>  #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
>  #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
>  #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
>  #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
> +#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
>  
>  #define XFS_DIFLAG2_ANY \
>  	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
> -	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
> +	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
>  
>  static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
>  {
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 513b50da6215..5c61a1d1bb2b 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -657,6 +657,15 @@ xfs_dinode_verify(
>  	    !xfs_has_bigtime(mp))
>  		return __this_address;
>  
> +	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
> +		fa = xfs_inode_validate_forcealign(mp,
> +				be32_to_cpu(dip->di_extsize),
> +				be32_to_cpu(dip->di_cowextsize),
> +				mode, flags, flags2);
> +		if (fa)
> +			return fa;
> +	}
> +
>  	return NULL;
>  }
>  
> @@ -824,3 +833,49 @@ xfs_inode_validate_cowextsize(
>  
>  	return NULL;
>  }
> +
> +/* Validate the forcealign inode flag */
> +xfs_failaddr_t
> +xfs_inode_validate_forcealign(
> +	struct xfs_mount	*mp,
> +	uint32_t		extsize,
> +	uint32_t		cowextsize,
> +	uint16_t		mode,
> +	uint16_t		flags,
> +	uint64_t		flags2)
> +{
> +	bool			rt =  flags & XFS_DIFLAG_REALTIME;
> +
> +	/* superblock rocompat feature flag */
> +	if (!xfs_has_forcealign(mp))
> +		return __this_address;
> +
> +	/* Only regular files and directories */
> +	if (!S_ISDIR(mode) && !S_ISREG(mode))
> +		return __this_address;
> +
> +	/* We require EXTSIZE or EXTSZINHERIT */
> +	if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
> +		return __this_address;
> +
> +	/* We require a non-zero extsize */
> +	if (!extsize)
> +		return __this_address;
> +
> +	/* Reflink'ed disallowed */
> +	if (flags2 & XFS_DIFLAG2_REFLINK)
> +		return __this_address;

Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
superblock verifier or xfs_fs_fill_super fail the mount so that old
kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
support for forcealign'd cow and starts writing out files with both
iflags set?

That said, if the bs>ps patchset lands, then I think forcealign cow is
a simple matter of setting the min folio order to the forcealign size
and making sure that we always write out entire folios if any of the
blocks cached by the folio is shared.  Direct writes to forcealigned
shared files probably has to be aligned to the forcealign size or fall
back to buffered writes for cow.

--D

> +
> +	/* COW extsize disallowed */
> +	if (flags2 & XFS_DIFLAG2_COWEXTSIZE)
> +		return __this_address;
> +
> +	if (cowextsize)
> +		return __this_address;
> +
> +	/* extsize must be a multiple of sb_rextsize for RT */
> +	if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
> +		return __this_address;
> +
> +	return NULL;
> +}
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
> index 585ed5a110af..b8b65287b037 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.h
> +++ b/fs/xfs/libxfs/xfs_inode_buf.h
> @@ -33,6 +33,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
>  xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
>  		uint32_t cowextsize, uint16_t mode, uint16_t flags,
>  		uint64_t flags2);
> +xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
> +		uint32_t extsize, uint32_t cowextsize, uint16_t mode,
> +		uint16_t flags, uint64_t flags2);
>  
>  static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
>  {
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 6b56f0f6d4c1..e56911553edd 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -164,6 +164,8 @@ xfs_sb_version_to_features(
>  		features |= XFS_FEAT_REFLINK;
>  	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
>  		features |= XFS_FEAT_INOBTCNT;
> +	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
> +		features |= XFS_FEAT_FORCEALIGN;
>  	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
>  		features |= XFS_FEAT_FTYPE;
>  	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index a4e3cd8971fc..caffd4c75179 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -609,6 +609,8 @@ xfs_ip2xflags(
>  			flags |= FS_XFLAG_DAX;
>  		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
>  			flags |= FS_XFLAG_COWEXTSIZE;
> +		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
> +			flags |= FS_XFLAG_FORCEALIGN;
>  	}
>  
>  	if (xfs_inode_has_attr_fork(ip))
> @@ -738,6 +740,8 @@ xfs_inode_inherit_flags2(
>  	}
>  	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
>  		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
> +	if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
> +		ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN;
>  
>  	/* Don't let invalid cowextsize hints propagate. */
>  	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
> @@ -746,6 +750,15 @@ xfs_inode_inherit_flags2(
>  		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
>  		ip->i_cowextsize = 0;
>  	}
> +
> +	if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
> +		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
> +				ip->i_extsize, ip->i_cowextsize,
> +				VFS_I(ip)->i_mode, ip->i_diflags,
> +				ip->i_diflags2);
> +		if (failaddr)
> +			ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN;
> +	}
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 42f999c1106c..536e646dd055 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -301,6 +301,16 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
>  	return ip->i_cowfp && ip->i_cowfp->if_bytes;
>  }
>  
> +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> +{
> +	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
> +}
> +
> +static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> +{
> +	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> +}
> +
>  static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
>  {
>  	return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
> @@ -313,7 +323,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  
>  static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
>  {
> -	return false;
> +	if (!(ip->i_diflags & XFS_DIFLAG_EXTSIZE))
> +		return false;
> +	if (ip->i_extsize <= 1)
> +		return false;
> +	if (xfs_is_cow_inode(ip))
> +		return false;
> +	if (ip->i_diflags & XFS_DIFLAG_REALTIME)
> +		return false;
> +	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index f0117188f302..771ef3954f4e 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -525,10 +525,48 @@ xfs_flags2diflags2(
>  		di_flags2 |= XFS_DIFLAG2_DAX;
>  	if (xflags & FS_XFLAG_COWEXTSIZE)
>  		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
> +	if (xflags & FS_XFLAG_FORCEALIGN)
> +		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
>  
>  	return di_flags2;
>  }
>  
> +/*
> + * Forcealign requires a non-zero extent size hint and a zero cow
> + * extent size hint.  Don't allow set for RT files yet.
> + */
> +static int
> +xfs_ioctl_setattr_forcealign(
> +	struct xfs_inode	*ip,
> +	struct fileattr		*fa)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +
> +	if (!xfs_has_forcealign(mp))
> +		return -EINVAL;
> +
> +	if (xfs_is_reflink_inode(ip))
> +		return -EINVAL;
> +
> +	if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
> +				FS_XFLAG_EXTSZINHERIT)))
> +		return -EINVAL;
> +
> +	if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
> +		return -EINVAL;
> +
> +	if (!fa->fsx_extsize)
> +		return -EINVAL;
> +
> +	if (fa->fsx_cowextsize)
> +		return -EINVAL;
> +
> +	if (fa->fsx_xflags & FS_XFLAG_REALTIME)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static int
>  xfs_ioctl_setattr_xflags(
>  	struct xfs_trans	*tp,
> @@ -537,10 +575,13 @@ xfs_ioctl_setattr_xflags(
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
> +	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
>  	uint64_t		i_flags2;
> +	int			error;
>  
> -	if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
> -		/* Can't change realtime flag if any extents are allocated. */
> +	/* Can't change RT or forcealign flags if any extents are allocated. */
> +	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
> +	    forcealign != xfs_inode_has_forcealign(ip)) {
>  		if (ip->i_df.if_nextents || ip->i_delayed_blks)
>  			return -EINVAL;
>  	}
> @@ -561,6 +602,12 @@ xfs_ioctl_setattr_xflags(
>  	if (i_flags2 && !xfs_has_v3inodes(mp))
>  		return -EINVAL;
>  
> +	if (forcealign) {
> +		error = xfs_ioctl_setattr_forcealign(ip, fa);
> +		if (error)
> +			return error;
> +	}
> +
>  	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
>  	ip->i_diflags2 = i_flags2;
>  
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index d0567dfbc036..30228fea908d 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -299,6 +299,7 @@ typedef struct xfs_mount {
>  #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
>  #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
>  #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
> +#define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
>  
>  /* Mount features */
>  #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
> @@ -385,6 +386,7 @@ __XFS_ADD_V4_FEAT(projid32, PROJID32)
>  __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
>  __XFS_HAS_V4_FEAT(crc, CRC)
>  __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
> +__XFS_HAS_FEAT(forcealign, FORCEALIGN)
>  
>  /*
>   * Mount features
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 265a2a418bc7..8da293e8bfa2 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1467,8 +1467,9 @@ xfs_reflink_remap_prep(
>  
>  	/* Check file eligibility and prepare for block sharing. */
>  	ret = -EINVAL;
> -	/* Don't reflink realtime inodes */
> -	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
> +	/* Don't reflink realtime or forcealign inodes */
> +	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) ||
> +	    xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest))
>  		goto out_unlock;
>  
>  	/* Don't share DAX file data with non-DAX file. */
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 65c5dfe17ecf..fb55e4ce49fa 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -6,16 +6,6 @@
>  #ifndef __XFS_REFLINK_H
>  #define __XFS_REFLINK_H 1
>  
> -static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> -{
> -	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
> -}
> -
> -static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> -{
> -	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> -}
> -
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
>  int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 27e9f749c4c7..4f0c77f38de1 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1721,6 +1721,17 @@ xfs_fs_fill_super(
>  		mp->m_features &= ~XFS_FEAT_DISCARD;
>  	}
>  
> +	if (xfs_has_forcealign(mp)) {
> +		if (xfs_has_realtime(mp)) {
> +			xfs_alert(mp,
> +	"forcealign not compatible with realtime device!");
> +			error = -EINVAL;
> +			goto out_filestream_unmount;
> +		}
> +		xfs_warn(mp,
> +"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
> +	}
> +
>  	if (xfs_has_reflink(mp)) {
>  		if (mp->m_sb.sb_rblocks) {
>  			xfs_alert(mp,
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 45e4e64fd664..8af304c0e29a 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -158,6 +158,8 @@ struct fsxattr {
>  #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
>  #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
>  #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
> +/* data extent mappings for regular files must be aligned to extent size hint */
> +#define FS_XFLAG_FORCEALIGN	0x00020000
>  #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
>  
>  /* the read-only stuff doesn't really belong here, but any other place is
> -- 
> 2.31.1
> 
>
Christoph Hellwig July 11, 2024, 3:59 a.m. UTC | #2
On Wed, Jul 10, 2024 at 07:59:58PM -0700, Darrick J. Wong wrote:
> Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> superblock verifier or xfs_fs_fill_super fail the mount so that old
> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> support for forcealign'd cow and starts writing out files with both
> iflags set?

Yes.

> That said, if the bs>ps patchset lands, then I think forcealign cow is
> a simple matter of setting the min folio order to the forcealign size
> and making sure that we always write out entire folios if any of the
> blocks cached by the folio is shared.  Direct writes to forcealigned
> shared files probably has to be aligned to the forcealign size or fall
> back to buffered writes for cow.

It has all the same problems as rtexsize > 1 + reflink, and suppoting
it will require raiding your patch stack.  Or better just wait until
we've got all that in now that we're actively working on it.
John Garry July 11, 2024, 7:17 a.m. UTC | #3
On 11/07/2024 03:59, Darrick J. Wong wrote:
> On Fri, Jul 05, 2024 at 04:24:44PM +0000, John Garry wrote:
>> From: "Darrick J. Wong" <djwong@kernel.org>
>>
>> Add a new inode flag to require that all file data extent mappings must
>> be aligned (both the file offset range and the allocated space itself)
>> to the extent size hint.  Having a separate COW extent size hint is no
>> longer allowed.
>>
>> The goal here is to enable sysadmins and users to mandate that all space
>> mappings in a file must have a startoff/blockcount that are aligned to
>> (say) a 2MB alignment and that the startblock/blockcount will follow the
>> same alignment.
>>
>> Allocated space will be aligned to start of the AG, and not necessarily
>> aligned with disk blocks. The upcoming atomic writes feature will rely and
>> forcealign and will also require allocated space will also be aligned to
>> disk blocks.
>>
>> Currently RT is not be supported for forcealign, so reject a mount under
>> that condition. In future, it should be possible to support forcealign for
>> RT. In this case, the extent size hint will need to be aligned with
>> rtextsize, so add inode verification for that now.
>>
>> reflink link will not be supported for forcealign. This is because we have
>> the limitation of pageache writeback not knowing how to writeback an
>> entire allocation unut, so reject a mount with relink.
>>
>> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
>> Co-developed-by: John Garry <john.g.garry@oracle.com>
>> [jpg: many changes from orig, including forcealign inode verification
>>   rework, disallow RT and forcealign mount, ioctl setattr rework,
>>   disallow reflink a forcealign inode]
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/libxfs/xfs_format.h    |  6 +++-
>>   fs/xfs/libxfs/xfs_inode_buf.c | 55 +++++++++++++++++++++++++++++++++++
>>   fs/xfs/libxfs/xfs_inode_buf.h |  3 ++
>>   fs/xfs/libxfs/xfs_sb.c        |  2 ++
>>   fs/xfs/xfs_inode.c            | 13 +++++++++
>>   fs/xfs/xfs_inode.h            | 20 ++++++++++++-
>>   fs/xfs/xfs_ioctl.c            | 51 ++++++++++++++++++++++++++++++--
>>   fs/xfs/xfs_mount.h            |  2 ++
>>   fs/xfs/xfs_reflink.c          |  5 ++--
>>   fs/xfs/xfs_reflink.h          | 10 -------
>>   fs/xfs/xfs_super.c            | 11 +++++++
>>   include/uapi/linux/fs.h       |  2 ++
>>   12 files changed, 164 insertions(+), 16 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
>> index 61f51becff4f..b48cd75d34a6 100644
>> --- a/fs/xfs/libxfs/xfs_format.h
>> +++ b/fs/xfs/libxfs/xfs_format.h
>> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
>>   #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
>>   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>>   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
>> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
>>   #define XFS_SB_FEAT_RO_COMPAT_ALL \
>>   		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>>   		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
>> @@ -1094,16 +1095,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>>   #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
>>   #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
>>   #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
>> +/* data extent mappings for regular files must be aligned to extent size hint */
>> +#define XFS_DIFLAG2_FORCEALIGN_BIT 5
>>   
>>   #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
>>   #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
>>   #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
>>   #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
>>   #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
>> +#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
>>   
>>   #define XFS_DIFLAG2_ANY \
>>   	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
>> -	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
>> +	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
>>   
>>   static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
>>   {
>> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
>> index 513b50da6215..5c61a1d1bb2b 100644
>> --- a/fs/xfs/libxfs/xfs_inode_buf.c
>> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
>> @@ -657,6 +657,15 @@ xfs_dinode_verify(
>>   	    !xfs_has_bigtime(mp))
>>   		return __this_address;
>>   
>> +	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
>> +		fa = xfs_inode_validate_forcealign(mp,
>> +				be32_to_cpu(dip->di_extsize),
>> +				be32_to_cpu(dip->di_cowextsize),
>> +				mode, flags, flags2);
>> +		if (fa)
>> +			return fa;
>> +	}
>> +
>>   	return NULL;
>>   }
>>   
>> @@ -824,3 +833,49 @@ xfs_inode_validate_cowextsize(
>>   
>>   	return NULL;
>>   }
>> +
>> +/* Validate the forcealign inode flag */
>> +xfs_failaddr_t
>> +xfs_inode_validate_forcealign(
>> +	struct xfs_mount	*mp,
>> +	uint32_t		extsize,
>> +	uint32_t		cowextsize,
>> +	uint16_t		mode,
>> +	uint16_t		flags,
>> +	uint64_t		flags2)
>> +{
>> +	bool			rt =  flags & XFS_DIFLAG_REALTIME;
>> +
>> +	/* superblock rocompat feature flag */
>> +	if (!xfs_has_forcealign(mp))
>> +		return __this_address;
>> +
>> +	/* Only regular files and directories */
>> +	if (!S_ISDIR(mode) && !S_ISREG(mode))
>> +		return __this_address;
>> +
>> +	/* We require EXTSIZE or EXTSZINHERIT */
>> +	if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
>> +		return __this_address;
>> +
>> +	/* We require a non-zero extsize */
>> +	if (!extsize)
>> +		return __this_address;
>> +
>> +	/* Reflink'ed disallowed */
>> +	if (flags2 & XFS_DIFLAG2_REFLINK)
>> +		return __this_address;
> 
> Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> superblock verifier or xfs_fs_fill_super fail the mount so that old
> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> support for forcealign'd cow and starts writing out files with both
> iflags set?

Fine, I see that we do something similar now for rtdev.

However why even have the rt inode check, below, to disallow for reflink 
cp for rt inode (if we can't even mount with rt and reflink together)?


>>    * Mount features
>> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
>> index 265a2a418bc7..8da293e8bfa2 100644
>> --- a/fs/xfs/xfs_reflink.c
>> +++ b/fs/xfs/xfs_reflink.c
>> @@ -1467,8 +1467,9 @@ xfs_reflink_remap_prep(
>>   
>>   	/* Check file eligibility and prepare for block sharing. */
>>   	ret = -EINVAL;
>> -	/* Don't reflink realtime inodes */
>> -	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
>> +	/* Don't reflink realtime or forcealign inodes */
>> +	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) ||
>> +	    xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest))
>>   		goto out_unlock;
>>   
>>   	/* Don't share DAX file data with non-DAX file. */
>> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
>> index 65c5dfe17ecf..fb55e4ce49fa 100644
>> --- a/fs/xfs/xfs_reflink.h
>> +++ b/fs/xfs/xfs_reflink.h
>> @@ -6,16 +6,6 @@
>>   #ifndef __XFS_REFLINK_H
>>   #define __XFS_REFLINK_H 1
>>
Dave Chinner July 11, 2024, 11:20 p.m. UTC | #4
On Wed, Jul 10, 2024 at 07:59:58PM -0700, Darrick J. Wong wrote:
> On Fri, Jul 05, 2024 at 04:24:44PM +0000, John Garry wrote:
> > +/* Validate the forcealign inode flag */
> > +xfs_failaddr_t
> > +xfs_inode_validate_forcealign(
> > +	struct xfs_mount	*mp,
> > +	uint32_t		extsize,
> > +	uint32_t		cowextsize,
> > +	uint16_t		mode,
> > +	uint16_t		flags,
> > +	uint64_t		flags2)
> > +{
> > +	bool			rt =  flags & XFS_DIFLAG_REALTIME;
> > +
> > +	/* superblock rocompat feature flag */
> > +	if (!xfs_has_forcealign(mp))
> > +		return __this_address;
> > +
> > +	/* Only regular files and directories */
> > +	if (!S_ISDIR(mode) && !S_ISREG(mode))
> > +		return __this_address;
> > +
> > +	/* We require EXTSIZE or EXTSZINHERIT */
> > +	if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
> > +		return __this_address;
> > +
> > +	/* We require a non-zero extsize */
> > +	if (!extsize)
> > +		return __this_address;
> > +
> > +	/* Reflink'ed disallowed */
> > +	if (flags2 & XFS_DIFLAG2_REFLINK)
> > +		return __this_address;
> 
> Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> superblock verifier or xfs_fs_fill_super fail the mount so that old
> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> support for forcealign'd cow and starts writing out files with both
> iflags set?

I don't think we should error out the mount because reflink and
forcealign are enabled - that's going to be the common configuration
for every user of forcealign, right? I also don't think we should
throw a corruption error if both flags are set, either.

We're making an initial *implementation choice* not to implement the
two features on the same inode at the same time. We are not making a
an on-disk format design decision that says "these two on-disk flags
are incompatible".

IOWs, if both are set on a current kernel, it's not corruption but a
more recent kernel that supports both flags has modified this inode.
Put simply, we have detected a ro-compat situation for this specific
inode.

Looking at it as a ro-compat situation rather then corruption,
what I would suggest we do is this:

1. Warn at mount that reflink+force align inodes will be treated
as ro-compat inodes. i.e. read-only.

2. prevent forcealign from being set if the shared extent flag is
set on the inode.

3. prevent shared extents from being created if the force align flag
is set (i.e. ->remap_file_range() and anything else that relies on
shared extents will fail on forcealign inodes).

4. if we read an inode with both set, we emit a warning and force
the inode to be read only so we don't screw up the force alignment
of the file (i.e. that inode operates in ro-compat mode.)

#1 is the mount time warning of potential ro-compat behaviour.

#2 and #3 prevent both from getting set on existing kernels.

#4 is the ro-compat behaviour that would occur from taking a
filesystem that ran on a newer kernel that supports force-align+COW.
This avoids corruption shutdowns and modifications that would screw
up the alignment of the shared and COW'd extents.

> That said, if the bs>ps patchset lands, then I think forcealign cow is
> a simple matter of setting the min folio order to the forcealign size
> and making sure that we always write out entire folios if any of the
> blocks cached by the folio is shared.  Direct writes to forcealigned
> shared files probably has to be aligned to the forcealign size or fall
> back to buffered writes for cow.

Right, I think all the pieces we will need are slowly falling into
place in the near future, so it doesn't seem right to me to actually
prevent filesystems with reflink and force-align both enabled right
now and then end up with a weird filesystem config needed to use
forcealign just for a couple of kernel releases...

-Dave.
Dave Chinner July 11, 2024, 11:33 p.m. UTC | #5
On Thu, Jul 11, 2024 at 08:17:26AM +0100, John Garry wrote:
> On 11/07/2024 03:59, Darrick J. Wong wrote:
> > On Fri, Jul 05, 2024 at 04:24:44PM +0000, John Garry wrote:
> > > +/* Validate the forcealign inode flag */
> > > +xfs_failaddr_t
> > > +xfs_inode_validate_forcealign(
> > > +	struct xfs_mount	*mp,
> > > +	uint32_t		extsize,
> > > +	uint32_t		cowextsize,
> > > +	uint16_t		mode,
> > > +	uint16_t		flags,
> > > +	uint64_t		flags2)
> > > +{
> > > +	bool			rt =  flags & XFS_DIFLAG_REALTIME;
> > > +
> > > +	/* superblock rocompat feature flag */
> > > +	if (!xfs_has_forcealign(mp))
> > > +		return __this_address;
> > > +
> > > +	/* Only regular files and directories */
> > > +	if (!S_ISDIR(mode) && !S_ISREG(mode))
> > > +		return __this_address;
> > > +
> > > +	/* We require EXTSIZE or EXTSZINHERIT */
> > > +	if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
> > > +		return __this_address;
> > > +
> > > +	/* We require a non-zero extsize */
> > > +	if (!extsize)
> > > +		return __this_address;
> > > +
> > > +	/* Reflink'ed disallowed */
> > > +	if (flags2 & XFS_DIFLAG2_REFLINK)
> > > +		return __this_address;
> > 
> > Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> > superblock verifier or xfs_fs_fill_super fail the mount so that old
> > kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> > support for forcealign'd cow and starts writing out files with both
> > iflags set?
> 
> Fine, I see that we do something similar now for rtdev.
>
> However why even have the rt inode check, below, to disallow for reflink cp
> for rt inode (if we can't even mount with rt and reflink together)?

In theory we should be able to have reflink enabled on a filesystem
with an RT device right now - we just can't share extents on a rt
inode.  Extent sharing should till work just fine on non-rt files,
but the overall config is disallowed at mount time because we
haven't ever tested that configuration. I'm not sure that mkfs.xfs
even allows you to make a filesystem of that config....

That said, it's good practice for the ->remap_file_range()
implementation (and anything else using shared extents) to be
explicitly checking for and preventing extent sharing on RT inodes.
THose operations don't support that config, and so should catch any
attempt that is made to do so and error out. It doesn't matter that
we have mount time checks, the checks in the extent sharing code
explicitly document that it doesn't support RT inodes right now...

-Dave.
Christoph Hellwig July 12, 2024, 4:56 a.m. UTC | #6
On Fri, Jul 12, 2024 at 09:20:26AM +1000, Dave Chinner wrote:
> I don't think we should error out the mount because reflink and
> forcealign are enabled - that's going to be the common configuration
> for every user of forcealign, right? I also don't think we should
> throw a corruption error if both flags are set, either.
> 
> We're making an initial *implementation choice* not to implement the
> two features on the same inode at the same time. We are not making a
> an on-disk format design decision that says "these two on-disk flags
> are incompatible".

Oh, right forcealign is per-inode.  In that case we just need to
ensure it never happens.  Which honestly might be a bit confusing if
you can reflink for some files and not others, but that's a separate
discussion.
John Garry July 18, 2024, 8:53 a.m. UTC | #7
On 12/07/2024 00:20, Dave Chinner wrote:
>>> /* Reflink'ed disallowed */
>>> +	if (flags2 & XFS_DIFLAG2_REFLINK)
>>> +		return __this_address;
>> Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
>> superblock verifier or xfs_fs_fill_super fail the mount so that old
>> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
>> support for forcealign'd cow and starts writing out files with both
>> iflags set?
> I don't think we should error out the mount because reflink and
> forcealign are enabled - that's going to be the common configuration
> for every user of forcealign, right? I also don't think we should
> throw a corruption error if both flags are set, either.
> 
> We're making an initial*implementation choice*  not to implement the
> two features on the same inode at the same time. We are not making a
> an on-disk format design decision that says "these two on-disk flags
> are incompatible".
> 
> IOWs, if both are set on a current kernel, it's not corruption but a
> more recent kernel that supports both flags has modified this inode.
> Put simply, we have detected a ro-compat situation for this specific
> inode.
> 
> Looking at it as a ro-compat situation rather then corruption,
> what I would suggest we do is this:
> 
> 1. Warn at mount that reflink+force align inodes will be treated
> as ro-compat inodes. i.e. read-only.
> 
> 2. prevent forcealign from being set if the shared extent flag is
> set on the inode.
> 
> 3. prevent shared extents from being created if the force align flag
> is set (i.e. ->remap_file_range() and anything else that relies on
> shared extents will fail on forcealign inodes).
> 
> 4. if we read an inode with both set, we emit a warning and force
> the inode to be read only so we don't screw up the force alignment
> of the file (i.e. that inode operates in ro-compat mode.)
> 
> #1 is the mount time warning of potential ro-compat behaviour.
> 
> #2 and #3 prevent both from getting set on existing kernels.
> 
> #4 is the ro-compat behaviour that would occur from taking a
> filesystem that ran on a newer kernel that supports force-align+COW.
> This avoids corruption shutdowns and modifications that would screw
> up the alignment of the shared and COW'd extents.
> 

This seems fine for dealing with forcealign and reflink.

So what about forcealign and RT?

We want to support this config in future, but the current implementation 
will not support it.

In this v2 series, I just disallow a mount for forcealign and RT, 
similar to reflink and RT together.

Furthermore, I am also saying here that still forcealign and RT bits set 
is a valid inode on-disk format and we just have to enforce a 
sb_rextsize to extsize relationship:

xfs_inode_validate_forcealign(
	struct xfs_mount	*mp,
	uint32_t		extsize,
	uint32_t		cowextsize,
	uint16_t		mode,
	uint16_t		flags,
	uint64_t		flags2)
{
	bool			rt =  flags & XFS_DIFLAG_REALTIME;
...


	/* extsize must be a multiple of sb_rextsize for RT */
	if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
		return __this_address;

	return NULL;
}

Is this all ok? Or should we follow similar solution to reflink + 
forcealign?
John Garry July 23, 2024, 10:11 a.m. UTC | #8
On 18/07/2024 09:53, John Garry wrote:
> On 12/07/2024 00:20, Dave Chinner wrote:
>>>> /* Reflink'ed disallowed */
>>>> +    if (flags2 & XFS_DIFLAG2_REFLINK)
>>>> +        return __this_address;
>>> Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
>>> superblock verifier or xfs_fs_fill_super fail the mount so that old
>>> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
>>> support for forcealign'd cow and starts writing out files with both
>>> iflags set?
>> I don't think we should error out the mount because reflink and
>> forcealign are enabled - that's going to be the common configuration
>> for every user of forcealign, right? I also don't think we should
>> throw a corruption error if both flags are set, either.
>>
>> We're making an initial*implementation choice*  not to implement the
>> two features on the same inode at the same time. We are not making a
>> an on-disk format design decision that says "these two on-disk flags
>> are incompatible".
>>
>> IOWs, if both are set on a current kernel, it's not corruption but a
>> more recent kernel that supports both flags has modified this inode.
>> Put simply, we have detected a ro-compat situation for this specific
>> inode.
>>
>> Looking at it as a ro-compat situation rather then corruption,
>> what I would suggest we do is this:
>>
>> 1. Warn at mount that reflink+force align inodes will be treated
>> as ro-compat inodes. i.e. read-only.

I am looking at something like this to implement read-only for those inodes:

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 07f736c42460..444a44ccc11c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1132,6 +1132,17 @@ xfs_vn_tmpfile(
  	return finish_open_simple(file, err);
  }

+static int xfs_permission(struct mnt_idmap *d, struct inode *inode, int 
mask)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if (mask & MAY_WRITE) {
+		if (xfs_is_reflink_inode(ip) && xfs_inode_has_forcealign(ip))
+			return -EACCES;
+	}
+	return generic_permission(d, inode, mask);
+}
+
  static const struct inode_operations xfs_inode_operations = {
  	.get_inode_acl		= xfs_get_acl,
  	.set_acl		= xfs_set_acl,
@@ -1142,6 +1153,7 @@ static const struct inode_operations 
xfs_inode_operations = {
  	.update_time		= xfs_vn_update_time,
  	.fileattr_get		= xfs_fileattr_get,
  	.fileattr_set		= xfs_fileattr_set,
+	.permission		= xfs_permission,
  };

Or how else could this be done? I guess that we have something else in 
the xfs code to implement the equivalent of this, but I did not find it.

>>
>> 2. prevent forcealign from being set if the shared extent flag is
>> set on the inode.

This is just XFS_DIFLAG2_REFLINK flag, right?

>>
>> 3. prevent shared extents from being created if the force align flag
>> is set (i.e. ->remap_file_range() and anything else that relies on
>> shared extents will fail on forcealign inodes).

In this series version I extend the RT check in xfs_reflink_remap_prep() 
to cover forcealign - is that good enough?

>>
>> 4. if we read an inode with both set, we emit a warning and force
>> the inode to be read only so we don't screw up the force alignment
>> of the file (i.e. that inode operates in ro-compat mode.)
>>
>> #1 is the mount time warning of potential ro-compat behaviour.
>>
>> #2 and #3 prevent both from getting set on existing kernels.
>>
>> #4 is the ro-compat behaviour that would occur from taking a
>> filesystem that ran on a newer kernel that supports force-align+COW.
>> This avoids corruption shutdowns and modifications that would screw
>> up the alignment of the shared and COW'd extents.
>>
> 
> This seems fine for dealing with forcealign and reflink.
> 
> So what about forcealign and RT?

Any opinion on this?

> 
> We want to support this config in future, but the current implementation 
> will not support it.
> 
> In this v2 series, I just disallow a mount for forcealign and RT, 
> similar to reflink and RT together.
> 
> Furthermore, I am also saying here that still forcealign and RT bits set 
> is a valid inode on-disk format and we just have to enforce a 
> sb_rextsize to extsize relationship:
> 
> xfs_inode_validate_forcealign(
>      struct xfs_mount    *mp,
>      uint32_t        extsize,
>      uint32_t        cowextsize,
>      uint16_t        mode,
>      uint16_t        flags,
>      uint64_t        flags2)
> {
>      bool            rt =  flags & XFS_DIFLAG_REALTIME;
> ...
> 
> 
>      /* extsize must be a multiple of sb_rextsize for RT */
>      if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
>          return __this_address;

And this? If not, I'll just send v3 with this code as-is.

> 
>      return NULL;
> }
>
Christoph Hellwig July 23, 2024, 2:42 p.m. UTC | #9
On Tue, Jul 23, 2024 at 11:11:28AM +0100, John Garry wrote:
> I am looking at something like this to implement read-only for those inodes:

Yikes.  Treating individual inodes in a file systems as read-only
is about the most confusing and harmful behavior we could do.

Just treat it as any other rocompat feature please an mount the entire
file system read-only if not supported.

Or even better let this wait a little, and work with Darrick to work
on the rextsize > 1 reflіnk patches and just make the thing work.

>> So what about forcealign and RT?
>
> Any opinion on this?

What about forcealign and RT?
John Garry July 23, 2024, 3:01 p.m. UTC | #10
On 23/07/2024 15:42, Christoph Hellwig wrote:
> On Tue, Jul 23, 2024 at 11:11:28AM +0100, John Garry wrote:
>> I am looking at something like this to implement read-only for those inodes:
> 
> Yikes.  Treating individual inodes in a file systems as read-only
> is about the most confusing and harmful behavior we could do.

That was the suggestion which I was given earlier in this thread.

> 
> Just treat it as any other rocompat feature please an mount the entire
> file system read-only if not supported.
> 
> Or even better let this wait a little, and work with Darrick to work
> on the rextsize > 1 reflіnk patches and just make the thing work.

I'll let Darrick comment on this.

> 
>>> So what about forcealign and RT?
>>
>> Any opinion on this?
> 
> What about forcealign and RT?

In this series version I was mounting the whole FS as RO if 
XFS_FEAT_FORCEALIGN and XFS_FEAT_REFLINK was found in the SB. And so 
very different to how I was going to individual treat inodes which 
happen to be forcealign and reflink, above.

So I was asking guidance when whether that approach (for RT and 
forcealign) is sound.
Darrick J. Wong July 23, 2024, 10:26 p.m. UTC | #11
On Tue, Jul 23, 2024 at 04:01:41PM +0100, John Garry wrote:
> On 23/07/2024 15:42, Christoph Hellwig wrote:
> > On Tue, Jul 23, 2024 at 11:11:28AM +0100, John Garry wrote:
> > > I am looking at something like this to implement read-only for those inodes:
> > 
> > Yikes.  Treating individual inodes in a file systems as read-only
> > is about the most confusing and harmful behavior we could do.
> 
> That was the suggestion which I was given earlier in this thread.

Well, Christoph and I suggested failing the mount /earlier/ in this
thread. ;)

> > 
> > Just treat it as any other rocompat feature please an mount the entire
> > file system read-only if not supported.
> > 
> > Or even better let this wait a little, and work with Darrick to work
> > on the rextsize > 1 reflіnk patches and just make the thing work.
> 
> I'll let Darrick comment on this.

COW with alloc_unit > fsblock is not currently possible, whether it's
forcealign or rtreflink because COW must happen at allocation unit
granularity.  Pure overwrites don't need all these twists and turns.

1. For COW to work, each write/page_mkwrite must mark dirty every
fsblock in the entire alloc unit.  Those fsblocks could be cached by
multiple folios, which means (in iomap terms) dirtying each block in
potentially multiple iomap_folio_state structures, as well as their
folios.

2. Similarly, writeback must then be able to issue IO in quantities that
are aligned to allocation units.  IOWs, for every dirty region in the
file, we'd have to find the folios for a given allocation unit, mark
them all for writeback, and issue bios for however much we managed to
do.  If it's not possible to grab a folio, then the entire allocation
unit can't be written out, which implies that writeback can fail to
fully clean folios.

3. Alternately I suppose we could track the number of folios undergoing
writeback for each allocation unit, issue the writeback ios whenever
we're ready, and only remap the allocation unit when the number of
folios undergoing writeback for that allocation unit reaches zero.

If we could get the mapping_set_folio_order patch merged, then we could
at least get partial support for power-of-two alloc_unit > fsblock
configurations by setting the minimum folio order to log2(alloc_unit).
For atomic writes this is probably a hard requirement because we must be
able to submit one bio with one memory region.

For everyone else this sucks because cranking up the min folio order
reduces the flexibility that the page cache can have in finding cache
memory... but until someone figures out how to make the batching work,
there's not much progress to be made.

For non power-of-two alloc_unit we can't just crank up the min folio
order because there will always be misalignments somewhere; we need a
full writeback batching implementation that can handle multiple folios
per alloc unit and partial folio writeback.

djwong-dev implements 1.  It partially handles 2 by enlarging the wbc
range to be aligned to allocation units, but it doesn't guarantee that
all the folios actually got tagged for the batch.  It can't do 3, which
means that it's probably broken if you press it hard enough.

Alternately we could disallow non power-of-two everywhere, which would
make the accounting simpler but that's a regression against ye olde xfs
which supports non power-of-two allocation units.

rtreflink is nowhere near ready to go -- it's still in djwong-wtf behind
metadata directories, rtgroups, realtime rmap, and (probably) hch's
zns patches.

> > > > So what about forcealign and RT?
> > > 
> > > Any opinion on this?
> > 
> > What about forcealign and RT?
> 
> In this series version I was mounting the whole FS as RO if
> XFS_FEAT_FORCEALIGN and XFS_FEAT_REFLINK was found in the SB. And so very
> different to how I was going to individual treat inodes which happen to be
> forcealign and reflink, above.
> 
> So I was asking guidance when whether that approach (for RT and forcealign)
> is sound.

I reiterate: don't allow mounting of (forcealign && reflink) or
(forcealign && rtextsize > 1) filesystems, and then you and I can work
on figuring out the rest.

--D
Dave Chinner July 23, 2024, 11:38 p.m. UTC | #12
On Thu, Jul 18, 2024 at 09:53:14AM +0100, John Garry wrote:
> On 12/07/2024 00:20, Dave Chinner wrote:
> > > > /* Reflink'ed disallowed */
> > > > +	if (flags2 & XFS_DIFLAG2_REFLINK)
> > > > +		return __this_address;
> > > Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> > > superblock verifier or xfs_fs_fill_super fail the mount so that old
> > > kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> > > support for forcealign'd cow and starts writing out files with both
> > > iflags set?
> > I don't think we should error out the mount because reflink and
> > forcealign are enabled - that's going to be the common configuration
> > for every user of forcealign, right? I also don't think we should
> > throw a corruption error if both flags are set, either.
> > 
> > We're making an initial*implementation choice*  not to implement the
> > two features on the same inode at the same time. We are not making a
> > an on-disk format design decision that says "these two on-disk flags
> > are incompatible".
> > 
> > IOWs, if both are set on a current kernel, it's not corruption but a
> > more recent kernel that supports both flags has modified this inode.
> > Put simply, we have detected a ro-compat situation for this specific
> > inode.
> > 
> > Looking at it as a ro-compat situation rather then corruption,
> > what I would suggest we do is this:
> > 
> > 1. Warn at mount that reflink+force align inodes will be treated
> > as ro-compat inodes. i.e. read-only.
> > 
> > 2. prevent forcealign from being set if the shared extent flag is
> > set on the inode.
> > 
> > 3. prevent shared extents from being created if the force align flag
> > is set (i.e. ->remap_file_range() and anything else that relies on
> > shared extents will fail on forcealign inodes).
> > 
> > 4. if we read an inode with both set, we emit a warning and force
> > the inode to be read only so we don't screw up the force alignment
> > of the file (i.e. that inode operates in ro-compat mode.)
> > 
> > #1 is the mount time warning of potential ro-compat behaviour.
> > 
> > #2 and #3 prevent both from getting set on existing kernels.
> > 
> > #4 is the ro-compat behaviour that would occur from taking a
> > filesystem that ran on a newer kernel that supports force-align+COW.
> > This avoids corruption shutdowns and modifications that would screw
> > up the alignment of the shared and COW'd extents.
> > 
> 
> This seems fine for dealing with forcealign and reflink.
> 
> So what about forcealign and RT?
> 
> We want to support this config in future, but the current implementation
> will not support it.

What's the problem with supporting it right from the start? We
already support forcealign for RT, just it's a global config 
under the "big rt alloc" moniker rather than a per-inode flag.

Like all on-disk format change based features,
forcealign should add the EXPERIMENTAL flag to the filesystem for a
couple of releases after merge, so there will be plenty of time to
test both data and rt dev functionality before removing the
EXPERIMENTAL flag from it.

So why not just enable the per-inode flag with RT right from the
start given that this functionality is supposed to work and be
globally supported by the rtdev right now? It seems like a whole lot
less work to just enable it for RT now than it is to disable it...

> In this v2 series, I just disallow a mount for forcealign and RT, similar to
> reflink and RT together.
> 
> Furthermore, I am also saying here that still forcealign and RT bits set is
> a valid inode on-disk format and we just have to enforce a sb_rextsize to
> extsize relationship:
> 
> xfs_inode_validate_forcealign(
> 	struct xfs_mount	*mp,
> 	uint32_t		extsize,
> 	uint32_t		cowextsize,
> 	uint16_t		mode,
> 	uint16_t		flags,
> 	uint64_t		flags2)
> {
> 	bool			rt =  flags & XFS_DIFLAG_REALTIME;
> ...
> 
> 
> 	/* extsize must be a multiple of sb_rextsize for RT */
> 	if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
> 		return __this_address;
> 
> 	return NULL;
> }

I suspect the logic needs tweaking, but why not just do this right
from the start?

-Dave.
Darrick J. Wong July 24, 2024, 12:04 a.m. UTC | #13
On Wed, Jul 24, 2024 at 09:38:18AM +1000, Dave Chinner wrote:
> On Thu, Jul 18, 2024 at 09:53:14AM +0100, John Garry wrote:
> > On 12/07/2024 00:20, Dave Chinner wrote:
> > > > > /* Reflink'ed disallowed */
> > > > > +	if (flags2 & XFS_DIFLAG2_REFLINK)
> > > > > +		return __this_address;
> > > > Hmm.  If we don't support reflink + forcealign ATM, then shouldn't the
> > > > superblock verifier or xfs_fs_fill_super fail the mount so that old
> > > > kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> > > > support for forcealign'd cow and starts writing out files with both
> > > > iflags set?
> > > I don't think we should error out the mount because reflink and
> > > forcealign are enabled - that's going to be the common configuration
> > > for every user of forcealign, right? I also don't think we should
> > > throw a corruption error if both flags are set, either.
> > > 
> > > We're making an initial*implementation choice*  not to implement the
> > > two features on the same inode at the same time. We are not making a
> > > an on-disk format design decision that says "these two on-disk flags
> > > are incompatible".
> > > 
> > > IOWs, if both are set on a current kernel, it's not corruption but a
> > > more recent kernel that supports both flags has modified this inode.
> > > Put simply, we have detected a ro-compat situation for this specific
> > > inode.
> > > 
> > > Looking at it as a ro-compat situation rather then corruption,
> > > what I would suggest we do is this:
> > > 
> > > 1. Warn at mount that reflink+force align inodes will be treated
> > > as ro-compat inodes. i.e. read-only.
> > > 
> > > 2. prevent forcealign from being set if the shared extent flag is
> > > set on the inode.
> > > 
> > > 3. prevent shared extents from being created if the force align flag
> > > is set (i.e. ->remap_file_range() and anything else that relies on
> > > shared extents will fail on forcealign inodes).
> > > 
> > > 4. if we read an inode with both set, we emit a warning and force
> > > the inode to be read only so we don't screw up the force alignment
> > > of the file (i.e. that inode operates in ro-compat mode.)
> > > 
> > > #1 is the mount time warning of potential ro-compat behaviour.
> > > 
> > > #2 and #3 prevent both from getting set on existing kernels.
> > > 
> > > #4 is the ro-compat behaviour that would occur from taking a
> > > filesystem that ran on a newer kernel that supports force-align+COW.
> > > This avoids corruption shutdowns and modifications that would screw
> > > up the alignment of the shared and COW'd extents.
> > > 
> > 
> > This seems fine for dealing with forcealign and reflink.
> > 
> > So what about forcealign and RT?
> > 
> > We want to support this config in future, but the current implementation
> > will not support it.
> 
> What's the problem with supporting it right from the start? We
> already support forcealign for RT, just it's a global config 
> under the "big rt alloc" moniker rather than a per-inode flag.
> 
> Like all on-disk format change based features,
> forcealign should add the EXPERIMENTAL flag to the filesystem for a
> couple of releases after merge, so there will be plenty of time to
> test both data and rt dev functionality before removing the
> EXPERIMENTAL flag from it.
> 
> So why not just enable the per-inode flag with RT right from the
> start given that this functionality is supposed to work and be
> globally supported by the rtdev right now? It seems like a whole lot
> less work to just enable it for RT now than it is to disable it...

What needs to be done to the rt allocator, anyway?

I think it's mostly turning off the fallback to unaligned allocation,
just like what was done for the data device allocator, right?  And
possibly tweaking whatever this does:

	/*
	 * Only bother calculating a real prod factor if offset & length are
	 * perfectly aligned, otherwise it will just get us in trouble.
	 */
	div_u64_rem(ap->offset, align, &mod);
	if (mod || ap->length % align) {
		prod = 1;
	} else {
		prod = xfs_extlen_to_rtxlen(mp, align);
		if (prod > 1)
			xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
	}


> > In this v2 series, I just disallow a mount for forcealign and RT, similar to
> > reflink and RT together.
> > 
> > Furthermore, I am also saying here that still forcealign and RT bits set is
> > a valid inode on-disk format and we just have to enforce a sb_rextsize to
> > extsize relationship:
> > 
> > xfs_inode_validate_forcealign(
> > 	struct xfs_mount	*mp,
> > 	uint32_t		extsize,
> > 	uint32_t		cowextsize,
> > 	uint16_t		mode,
> > 	uint16_t		flags,
> > 	uint64_t		flags2)
> > {
> > 	bool			rt =  flags & XFS_DIFLAG_REALTIME;
> > ...
> > 
> > 
> > 	/* extsize must be a multiple of sb_rextsize for RT */
> > 	if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
> > 		return __this_address;
> > 
> > 	return NULL;
> > }
> 
> I suspect the logic needs tweaking, but why not just do this right
> from the start?

Do we even allow (i_extsize % mp->m_sb.sb_rextsize) != 0 for realtime
files?  I didn't think we did.

--D

> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
John Garry July 24, 2024, 7:39 a.m. UTC | #14
On 24/07/2024 00:38, Dave Chinner wrote:
> What's the problem with supporting it right from the start? 

Simply because I wanted to focus on regular data volume support first.

> We
> already support forcealign for RT, just it's a global config
> under the "big rt alloc" moniker rather than a per-inode flag.
> 
> Like all on-disk format change based features,
> forcealign should add the EXPERIMENTAL flag to the filesystem for a
> couple of releases after merge, so there will be plenty of time to
> test both data and rt dev functionality before removing the
> EXPERIMENTAL flag from it.
> 
> So why not just enable the per-inode flag with RT right from the
> start given that this functionality is supposed to work and be
> globally supported by the rtdev right now? It seems like a whole lot
> less work to just enable it for RT now than it is to disable it...

I'll have a look... if it really is that easy, then - yes - we can have 
it from the start.
John Garry July 24, 2024, 6:50 p.m. UTC | #15
On 24/07/2024 01:04, Darrick J. Wong wrote:
>> So why not just enable the per-inode flag with RT right from the
>> start given that this functionality is supposed to work and be
>> globally supported by the rtdev right now? It seems like a whole lot
>> less work to just enable it for RT now than it is to disable it...
> What needs to be done to the rt allocator, anyway?
> 
> I think it's mostly turning off the fallback to unaligned allocation,
> just like what was done for the data device allocator, right?  And
> possibly tweaking whatever this does:
> 
> 	/*
> 	 * Only bother calculating a real prod factor if offset & length are
> 	 * perfectly aligned, otherwise it will just get us in trouble.
> 	 */
> 	div_u64_rem(ap->offset, align, &mod);
> 	if (mod || ap->length % align) {
> 		prod = 1;
> 	} else {
> 		prod = xfs_extlen_to_rtxlen(mp, align);
> 		if (prod > 1)
> 			xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
> 	}
> 
> 

My initial impression is that calling xfs_bmap_rtalloc() -> 
xfs_rtpick_extent() for XFS_ALLOC_INITIAL_USER_DATA won't always give an 
aligned extent. However the rest of the allocator paths are giving 
extents aligned as requested - that is from limited testing.

And we would need to not take the xfs_bmap_rtalloc() retry fallback for 
-ENOSPC when align > rtextsize, but I have not hit that yet - maybe 
because xfs_trans_reserve() stops us getting to this point due to lack 
of free rtextents.
John Garry July 26, 2024, 2:14 p.m. UTC | #16
> 
>>>>> So what about forcealign and RT?
>>>> Any opinion on this?
>>> What about forcealign and RT?
>> In this series version I was mounting the whole FS as RO if
>> XFS_FEAT_FORCEALIGN and XFS_FEAT_REFLINK was found in the SB. And so very
>> different to how I was going to individual treat inodes which happen to be
>> forcealign and reflink, above.
>>
>> So I was asking guidance when whether that approach (for RT and forcealign)
>> is sound.
> I reiterate: don't allow mounting of (forcealign && reflink) or
> (forcealign && rtextsize > 1) filesystems, and then you and I can work
> on figuring out the rest.

I'm fine with that approach for forcealign && reflink (no mounting).

As for forcealign && rtextsize > 1 it seems to be working for me. That 
is with not too many changes, so maybe we can go with this support 
initially. Personally I'd rather not, as testing may be spread too thin. 
Anyway, I'll send the patches early next week and we can make the 
judgement then.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 61f51becff4f..b48cd75d34a6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,6 +353,7 @@  xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
@@ -1094,16 +1095,19 @@  static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
 #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define XFS_DIFLAG2_FORCEALIGN_BIT 5
 
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
 #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
 #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 513b50da6215..5c61a1d1bb2b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -657,6 +657,15 @@  xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		fa = xfs_inode_validate_forcealign(mp,
+				be32_to_cpu(dip->di_extsize),
+				be32_to_cpu(dip->di_cowextsize),
+				mode, flags, flags2);
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
@@ -824,3 +833,49 @@  xfs_inode_validate_cowextsize(
 
 	return NULL;
 }
+
+/* Validate the forcealign inode flag */
+xfs_failaddr_t
+xfs_inode_validate_forcealign(
+	struct xfs_mount	*mp,
+	uint32_t		extsize,
+	uint32_t		cowextsize,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint64_t		flags2)
+{
+	bool			rt =  flags & XFS_DIFLAG_REALTIME;
+
+	/* superblock rocompat feature flag */
+	if (!xfs_has_forcealign(mp))
+		return __this_address;
+
+	/* Only regular files and directories */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* We require EXTSIZE or EXTSZINHERIT */
+	if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
+		return __this_address;
+
+	/* We require a non-zero extsize */
+	if (!extsize)
+		return __this_address;
+
+	/* Reflink'ed disallowed */
+	if (flags2 & XFS_DIFLAG2_REFLINK)
+		return __this_address;
+
+	/* COW extsize disallowed */
+	if (flags2 & XFS_DIFLAG2_COWEXTSIZE)
+		return __this_address;
+
+	if (cowextsize)
+		return __this_address;
+
+	/* extsize must be a multiple of sb_rextsize for RT */
+	if (rt && mp->m_sb.sb_rextsize && extsize % mp->m_sb.sb_rextsize)
+		return __this_address;
+
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af..b8b65287b037 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -33,6 +33,9 @@  xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
 		uint32_t cowextsize, uint16_t mode, uint16_t flags,
 		uint64_t flags2);
+xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
+		uint32_t extsize, uint32_t cowextsize, uint16_t mode,
+		uint16_t flags, uint64_t flags2);
 
 static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6b56f0f6d4c1..e56911553edd 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -164,6 +164,8 @@  xfs_sb_version_to_features(
 		features |= XFS_FEAT_REFLINK;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
 		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+		features |= XFS_FEAT_FORCEALIGN;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4e3cd8971fc..caffd4c75179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -609,6 +609,8 @@  xfs_ip2xflags(
 			flags |= FS_XFLAG_DAX;
 		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 			flags |= FS_XFLAG_COWEXTSIZE;
+		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+			flags |= FS_XFLAG_FORCEALIGN;
 	}
 
 	if (xfs_inode_has_attr_fork(ip))
@@ -738,6 +740,8 @@  xfs_inode_inherit_flags2(
 	}
 	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+		ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN;
 
 	/* Don't let invalid cowextsize hints propagate. */
 	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -746,6 +750,15 @@  xfs_inode_inherit_flags2(
 		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
 		ip->i_cowextsize = 0;
 	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				ip->i_extsize, ip->i_cowextsize,
+				VFS_I(ip)->i_mode, ip->i_diflags,
+				ip->i_diflags2);
+		if (failaddr)
+			ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN;
+	}
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 42f999c1106c..536e646dd055 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -301,6 +301,16 @@  static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
 	return ip->i_cowfp && ip->i_cowfp->if_bytes;
 }
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 {
 	return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
@@ -313,7 +323,15 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 
 static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
 {
-	return false;
+	if (!(ip->i_diflags & XFS_DIFLAG_EXTSIZE))
+		return false;
+	if (ip->i_extsize <= 1)
+		return false;
+	if (xfs_is_cow_inode(ip))
+		return false;
+	if (ip->i_diflags & XFS_DIFLAG_REALTIME)
+		return false;
+	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f0117188f302..771ef3954f4e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -525,10 +525,48 @@  xfs_flags2diflags2(
 		di_flags2 |= XFS_DIFLAG2_DAX;
 	if (xflags & FS_XFLAG_COWEXTSIZE)
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	if (xflags & FS_XFLAG_FORCEALIGN)
+		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
 
 	return di_flags2;
 }
 
+/*
+ * Forcealign requires a non-zero extent size hint and a zero cow
+ * extent size hint.  Don't allow set for RT files yet.
+ */
+static int
+xfs_ioctl_setattr_forcealign(
+	struct xfs_inode	*ip,
+	struct fileattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!xfs_has_forcealign(mp))
+		return -EINVAL;
+
+	if (xfs_is_reflink_inode(ip))
+		return -EINVAL;
+
+	if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
+				FS_XFLAG_EXTSZINHERIT)))
+		return -EINVAL;
+
+	if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
+		return -EINVAL;
+
+	if (!fa->fsx_extsize)
+		return -EINVAL;
+
+	if (fa->fsx_cowextsize)
+		return -EINVAL;
+
+	if (fa->fsx_xflags & FS_XFLAG_REALTIME)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int
 xfs_ioctl_setattr_xflags(
 	struct xfs_trans	*tp,
@@ -537,10 +575,13 @@  xfs_ioctl_setattr_xflags(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
+	bool			forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN;
 	uint64_t		i_flags2;
+	int			error;
 
-	if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
-		/* Can't change realtime flag if any extents are allocated. */
+	/* Can't change RT or forcealign flags if any extents are allocated. */
+	if (rtflag != XFS_IS_REALTIME_INODE(ip) ||
+	    forcealign != xfs_inode_has_forcealign(ip)) {
 		if (ip->i_df.if_nextents || ip->i_delayed_blks)
 			return -EINVAL;
 	}
@@ -561,6 +602,12 @@  xfs_ioctl_setattr_xflags(
 	if (i_flags2 && !xfs_has_v3inodes(mp))
 		return -EINVAL;
 
+	if (forcealign) {
+		error = xfs_ioctl_setattr_forcealign(ip, fa);
+		if (error)
+			return error;
+	}
+
 	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_diflags2 = i_flags2;
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d0567dfbc036..30228fea908d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -299,6 +299,7 @@  typedef struct xfs_mount {
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
 #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
+#define XFS_FEAT_FORCEALIGN	(1ULL << 28)	/* aligned file data extents */
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -385,6 +386,7 @@  __XFS_ADD_V4_FEAT(projid32, PROJID32)
 __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
 __XFS_HAS_V4_FEAT(crc, CRC)
 __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
+__XFS_HAS_FEAT(forcealign, FORCEALIGN)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 265a2a418bc7..8da293e8bfa2 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1467,8 +1467,9 @@  xfs_reflink_remap_prep(
 
 	/* Check file eligibility and prepare for block sharing. */
 	ret = -EINVAL;
-	/* Don't reflink realtime inodes */
-	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+	/* Don't reflink realtime or forcealign inodes */
+	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) ||
+	    xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest))
 		goto out_unlock;
 
 	/* Don't share DAX file data with non-DAX file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 65c5dfe17ecf..fb55e4ce49fa 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,6 @@ 
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
-static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
-{
-	return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
-}
-
-static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
-{
-	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
-}
-
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 27e9f749c4c7..4f0c77f38de1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1721,6 +1721,17 @@  xfs_fs_fill_super(
 		mp->m_features &= ~XFS_FEAT_DISCARD;
 	}
 
+	if (xfs_has_forcealign(mp)) {
+		if (xfs_has_realtime(mp)) {
+			xfs_alert(mp,
+	"forcealign not compatible with realtime device!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+		xfs_warn(mp,
+"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
+	}
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 45e4e64fd664..8af304c0e29a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -158,6 +158,8 @@  struct fsxattr {
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define FS_XFLAG_FORCEALIGN	0x00020000
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is