diff mbox series

[v3,08/21] xfs: Introduce FORCEALIGN inode flag

Message ID 20240429174746.2132161-9-john.g.garry@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry April 29, 2024, 5:47 p.m. UTC
From: "Darrick J. Wong" <djwong@kernel.org>

Add a new inode flag to require that all file data extent mappings must
be aligned (both the file offset range and the allocated space itself)
to the extent size hint.  Having a separate COW extent size hint is no
longer allowed.

The goal here is to enable sysadmins and users to mandate that all space
mappings in a file must have a startoff/blockcount that are aligned to
(say) a 2MB alignment and that the startblock/blockcount will follow the
same alignment.

jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
     alignment for forcealign
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Co-developed-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h    |  6 ++++-
 fs/xfs/libxfs/xfs_inode_buf.c | 50 +++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_inode_buf.h |  3 +++
 fs/xfs/libxfs/xfs_sb.c        |  2 ++
 fs/xfs/xfs_inode.c            | 12 +++++++++
 fs/xfs/xfs_inode.h            |  2 +-
 fs/xfs/xfs_ioctl.c            | 34 +++++++++++++++++++++++-
 fs/xfs/xfs_mount.h            |  2 ++
 fs/xfs/xfs_super.c            |  4 +++
 include/uapi/linux/fs.h       |  2 ++
 10 files changed, 114 insertions(+), 3 deletions(-)

Comments

Dave Chinner April 30, 2024, 11:22 p.m. UTC | #1
On Mon, Apr 29, 2024 at 05:47:33PM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@kernel.org>
> 
> Add a new inode flag to require that all file data extent mappings must
> be aligned (both the file offset range and the allocated space itself)
> to the extent size hint.  Having a separate COW extent size hint is no
> longer allowed.
> 
> The goal here is to enable sysadmins and users to mandate that all space
> mappings in a file must have a startoff/blockcount that are aligned to
> (say) a 2MB alignment and that the startblock/blockcount will follow the
> same alignment.
> 
> jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
>      alignment for forcealign
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> Co-developed-by: John Garry <john.g.garry@oracle.com>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---

....

> @@ -783,3 +791,45 @@ xfs_inode_validate_cowextsize(
>  
>  	return NULL;
>  }
> +
> +/* Validate the forcealign inode flag */
> +xfs_failaddr_t
> +xfs_inode_validate_forcealign(
> +	struct xfs_mount	*mp,
> +	uint16_t		mode,

	umode_t			mode,

> +	uint16_t		flags,
> +	uint32_t		extsize,
> +	uint32_t		cowextsize)

extent sizes are xfs_extlen_t types.

> +{
> +	/* superblock rocompat feature flag */
> +	if (!xfs_has_forcealign(mp))
> +		return __this_address;
> +
> +	/* Only regular files and directories */
> +	if (!S_ISDIR(mode) && !S_ISREG(mode))
> +		return __this_address;
> +
> +	/* Doesn't apply to realtime files */
> +	if (flags & XFS_DIFLAG_REALTIME)
> +		return __this_address;

Why not? A rt device with an extsize of 1 fsb could make use of
forced alignment just like the data device to allow larger atomic
writes to be done. I mean, just because we haven't written the code
to do this yet doesn't mean it is an illegal on-disk format state.

> +	/* Requires a non-zero power-of-2 extent size hint */
> +	if (extsize == 0 || !is_power_of_2(extsize) ||
> +	    (mp->m_sb.sb_agblocks % extsize))
> +		return __this_address;

Please do these as indiviual checks with their own fail address.
That way we can tell which check failed from the console output.
Also, the agblocks check is already split out below, so it's being
checked twice...

Also, why does force-align require a power-of-2 extent size? Why
does it require the extent size to be an exact divisor of the AG
size? Aren't these atomic write alignment restrictions? i.e.
shouldn't these only be enforced when the atomic writes inode flag
is set?

> +	/* Requires agsize be a multiple of extsize */
> +	if (mp->m_sb.sb_agblocks % extsize)
> +		return __this_address;
> +
> +	/* Requires stripe unit+width (if set) be a multiple of extsize */
> +	if ((mp->m_dalign && (mp->m_dalign % extsize)) ||
> +	    (mp->m_swidth && (mp->m_swidth % extsize)))
> +		return __this_address;

Again, this is an atomic write constraint, isn't it?

> +	/* Requires no cow extent size hint */
> +	if (cowextsize != 0)
> +		return __this_address;

What if it's a reflinked file?

.....

> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index d0e2cec6210d..d1126509ceb9 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1110,6 +1110,8 @@ xfs_flags2diflags2(
>  		di_flags2 |= XFS_DIFLAG2_DAX;
>  	if (xflags & FS_XFLAG_COWEXTSIZE)
>  		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
> +	if (xflags & FS_XFLAG_FORCEALIGN)
> +		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
>  
>  	return di_flags2;
>  }
> @@ -1146,6 +1148,22 @@ xfs_ioctl_setattr_xflags(
>  	if (i_flags2 && !xfs_has_v3inodes(mp))
>  		return -EINVAL;
>  
> +	/*
> +	 * Force-align requires a nonzero extent size hint and a zero cow
> +	 * extent size hint.  It doesn't apply to realtime files.
> +	 */
> +	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
> +		if (!xfs_has_forcealign(mp))
> +			return -EINVAL;
> +		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
> +			return -EINVAL;
> +		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
> +					FS_XFLAG_EXTSZINHERIT)))
> +			return -EINVAL;
> +		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
> +			return -EINVAL;
> +	}

What about if the file already has shared extents on it (i.e.
reflinked or deduped?)

Also, why is this getting checked here instead of in
xfs_ioctl_setattr_check_extsize()?


> @@ -1263,7 +1283,19 @@ xfs_ioctl_setattr_check_extsize(
>  	failaddr = xfs_inode_validate_extsize(ip->i_mount,
>  			XFS_B_TO_FSB(mp, fa->fsx_extsize),
>  			VFS_I(ip)->i_mode, new_diflags);
> -	return failaddr != NULL ? -EINVAL : 0;
> +	if (failaddr)
> +		return -EINVAL;
> +
> +	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
> +		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
> +				VFS_I(ip)->i_mode, new_diflags,
> +				XFS_B_TO_FSB(mp, fa->fsx_extsize),
> +				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
> +		if (failaddr)
> +			return -EINVAL;
> +	}

Oh, it's because you're trying to use on-disk format validation
routines for user API validation. That, IMO, is a bad idea because
the on-disk format and kernel/user APIs should not be tied
together as they have different constraints and error conditions.

That also explains why xfs_inode_validate_forcealign() doesn't just
get passed the inode to validate - it's because you want to pass
information from the user API to it. This results in sub-optimal
code for both on-disk format validation and user API validation.

Can you please separate these and put all the force align user API
validation checks in the one function?

-Dave.
John Garry May 1, 2024, 10:03 a.m. UTC | #2
>> +/* Validate the forcealign inode flag */
>> +xfs_failaddr_t
>> +xfs_inode_validate_forcealign(
>> +	struct xfs_mount	*mp,
>> +	uint16_t		mode,
> 
> 	umode_t			mode,

ok. BTW, other functions like xfs_inode_validate_extsize() use uint16_t

> 
>> +	uint16_t		flags,
>> +	uint32_t		extsize,
>> +	uint32_t		cowextsize)
> 
> extent sizes are xfs_extlen_t types.

ok

> 
>> +{
>> +	/* superblock rocompat feature flag */
>> +	if (!xfs_has_forcealign(mp))
>> +		return __this_address;
>> +
>> +	/* Only regular files and directories */
>> +	if (!S_ISDIR(mode) && !S_ISREG(mode))
>> +		return __this_address;
>> +
>> +	/* Doesn't apply to realtime files */
>> +	if (flags & XFS_DIFLAG_REALTIME)
>> +		return __this_address;
> 
> Why not? A rt device with an extsize of 1 fsb could make use of
> forced alignment just like the data device to allow larger atomic
> writes to be done. I mean, just because we haven't written the code
> to do this yet doesn't mean it is an illegal on-disk format state.

ok, so where is a better place to disallow forcealign for RT now (since 
we have not written the code to support it nor verified it)?

> 
>> +	/* Requires a non-zero power-of-2 extent size hint */
>> +	if (extsize == 0 || !is_power_of_2(extsize) ||
>> +	    (mp->m_sb.sb_agblocks % extsize))
>> +		return __this_address;
> 
> Please do these as indiviual checks with their own fail address.

ok

> That way we can tell which check failed from the console output.
> Also, the agblocks check is already split out below, so it's being
> checked twice...
> 
> Also, why does force-align require a power-of-2 extent size? Why
> does it require the extent size to be an exact divisor of the AG
> size? Aren't these atomic write alignment restrictions? i.e.
> shouldn't these only be enforced when the atomic writes inode flag
> is set?

With regards the power-of-2 restriction, I think that the code changes 
are going to become a lot more complex if we don't enforce this for 
forcealign.

For example, consider xfs_file_dio_write(), where we check for an 
unaligned write based on forcealign extent mask. It's much simpler to 
rely on a power-of-2 size. And same for iomap extent zeroing.

So then it can be asked, for what reason do we want to support 
unorthodox, non-power-of-2 sizes? Who would want this?

As for AG size, again I think that it is required to be aligned to the 
forcealign extsize. As I remember, when converting from an FSB to a DB, 
if the AG itself is not aligned to the forcealign extsize, then the DB 
will not be aligned to the forcealign extsize. More below...

> 
>> +	/* Requires agsize be a multiple of extsize */
>> +	if (mp->m_sb.sb_agblocks % extsize)
>> +		return __this_address;
>> +
>> +	/* Requires stripe unit+width (if set) be a multiple of extsize */
>> +	if ((mp->m_dalign && (mp->m_dalign % extsize)) ||
>> +	    (mp->m_swidth && (mp->m_swidth % extsize)))
>> +		return __this_address;
> 
> Again, this is an atomic write constraint, isn't it?

So why do we want forcealign? It is to only align extent FSBs? Or to 
align extents to DBs? I would have thought the latter. If so, it seems 
sensible to do this check also.

> 
>> +	/* Requires no cow extent size hint */
>> +	if (cowextsize != 0)
>> +		return __this_address;
> 
> What if it's a reflinked file?

Yeah, I think that we want to disallow that.

> 
> .....
> 
>> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
>> index d0e2cec6210d..d1126509ceb9 100644
>> --- a/fs/xfs/xfs_ioctl.c
>> +++ b/fs/xfs/xfs_ioctl.c
>> @@ -1110,6 +1110,8 @@ xfs_flags2diflags2(
>>   		di_flags2 |= XFS_DIFLAG2_DAX;
>>   	if (xflags & FS_XFLAG_COWEXTSIZE)
>>   		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
>> +	if (xflags & FS_XFLAG_FORCEALIGN)
>> +		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
>>   
>>   	return di_flags2;
>>   }
>> @@ -1146,6 +1148,22 @@ xfs_ioctl_setattr_xflags(
>>   	if (i_flags2 && !xfs_has_v3inodes(mp))
>>   		return -EINVAL;
>>   
>> +	/*
>> +	 * Force-align requires a nonzero extent size hint and a zero cow
>> +	 * extent size hint.  It doesn't apply to realtime files.
>> +	 */
>> +	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
>> +		if (!xfs_has_forcealign(mp))
>> +			return -EINVAL;
>> +		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
>> +			return -EINVAL;
>> +		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
>> +					FS_XFLAG_EXTSZINHERIT)))
>> +			return -EINVAL;
>> +		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
>> +			return -EINVAL;
>> +	}
> 
> What about if the file already has shared extents on it (i.e.
> reflinked or deduped?)

At the top of the function we have this check for RT:

	if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
		/* Can't change realtime flag if any extents are allocated. */
		if (ip->i_df.if_nextents || ip->i_delayed_blks)
			return -EINVAL;
	}

Would expanding that check for forcealign also suffice? Indeed, later in 
this series I expanded this check to cover atomicwrites (when I really 
intended it for forcealign).

> 
> Also, why is this getting checked here instead of in
> xfs_ioctl_setattr_check_extsize()?
> 
> 
>> @@ -1263,7 +1283,19 @@ xfs_ioctl_setattr_check_extsize(
>>   	failaddr = xfs_inode_validate_extsize(ip->i_mount,
>>   			XFS_B_TO_FSB(mp, fa->fsx_extsize),
>>   			VFS_I(ip)->i_mode, new_diflags);
>> -	return failaddr != NULL ? -EINVAL : 0;
>> +	if (failaddr)
>> +		return -EINVAL;
>> +
>> +	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
>> +		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
>> +				VFS_I(ip)->i_mode, new_diflags,
>> +				XFS_B_TO_FSB(mp, fa->fsx_extsize),
>> +				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
>> +		if (failaddr)
>> +			return -EINVAL;
>> +	}
> 
> Oh, it's because you're trying to use on-disk format validation
> routines for user API validation. That, IMO, is a bad idea because
> the on-disk format and kernel/user APIs should not be tied
> together as they have different constraints and error conditions.
> 
> That also explains why xfs_inode_validate_forcealign() doesn't just
> get passed the inode to validate - it's because you want to pass
> information from the user API to it. This results in sub-optimal
> code for both on-disk format validation and user API validation.
> 
> Can you please separate these and put all the force align user API
> validation checks in the one function?
> 

ok, fine. But it would be good to have clarification on function of 
forcealign, above, i.e. does it always align extents to disk blocks?

Thanks,
John
Dave Chinner May 2, 2024, 12:50 a.m. UTC | #3
On Wed, May 01, 2024 at 11:03:06AM +0100, John Garry wrote:
> 
> > > +/* Validate the forcealign inode flag */
> > > +xfs_failaddr_t
> > > +xfs_inode_validate_forcealign(
> > > +	struct xfs_mount	*mp,
> > > +	uint16_t		mode,
> > 
> > 	umode_t			mode,
> 
> ok. BTW, other functions like xfs_inode_validate_extsize() use uint16_t
> 
> > 
> > > +	uint16_t		flags,
> > > +	uint32_t		extsize,
> > > +	uint32_t		cowextsize)
> > 
> > extent sizes are xfs_extlen_t types.
> 
> ok
> 
> > 
> > > +{
> > > +	/* superblock rocompat feature flag */
> > > +	if (!xfs_has_forcealign(mp))
> > > +		return __this_address;
> > > +
> > > +	/* Only regular files and directories */
> > > +	if (!S_ISDIR(mode) && !S_ISREG(mode))
> > > +		return __this_address;
> > > +
> > > +	/* Doesn't apply to realtime files */
> > > +	if (flags & XFS_DIFLAG_REALTIME)
> > > +		return __this_address;
> > 
> > Why not? A rt device with an extsize of 1 fsb could make use of
> > forced alignment just like the data device to allow larger atomic
> > writes to be done. I mean, just because we haven't written the code
> > to do this yet doesn't mean it is an illegal on-disk format state.
> 
> ok, so where is a better place to disallow forcealign for RT now (since we
> have not written the code to support it nor verified it)?

Just don't allow it to be set in the setattr ioctl if the inode is
RT. ANd don't let an inode be marked RT if forcealign is already
set.

> 
> > 
> > > +	/* Requires a non-zero power-of-2 extent size hint */
> > > +	if (extsize == 0 || !is_power_of_2(extsize) ||
> > > +	    (mp->m_sb.sb_agblocks % extsize))
> > > +		return __this_address;
> > 
> > Please do these as indiviual checks with their own fail address.
> 
> ok
> 
> > That way we can tell which check failed from the console output.
> > Also, the agblocks check is already split out below, so it's being
> > checked twice...
> > 
> > Also, why does force-align require a power-of-2 extent size? Why
> > does it require the extent size to be an exact divisor of the AG
> > size? Aren't these atomic write alignment restrictions? i.e.
> > shouldn't these only be enforced when the atomic writes inode flag
> > is set?
> 
> With regards the power-of-2 restriction, I think that the code changes are
> going to become a lot more complex if we don't enforce this for forcealign.
> 
> For example, consider xfs_file_dio_write(), where we check for an unaligned
> write based on forcealign extent mask. It's much simpler to rely on a
> power-of-2 size. And same for iomap extent zeroing.

But it's not more complex - we already do this non-power-of-2
alignment stuff for all the realtime code, so it's just a matter
of not blindly using bit masking in alignment checks.

> So then it can be asked, for what reason do we want to support unorthodox,
> non-power-of-2 sizes? Who would want this?

I'm constantly surprised by the way people use stuff like this
filesystem and storage alignment constraints are not arbitrarily
limited to power-of-2 sizes.

For example, code implementation is simple in RAID setups when you
use power-of-2 chunk sizes and stripe widths. But not all storage
hardware fits power-of-2 configs like 4+1, 4+2, 8+1, 8+2, etc. THis
is pretty common - 2.5" 2U drive trays have 24 drive bays. If you
want to give up 33% of the storage capacity just to use power-of-2
stripe widths then you would use 4x4+2 RAID6 luns. However, most
people don't want to waste that much money on redundancy. They are
much more likely to use 2x10+2 RAID6 luns or 1x21+2 with a hot spare
to maximise the data storage capacity.

If someone wants to force-align allocation to stripe widths on such
a RAID array config rather than trying to rely on the best effort
swalloc mount option, then they need non-power-of-2
alignments to be supported.

It's pretty much a no-brainer - the alignment code already handles
non-power-of-2 alignments, and it's not very much additional code to
ensure we can handle any alignment the user specified.

> As for AG size, again I think that it is required to be aligned to the
> forcealign extsize. As I remember, when converting from an FSB to a DB, if
> the AG itself is not aligned to the forcealign extsize, then the DB will not
> be aligned to the forcealign extsize. More below...
> 
> > 
> > > +	/* Requires agsize be a multiple of extsize */
> > > +	if (mp->m_sb.sb_agblocks % extsize)
> > > +		return __this_address;
> > > +
> > > +	/* Requires stripe unit+width (if set) be a multiple of extsize */
> > > +	if ((mp->m_dalign && (mp->m_dalign % extsize)) ||
> > > +	    (mp->m_swidth && (mp->m_swidth % extsize)))
> > > +		return __this_address;
> > 
> > Again, this is an atomic write constraint, isn't it?
> 
> So why do we want forcealign? It is to only align extent FSBs?

Yes. forced alignment is essentially just extent size guarantees.

This is part of what is needed for atomic writes, but atomic writes
also require specific physical storage alignment between the
filesystem and the device. The filesystem setup has to correctly
align AGs to the physical storage, and stuff like RAID
configurations need to be specifically compatible with the atomic
write capabilities of the underlying hardware.

None of these hardware iand storage stack alignment constraints have
any relevance to the filesystem forced alignment functionality. They
are completely indepedent. All the forced alignment does is
guarantees that allocation is aligned according the extent size hint
on the inode or it fails with ENOSPC.

> > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > > index d0e2cec6210d..d1126509ceb9 100644
> > > --- a/fs/xfs/xfs_ioctl.c
> > > +++ b/fs/xfs/xfs_ioctl.c
> > > @@ -1110,6 +1110,8 @@ xfs_flags2diflags2(
> > >   		di_flags2 |= XFS_DIFLAG2_DAX;
> > >   	if (xflags & FS_XFLAG_COWEXTSIZE)
> > >   		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
> > > +	if (xflags & FS_XFLAG_FORCEALIGN)
> > > +		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
> > >   	return di_flags2;
> > >   }
> > > @@ -1146,6 +1148,22 @@ xfs_ioctl_setattr_xflags(
> > >   	if (i_flags2 && !xfs_has_v3inodes(mp))
> > >   		return -EINVAL;
> > > +	/*
> > > +	 * Force-align requires a nonzero extent size hint and a zero cow
> > > +	 * extent size hint.  It doesn't apply to realtime files.
> > > +	 */
> > > +	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
> > > +		if (!xfs_has_forcealign(mp))
> > > +			return -EINVAL;
> > > +		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
> > > +			return -EINVAL;
> > > +		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
> > > +					FS_XFLAG_EXTSZINHERIT)))
> > > +			return -EINVAL;
> > > +		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
> > > +			return -EINVAL;
> > > +	}
> > 
> > What about if the file already has shared extents on it (i.e.
> > reflinked or deduped?)
> 
> At the top of the function we have this check for RT:
> 
> 	if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
> 		/* Can't change realtime flag if any extents are allocated. */
> 		if (ip->i_df.if_nextents || ip->i_delayed_blks)
> 			return -EINVAL;
> 	}
> 
> Would expanding that check for forcealign also suffice? Indeed, later in
> this series I expanded this check to cover atomicwrites (when I really
> intended it for forcealign).

For the moment, yes.

> > > @@ -1263,7 +1283,19 @@ xfs_ioctl_setattr_check_extsize(
> > >   	failaddr = xfs_inode_validate_extsize(ip->i_mount,
> > >   			XFS_B_TO_FSB(mp, fa->fsx_extsize),
> > >   			VFS_I(ip)->i_mode, new_diflags);
> > > -	return failaddr != NULL ? -EINVAL : 0;
> > > +	if (failaddr)
> > > +		return -EINVAL;
> > > +
> > > +	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
> > > +		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
> > > +				VFS_I(ip)->i_mode, new_diflags,
> > > +				XFS_B_TO_FSB(mp, fa->fsx_extsize),
> > > +				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
> > > +		if (failaddr)
> > > +			return -EINVAL;
> > > +	}
> > 
> > Oh, it's because you're trying to use on-disk format validation
> > routines for user API validation. That, IMO, is a bad idea because
> > the on-disk format and kernel/user APIs should not be tied
> > together as they have different constraints and error conditions.
> > 
> > That also explains why xfs_inode_validate_forcealign() doesn't just
> > get passed the inode to validate - it's because you want to pass
> > information from the user API to it. This results in sub-optimal
> > code for both on-disk format validation and user API validation.
> > 
> > Can you please separate these and put all the force align user API
> > validation checks in the one function?
> > 
> 
> ok, fine. But it would be good to have clarification on function of
> forcealign, above, i.e. does it always align extents to disk blocks?

No, it doesn't. XFS has never done this - physical extent alignment
is always done relative to the start of the AG, not the underlying
disk geometry.

IOWs, forced alignement is not aligning to disk blocks at all - it
is aligning extents logically to file offset and physically to the
offset from the start of the allocation group.  Hence there are no
real constraints on forced alignment - we can do any sort of
alignment as long it is smaller than half the max size of a physical
extent.

For allocation to then be aligned to physical storage, we need mkfs
to physically align the start of each AG to the geometry of the
underlying storage. We already do this for filesystems with a stripe
unit defined, hence stripe aligned allocation is physically aligned
to the underlying storage.

However, if mkfs doesn't get the physical layout of AGs right, there
is nothing the mounted filesystem can do to guarantee extent
allocation is aligned to physical disk blocks regardless of whether
forced alignment is enabled or not...

-Dave.
John Garry May 2, 2024, 7:56 a.m. UTC | #4
On 02/05/2024 01:50, Dave Chinner wrote:
>> For example, consider xfs_file_dio_write(), where we check for an unaligned
>> write based on forcealign extent mask. It's much simpler to rely on a
>> power-of-2 size. And same for iomap extent zeroing.
> But it's not more complex - we already do this non-power-of-2
> alignment stuff for all the realtime code, so it's just a matter
> of not blindly using bit masking in alignment checks.
> 
>> So then it can be asked, for what reason do we want to support unorthodox,
>> non-power-of-2 sizes? Who would want this?
> I'm constantly surprised by the way people use stuff like this
> filesystem and storage alignment constraints are not arbitrarily
> limited to power-of-2 sizes.
> 
> For example, code implementation is simple in RAID setups when you
> use power-of-2 chunk sizes and stripe widths. But not all storage
> hardware fits power-of-2 configs like 4+1, 4+2, 8+1, 8+2, etc. THis
> is pretty common - 2.5" 2U drive trays have 24 drive bays. If you
> want to give up 33% of the storage capacity just to use power-of-2
> stripe widths then you would use 4x4+2 RAID6 luns. However, most
> people don't want to waste that much money on redundancy. They are
> much more likely to use 2x10+2 RAID6 luns or 1x21+2 with a hot spare
> to maximise the data storage capacity.

Thanks for sharing this info

> 
> If someone wants to force-align allocation to stripe widths on such
> a RAID array config rather than trying to rely on the best effort
> swalloc mount option, then they need non-power-of-2
> alignments to be supported.
> 
> It's pretty much a no-brainer - the alignment code already handles
> non-power-of-2 alignments, and it's not very much additional code to
> ensure we can handle any alignment the user specified.

ok, fine

> 
>> As for AG size, again I think that it is required to be aligned to the
>> forcealign extsize. As I remember, when converting from an FSB to a DB, if
>> the AG itself is not aligned to the forcealign extsize, then the DB will not
>> be aligned to the forcealign extsize. More below...
>>
>>>> +	/* Requires agsize be a multiple of extsize */
>>>> +	if (mp->m_sb.sb_agblocks % extsize)
>>>> +		return __this_address;
>>>> +
>>>> +	/* Requires stripe unit+width (if set) be a multiple of extsize */
>>>> +	if ((mp->m_dalign && (mp->m_dalign % extsize)) ||
>>>> +	    (mp->m_swidth && (mp->m_swidth % extsize)))
>>>> +		return __this_address;
>>> Again, this is an atomic write constraint, isn't it?
>> So why do we want forcealign? It is to only align extent FSBs?
> Yes. forced alignment is essentially just extent size guarantees.
> 
> This is part of what is needed for atomic writes, but atomic writes
> also require specific physical storage alignment between the
> filesystem and the device. The filesystem setup has to correctly
> align AGs to the physical storage, and stuff like RAID
> configurations need to be specifically compatible with the atomic
> write capabilities of the underlying hardware.
> 
> None of these hardware iand storage stack alignment constraints have
> any relevance to the filesystem forced alignment functionality. They
> are completely indepedent. All the forced alignment does is
> guarantees that allocation is aligned according the extent size hint
> on the inode or it fails with ENOSPC.

Fine, so only for atomic writes we just need to ensure FSBs are aligned 
to DBs.

And so it is the responsibility of mkfs to ensure AG size aligns to any 
forcealign extsize specified and also disk atomic write geometry.

For atomic write only, it is the responsibility of the kernel to check 
the forcealign extsize is compatible with any stripe alignment and AG size.

>>>
>>> Can you please separate these and put all the force align user API
>>> validation checks in the one function?
>>>
>> ok, fine. But it would be good to have clarification on function of
>> forcealign, above, i.e. does it always align extents to disk blocks?
> No, it doesn't. XFS has never done this - physical extent alignment
> is always done relative to the start of the AG, not the underlying
> disk geometry.
> 
> IOWs, forced alignement is not aligning to disk blocks at all - it
> is aligning extents logically to file offset and physically to the
> offset from the start of the allocation group.  Hence there are no
> real constraints on forced alignment - we can do any sort of
> alignment as long it is smaller than half the max size of a physical
> extent.
> 
> For allocation to then be aligned to physical storage, we need mkfs
> to physically align the start of each AG to the geometry of the
> underlying storage. We already do this for filesystems with a stripe
> unit defined, hence stripe aligned allocation is physically aligned
> to the underlying storage.

Sure

> 
> However, if mkfs doesn't get the physical layout of AGs right, there
> is nothing the mounted filesystem can do to guarantee extent
> allocation is aligned to physical disk blocks regardless of whether
> forced alignment is enabled or not...

ok, understood.

Thanks,
John
Long Li June 12, 2024, 2:10 a.m. UTC | #5
On Mon, Apr 29, 2024 at 05:47:33PM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@kernel.org>
> 
> Add a new inode flag to require that all file data extent mappings must
> be aligned (both the file offset range and the allocated space itself)
> to the extent size hint.  Having a separate COW extent size hint is no
> longer allowed.
> 
> The goal here is to enable sysadmins and users to mandate that all space
> mappings in a file must have a startoff/blockcount that are aligned to
> (say) a 2MB alignment and that the startblock/blockcount will follow the
> same alignment.
> 
> jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
>      alignment for forcealign
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> Co-developed-by: John Garry <john.g.garry@oracle.com>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_format.h    |  6 ++++-
>  fs/xfs/libxfs/xfs_inode_buf.c | 50 +++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_inode_buf.h |  3 +++
>  fs/xfs/libxfs/xfs_sb.c        |  2 ++
>  fs/xfs/xfs_inode.c            | 12 +++++++++
>  fs/xfs/xfs_inode.h            |  2 +-
>  fs/xfs/xfs_ioctl.c            | 34 +++++++++++++++++++++++-
>  fs/xfs/xfs_mount.h            |  2 ++
>  fs/xfs/xfs_super.c            |  4 +++
>  include/uapi/linux/fs.h       |  2 ++
>  10 files changed, 114 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 2b2f9050fbfb..4dd295b047f8 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
>  #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
>  #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>  #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
 
Hi, John

You know I've been using and testing your atomic writes patch series recently,
and I'm particularly interested in the changes to the on-disk format. I noticed
that XFS_SB_FEAT_RO_COMPAT_FORCEALIGN uses bit 30 instead of bit 4, which would
be the next available bit in sequence.

I'm wondering if using bit 30 is just a temporary solution to avoid conflicts, 
and if the plan is to eventually use bits sequentially, for example, using bit 4?
I'm looking forward to your explanation.

Thanks,
Long Li
John Garry June 12, 2024, 6:55 a.m. UTC | #6
On 12/06/2024 03:10, Long Li wrote:
> On Mon, Apr 29, 2024 at 05:47:33PM +0000, John Garry wrote:
>> From: "Darrick J. Wong"<djwong@kernel.org>
>>
>> Add a new inode flag to require that all file data extent mappings must
>> be aligned (both the file offset range and the allocated space itself)
>> to the extent size hint.  Having a separate COW extent size hint is no
>> longer allowed.
>>
>> The goal here is to enable sysadmins and users to mandate that all space
>> mappings in a file must have a startoff/blockcount that are aligned to
>> (say) a 2MB alignment and that the startblock/blockcount will follow the
>> same alignment.
>>
>> jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
>>       alignment for forcealign
>> Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
>> Co-developed-by: John Garry<john.g.garry@oracle.com>
>> Signed-off-by: John Garry<john.g.garry@oracle.com>
>> ---
>>   fs/xfs/libxfs/xfs_format.h    |  6 ++++-
>>   fs/xfs/libxfs/xfs_inode_buf.c | 50 +++++++++++++++++++++++++++++++++++
>>   fs/xfs/libxfs/xfs_inode_buf.h |  3 +++
>>   fs/xfs/libxfs/xfs_sb.c        |  2 ++
>>   fs/xfs/xfs_inode.c            | 12 +++++++++
>>   fs/xfs/xfs_inode.h            |  2 +-
>>   fs/xfs/xfs_ioctl.c            | 34 +++++++++++++++++++++++-
>>   fs/xfs/xfs_mount.h            |  2 ++
>>   fs/xfs/xfs_super.c            |  4 +++
>>   include/uapi/linux/fs.h       |  2 ++
>>   10 files changed, 114 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
>> index 2b2f9050fbfb..4dd295b047f8 100644
>> --- a/fs/xfs/libxfs/xfs_format.h
>> +++ b/fs/xfs/libxfs/xfs_format.h
>> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
>>   #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
>>   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
>>   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
>> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
>   
> Hi, John
> 
> You know I've been using and testing your atomic writes patch series recently,
> and I'm particularly interested in the changes to the on-disk format. I noticed
> that XFS_SB_FEAT_RO_COMPAT_FORCEALIGN uses bit 30 instead of bit 4, which would
> be the next available bit in sequence.
> 
> I'm wondering if using bit 30 is just a temporary solution to avoid conflicts,
> and if the plan is to eventually use bits sequentially, for example, using bit 4?
> I'm looking forward to your explanation.

I really don't know. I'm looking through the history and it has been 
like that this the start of my source control records.

Maybe it was a copy-and-paste error from XFS_FEAT_FORCEALIGN, whose 
value has changed since.

Anyway, I'll ask a bit more internally, and I'll look to change to (1 << 
4) if ok.

Thanks,
John
Darrick J. Wong June 12, 2024, 3:43 p.m. UTC | #7
On Wed, Jun 12, 2024 at 07:55:31AM +0100, John Garry wrote:
> On 12/06/2024 03:10, Long Li wrote:
> > On Mon, Apr 29, 2024 at 05:47:33PM +0000, John Garry wrote:
> > > From: "Darrick J. Wong"<djwong@kernel.org>
> > > 
> > > Add a new inode flag to require that all file data extent mappings must
> > > be aligned (both the file offset range and the allocated space itself)
> > > to the extent size hint.  Having a separate COW extent size hint is no
> > > longer allowed.
> > > 
> > > The goal here is to enable sysadmins and users to mandate that all space
> > > mappings in a file must have a startoff/blockcount that are aligned to
> > > (say) a 2MB alignment and that the startblock/blockcount will follow the
> > > same alignment.
> > > 
> > > jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
> > >       alignment for forcealign
> > > Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
> > > Co-developed-by: John Garry<john.g.garry@oracle.com>
> > > Signed-off-by: John Garry<john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/libxfs/xfs_format.h    |  6 ++++-
> > >   fs/xfs/libxfs/xfs_inode_buf.c | 50 +++++++++++++++++++++++++++++++++++
> > >   fs/xfs/libxfs/xfs_inode_buf.h |  3 +++
> > >   fs/xfs/libxfs/xfs_sb.c        |  2 ++
> > >   fs/xfs/xfs_inode.c            | 12 +++++++++
> > >   fs/xfs/xfs_inode.h            |  2 +-
> > >   fs/xfs/xfs_ioctl.c            | 34 +++++++++++++++++++++++-
> > >   fs/xfs/xfs_mount.h            |  2 ++
> > >   fs/xfs/xfs_super.c            |  4 +++
> > >   include/uapi/linux/fs.h       |  2 ++
> > >   10 files changed, 114 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > > index 2b2f9050fbfb..4dd295b047f8 100644
> > > --- a/fs/xfs/libxfs/xfs_format.h
> > > +++ b/fs/xfs/libxfs/xfs_format.h
> > > @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
> > >   #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
> > >   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
> > >   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
> > > +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
> > Hi, John
> > 
> > You know I've been using and testing your atomic writes patch series recently,
> > and I'm particularly interested in the changes to the on-disk format. I noticed
> > that XFS_SB_FEAT_RO_COMPAT_FORCEALIGN uses bit 30 instead of bit 4, which would
> > be the next available bit in sequence.
> > 
> > I'm wondering if using bit 30 is just a temporary solution to avoid conflicts,
> > and if the plan is to eventually use bits sequentially, for example, using bit 4?
> > I'm looking forward to your explanation.
> 
> I really don't know. I'm looking through the history and it has been like
> that this the start of my source control records.
> 
> Maybe it was a copy-and-paste error from XFS_FEAT_FORCEALIGN, whose value
> has changed since.
> 
> Anyway, I'll ask a bit more internally, and I'll look to change to (1 << 4)
> if ok.

I tend to use upper bits for ondisk features that are still under
development so that (a) there won't be collisions with other features
getting merged and (b) after the feature I'm working on gets merged, any
old fs images in my zoo will no longer mount.

--D

> Thanks,
> John
>
Long Li June 13, 2024, 2:04 a.m. UTC | #8
On Wed, Jun 12, 2024 at 08:43:42AM -0700, Darrick J. Wong wrote:
> On Wed, Jun 12, 2024 at 07:55:31AM +0100, John Garry wrote:
> > On 12/06/2024 03:10, Long Li wrote:
> > > On Mon, Apr 29, 2024 at 05:47:33PM +0000, John Garry wrote:
> > > > From: "Darrick J. Wong"<djwong@kernel.org>
> > > > 
> > > > Add a new inode flag to require that all file data extent mappings must
> > > > be aligned (both the file offset range and the allocated space itself)
> > > > to the extent size hint.  Having a separate COW extent size hint is no
> > > > longer allowed.
> > > > 
> > > > The goal here is to enable sysadmins and users to mandate that all space
> > > > mappings in a file must have a startoff/blockcount that are aligned to
> > > > (say) a 2MB alignment and that the startblock/blockcount will follow the
> > > > same alignment.
> > > > 
> > > > jpg: Enforce extsize is a power-of-2 and aligned with afgsize + stripe
> > > >       alignment for forcealign
> > > > Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
> > > > Co-developed-by: John Garry<john.g.garry@oracle.com>
> > > > Signed-off-by: John Garry<john.g.garry@oracle.com>
> > > > ---
> > > >   fs/xfs/libxfs/xfs_format.h    |  6 ++++-
> > > >   fs/xfs/libxfs/xfs_inode_buf.c | 50 +++++++++++++++++++++++++++++++++++
> > > >   fs/xfs/libxfs/xfs_inode_buf.h |  3 +++
> > > >   fs/xfs/libxfs/xfs_sb.c        |  2 ++
> > > >   fs/xfs/xfs_inode.c            | 12 +++++++++
> > > >   fs/xfs/xfs_inode.h            |  2 +-
> > > >   fs/xfs/xfs_ioctl.c            | 34 +++++++++++++++++++++++-
> > > >   fs/xfs/xfs_mount.h            |  2 ++
> > > >   fs/xfs/xfs_super.c            |  4 +++
> > > >   include/uapi/linux/fs.h       |  2 ++
> > > >   10 files changed, 114 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > > > index 2b2f9050fbfb..4dd295b047f8 100644
> > > > --- a/fs/xfs/libxfs/xfs_format.h
> > > > +++ b/fs/xfs/libxfs/xfs_format.h
> > > > @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
> > > >   #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
> > > >   #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
> > > >   #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
> > > > +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
> > > Hi, John
> > > 
> > > You know I've been using and testing your atomic writes patch series recently,
> > > and I'm particularly interested in the changes to the on-disk format. I noticed
> > > that XFS_SB_FEAT_RO_COMPAT_FORCEALIGN uses bit 30 instead of bit 4, which would
> > > be the next available bit in sequence.
> > > 
> > > I'm wondering if using bit 30 is just a temporary solution to avoid conflicts,
> > > and if the plan is to eventually use bits sequentially, for example, using bit 4?
> > > I'm looking forward to your explanation.
> > 
> > I really don't know. I'm looking through the history and it has been like
> > that this the start of my source control records.
> > 
> > Maybe it was a copy-and-paste error from XFS_FEAT_FORCEALIGN, whose value
> > has changed since.
> > 
> > Anyway, I'll ask a bit more internally, and I'll look to change to (1 << 4)
> > if ok.
> 
> I tend to use upper bits for ondisk features that are still under
> development so that (a) there won't be collisions with other features
> getting merged and (b) after the feature I'm working on gets merged, any
> old fs images in my zoo will no longer mount.
> 

I get it, thank you very much for your response.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2b2f9050fbfb..4dd295b047f8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,6 +353,7 @@  xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
@@ -1084,16 +1085,19 @@  static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
 #define XFS_DIFLAG2_NREXT64_BIT 4	/* large extent counters */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define XFS_DIFLAG2_FORCEALIGN_BIT 5
 
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
 #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
 #define XFS_DIFLAG2_NREXT64	(1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d0dcce462bf4..12f128f12824 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -616,6 +616,14 @@  xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		fa = xfs_inode_validate_forcealign(mp, mode, flags,
+				be32_to_cpu(dip->di_extsize),
+				be32_to_cpu(dip->di_cowextsize));
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
@@ -783,3 +791,45 @@  xfs_inode_validate_cowextsize(
 
 	return NULL;
 }
+
+/* Validate the forcealign inode flag */
+xfs_failaddr_t
+xfs_inode_validate_forcealign(
+	struct xfs_mount	*mp,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint32_t		extsize,
+	uint32_t		cowextsize)
+{
+	/* superblock rocompat feature flag */
+	if (!xfs_has_forcealign(mp))
+		return __this_address;
+
+	/* Only regular files and directories */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* Doesn't apply to realtime files */
+	if (flags & XFS_DIFLAG_REALTIME)
+		return __this_address;
+
+	/* Requires a non-zero power-of-2 extent size hint */
+	if (extsize == 0 || !is_power_of_2(extsize) ||
+	    (mp->m_sb.sb_agblocks % extsize))
+		return __this_address;
+
+	/* Requires agsize be a multiple of extsize */
+	if (mp->m_sb.sb_agblocks % extsize)
+		return __this_address;
+
+	/* Requires stripe unit+width (if set) be a multiple of extsize */
+	if ((mp->m_dalign && (mp->m_dalign % extsize)) ||
+	    (mp->m_swidth && (mp->m_swidth % extsize)))
+		return __this_address;
+
+	/* Requires no cow extent size hint */
+	if (cowextsize != 0)
+		return __this_address;
+
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af..50db17d22b68 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -33,6 +33,9 @@  xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
 		uint32_t cowextsize, uint16_t mode, uint16_t flags,
 		uint64_t flags2);
+xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
+		uint16_t mode, uint16_t flags, uint32_t extsize,
+		uint32_t cowextsize);
 
 static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d991eec05436..e746c57c4cc4 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -163,6 +163,8 @@  xfs_sb_version_to_features(
 		features |= XFS_FEAT_REFLINK;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
 		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+		features |= XFS_FEAT_FORCEALIGN;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ea48774f6b76..db5a0f66a121 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -607,6 +607,8 @@  xfs_ip2xflags(
 			flags |= FS_XFLAG_DAX;
 		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 			flags |= FS_XFLAG_COWEXTSIZE;
+		if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+			flags |= FS_XFLAG_FORCEALIGN;
 	}
 
 	if (xfs_inode_has_attr_fork(ip))
@@ -736,6 +738,8 @@  xfs_inode_inherit_flags2(
 	}
 	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 		ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+		ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN;
 
 	/* Don't let invalid cowextsize hints propagate. */
 	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -744,6 +748,14 @@  xfs_inode_inherit_flags2(
 		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
 		ip->i_cowextsize = 0;
 	}
+
+	if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, ip->i_diflags, ip->i_extsize,
+				ip->i_cowextsize);
+		if (failaddr)
+			ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN;
+	}
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 67f10349a6ed..065028789473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -313,7 +313,7 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 
 static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip)
 {
-	return false;
+	return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
 }
 
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d0e2cec6210d..d1126509ceb9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1110,6 +1110,8 @@  xfs_flags2diflags2(
 		di_flags2 |= XFS_DIFLAG2_DAX;
 	if (xflags & FS_XFLAG_COWEXTSIZE)
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	if (xflags & FS_XFLAG_FORCEALIGN)
+		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
 
 	return di_flags2;
 }
@@ -1146,6 +1148,22 @@  xfs_ioctl_setattr_xflags(
 	if (i_flags2 && !xfs_has_v3inodes(mp))
 		return -EINVAL;
 
+	/*
+	 * Force-align requires a nonzero extent size hint and a zero cow
+	 * extent size hint.  It doesn't apply to realtime files.
+	 */
+	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
+		if (!xfs_has_forcealign(mp))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
+			return -EINVAL;
+		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
+					FS_XFLAG_EXTSZINHERIT)))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
+			return -EINVAL;
+	}
+
 	ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_diflags2 = i_flags2;
 
@@ -1232,6 +1250,7 @@  xfs_ioctl_setattr_check_extsize(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_failaddr_t		failaddr;
 	uint16_t		new_diflags;
+	uint16_t		new_diflags2;
 
 	if (!fa->fsx_valid)
 		return 0;
@@ -1244,6 +1263,7 @@  xfs_ioctl_setattr_check_extsize(
 		return -EINVAL;
 
 	new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
 
 	/*
 	 * Inode verifiers do not check that the extent size hint is an integer
@@ -1263,7 +1283,19 @@  xfs_ioctl_setattr_check_extsize(
 	failaddr = xfs_inode_validate_extsize(ip->i_mount,
 			XFS_B_TO_FSB(mp, fa->fsx_extsize),
 			VFS_I(ip)->i_mode, new_diflags);
-	return failaddr != NULL ? -EINVAL : 0;
+	if (failaddr)
+		return -EINVAL;
+
+	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, new_diflags,
+				XFS_B_TO_FSB(mp, fa->fsx_extsize),
+				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
+		if (failaddr)
+			return -EINVAL;
+	}
+
+	return 0;
 }
 
 static int
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e880aa48de68..a8266cf654c4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -292,6 +292,7 @@  typedef struct xfs_mount {
 #define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
 #define XFS_FEAT_NREXT64	(1ULL << 26)	/* large extent counters */
+#define XFS_FEAT_FORCEALIGN	(1ULL << 27)	/* aligned file data extents */
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -355,6 +356,7 @@  __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
 __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
 __XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(forcealign, FORCEALIGN)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c21f10ab0f5d..63d4312785ef 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1706,6 +1706,10 @@  xfs_fs_fill_super(
 		mp->m_features &= ~XFS_FEAT_DISCARD;
 	}
 
+	if (xfs_has_forcealign(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 191a7e88a8ab..6a6bcb53594a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -158,6 +158,8 @@  struct fsxattr {
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define FS_XFLAG_FORCEALIGN	0x00020000
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is