Message ID | 20240801163057.3981192-8-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | forcealign for xfs | expand |
On Thu, Aug 01, 2024 at 04:30:50PM +0000, John Garry wrote: > From: "Darrick J. Wong" <djwong@kernel.org> > > Add a new inode flag to require that all file data extent mappings must > be aligned (both the file offset range and the allocated space itself) > to the extent size hint. Having a separate COW extent size hint is no > longer allowed. > > The goal here is to enable sysadmins and users to mandate that all space > mappings in a file must have a startoff/blockcount that are aligned to > (say) a 2MB alignment and that the startblock/blockcount will follow the > same alignment. > > Allocated space will be aligned to start of the AG, and not necessarily > aligned with disk blocks. The upcoming atomic writes feature will rely and > forcealign and will also require allocated space will also be aligned to > disk blocks. > > reflink will not be supported for forcealign yet, so disallow a mount under > this condition. This is because we have the limitation of pageache > writeback not knowing how to writeback an entire allocation unut, so > reject a mount with relink. > > RT vol will not be supported for forcealign yet, so disallow a mount under > this condition. It will be possible to support RT vol and forcealign in > future. For this, the inode extsize must be a multiple of rtextsize - this > is enforced already in xfs_ioctl_setattr_check_extsize() and > xfs_inode_validate_extsize(). > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > Co-developed-by: John Garry <john.g.garry@oracle.com> > [jpg: many changes from orig, including forcealign inode verification > rework, ioctl setattr rework disallow reflink a forcealign inode, > disallow mount for forcealign + reflink or rt] > Signed-off-by: John Garry <john.g.garry@oracle.com> This patch looks ready to me but as I'm the original author I cannot add a RVB tag. Someone else needs to add that -- frankly, John is the best candidate because he grabbed my patch into his tree and actually modified it to do what he wants, which means he's the most familiar with it. --D > --- > fs/xfs/libxfs/xfs_format.h | 6 ++++- > fs/xfs/libxfs/xfs_inode_buf.c | 46 ++++++++++++++++++++++++++++++++++ > fs/xfs/libxfs/xfs_inode_buf.h | 3 +++ > fs/xfs/libxfs/xfs_inode_util.c | 14 +++++++++++ > fs/xfs/libxfs/xfs_sb.c | 2 ++ > fs/xfs/xfs_inode.h | 8 +++++- > fs/xfs/xfs_ioctl.c | 46 ++++++++++++++++++++++++++++++++-- > fs/xfs/xfs_mount.h | 2 ++ > fs/xfs/xfs_reflink.c | 5 ++-- > fs/xfs/xfs_super.c | 18 +++++++++++++ > include/uapi/linux/fs.h | 2 ++ > 11 files changed, 146 insertions(+), 6 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > index e1bfee0c3b1a..95f5259c4255 100644 > --- a/fs/xfs/libxfs/xfs_format.h > +++ b/fs/xfs/libxfs/xfs_format.h > @@ -352,6 +352,7 @@ xfs_sb_has_compat_feature( > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ > #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ > +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ > @@ -1093,16 +1094,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) > #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ > #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ > #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ > +/* data extent mappings for regular files must be aligned to extent size hint */ > +#define XFS_DIFLAG2_FORCEALIGN_BIT 5 > > #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) > #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) > #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) > #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) > #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) > +#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT) > > #define XFS_DIFLAG2_ANY \ > (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ > - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) > + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN) > > static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) > { > diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c > index 513b50da6215..1c59891fa9e2 100644 > --- a/fs/xfs/libxfs/xfs_inode_buf.c > +++ b/fs/xfs/libxfs/xfs_inode_buf.c > @@ -657,6 +657,15 @@ xfs_dinode_verify( > !xfs_has_bigtime(mp)) > return __this_address; > > + if (flags2 & XFS_DIFLAG2_FORCEALIGN) { > + fa = xfs_inode_validate_forcealign(mp, > + be32_to_cpu(dip->di_extsize), > + be32_to_cpu(dip->di_cowextsize), > + mode, flags, flags2); > + if (fa) > + return fa; > + } > + > return NULL; > } > > @@ -824,3 +833,40 @@ xfs_inode_validate_cowextsize( > > return NULL; > } > + > +/* Validate the forcealign inode flag */ > +xfs_failaddr_t > +xfs_inode_validate_forcealign( > + struct xfs_mount *mp, > + uint32_t extsize, > + uint32_t cowextsize, > + uint16_t mode, > + uint16_t flags, > + uint64_t flags2) > +{ > + /* superblock rocompat feature flag */ > + if (!xfs_has_forcealign(mp)) > + return __this_address; > + > + /* Only regular files and directories */ > + if (!S_ISDIR(mode) && !S_ISREG(mode)) > + return __this_address; > + > + /* We require EXTSIZE or EXTSZINHERIT */ > + if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))) > + return __this_address; > + > + /* We require a non-zero extsize */ > + if (!extsize) > + return __this_address; > + > + /* COW extsize disallowed */ > + if (flags2 & XFS_DIFLAG2_COWEXTSIZE) > + return __this_address; > + > + /* cowextsize must be zero */ > + if (cowextsize) > + return __this_address; > + > + return NULL; > +} > diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h > index 585ed5a110af..b8b65287b037 100644 > --- a/fs/xfs/libxfs/xfs_inode_buf.h > +++ b/fs/xfs/libxfs/xfs_inode_buf.h > @@ -33,6 +33,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, > xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, > uint32_t cowextsize, uint16_t mode, uint16_t flags, > uint64_t flags2); > +xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp, > + uint32_t extsize, uint32_t cowextsize, uint16_t mode, > + uint16_t flags, uint64_t flags2); > > static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) > { > diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c > index 032333289113..b264939d8855 100644 > --- a/fs/xfs/libxfs/xfs_inode_util.c > +++ b/fs/xfs/libxfs/xfs_inode_util.c > @@ -80,6 +80,8 @@ xfs_flags2diflags2( > di_flags2 |= XFS_DIFLAG2_DAX; > if (xflags & FS_XFLAG_COWEXTSIZE) > di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; > + if (xflags & FS_XFLAG_FORCEALIGN) > + di_flags2 |= XFS_DIFLAG2_FORCEALIGN; > > return di_flags2; > } > @@ -126,6 +128,8 @@ xfs_ip2xflags( > flags |= FS_XFLAG_DAX; > if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) > flags |= FS_XFLAG_COWEXTSIZE; > + if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) > + flags |= FS_XFLAG_FORCEALIGN; > } > > if (xfs_inode_has_attr_fork(ip)) > @@ -224,6 +228,8 @@ xfs_inode_inherit_flags2( > } > if (pip->i_diflags2 & XFS_DIFLAG2_DAX) > ip->i_diflags2 |= XFS_DIFLAG2_DAX; > + if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) > + ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN; > > /* Don't let invalid cowextsize hints propagate. */ > failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, > @@ -232,6 +238,14 @@ xfs_inode_inherit_flags2( > ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; > ip->i_cowextsize = 0; > } > + if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) { > + failaddr = xfs_inode_validate_forcealign(ip->i_mount, > + ip->i_extsize, ip->i_cowextsize, > + VFS_I(ip)->i_mode, ip->i_diflags, > + ip->i_diflags2); > + if (failaddr) > + ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN; > + } > } > > /* > diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c > index 6b56f0f6d4c1..e56911553edd 100644 > --- a/fs/xfs/libxfs/xfs_sb.c > +++ b/fs/xfs/libxfs/xfs_sb.c > @@ -164,6 +164,8 @@ xfs_sb_version_to_features( > features |= XFS_FEAT_REFLINK; > if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > features |= XFS_FEAT_INOBTCNT; > + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN) > + features |= XFS_FEAT_FORCEALIGN; > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) > features |= XFS_FEAT_FTYPE; > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index bf0f4f8b9e64..3e7664ec4d6c 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -312,7 +312,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > > static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip) > { > - return false; > + if (!(ip->i_diflags & XFS_DIFLAG_EXTSIZE)) > + return false; > + if (ip->i_extsize <= 1) > + return false; > + if (xfs_is_cow_inode(ip)) > + return false; > + return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN; > } > > /* > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index 4e933db75b12..7a6757a4d2bd 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -469,6 +469,39 @@ xfs_fileattr_get( > return 0; > } > > +/* > + * Forcealign requires a non-zero extent size hint and a zero cow > + * extent size hint. > + */ > +static int > +xfs_ioctl_setattr_forcealign( > + struct xfs_inode *ip, > + struct fileattr *fa) > +{ > + struct xfs_mount *mp = ip->i_mount; > + > + if (!xfs_has_forcealign(mp)) > + return -EINVAL; > + > + if (xfs_is_reflink_inode(ip)) > + return -EINVAL; > + > + if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE | > + FS_XFLAG_EXTSZINHERIT))) > + return -EINVAL; > + > + if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) > + return -EINVAL; > + > + if (!fa->fsx_extsize) > + return -EINVAL; > + > + if (fa->fsx_cowextsize) > + return -EINVAL; > + > + return 0; > +} > + > static int > xfs_ioctl_setattr_xflags( > struct xfs_trans *tp, > @@ -477,10 +510,13 @@ xfs_ioctl_setattr_xflags( > { > struct xfs_mount *mp = ip->i_mount; > bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); > + bool forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN; > uint64_t i_flags2; > + int error; > > - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { > - /* Can't change realtime flag if any extents are allocated. */ > + /* Can't change RT or forcealign flags if any extents are allocated. */ > + if (rtflag != XFS_IS_REALTIME_INODE(ip) || > + forcealign != xfs_inode_has_forcealign(ip)) { > if (ip->i_df.if_nextents || ip->i_delayed_blks) > return -EINVAL; > } > @@ -501,6 +537,12 @@ xfs_ioctl_setattr_xflags( > if (i_flags2 && !xfs_has_v3inodes(mp)) > return -EINVAL; > > + if (forcealign) { > + error = xfs_ioctl_setattr_forcealign(ip, fa); > + if (error) > + return error; > + } > + > ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); > ip->i_diflags2 = i_flags2; > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index d0567dfbc036..30228fea908d 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -299,6 +299,7 @@ typedef struct xfs_mount { > #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ > #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ > #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ > +#define XFS_FEAT_FORCEALIGN (1ULL << 28) /* aligned file data extents */ > > /* Mount features */ > #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ > @@ -385,6 +386,7 @@ __XFS_ADD_V4_FEAT(projid32, PROJID32) > __XFS_HAS_V4_FEAT(v3inodes, V3INODES) > __XFS_HAS_V4_FEAT(crc, CRC) > __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) > +__XFS_HAS_FEAT(forcealign, FORCEALIGN) > > /* > * Mount features > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index 6fde6ec8092f..a836bfec7878 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -1467,8 +1467,9 @@ xfs_reflink_remap_prep( > > /* Check file eligibility and prepare for block sharing. */ > ret = -EINVAL; > - /* Don't reflink realtime inodes */ > - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) > + /* Don't reflink realtime or forcealign inodes */ > + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) || > + xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest)) > goto out_unlock; > > /* Don't share DAX file data with non-DAX file. */ > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 27e9f749c4c7..b52a01b50387 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -1729,12 +1729,30 @@ xfs_fs_fill_super( > goto out_filestream_unmount; > } > > + if (xfs_has_forcealign(mp)) { > + xfs_alert(mp, > + "reflink not compatible with forcealign!"); > + error = -EINVAL; > + goto out_filestream_unmount; > + } > + > if (xfs_globals.always_cow) { > xfs_info(mp, "using DEBUG-only always_cow mode."); > mp->m_always_cow = true; > } > } > > + if (xfs_has_forcealign(mp)) { > + if (xfs_has_realtime(mp)) { > + xfs_alert(mp, > + "forcealign not supported for realtime device!"); > + error = -EINVAL; > + goto out_filestream_unmount; > + } > + xfs_warn(mp, > +"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!"); > + } > + > if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { > xfs_alert(mp, > "reverse mapping btree not compatible with realtime device!"); > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 753971770733..f55d650f904a 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -158,6 +158,8 @@ struct fsxattr { > #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ > #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ > #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ > +/* data extent mappings for regular files must be aligned to extent size hint */ > +#define FS_XFLAG_FORCEALIGN 0x00020000 > #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ > > /* the read-only stuff doesn't really belong here, but any other place is > -- > 2.31.1 > >
On 06/08/2024 20:02, Darrick J. Wong wrote: > On Thu, Aug 01, 2024 at 04:30:50PM +0000, John Garry wrote: >> From: "Darrick J. Wong" <djwong@kernel.org> >> >> Add a new inode flag to require that all file data extent mappings must >> be aligned (both the file offset range and the allocated space itself) >> to the extent size hint. Having a separate COW extent size hint is no >> longer allowed. >> >> The goal here is to enable sysadmins and users to mandate that all space >> mappings in a file must have a startoff/blockcount that are aligned to >> (say) a 2MB alignment and that the startblock/blockcount will follow the >> same alignment. >> >> Allocated space will be aligned to start of the AG, and not necessarily >> aligned with disk blocks. The upcoming atomic writes feature will rely and >> forcealign and will also require allocated space will also be aligned to >> disk blocks. >> >> reflink will not be supported for forcealign yet, so disallow a mount under >> this condition. This is because we have the limitation of pageache >> writeback not knowing how to writeback an entire allocation unut, so >> reject a mount with relink. >> >> RT vol will not be supported for forcealign yet, so disallow a mount under >> this condition. It will be possible to support RT vol and forcealign in >> future. For this, the inode extsize must be a multiple of rtextsize - this >> is enforced already in xfs_ioctl_setattr_check_extsize() and >> xfs_inode_validate_extsize(). >> >> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> >> Co-developed-by: John Garry <john.g.garry@oracle.com> >> [jpg: many changes from orig, including forcealign inode verification >> rework, ioctl setattr rework disallow reflink a forcealign inode, >> disallow mount for forcealign + reflink or rt] >> Signed-off-by: John Garry <john.g.garry@oracle.com> > > This patch looks ready to me but as I'm the original author I cannot add > a RVB tag. Someone else needs to add that -- frankly, John is the best > candidate because he grabbed my patch into his tree and actually > modified it to do what he wants, which means he's the most familiar with > it. I thought my review would be implied since I noted how I appended it, above. Anyway, Reviewed-by: John Garry <john.g.garry@oracle.com> I am hoping that Dave and Christoph will give some formal ack/review when they get a chance. BTW, at what stage do we give XFS_SB_FEAT_RO_COMPAT_FORCEALIGN a more proper value? So far it has the experimental dev value of 1 << 30, below. Thanks! >> >> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h >> index e1bfee0c3b1a..95f5259c4255 100644 >> --- a/fs/xfs/libxfs/xfs_format.h >> +++ b/fs/xfs/libxfs/xfs_format.h >> @@ -352,6 +352,7 @@ xfs_sb_has_compat_feature( >> #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ >> #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ >> #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ >> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ >> #define XFS_SB_FEAT_RO_COMPAT_ALL \ >> (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ >> XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
On Wed, Aug 07, 2024 at 12:42:32PM +0100, John Garry wrote: > On 06/08/2024 20:02, Darrick J. Wong wrote: > > On Thu, Aug 01, 2024 at 04:30:50PM +0000, John Garry wrote: > > > From: "Darrick J. Wong" <djwong@kernel.org> > > > > > > Add a new inode flag to require that all file data extent mappings must > > > be aligned (both the file offset range and the allocated space itself) > > > to the extent size hint. Having a separate COW extent size hint is no > > > longer allowed. > > > > > > The goal here is to enable sysadmins and users to mandate that all space > > > mappings in a file must have a startoff/blockcount that are aligned to > > > (say) a 2MB alignment and that the startblock/blockcount will follow the > > > same alignment. > > > > > > Allocated space will be aligned to start of the AG, and not necessarily > > > aligned with disk blocks. The upcoming atomic writes feature will rely and > > > forcealign and will also require allocated space will also be aligned to > > > disk blocks. > > > > > > reflink will not be supported for forcealign yet, so disallow a mount under > > > this condition. This is because we have the limitation of pageache > > > writeback not knowing how to writeback an entire allocation unut, so > > > reject a mount with relink. > > > > > > RT vol will not be supported for forcealign yet, so disallow a mount under > > > this condition. It will be possible to support RT vol and forcealign in > > > future. For this, the inode extsize must be a multiple of rtextsize - this > > > is enforced already in xfs_ioctl_setattr_check_extsize() and > > > xfs_inode_validate_extsize(). > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > Co-developed-by: John Garry <john.g.garry@oracle.com> > > > [jpg: many changes from orig, including forcealign inode verification > > > rework, ioctl setattr rework disallow reflink a forcealign inode, > > > disallow mount for forcealign + reflink or rt] > > > Signed-off-by: John Garry <john.g.garry@oracle.com> > > > > This patch looks ready to me but as I'm the original author I cannot add > > a RVB tag. Someone else needs to add that -- frankly, John is the best > > candidate because he grabbed my patch into his tree and actually > > modified it to do what he wants, which means he's the most familiar with > > it. > > I thought my review would be implied since I noted how I appended it, above. > > Anyway, > > Reviewed-by: John Garry <john.g.garry@oracle.com> > > I am hoping that Dave and Christoph will give some formal ack/review when > they get a chance. > > BTW, at what stage do we give XFS_SB_FEAT_RO_COMPAT_FORCEALIGN a more proper > value? So far it has the experimental dev value of 1 << 30, below. When you're ready for the release manager to merge it, change the value, resend the entire series for archival purposes, and send a pull request. --D > Thanks! > > > > > > > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > > > index e1bfee0c3b1a..95f5259c4255 100644 > > > --- a/fs/xfs/libxfs/xfs_format.h > > > +++ b/fs/xfs/libxfs/xfs_format.h > > > @@ -352,6 +352,7 @@ xfs_sb_has_compat_feature( > > > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > > > #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ > > > #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ > > > +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ > > > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > > > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > > > XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ >
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e1bfee0c3b1a..95f5259c4255 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -352,6 +352,7 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ @@ -1093,16 +1094,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define XFS_DIFLAG2_FORCEALIGN_BIT 5 #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 513b50da6215..1c59891fa9e2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -657,6 +657,15 @@ xfs_dinode_verify( !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_FORCEALIGN) { + fa = xfs_inode_validate_forcealign(mp, + be32_to_cpu(dip->di_extsize), + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } + return NULL; } @@ -824,3 +833,40 @@ xfs_inode_validate_cowextsize( return NULL; } + +/* Validate the forcealign inode flag */ +xfs_failaddr_t +xfs_inode_validate_forcealign( + struct xfs_mount *mp, + uint32_t extsize, + uint32_t cowextsize, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + /* superblock rocompat feature flag */ + if (!xfs_has_forcealign(mp)) + return __this_address; + + /* Only regular files and directories */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* We require EXTSIZE or EXTSZINHERIT */ + if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))) + return __this_address; + + /* We require a non-zero extsize */ + if (!extsize) + return __this_address; + + /* COW extsize disallowed */ + if (flags2 & XFS_DIFLAG2_COWEXTSIZE) + return __this_address; + + /* cowextsize must be zero */ + if (cowextsize) + return __this_address; + + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 585ed5a110af..b8b65287b037 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -33,6 +33,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp, + uint32_t extsize, uint32_t cowextsize, uint16_t mode, + uint16_t flags, uint64_t flags2); static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) { diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 032333289113..b264939d8855 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -80,6 +80,8 @@ xfs_flags2diflags2( di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + if (xflags & FS_XFLAG_FORCEALIGN) + di_flags2 |= XFS_DIFLAG2_FORCEALIGN; return di_flags2; } @@ -126,6 +128,8 @@ xfs_ip2xflags( flags |= FS_XFLAG_DAX; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; + if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) + flags |= FS_XFLAG_FORCEALIGN; } if (xfs_inode_has_attr_fork(ip)) @@ -224,6 +228,8 @@ xfs_inode_inherit_flags2( } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; + if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) + ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN; /* Don't let invalid cowextsize hints propagate. */ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, @@ -232,6 +238,14 @@ xfs_inode_inherit_flags2( ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; ip->i_cowextsize = 0; } + if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) { + failaddr = xfs_inode_validate_forcealign(ip->i_mount, + ip->i_extsize, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, + ip->i_diflags2); + if (failaddr) + ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN; + } } /* diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 6b56f0f6d4c1..e56911553edd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -164,6 +164,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN) + features |= XFS_FEAT_FORCEALIGN; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index bf0f4f8b9e64..3e7664ec4d6c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -312,7 +312,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) static inline bool xfs_inode_has_forcealign(struct xfs_inode *ip) { - return false; + if (!(ip->i_diflags & XFS_DIFLAG_EXTSIZE)) + return false; + if (ip->i_extsize <= 1) + return false; + if (xfs_is_cow_inode(ip)) + return false; + return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN; } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4e933db75b12..7a6757a4d2bd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -469,6 +469,39 @@ xfs_fileattr_get( return 0; } +/* + * Forcealign requires a non-zero extent size hint and a zero cow + * extent size hint. + */ +static int +xfs_ioctl_setattr_forcealign( + struct xfs_inode *ip, + struct fileattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!xfs_has_forcealign(mp)) + return -EINVAL; + + if (xfs_is_reflink_inode(ip)) + return -EINVAL; + + if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE | + FS_XFLAG_EXTSZINHERIT))) + return -EINVAL; + + if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) + return -EINVAL; + + if (!fa->fsx_extsize) + return -EINVAL; + + if (fa->fsx_cowextsize) + return -EINVAL; + + return 0; +} + static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, @@ -477,10 +510,13 @@ xfs_ioctl_setattr_xflags( { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); + bool forcealign = fa->fsx_xflags & FS_XFLAG_FORCEALIGN; uint64_t i_flags2; + int error; - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { - /* Can't change realtime flag if any extents are allocated. */ + /* Can't change RT or forcealign flags if any extents are allocated. */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) || + forcealign != xfs_inode_has_forcealign(ip)) { if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; } @@ -501,6 +537,12 @@ xfs_ioctl_setattr_xflags( if (i_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; + if (forcealign) { + error = xfs_ioctl_setattr_forcealign(ip, fa); + if (error) + return error; + } + ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_diflags2 = i_flags2; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d0567dfbc036..30228fea908d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -299,6 +299,7 @@ typedef struct xfs_mount { #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ +#define XFS_FEAT_FORCEALIGN (1ULL << 28) /* aligned file data extents */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -385,6 +386,7 @@ __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +__XFS_HAS_FEAT(forcealign, FORCEALIGN) /* * Mount features diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6fde6ec8092f..a836bfec7878 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1467,8 +1467,9 @@ xfs_reflink_remap_prep( /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - /* Don't reflink realtime inodes */ - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + /* Don't reflink realtime or forcealign inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) || + xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest)) goto out_unlock; /* Don't share DAX file data with non-DAX file. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27e9f749c4c7..b52a01b50387 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1729,12 +1729,30 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_forcealign(mp)) { + xfs_alert(mp, + "reflink not compatible with forcealign!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; } } + if (xfs_has_forcealign(mp)) { + if (xfs_has_realtime(mp)) { + xfs_alert(mp, + "forcealign not supported for realtime device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn(mp, +"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!"); + } + if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { xfs_alert(mp, "reverse mapping btree not compatible with realtime device!"); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..f55d650f904a 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -158,6 +158,8 @@ struct fsxattr { #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define FS_XFLAG_FORCEALIGN 0x00020000 #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* the read-only stuff doesn't really belong here, but any other place is