Message ID | 20240124142645.9334-4-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | Deferred, archived |
Headers | show |
Series | block atomic writes for XFS | expand |
On Wed, Jan 24, 2024 at 02:26:42PM +0000, John Garry wrote: > Add initial support for FS_XFLAG_ATOMICWRITES in rtvol. > > Current kernel support for atomic writes is based on HW support (for atomic > writes). As such, it is required to ensure extent alignment with > atomic_write_unit_max so that an atomic write can result in a single > HW-compliant IO operation. > > rtvol already guarantees extent alignment, so initially add support there. > > Signed-off-by: John Garry <john.g.garry@oracle.com> > --- > fs/xfs/libxfs/xfs_format.h | 8 ++++++-- > fs/xfs/libxfs/xfs_sb.c | 2 ++ > fs/xfs/xfs_inode.c | 22 ++++++++++++++++++++++ > fs/xfs/xfs_inode.h | 7 +++++++ > fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++-- > fs/xfs/xfs_mount.h | 2 ++ > fs/xfs/xfs_super.c | 4 ++++ > 7 files changed, 60 insertions(+), 4 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > index 382ab1e71c0b..79fb0d4adeda 100644 > --- a/fs/xfs/libxfs/xfs_format.h > +++ b/fs/xfs/libxfs/xfs_format.h > @@ -353,11 +353,13 @@ xfs_sb_has_compat_feature( > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ > #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ > +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 29) /* aligned file data extents */ I thought FORCEALIGN was going to signal aligned file data extent allocations being mandatory? This flag (AFAICT) simply marks the inode as something that gets FMODE_CAN_ATOMIC_WRITES, right? > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ > XFS_SB_FEAT_RO_COMPAT_REFLINK| \ > - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > + XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \ > + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL > static inline bool > xfs_sb_has_ro_compat_feature( > @@ -1085,16 +1087,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) > #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ > #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ > #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ > +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 Needs a comment here ("files flagged for atomic writes"). Also not sure why you skipped bit 5, though I'm guessing it's because the forcealign series is/was using it? > #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) > #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) > #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) > #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) > #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) > +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) > > #define XFS_DIFLAG2_ANY \ > (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ > - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) > + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_ATOMICWRITES) > > static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) > { > diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c > index 4a9e8588f4c9..28a98130a56d 100644 > --- a/fs/xfs/libxfs/xfs_sb.c > +++ b/fs/xfs/libxfs/xfs_sb.c > @@ -163,6 +163,8 @@ xfs_sb_version_to_features( > features |= XFS_FEAT_REFLINK; > if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > features |= XFS_FEAT_INOBTCNT; > + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > + features |= XFS_FEAT_ATOMICWRITES; > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) > features |= XFS_FEAT_FTYPE; > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 1fd94958aa97..0b0f525fd043 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -65,6 +65,26 @@ xfs_get_extsz_hint( > return 0; > } > > +/* > + * helper function to extract extent size How does that differ from xfs_get_extsz_hint? > + */ > +xfs_extlen_t > +xfs_get_extsz( > + struct xfs_inode *ip) > +{ > + /* > + * No point in aligning allocations if we need to COW to actually > + * write to them. What does alwayscow have to do with untorn writes? > + */ > + if (xfs_is_always_cow_inode(ip)) > + return 0; > + > + if (XFS_IS_REALTIME_INODE(ip)) > + return ip->i_mount->m_sb.sb_rextsize; > + > + return 1; > +} Does this function exist to return the allocation unit for a given file? https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/commit/?h=djwong-wtf&id=b8ddcef3df8da02ed2c4aacbed1d811e60372006 > + > /* > * Helper function to extract CoW extent size hint from inode. > * Between the extent size hint and the CoW extent size hint, we > @@ -629,6 +649,8 @@ xfs_ip2xflags( > flags |= FS_XFLAG_DAX; > if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) > flags |= FS_XFLAG_COWEXTSIZE; > + if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES) > + flags |= FS_XFLAG_ATOMICWRITES; > } > > if (xfs_inode_has_attr_fork(ip)) > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 97f63bacd4c2..0e0a21d9d30f 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; > } > > +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) I think this predicate wants a verb in its name, the rest of them have "is" or "has" somewhere: "xfs_inode_has_atomicwrites" > +{ > + return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES; > +} > + > /* > * Return the buftarg used for data allocations on a given inode. > */ > @@ -542,7 +547,9 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, > struct xfs_inode *ip1, uint ip1_mode); > > xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); > +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); > xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); > +xfs_extlen_t xfs_get_atomicwrites_size(struct xfs_inode *ip); > > int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, > struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index f02b6e558af5..c380a3055be7 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -1110,6 +1110,8 @@ xfs_flags2diflags2( > di_flags2 |= XFS_DIFLAG2_DAX; > if (xflags & FS_XFLAG_COWEXTSIZE) > di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; > + if (xflags & FS_XFLAG_ATOMICWRITES) > + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; > > return di_flags2; > } > @@ -1122,10 +1124,12 @@ xfs_ioctl_setattr_xflags( > { > struct xfs_mount *mp = ip->i_mount; > bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); > + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; > uint64_t i_flags2; > > - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { > - /* Can't change realtime flag if any extents are allocated. */ Please augment this comment ("Can't change realtime or atomicwrites flags if any extents are allocated") instead of deleting it. This is validation code, the requirements should be spelled out in English. > + > + if (rtflag != XFS_IS_REALTIME_INODE(ip) || > + atomic_writes != xfs_inode_atomicwrites(ip)) { > if (ip->i_df.if_nextents || ip->i_delayed_blks) > return -EINVAL; > } > @@ -1146,6 +1150,17 @@ xfs_ioctl_setattr_xflags( > if (i_flags2 && !xfs_has_v3inodes(mp)) > return -EINVAL; > > + if (atomic_writes) { > + if (!xfs_has_atomicwrites(mp)) > + return -EINVAL; > + > + if (!rtflag) > + return -EINVAL; > + > + if (!is_power_of_2(mp->m_sb.sb_rextsize)) > + return -EINVAL; Shouldn't we check sb_rextsize w.r.t. the actual block device queue limits here? I keep seeing similar validation logic open-coded throughout both atomic write patchsets: if (l < queue_atomic_write_unit_min_bytes()) /* fail */ if (l > queue_atomic_write_unit_max_bytes()) /* fail */ if (!is_power_of_2(l)) /* fail */ /* ok */ which really should be a common helper somewhere. /* * Don't set atomic write if the allocation unit doesn't * align with the device requirements. */ if (!bdev_validate_atomic_write(<target blockdev>, XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize)) return -EINVAL; Too bad we have to figure out the target blockdev and file allocation unit based on the ioctl in-params and can't use the xfs_inode helpers here. --D > + } > + > ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); > ip->i_diflags2 = i_flags2; > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 503fe3c7edbf..bcd591f52925 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -289,6 +289,7 @@ typedef struct xfs_mount { > #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ > #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ > #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ > +#define XFS_FEAT_ATOMICWRITES (1ULL << 28) /* atomic writes support */ > > /* Mount features */ > #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ > @@ -352,6 +353,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT) > __XFS_HAS_FEAT(bigtime, BIGTIME) > __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) > __XFS_HAS_FEAT(large_extent_counts, NREXT64) > +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES) > > /* > * Mount features > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index aff20ddd4a9f..263404f683d6 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -1696,6 +1696,10 @@ xfs_fs_fill_super( > mp->m_features &= ~XFS_FEAT_DISCARD; > } > > + if (xfs_has_atomicwrites(mp)) > + xfs_warn(mp, > +"EXPERIMENTAL atomic writes feature in use. Use at your own risk!"); > + > if (xfs_has_reflink(mp)) { > if (mp->m_sb.sb_rblocks) { > xfs_alert(mp, > -- > 2.31.1 > >
On Fri, Feb 02, 2024 at 09:52:25AM -0800, Darrick J. Wong wrote: > On Wed, Jan 24, 2024 at 02:26:42PM +0000, John Garry wrote: > > Add initial support for FS_XFLAG_ATOMICWRITES in rtvol. > > > > Current kernel support for atomic writes is based on HW support (for atomic > > writes). As such, it is required to ensure extent alignment with > > atomic_write_unit_max so that an atomic write can result in a single > > HW-compliant IO operation. > > > > rtvol already guarantees extent alignment, so initially add support there. > > > > Signed-off-by: John Garry <john.g.garry@oracle.com> > > --- > > fs/xfs/libxfs/xfs_format.h | 8 ++++++-- > > fs/xfs/libxfs/xfs_sb.c | 2 ++ > > fs/xfs/xfs_inode.c | 22 ++++++++++++++++++++++ > > fs/xfs/xfs_inode.h | 7 +++++++ > > fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++-- > > fs/xfs/xfs_mount.h | 2 ++ > > fs/xfs/xfs_super.c | 4 ++++ > > 7 files changed, 60 insertions(+), 4 deletions(-) > > > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > > index 382ab1e71c0b..79fb0d4adeda 100644 > > --- a/fs/xfs/libxfs/xfs_format.h > > +++ b/fs/xfs/libxfs/xfs_format.h > > @@ -353,11 +353,13 @@ xfs_sb_has_compat_feature( > > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > > #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ > > #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ > > +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 29) /* aligned file data extents */ > > I thought FORCEALIGN was going to signal aligned file data extent > allocations being mandatory? > > This flag (AFAICT) simply marks the inode as something that gets > FMODE_CAN_ATOMIC_WRITES, right? > > > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > > XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ > > XFS_SB_FEAT_RO_COMPAT_REFLINK| \ > > - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > > + XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \ > > + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > > #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL > > static inline bool > > xfs_sb_has_ro_compat_feature( > > @@ -1085,16 +1087,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) > > #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ > > #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ > > #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ > > +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 > > Needs a comment here ("files flagged for atomic writes"). Also not sure > why you skipped bit 5, though I'm guessing it's because the forcealign > series is/was using it? > > > #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) > > #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) > > #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) > > #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) > > #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) > > +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) > > > > #define XFS_DIFLAG2_ANY \ > > (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ > > - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) > > + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_ATOMICWRITES) > > > > static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) > > { > > diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c > > index 4a9e8588f4c9..28a98130a56d 100644 > > --- a/fs/xfs/libxfs/xfs_sb.c > > +++ b/fs/xfs/libxfs/xfs_sb.c > > @@ -163,6 +163,8 @@ xfs_sb_version_to_features( > > features |= XFS_FEAT_REFLINK; > > if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > > features |= XFS_FEAT_INOBTCNT; > > + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > > + features |= XFS_FEAT_ATOMICWRITES; > > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) > > features |= XFS_FEAT_FTYPE; > > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > > index 1fd94958aa97..0b0f525fd043 100644 > > --- a/fs/xfs/xfs_inode.c > > +++ b/fs/xfs/xfs_inode.c > > @@ -65,6 +65,26 @@ xfs_get_extsz_hint( > > return 0; > > } > > > > +/* > > + * helper function to extract extent size > > How does that differ from xfs_get_extsz_hint? > > > + */ > > +xfs_extlen_t > > +xfs_get_extsz( > > + struct xfs_inode *ip) > > +{ > > + /* > > + * No point in aligning allocations if we need to COW to actually > > + * write to them. > > What does alwayscow have to do with untorn writes? > > > + */ > > + if (xfs_is_always_cow_inode(ip)) > > + return 0; > > + > > + if (XFS_IS_REALTIME_INODE(ip)) > > + return ip->i_mount->m_sb.sb_rextsize; > > + > > + return 1; > > +} > > Does this function exist to return the allocation unit for a given file? > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/commit/?h=djwong-wtf&id=b8ddcef3df8da02ed2c4aacbed1d811e60372006 > > > + > > /* > > * Helper function to extract CoW extent size hint from inode. > > * Between the extent size hint and the CoW extent size hint, we > > @@ -629,6 +649,8 @@ xfs_ip2xflags( > > flags |= FS_XFLAG_DAX; > > if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) > > flags |= FS_XFLAG_COWEXTSIZE; > > + if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES) > > + flags |= FS_XFLAG_ATOMICWRITES; > > } > > > > if (xfs_inode_has_attr_fork(ip)) > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > index 97f63bacd4c2..0e0a21d9d30f 100644 > > --- a/fs/xfs/xfs_inode.h > > +++ b/fs/xfs/xfs_inode.h > > @@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > > return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; > > } > > > > +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) > > I think this predicate wants a verb in its name, the rest of them have > "is" or "has" somewhere: > > "xfs_inode_has_atomicwrites" > > > +{ > > + return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES; > > +} > > + > > /* > > * Return the buftarg used for data allocations on a given inode. > > */ > > @@ -542,7 +547,9 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, > > struct xfs_inode *ip1, uint ip1_mode); > > > > xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); > > +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); > > xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); > > +xfs_extlen_t xfs_get_atomicwrites_size(struct xfs_inode *ip); > > > > int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, > > struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > > index f02b6e558af5..c380a3055be7 100644 > > --- a/fs/xfs/xfs_ioctl.c > > +++ b/fs/xfs/xfs_ioctl.c > > @@ -1110,6 +1110,8 @@ xfs_flags2diflags2( > > di_flags2 |= XFS_DIFLAG2_DAX; > > if (xflags & FS_XFLAG_COWEXTSIZE) > > di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; > > + if (xflags & FS_XFLAG_ATOMICWRITES) > > + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; > > > > return di_flags2; > > } > > @@ -1122,10 +1124,12 @@ xfs_ioctl_setattr_xflags( > > { > > struct xfs_mount *mp = ip->i_mount; > > bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); > > + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; > > uint64_t i_flags2; > > > > - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { > > - /* Can't change realtime flag if any extents are allocated. */ > > Please augment this comment ("Can't change realtime or atomicwrites > flags if any extents are allocated") instead of deleting it. This is > validation code, the requirements should be spelled out in English. > > > + > > + if (rtflag != XFS_IS_REALTIME_INODE(ip) || > > + atomic_writes != xfs_inode_atomicwrites(ip)) { > > if (ip->i_df.if_nextents || ip->i_delayed_blks) > > return -EINVAL; > > } > > @@ -1146,6 +1150,17 @@ xfs_ioctl_setattr_xflags( > > if (i_flags2 && !xfs_has_v3inodes(mp)) > > return -EINVAL; > > > > + if (atomic_writes) { > > + if (!xfs_has_atomicwrites(mp)) > > + return -EINVAL; > > + > > + if (!rtflag) > > + return -EINVAL; > > + > > + if (!is_power_of_2(mp->m_sb.sb_rextsize)) > > + return -EINVAL; > > Shouldn't we check sb_rextsize w.r.t. the actual block device queue > limits here? I keep seeing similar validation logic open-coded > throughout both atomic write patchsets: > > if (l < queue_atomic_write_unit_min_bytes()) > /* fail */ > if (l > queue_atomic_write_unit_max_bytes()) > /* fail */ > if (!is_power_of_2(l)) > /* fail */ > /* ok */ > > which really should be a common helper somewhere. > > /* > * Don't set atomic write if the allocation unit doesn't > * align with the device requirements. > */ > if (!bdev_validate_atomic_write(<target blockdev>, > XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize)) > return -EINVAL; > > Too bad we have to figure out the target blockdev and file allocation > unit based on the ioctl in-params and can't use the xfs_inode helpers > here. > > --D Hey John, Darrick, I agree that we should have a common helper to validate block device limits. I tried to do so by exporting blkdev_atomic_write_valid() in the ext4 series [1]. There was also some discussion on moving this to VFS, where we can check against the len and off of the write and then we can make fs specific checks (eg if off,len align with rt extsize/bigalloc size) later in the fs layer. [1] https://lore.kernel.org/linux-ext4/b53609d0d4b97eb9355987ac5ec03d4e89293b43.1701339358.git.ojaswin@linux.ibm.com/
On 02/02/2024 17:52, Darrick J. Wong wrote: > On Wed, Jan 24, 2024 at 02:26:42PM +0000, John Garry wrote: >> Add initial support for FS_XFLAG_ATOMICWRITES in rtvol. >> >> Current kernel support for atomic writes is based on HW support (for atomic >> writes). As such, it is required to ensure extent alignment with >> atomic_write_unit_max so that an atomic write can result in a single >> HW-compliant IO operation. >> >> rtvol already guarantees extent alignment, so initially add support there. >> >> Signed-off-by: John Garry <john.g.garry@oracle.com> >> --- >> fs/xfs/libxfs/xfs_format.h | 8 ++++++-- >> fs/xfs/libxfs/xfs_sb.c | 2 ++ >> fs/xfs/xfs_inode.c | 22 ++++++++++++++++++++++ >> fs/xfs/xfs_inode.h | 7 +++++++ >> fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++-- >> fs/xfs/xfs_mount.h | 2 ++ >> fs/xfs/xfs_super.c | 4 ++++ >> 7 files changed, 60 insertions(+), 4 deletions(-) >> >> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h >> index 382ab1e71c0b..79fb0d4adeda 100644 >> --- a/fs/xfs/libxfs/xfs_format.h >> +++ b/fs/xfs/libxfs/xfs_format.h >> @@ -353,11 +353,13 @@ xfs_sb_has_compat_feature( >> #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ >> #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ >> #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ >> +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 29) /* aligned file data extents */ > > I thought FORCEALIGN was going to signal aligned file data extent > allocations being mandatory? Right, I'll fix that comment > > This flag (AFAICT) simply marks the inode as something that gets > FMODE_CAN_ATOMIC_WRITES, right? Correct > >> #define XFS_SB_FEAT_RO_COMPAT_ALL \ >> (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ >> XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ >> XFS_SB_FEAT_RO_COMPAT_REFLINK| \ >> - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) >> + XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \ >> + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) >> #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL >> static inline bool >> xfs_sb_has_ro_compat_feature( >> @@ -1085,16 +1087,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) >> #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ >> #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ >> #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ >> +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 > > Needs a comment here ("files flagged for atomic writes"). ok > Also not sure > why you skipped bit 5, though I'm guessing it's because the forcealign > series is/was using it? Right, I'll fix that > >> #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) >> #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) >> #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) >> #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) >> #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) >> +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) >> >> #define XFS_DIFLAG2_ANY \ >> (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ >> - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) >> + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_ATOMICWRITES) >> >> static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) >> { >> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c >> index 4a9e8588f4c9..28a98130a56d 100644 >> --- a/fs/xfs/libxfs/xfs_sb.c >> +++ b/fs/xfs/libxfs/xfs_sb.c >> @@ -163,6 +163,8 @@ xfs_sb_version_to_features( >> features |= XFS_FEAT_REFLINK; >> if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) >> features |= XFS_FEAT_INOBTCNT; >> + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) >> + features |= XFS_FEAT_ATOMICWRITES; >> if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) >> features |= XFS_FEAT_FTYPE; >> if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) >> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c >> index 1fd94958aa97..0b0f525fd043 100644 >> --- a/fs/xfs/xfs_inode.c >> +++ b/fs/xfs/xfs_inode.c >> @@ -65,6 +65,26 @@ xfs_get_extsz_hint( >> return 0; >> } >> >> +/* >> + * helper function to extract extent size > > How does that differ from xfs_get_extsz_hint? The idea of this function is to return the guaranteed extent alignment, and not just the hint > >> + */ >> +xfs_extlen_t >> +xfs_get_extsz( >> + struct xfs_inode *ip) >> +{ >> + /* >> + * No point in aligning allocations if we need to COW to actually >> + * write to them. > > What does alwayscow have to do with untorn writes? Nothing at the moment, so I'll remove this. > >> + */ >> + if (xfs_is_always_cow_inode(ip)) >> + return 0; >> + >> + if (XFS_IS_REALTIME_INODE(ip)) >> + return ip->i_mount->m_sb.sb_rextsize; >> + >> + return 1; >> +} > > Does this function exist to return the allocation unit for a given file? > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/commit/?h=djwong-wtf&id=b8ddcef3df8da02ed2c4aacbed1d811e60372006 > Yes, something like xfs_inode_alloc_unitsize() there. What's the upstream status for that change? I see it mentioned in linux-xfs lore and seems to be part of a mega patchset. >> + >> /* >> * Helper function to extract CoW extent size hint from inode. >> * Between the extent size hint and the CoW extent size hint, we >> @@ -629,6 +649,8 @@ xfs_ip2xflags( >> flags |= FS_XFLAG_DAX; >> if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) >> flags |= FS_XFLAG_COWEXTSIZE; >> + if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES) >> + flags |= FS_XFLAG_ATOMICWRITES; >> } >> >> if (xfs_inode_has_attr_fork(ip)) >> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h >> index 97f63bacd4c2..0e0a21d9d30f 100644 >> --- a/fs/xfs/xfs_inode.h >> +++ b/fs/xfs/xfs_inode.h >> @@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) >> return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; >> } >> >> +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) > > I think this predicate wants a verb in its name, the rest of them have > "is" or "has" somewhere: > > "xfs_inode_has_atomicwrites" ok, fine. Note that I was copying xfs_inode_forcealign() in terms of naming. > >> +{ >> + return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES; >> +} >> + >> /* >> * Return the buftarg used for data allocations on a given inode. >> */ >> @@ -542,7 +547,9 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, >> struct xfs_inode *ip1, uint ip1_mode); >> >> xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); >> +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); >> xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); >> +xfs_extlen_t xfs_get_atomicwrites_size(struct xfs_inode *ip); >> >> int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, >> struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, >> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c >> index f02b6e558af5..c380a3055be7 100644 >> --- a/fs/xfs/xfs_ioctl.c >> +++ b/fs/xfs/xfs_ioctl.c >> @@ -1110,6 +1110,8 @@ xfs_flags2diflags2( >> di_flags2 |= XFS_DIFLAG2_DAX; >> if (xflags & FS_XFLAG_COWEXTSIZE) >> di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; >> + if (xflags & FS_XFLAG_ATOMICWRITES) >> + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; >> >> return di_flags2; >> } >> @@ -1122,10 +1124,12 @@ xfs_ioctl_setattr_xflags( >> { >> struct xfs_mount *mp = ip->i_mount; >> bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); >> + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; >> uint64_t i_flags2; >> >> - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { >> - /* Can't change realtime flag if any extents are allocated. */ > > Please augment this comment ("Can't change realtime or atomicwrites > flags if any extents are allocated") instead of deleting it. I wasn't supposed to delete that - will remedy. > This is > validation code, the requirements should be spelled out in English. > >> + >> + if (rtflag != XFS_IS_REALTIME_INODE(ip) || >> + atomic_writes != xfs_inode_atomicwrites(ip)) { >> if (ip->i_df.if_nextents || ip->i_delayed_blks) >> return -EINVAL; >> } >> @@ -1146,6 +1150,17 @@ xfs_ioctl_setattr_xflags( >> if (i_flags2 && !xfs_has_v3inodes(mp)) >> return -EINVAL; >> >> + if (atomic_writes) { >> + if (!xfs_has_atomicwrites(mp)) >> + return -EINVAL; >> + >> + if (!rtflag) >> + return -EINVAL; >> + >> + if (!is_power_of_2(mp->m_sb.sb_rextsize)) >> + return -EINVAL; > > Shouldn't we check sb_rextsize w.r.t. the actual block device queue > limits here? I keep seeing similar validation logic open-coded > throughout both atomic write patchsets: > > if (l < queue_atomic_write_unit_min_bytes()) > /* fail */ > if (l > queue_atomic_write_unit_max_bytes()) > /* fail */ > if (!is_power_of_2(l)) > /* fail */ > /* ok */ > > which really should be a common helper somewhere. I think that it is a reasonable comment about duplication the atomic writes checks for the bdev and iomap write paths - I can try to improve that. But the is_power_of_2(mp->m_sb.sb_rextsize) check is to ensure that the extent size is suitable for enabling atomic writes. I don't see a point in checking the bdev queue limits here. > > /* > * Don't set atomic write if the allocation unit doesn't > * align with the device requirements. > */ > if (!bdev_validate_atomic_write(<target blockdev>, > XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize)) > return -EINVAL; > > Too bad we have to figure out the target blockdev and file allocation > unit based on the ioctl in-params and can't use the xfs_inode helpers > here. I am not sure what bdev_validate_atomic_write() would even do. If sb_rextsize exceeded the bdev atomic write unit max, then we just cap reported atomic write unit max in statx to that which the bdev reports and vice-versa. And didn't we previously have a concern that it is possible to change the geometry of the device? If so, not much point in this check. Thanks, John
On Mon, Feb 05, 2024 at 12:51:07PM +0000, John Garry wrote: > On 02/02/2024 17:52, Darrick J. Wong wrote: > > On Wed, Jan 24, 2024 at 02:26:42PM +0000, John Garry wrote: > > > Add initial support for FS_XFLAG_ATOMICWRITES in rtvol. > > > > > > Current kernel support for atomic writes is based on HW support (for atomic > > > writes). As such, it is required to ensure extent alignment with > > > atomic_write_unit_max so that an atomic write can result in a single > > > HW-compliant IO operation. > > > > > > rtvol already guarantees extent alignment, so initially add support there. > > > > > > Signed-off-by: John Garry <john.g.garry@oracle.com> > > > --- > > > fs/xfs/libxfs/xfs_format.h | 8 ++++++-- > > > fs/xfs/libxfs/xfs_sb.c | 2 ++ > > > fs/xfs/xfs_inode.c | 22 ++++++++++++++++++++++ > > > fs/xfs/xfs_inode.h | 7 +++++++ > > > fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++-- > > > fs/xfs/xfs_mount.h | 2 ++ > > > fs/xfs/xfs_super.c | 4 ++++ > > > 7 files changed, 60 insertions(+), 4 deletions(-) > > > > > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > > > index 382ab1e71c0b..79fb0d4adeda 100644 > > > --- a/fs/xfs/libxfs/xfs_format.h > > > +++ b/fs/xfs/libxfs/xfs_format.h > > > @@ -353,11 +353,13 @@ xfs_sb_has_compat_feature( > > > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > > > #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ > > > #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ > > > +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 29) /* aligned file data extents */ > > > > I thought FORCEALIGN was going to signal aligned file data extent > > allocations being mandatory? > > Right, I'll fix that comment > > > > > This flag (AFAICT) simply marks the inode as something that gets > > FMODE_CAN_ATOMIC_WRITES, right? > > Correct > > > > > > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > > > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > > > XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ > > > XFS_SB_FEAT_RO_COMPAT_REFLINK| \ > > > - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > > > + XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \ > > > + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > > > #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL > > > static inline bool > > > xfs_sb_has_ro_compat_feature( > > > @@ -1085,16 +1087,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) > > > #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ > > > #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ > > > #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ > > > +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 > > > > Needs a comment here ("files flagged for atomic writes"). > > ok > > > Also not sure > > why you skipped bit 5, though I'm guessing it's because the forcealign > > series is/was using it? > > Right, I'll fix that > > > > > > #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) > > > #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) > > > #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) > > > #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) > > > #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) > > > +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) > > > #define XFS_DIFLAG2_ANY \ > > > (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ > > > - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) > > > + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_ATOMICWRITES) > > > static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) > > > { > > > diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c > > > index 4a9e8588f4c9..28a98130a56d 100644 > > > --- a/fs/xfs/libxfs/xfs_sb.c > > > +++ b/fs/xfs/libxfs/xfs_sb.c > > > @@ -163,6 +163,8 @@ xfs_sb_version_to_features( > > > features |= XFS_FEAT_REFLINK; > > > if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) > > > features |= XFS_FEAT_INOBTCNT; > > > + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) > > > + features |= XFS_FEAT_ATOMICWRITES; > > > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) > > > features |= XFS_FEAT_FTYPE; > > > if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > > > index 1fd94958aa97..0b0f525fd043 100644 > > > --- a/fs/xfs/xfs_inode.c > > > +++ b/fs/xfs/xfs_inode.c > > > @@ -65,6 +65,26 @@ xfs_get_extsz_hint( > > > return 0; > > > } > > > +/* > > > + * helper function to extract extent size > > > > How does that differ from xfs_get_extsz_hint? > > The idea of this function is to return the guaranteed extent alignment, and > not just the hint > > > > > > + */ > > > +xfs_extlen_t > > > +xfs_get_extsz( > > > + struct xfs_inode *ip) > > > +{ > > > + /* > > > + * No point in aligning allocations if we need to COW to actually > > > + * write to them. > > > > What does alwayscow have to do with untorn writes? > > Nothing at the moment, so I'll remove this. > > > > > > + */ > > > + if (xfs_is_always_cow_inode(ip)) > > > + return 0; > > > + > > > + if (XFS_IS_REALTIME_INODE(ip)) > > > + return ip->i_mount->m_sb.sb_rextsize; > > > + > > > + return 1; > > > +} > > > > Does this function exist to return the allocation unit for a given file? > > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/commit/?h=djwong-wtf&id=b8ddcef3df8da02ed2c4aacbed1d811e60372006 > > > > Yes, something like xfs_inode_alloc_unitsize() there. > > What's the upstream status for that change? I see it mentioned in linux-xfs > lore and seems to be part of a mega patchset. It's stuck in review along with the other ~1400 patches that I've been grumbling about in our staff meetings for years now. > > > + > > > /* > > > * Helper function to extract CoW extent size hint from inode. > > > * Between the extent size hint and the CoW extent size hint, we > > > @@ -629,6 +649,8 @@ xfs_ip2xflags( > > > flags |= FS_XFLAG_DAX; > > > if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) > > > flags |= FS_XFLAG_COWEXTSIZE; > > > + if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES) > > > + flags |= FS_XFLAG_ATOMICWRITES; > > > } > > > if (xfs_inode_has_attr_fork(ip)) > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > > index 97f63bacd4c2..0e0a21d9d30f 100644 > > > --- a/fs/xfs/xfs_inode.h > > > +++ b/fs/xfs/xfs_inode.h > > > @@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > > > return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; > > > } > > > +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) > > > > I think this predicate wants a verb in its name, the rest of them have > > "is" or "has" somewhere: > > > > "xfs_inode_has_atomicwrites" > > ok, fine. > > Note that I was copying xfs_inode_forcealign() in terms of naming. Yeah, I could rename that xfs_inode_forces_alignment() or something. Or just leave the condensed version where the verb and object are smashed together. xfs_inode_has_forcealign? Yeah. I'll go with that. > > > > > +{ > > > + return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES; > > > +} > > > + > > > /* > > > * Return the buftarg used for data allocations on a given inode. > > > */ > > > @@ -542,7 +547,9 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, > > > struct xfs_inode *ip1, uint ip1_mode); > > > xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); > > > +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); > > > xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); > > > +xfs_extlen_t xfs_get_atomicwrites_size(struct xfs_inode *ip); > > > int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, > > > struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, > > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > > > index f02b6e558af5..c380a3055be7 100644 > > > --- a/fs/xfs/xfs_ioctl.c > > > +++ b/fs/xfs/xfs_ioctl.c > > > @@ -1110,6 +1110,8 @@ xfs_flags2diflags2( > > > di_flags2 |= XFS_DIFLAG2_DAX; > > > if (xflags & FS_XFLAG_COWEXTSIZE) > > > di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; > > > + if (xflags & FS_XFLAG_ATOMICWRITES) > > > + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; > > > return di_flags2; > > > } > > > @@ -1122,10 +1124,12 @@ xfs_ioctl_setattr_xflags( > > > { > > > struct xfs_mount *mp = ip->i_mount; > > > bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); > > > + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; > > > uint64_t i_flags2; > > > - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { > > > - /* Can't change realtime flag if any extents are allocated. */ > > > > Please augment this comment ("Can't change realtime or atomicwrites > > flags if any extents are allocated") instead of deleting it. > > I wasn't supposed to delete that - will remedy. > > > This is > > validation code, the requirements should be spelled out in English. > > > > > + > > > + if (rtflag != XFS_IS_REALTIME_INODE(ip) || > > > + atomic_writes != xfs_inode_atomicwrites(ip)) { > > > if (ip->i_df.if_nextents || ip->i_delayed_blks) > > > return -EINVAL; > > > } > > > @@ -1146,6 +1150,17 @@ xfs_ioctl_setattr_xflags( > > > if (i_flags2 && !xfs_has_v3inodes(mp)) > > > return -EINVAL; > > > + if (atomic_writes) { > > > + if (!xfs_has_atomicwrites(mp)) > > > + return -EINVAL; > > > + > > > + if (!rtflag) > > > + return -EINVAL; > > > + > > > + if (!is_power_of_2(mp->m_sb.sb_rextsize)) > > > + return -EINVAL; > > > > Shouldn't we check sb_rextsize w.r.t. the actual block device queue > > limits here? I keep seeing similar validation logic open-coded > > throughout both atomic write patchsets: > > > > if (l < queue_atomic_write_unit_min_bytes()) > > /* fail */ > > if (l > queue_atomic_write_unit_max_bytes()) > > /* fail */ > > if (!is_power_of_2(l)) > > /* fail */ > > /* ok */ > > > > which really should be a common helper somewhere. > > I think that it is a reasonable comment about duplication the atomic writes > checks for the bdev and iomap write paths - I can try to improve that. > > But the is_power_of_2(mp->m_sb.sb_rextsize) check is to ensure that the > extent size is suitable for enabling atomic writes. I don't see a point in > checking the bdev queue limits here. Ok, skip the queue limits then. > > > > /* > > * Don't set atomic write if the allocation unit doesn't > > * align with the device requirements. > > */ > > if (!bdev_validate_atomic_write(<target blockdev>, > > XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize)) > > return -EINVAL; > > > > Too bad we have to figure out the target blockdev and file allocation > > unit based on the ioctl in-params and can't use the xfs_inode helpers > > here. > > I am not sure what bdev_validate_atomic_write() would even do. If > sb_rextsize exceeded the bdev atomic write unit max, then we just cap > reported atomic write unit max in statx to that which the bdev reports and > vice-versa. > > And didn't we previously have a concern that it is possible to change the > geometry of the device? The thing is, I don't want this logic: if (!is_power_of_2(mp->m_sb.sb_rextsize)) /* fail */ to be open-coded inside xfs. I'd rather have a standard bdev_* helper that every filesystem can call, so we don't end up with more generic code copy-pasted all over the codebase. The awkward part (for me) is the naming, since filesystems usually don't have to check with the block layer about their units of space allocation. /* * Ensure that a file space allocation unit is congruent with the atomic * write unit capabilities of supported block devices. */ static inline bool bdev_validate_atomic_write_allocunit(unsigned au) { return is_power_of_2(au); } if (!bdev_validate_atomic_write_allocunit(mp->m-sb.sb_rextsize)) return -EINVAL; > If so, not much point in this check. Yes, that is a disadvantage of me reading patchsets in reverse order. ;) --D > Thanks, > John > >
On 13/02/2024 17:22, Darrick J. Wong wrote: >> I am not sure what bdev_validate_atomic_write() would even do. If >> sb_rextsize exceeded the bdev atomic write unit max, then we just cap >> reported atomic write unit max in statx to that which the bdev reports and >> vice-versa. >> >> And didn't we previously have a concern that it is possible to change the >> geometry of the device? > The thing is, I don't want this logic: > > if (!is_power_of_2(mp->m_sb.sb_rextsize)) > /* fail */ This is really specific to XFS. Let's see where all this alignment stuff goes before trying to unify all these checks. > > to be open-coded inside xfs. I'd rather have a standard bdev_* helper > that every filesystem can call, so we don't end up with more generic > code copy-pasted all over the codebase. > > The awkward part (for me) is the naming, since filesystems usually don't > have to check with the block layer about their units of space allocation. > > /* > * Ensure that a file space allocation unit is congruent with the atomic > * write unit capabilities of supported block devices. > */ > static inline bool bdev_validate_atomic_write_allocunit(unsigned au) > { > return is_power_of_2(au); > } > > if (!bdev_validate_atomic_write_allocunit(mp->m-sb.sb_rextsize)) > return -EINVAL; As above, I can try to unify, but this alignment stuff is a bit up in the air at the moment. Thanks, John
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 382ab1e71c0b..79fb0d4adeda 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -353,11 +353,13 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 29) /* aligned file data extents */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ XFS_SB_FEAT_RO_COMPAT_REFLINK| \ - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + XFS_SB_FEAT_RO_COMPAT_INOBTCNT | \ + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -1085,16 +1087,18 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_ATOMICWRITES) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..28a98130a56d 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -163,6 +163,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) + features |= XFS_FEAT_ATOMICWRITES; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1fd94958aa97..0b0f525fd043 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -65,6 +65,26 @@ xfs_get_extsz_hint( return 0; } +/* + * helper function to extract extent size + */ +xfs_extlen_t +xfs_get_extsz( + struct xfs_inode *ip) +{ + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (xfs_is_always_cow_inode(ip)) + return 0; + + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + + return 1; +} + /* * Helper function to extract CoW extent size hint from inode. * Between the extent size hint and the CoW extent size hint, we @@ -629,6 +649,8 @@ xfs_ip2xflags( flags |= FS_XFLAG_DAX; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; + if (ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES) + flags |= FS_XFLAG_ATOMICWRITES; } if (xfs_inode_has_attr_fork(ip)) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97f63bacd4c2..0e0a21d9d30f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_ATOMICWRITES; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -542,7 +547,9 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_atomicwrites_size(struct xfs_inode *ip); int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f02b6e558af5..c380a3055be7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1110,6 +1110,8 @@ xfs_flags2diflags2( di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + if (xflags & FS_XFLAG_ATOMICWRITES) + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; return di_flags2; } @@ -1122,10 +1124,12 @@ xfs_ioctl_setattr_xflags( { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; uint64_t i_flags2; - if (rtflag != XFS_IS_REALTIME_INODE(ip)) { - /* Can't change realtime flag if any extents are allocated. */ + + if (rtflag != XFS_IS_REALTIME_INODE(ip) || + atomic_writes != xfs_inode_atomicwrites(ip)) { if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; } @@ -1146,6 +1150,17 @@ xfs_ioctl_setattr_xflags( if (i_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; + if (atomic_writes) { + if (!xfs_has_atomicwrites(mp)) + return -EINVAL; + + if (!rtflag) + return -EINVAL; + + if (!is_power_of_2(mp->m_sb.sb_rextsize)) + return -EINVAL; + } + ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_diflags2 = i_flags2; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 503fe3c7edbf..bcd591f52925 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -289,6 +289,7 @@ typedef struct xfs_mount { #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ +#define XFS_FEAT_ATOMICWRITES (1ULL << 28) /* atomic writes support */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -352,6 +353,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES) /* * Mount features diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..263404f683d6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1696,6 +1696,10 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } + if (xfs_has_atomicwrites(mp)) + xfs_warn(mp, +"EXPERIMENTAL atomic writes feature in use. Use at your own risk!"); + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp,
Add initial support for FS_XFLAG_ATOMICWRITES in rtvol. Current kernel support for atomic writes is based on HW support (for atomic writes). As such, it is required to ensure extent alignment with atomic_write_unit_max so that an atomic write can result in a single HW-compliant IO operation. rtvol already guarantees extent alignment, so initially add support there. Signed-off-by: John Garry <john.g.garry@oracle.com> --- fs/xfs/libxfs/xfs_format.h | 8 ++++++-- fs/xfs/libxfs/xfs_sb.c | 2 ++ fs/xfs/xfs_inode.c | 22 ++++++++++++++++++++++ fs/xfs/xfs_inode.h | 7 +++++++ fs/xfs/xfs_ioctl.c | 19 +++++++++++++++++-- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 4 ++++ 7 files changed, 60 insertions(+), 4 deletions(-)