Message ID | 20230503183821.1473305-3-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | block atomic writes | expand |
On Wed, May 03, 2023 at 06:38:07PM +0000, John Garry wrote: > From: Prasad Singamsetty <prasad.singamsetty@oracle.com> > > Extend statx system call to return additional info for atomic write support > support if the specified file is a block device. > > Add initial support for a block device. > > Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com> > Signed-off-by: John Garry <john.g.garry@oracle.com> > --- > block/bdev.c | 21 +++++++++++++++++++++ > fs/stat.c | 10 ++++++++++ > include/linux/blkdev.h | 4 ++++ > include/linux/stat.h | 2 ++ > include/uapi/linux/stat.h | 7 ++++++- > 5 files changed, 43 insertions(+), 1 deletion(-) > > diff --git a/block/bdev.c b/block/bdev.c > index 1795c7d4b99e..6a5fd5abaadc 100644 > --- a/block/bdev.c > +++ b/block/bdev.c > @@ -1014,3 +1014,24 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) > > blkdev_put_no_open(bdev); > } > + > +/* > + * Handle statx for block devices to get properties of WRITE ATOMIC > + * feature support. > + */ > +void bdev_statx_atomic(struct inode *inode, struct kstat *stat) > +{ > + struct block_device *bdev; > + > + bdev = blkdev_get_no_open(inode->i_rdev); > + if (!bdev) > + return; > + > + stat->atomic_write_unit_min = queue_atomic_write_unit_min(bdev->bd_queue); > + stat->atomic_write_unit_max = queue_atomic_write_unit_max(bdev->bd_queue); > + stat->attributes |= STATX_ATTR_WRITE_ATOMIC; > + stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; > + stat->result_mask |= STATX_WRITE_ATOMIC; > + > + blkdev_put_no_open(bdev); > +} > diff --git a/fs/stat.c b/fs/stat.c > index 7c238da22ef0..d20334a0e9ae 100644 > --- a/fs/stat.c > +++ b/fs/stat.c > @@ -256,6 +256,14 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, > bdev_statx_dioalign(inode, stat); > } > > + /* Handle STATX_WRITE_ATOMIC for block devices */ > + if (request_mask & STATX_WRITE_ATOMIC) { > + struct inode *inode = d_backing_inode(path.dentry); > + > + if (S_ISBLK(inode->i_mode)) > + bdev_statx_atomic(inode, stat); > + } This duplicates STATX_DIOALIGN bdev handling. Really, the bdev attribute handling should be completely factored out of vfs_statx() - blockdevs are not the common fastpath for stat operations. Somthing like: /* * If this is a block device inode, override the filesystem * attributes with the block device specific parameters * that need to be obtained from the bdev backing inode. */ if (S_ISBLK(d_backing_inode(path.dentry)->i_mode)) bdev_statx(path.dentry, stat); And then all the overrides can go in the one function that doesn't need to repeatedly check S_ISBLK().... > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 6b6f2992338c..19d33b2897b2 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -1527,6 +1527,7 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); > int sync_blockdev_nowait(struct block_device *bdev); > void sync_bdevs(bool wait); > void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); > +void bdev_statx_atomic(struct inode *inode, struct kstat *stat); > void printk_all_partitions(void); > #else > static inline void invalidate_bdev(struct block_device *bdev) > @@ -1546,6 +1547,9 @@ static inline void sync_bdevs(bool wait) > static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) > { > } > +static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat) > +{ > +} > static inline void printk_all_partitions(void) > { > } That also gets rid of the need for all these fine grained exports out of the bdev code for statx.... > diff --git a/include/linux/stat.h b/include/linux/stat.h > index 52150570d37a..dfa69ecfaacf 100644 > --- a/include/linux/stat.h > +++ b/include/linux/stat.h > @@ -53,6 +53,8 @@ struct kstat { > u32 dio_mem_align; > u32 dio_offset_align; > u64 change_cookie; > + u32 atomic_write_unit_max; > + u32 atomic_write_unit_min; > }; > > /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h > index 7cab2c65d3d7..c99d7cac2aa6 100644 > --- a/include/uapi/linux/stat.h > +++ b/include/uapi/linux/stat.h > @@ -127,7 +127,10 @@ struct statx { > __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ > __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ > /* 0xa0 */ > - __u64 __spare3[12]; /* Spare space for future expansion */ > + __u32 stx_atomic_write_unit_max; > + __u32 stx_atomic_write_unit_min; > + /* 0xb0 */ > + __u64 __spare3[11]; /* Spare space for future expansion */ > /* 0x100 */ > }; No documentation on what units these are in. Is there a statx() man page update for this addition? Cheers, Dave.
On 03/05/2023 22:58, Dave Chinner wrote: Hi Dave, >> + /* Handle STATX_WRITE_ATOMIC for block devices */ >> + if (request_mask & STATX_WRITE_ATOMIC) { >> + struct inode *inode = d_backing_inode(path.dentry); >> + >> + if (S_ISBLK(inode->i_mode)) >> + bdev_statx_atomic(inode, stat); >> + } > This duplicates STATX_DIOALIGN bdev handling. > > Really, the bdev attribute handling should be completely factored > out of vfs_statx() - blockdevs are not the common fastpath for stat > operations. Somthing like: > > /* > * If this is a block device inode, override the filesystem > * attributes with the block device specific parameters > * that need to be obtained from the bdev backing inode. > */ > if (S_ISBLK(d_backing_inode(path.dentry)->i_mode)) > bdev_statx(path.dentry, stat); > > And then all the overrides can go in the one function that doesn't > need to repeatedly check S_ISBLK().... ok, that looks like a reasonable idea. We'll look to make that change. > > >> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h >> index 6b6f2992338c..19d33b2897b2 100644 >> --- a/include/linux/blkdev.h >> +++ b/include/linux/blkdev.h >> @@ -1527,6 +1527,7 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); >> int sync_blockdev_nowait(struct block_device *bdev); >> void sync_bdevs(bool wait); >> void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); >> +void bdev_statx_atomic(struct inode *inode, struct kstat *stat); >> void printk_all_partitions(void); >> #else >> static inline void invalidate_bdev(struct block_device *bdev) >> @@ -1546,6 +1547,9 @@ static inline void sync_bdevs(bool wait) >> static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) >> { >> } >> +static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat) >> +{ >> +} >> static inline void printk_all_partitions(void) >> { >> } > That also gets rid of the need for all these fine grained exports > out of the bdev code for statx.... Sure > >> diff --git a/include/linux/stat.h b/include/linux/stat.h >> index 52150570d37a..dfa69ecfaacf 100644 >> --- a/include/linux/stat.h >> +++ b/include/linux/stat.h >> @@ -53,6 +53,8 @@ struct kstat { >> u32 dio_mem_align; >> u32 dio_offset_align; >> u64 change_cookie; >> + u32 atomic_write_unit_max; >> + u32 atomic_write_unit_min; >> }; >> >> /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ >> diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h >> index 7cab2c65d3d7..c99d7cac2aa6 100644 >> --- a/include/uapi/linux/stat.h >> +++ b/include/uapi/linux/stat.h >> @@ -127,7 +127,10 @@ struct statx { >> __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ >> __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ >> /* 0xa0 */ >> - __u64 __spare3[12]; /* Spare space for future expansion */ >> + __u32 stx_atomic_write_unit_max; >> + __u32 stx_atomic_write_unit_min; >> + /* 0xb0 */ >> + __u64 __spare3[11]; /* Spare space for future expansion */ >> /* 0x100 */ >> }; > No documentation on what units these are in. It's in bytes, we're really just continuing the pattern of what we do for dio. I think that it would be reasonable to limit to u32. > Is there a statx() man > page update for this addition? No, not yet. Is it normally expected to provide a proposed man page update in parallel? Or somewhat later, when the kernel API change has some appreciable level of agreement? Thanks, John
On Thu, May 04, 2023 at 09:45:50AM +0100, John Garry wrote: > On 03/05/2023 22:58, Dave Chinner wrote: > > Is there a statx() man > > page update for this addition? > > No, not yet. Is it normally expected to provide a proposed man page update > in parallel? Or somewhat later, when the kernel API change has some > appreciable level of agreement? Normally we ask for man page updates to be presented at the same time, as the man page defines the user interface that is being implemented. In this case, we need updates for the pwritev2() man page to document RWF_ATOMIC semantics, and the statx() man page to document what the variables being exposed mean w.r.t. RWF_ATOMIC. The pwritev2() man page is probably the most important one right now - it needs to explain the guarantees that RWF_ATOMIC is supposed to provide w.r.t. data integrity, IO ordering, persistence, etc. Indeed, it will need to explain exactly how this "multi-atomic-unit mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and reliably, especially w.r.t. IO ordering and persistence guarantees in the face of crashes and power failures. Not to mention documenting error conditions specific to RWF_ATOMIC... It's all well and good to have some implementation, but without actually defining and documenting the *guarantees* that RWF_ATOMIC provides userspace it is completely useless for application developers. And from the perspective of a reviewer, without the documentation stating what the infrastructure actually guarantees applications, we can't determine if the implementation being presented is fit for purpose.... Cheers, Dave.
On 04/05/2023 23:40, Dave Chinner wrote: Hi Dave, >> No, not yet. Is it normally expected to provide a proposed man page update >> in parallel? Or somewhat later, when the kernel API change has some >> appreciable level of agreement? > Normally we ask for man page updates to be presented at the same > time, as the man page defines the user interface that is being > implemented. In this case, we need updates for the pwritev2() man > page to document RWF_ATOMIC semantics, and the statx() man page to > document what the variables being exposed mean w.r.t. RWF_ATOMIC. > > The pwritev2() man page is probably the most important one right now > - it needs to explain the guarantees that RWF_ATOMIC is supposed to > provide w.r.t. data integrity, IO ordering, persistence, etc. > Indeed, it will need to explain exactly how this "multi-atomic-unit > mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and > reliably, especially w.r.t. IO ordering and persistence guarantees > in the face of crashes and power failures. Not to mention > documenting error conditions specific to RWF_ATOMIC... > > It's all well and good to have some implementation, but without > actually defining and documenting the*guarantees* that RWF_ATOMIC > provides userspace it is completely useless for application > developers. And from the perspective of a reviewer, without the > documentation stating what the infrastructure actually guarantees > applications, we can't determine if the implementation being > presented is fit for purpose.... ok, understood. Obviously from any discussion so far there are many details which the user needs to know about how to use this interface and what to expect. We'll look to start working on those man page details now. Thanks, John
On Fri, May 05, 2023 at 09:01:58AM +0100, John Garry wrote: > On 04/05/2023 23:40, Dave Chinner wrote: > > Hi Dave, > > > > No, not yet. Is it normally expected to provide a proposed man page update > > > in parallel? Or somewhat later, when the kernel API change has some > > > appreciable level of agreement? > > Normally we ask for man page updates to be presented at the same > > time, as the man page defines the user interface that is being > > implemented. In this case, we need updates for the pwritev2() man > > page to document RWF_ATOMIC semantics, and the statx() man page to > > document what the variables being exposed mean w.r.t. RWF_ATOMIC. > > > > The pwritev2() man page is probably the most important one right now > > - it needs to explain the guarantees that RWF_ATOMIC is supposed to > > provide w.r.t. data integrity, IO ordering, persistence, etc. > > Indeed, it will need to explain exactly how this "multi-atomic-unit > > mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and > > reliably, especially w.r.t. IO ordering and persistence guarantees > > in the face of crashes and power failures. Not to mention > > documenting error conditions specific to RWF_ATOMIC... > > > > It's all well and good to have some implementation, but without > > actually defining and documenting the*guarantees* that RWF_ATOMIC > > provides userspace it is completely useless for application > > developers. And from the perspective of a reviewer, without the > > documentation stating what the infrastructure actually guarantees > > applications, we can't determine if the implementation being > > presented is fit for purpose.... > > ok, understood. Obviously from any discussion so far there are many details > which the user needs to know about how to use this interface and what to > expect. > > We'll look to start working on those man page details now. Agreed. The manpage contents are what needs to get worked on at LSFMM where you'll have various block/fs/storage device people in the same room with which to discuss various issues and try to smooth out the misundertandings. (Also: I've decided to cancel my in-person attendance due to a sudden health issue. I'll still be in the room, just virtually now. :() --D > Thanks, > John
diff --git a/block/bdev.c b/block/bdev.c index 1795c7d4b99e..6a5fd5abaadc 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1014,3 +1014,24 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) blkdev_put_no_open(bdev); } + +/* + * Handle statx for block devices to get properties of WRITE ATOMIC + * feature support. + */ +void bdev_statx_atomic(struct inode *inode, struct kstat *stat) +{ + struct block_device *bdev; + + bdev = blkdev_get_no_open(inode->i_rdev); + if (!bdev) + return; + + stat->atomic_write_unit_min = queue_atomic_write_unit_min(bdev->bd_queue); + stat->atomic_write_unit_max = queue_atomic_write_unit_max(bdev->bd_queue); + stat->attributes |= STATX_ATTR_WRITE_ATOMIC; + stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; + stat->result_mask |= STATX_WRITE_ATOMIC; + + blkdev_put_no_open(bdev); +} diff --git a/fs/stat.c b/fs/stat.c index 7c238da22ef0..d20334a0e9ae 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -256,6 +256,14 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, bdev_statx_dioalign(inode, stat); } + /* Handle STATX_WRITE_ATOMIC for block devices */ + if (request_mask & STATX_WRITE_ATOMIC) { + struct inode *inode = d_backing_inode(path.dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_atomic(inode, stat); + } + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -636,6 +644,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_mnt_id = stat->mnt_id; tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; + tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; + tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6b6f2992338c..19d33b2897b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1527,6 +1527,7 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); +void bdev_statx_atomic(struct inode *inode, struct kstat *stat); void printk_all_partitions(void); #else static inline void invalidate_bdev(struct block_device *bdev) @@ -1546,6 +1547,9 @@ static inline void sync_bdevs(bool wait) static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) { } +static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat) +{ +} static inline void printk_all_partitions(void) { } diff --git a/include/linux/stat.h b/include/linux/stat.h index 52150570d37a..dfa69ecfaacf 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -53,6 +53,8 @@ struct kstat { u32 dio_mem_align; u32 dio_offset_align; u64 change_cookie; + u32 atomic_write_unit_max; + u32 atomic_write_unit_min; }; /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7cab2c65d3d7..c99d7cac2aa6 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -127,7 +127,10 @@ struct statx { __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ + __u32 stx_atomic_write_unit_max; + __u32 stx_atomic_write_unit_min; + /* 0xb0 */ + __u64 __spare3[11]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -154,6 +157,7 @@ struct statx { #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_WRITE_ATOMIC 0x00004000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -189,6 +193,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */