Message ID | 20240610104329.3555488-4-john.g.garry@oracle.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Benjamin Marzinski |
Headers | show |
Series | block atomic writes | expand |
On Mon, Jun 10, 2024 at 10:43:22AM +0000, John Garry wrote: > From: Prasad Singamsetty <prasad.singamsetty@oracle.com> > > An atomic write is a write issued with torn-write protection, meaning > that for a power failure or any other hardware failure, all or none of the > data from the write will be stored, but never a mix of old and new data. > > Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the > write is to be issued with torn-write prevention, according to special > alignment and length rules. > > For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for > iocb->ki_flags field to indicate the same. > > A call to statx will give the relevant atomic write info for a file: > - atomic_write_unit_min > - atomic_write_unit_max > - atomic_write_segments_max > > Both min and max values must be a power-of-2. > > Applications can avail of atomic write feature by ensuring that the total > length of a write is a power-of-2 in size and also sized between > atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications > must ensure that the write is at a naturally-aligned offset in the file > wrt the total write length. The value in atomic_write_segments_max > indicates the upper limit for IOV_ITER iovcnt. > > Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the > flag set will have RWF_ATOMIC rejected and not just ignored. > > Add a type argument to kiocb_set_rw_flags() to allows reads which have > RWF_ATOMIC set to be rejected. > > Helper function generic_atomic_write_valid() can be used by FSes to verify > compliant writes. There we check for iov_iter type is for ubuf, which > implies iovcnt==1 for pwritev2(), which is an initial restriction for > atomic_write_segments_max. Initially the only user will be bdev file > operations write handler. We will rely on the block BIO submission path to > ensure write sizes are compliant for the bdev, so we don't need to check > atomic writes sizes yet. > > Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com> > jpg: merge into single patch and much rewrite > Signed-off-by: John Garry <john.g.garry@oracle.com> Seems fine to me, though clearly others have had much stronger opinions in the past so: Acked-by: Darrick J. Wong <djwong@kernel.org> --D > --- > fs/aio.c | 8 ++++---- > fs/btrfs/ioctl.c | 2 +- > fs/read_write.c | 18 +++++++++++++++++- > include/linux/fs.h | 17 +++++++++++++++-- > include/uapi/linux/fs.h | 5 ++++- > io_uring/rw.c | 9 ++++----- > 6 files changed, 45 insertions(+), 14 deletions(-) > > diff --git a/fs/aio.c b/fs/aio.c > index 57c9f7c077e6..93ef59d358b3 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) > iocb_put(iocb); > } > > -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) > +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) > { > int ret; > > @@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) > } else > req->ki_ioprio = get_current_ioprio(); > > - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); > + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); > if (unlikely(ret)) > return ret; > > @@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, > struct file *file; > int ret; > > - ret = aio_prep_rw(req, iocb); > + ret = aio_prep_rw(req, iocb, READ); > if (ret) > return ret; > file = req->ki_filp; > @@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, > struct file *file; > int ret; > > - ret = aio_prep_rw(req, iocb); > + ret = aio_prep_rw(req, iocb, WRITE); > if (ret) > return ret; > file = req->ki_filp; > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index efd5d6e9589e..6ad524b894fc 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool > goto out_iov; > > init_sync_kiocb(&kiocb, file); > - ret = kiocb_set_rw_flags(&kiocb, 0); > + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); > if (ret) > goto out_iov; > kiocb.ki_pos = pos; > diff --git a/fs/read_write.c b/fs/read_write.c > index ef6339391351..285b0f5a9a9c 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, > ssize_t ret; > > init_sync_kiocb(&kiocb, filp); > - ret = kiocb_set_rw_flags(&kiocb, flags); > + ret = kiocb_set_rw_flags(&kiocb, flags, type); > if (ret) > return ret; > kiocb.ki_pos = (ppos ? *ppos : 0); > @@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) > > return 0; > } > + > +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) > +{ > + size_t len = iov_iter_count(iter); > + > + if (!iter_is_ubuf(iter)) > + return false; > + > + if (!is_power_of_2(len)) > + return false; > + > + if (!IS_ALIGNED(pos, len)) > + return false; > + > + return true; > +} > \ No newline at end of file > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 0283cf366c2a..e049414bef7d 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, > #define FMODE_EXEC ((__force fmode_t)(1 << 5)) > /* File writes are restricted (block device specific) */ > #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) > +/* File supports atomic writes */ > +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) > > -/* FMODE_* bits 7 to 8 */ > +/* FMODE_* bit 8 */ > > /* 32bit hashes as llseek() offset (for directories) */ > #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) > @@ -317,6 +319,7 @@ struct readahead_control; > #define IOCB_SYNC (__force int) RWF_SYNC > #define IOCB_NOWAIT (__force int) RWF_NOWAIT > #define IOCB_APPEND (__force int) RWF_APPEND > +#define IOCB_ATOMIC (__force int) RWF_ATOMIC > > /* non-RWF related bits - start at 16 */ > #define IOCB_EVENTFD (1 << 16) > @@ -351,6 +354,7 @@ struct readahead_control; > { IOCB_SYNC, "SYNC" }, \ > { IOCB_NOWAIT, "NOWAIT" }, \ > { IOCB_APPEND, "APPEND" }, \ > + { IOCB_ATOMIC, "ATOMIC"}, \ > { IOCB_EVENTFD, "EVENTFD"}, \ > { IOCB_DIRECT, "DIRECT" }, \ > { IOCB_WRITE, "WRITE" }, \ > @@ -3403,7 +3407,8 @@ static inline int iocb_flags(struct file *file) > return res; > } > > -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) > +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, > + int rw_type) > { > int kiocb_flags = 0; > > @@ -3422,6 +3427,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) > return -EOPNOTSUPP; > kiocb_flags |= IOCB_NOIO; > } > + if (flags & RWF_ATOMIC) { > + if (rw_type != WRITE) > + return -EOPNOTSUPP; > + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) > + return -EOPNOTSUPP; > + } > kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); > if (flags & RWF_SYNC) > kiocb_flags |= IOCB_DSYNC; > @@ -3613,4 +3624,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, > extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, > int advice); > > +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); > + > #endif /* _LINUX_FS_H */ > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 45e4e64fd664..191a7e88a8ab 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -329,9 +329,12 @@ typedef int __bitwise __kernel_rwf_t; > /* per-IO negation of O_APPEND */ > #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) > > +/* Atomic Write */ > +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) > + > /* mask of flags supported by the kernel */ > #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ > - RWF_APPEND | RWF_NOAPPEND) > + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) > > /* Pagemap ioctl */ > #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) > diff --git a/io_uring/rw.c b/io_uring/rw.c > index 1a2128459cb4..c004d21e2f12 100644 > --- a/io_uring/rw.c > +++ b/io_uring/rw.c > @@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req) > S_ISBLK(file_inode(req->file)->i_mode); > } > > -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) > +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) > { > struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); > struct kiocb *kiocb = &rw->kiocb; > @@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) > req->flags |= io_file_get_flags(file); > > kiocb->ki_flags = file->f_iocb_flags; > - ret = kiocb_set_rw_flags(kiocb, rw->flags); > + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); > if (unlikely(ret)) > return ret; > kiocb->ki_flags |= IOCB_ALLOC_CACHE; > @@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) > if (unlikely(ret < 0)) > return ret; > } > - > - ret = io_rw_init_file(req, FMODE_READ); > + ret = io_rw_init_file(req, FMODE_READ, READ); > if (unlikely(ret)) > return ret; > req->cqe.res = iov_iter_count(&io->iter); > @@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) > ssize_t ret, ret2; > loff_t *ppos; > > - ret = io_rw_init_file(req, FMODE_WRITE); > + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); > if (unlikely(ret)) > return ret; > req->cqe.res = iov_iter_count(&io->iter); > -- > 2.31.1 > >
diff --git a/fs/aio.c b/fs/aio.c index 57c9f7c077e6..93ef59d358b3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index efd5d6e9589e..6ad524b894fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; diff --git a/fs/read_write.c b/fs/read_write.c index ef6339391351..285b0f5a9a9c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); @@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } + +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +{ + size_t len = iov_iter_count(iter); + + if (!iter_is_ubuf(iter)) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} \ No newline at end of file diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..e049414bef7d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) -/* FMODE_* bits 7 to 8 */ +/* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) @@ -317,6 +319,7 @@ struct readahead_control; #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -351,6 +354,7 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -3403,7 +3407,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3422,6 +3427,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3613,4 +3624,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); + #endif /* _LINUX_FS_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd664..191a7e88a8ab 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -329,9 +329,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a2128459cb4..c004d21e2f12 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } - - ret = io_rw_init_file(req, FMODE_READ); + ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); @@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; - ret = io_rw_init_file(req, FMODE_WRITE); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter);