Message ID | 20240823162810.1668399-7-maharmstone@fb.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | btrfs: add io_uring for encoded reads | expand |
On 8/23/24 17:27, Mark Harmstone wrote: > Adds an io_uring interface for asynchronous encoded reads, using the > same interface as for the ioctl. To use this you would use an SQE opcode > of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and > addr would point to the userspace address of the > btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have > CAP_SYS_ADMIN for this to work. > > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > --- ... > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h > index a5d786c6d7d4..62e5757d3ba2 100644 > --- a/fs/btrfs/btrfs_inode.h > +++ b/fs/btrfs/btrfs_inode.h > @@ -620,6 +620,8 @@ struct btrfs_encoded_read_private { > struct btrfs_ioctl_encoded_io_args args; > struct file *file; > void __user *copy_out; > + struct io_uring_cmd *cmd; > + unsigned int issue_flags; > }; > ... > +static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu( > + struct io_uring_cmd *cmd) > +{ > + return *(struct btrfs_encoded_read_private **)cmd->pdu; > +} > +static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + struct btrfs_encoded_read_private *priv; > + ssize_t ret; > + > + priv = btrfs_uring_encoded_read_pdu(cmd); > + ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED); > + > + io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags); s/priv->issue_flags/issue_flags/ You can't use flags that you got somewhere before from a completely different execution context. > + > + kfree(priv); > +} > + > +static void btrfs_encoded_read_uring_endio(struct btrfs_bio *bbio) > +{ > + struct btrfs_encoded_read_private *priv = bbio->private; Might be cleaner if you combine it with btrfs_encoded_read_ioctl_endio btrfs_encoded_read_endio() { ... if (!atomic_dec_return(&priv->pending)) { if (priv->cmd) io_uring_cmd_complete_in_task(); else wake_up(); ... } > + > + if (bbio->bio.bi_status) { > + /* > + * The memory barrier implied by the atomic_dec_return() here > + * pairs with the memory barrier implied by the > + * atomic_dec_return() or io_wait_event() in > + * btrfs_encoded_read_regular_fill_pages() to ensure that this > + * write is observed before the load of status in > + * btrfs_encoded_read_regular_fill_pages(). > + */ > + WRITE_ONCE(priv->status, bbio->bio.bi_status); > + } > + if (!atomic_dec_return(&priv->pending)) { > + io_uring_cmd_complete_in_task(priv->cmd, > + btrfs_finish_uring_encoded_read); > + } > + bio_put(&bbio->bio); > +} > + > static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, > u64 file_offset, u64 disk_bytenr, > u64 disk_io_size, > @@ -9106,11 +9148,16 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, > struct btrfs_fs_info *fs_info = inode->root->fs_info; > unsigned long i = 0; > struct btrfs_bio *bbio; > + btrfs_bio_end_io_t cb; > > - init_waitqueue_head(&priv->wait); > + if (priv->cmd) { > + cb = btrfs_encoded_read_uring_endio; > + } else { > + init_waitqueue_head(&priv->wait); > + cb = btrfs_encoded_read_ioctl_endio; > + } > > - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, > - btrfs_encoded_read_endio, priv); > + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, cb, priv); > bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; > bbio->inode = inode; > > @@ -9122,7 +9169,7 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, > btrfs_submit_bio(bbio, 0); > > bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, > - btrfs_encoded_read_endio, priv); > + cb, priv); > bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; > bbio->inode = inode; > continue; > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index c1886209933a..85a512a9ca64 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c ... > +static void btrfs_uring_encoded_read(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + ssize_t ret; > + void __user *argp = (void __user *)(uintptr_t)cmd->sqe->addr; u64_to_user_ptr() > + struct btrfs_encoded_read_private *priv; > + bool compat = issue_flags & IO_URING_F_COMPAT; > + > + priv = kmalloc(sizeof(*priv), GFP_KERNEL); > + if (!priv) { > + ret = -ENOMEM; > + goto out_uring; > + } > + > + ret = btrfs_prepare_encoded_read(priv, cmd->file, compat, argp); > + if (ret) > + goto out_finish; > + > + if (iov_iter_count(&priv->iter) == 0) { > + ret = 0; > + goto out_finish; > + } > + > + *(struct btrfs_encoded_read_private **)cmd->pdu = priv; > + priv->cmd = cmd; > + priv->issue_flags = issue_flags; You shouldn't be stashing issue_flags, it almost always wrong. Looks btrfs_finish_uring_encoded_read() is the only user, and with my comment above you can kill priv->issue_flags > + ret = btrfs_encoded_read(priv); > + > + if (ret == -EIOCBQUEUED && atomic_dec_return(&priv->pending)) > + return; Is gating on -EIOCBQUEUED just an optimisation? I.e. it's fine to swap checks if (atomic_dec_return(&priv->pending) && ret == -EIOCBQUEUED) but we know that unless it returned -EIOCBQUEUED nobody should've touched ->pending. > + > +out_finish: > + ret = btrfs_encoded_read_finish(priv, ret); > + kfree(priv); > + > +out_uring: > + io_uring_cmd_done(cmd, ret, 0, issue_flags); > +} > +
On 8/23/24 17:27, Mark Harmstone wrote: > Adds an io_uring interface for asynchronous encoded reads, using the > same interface as for the ioctl. To use this you would use an SQE opcode > of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and > addr would point to the userspace address of the > btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have > CAP_SYS_ADMIN for this to work. > > Signed-off-by: Mark Harmstone <maharmstone@fb.com> > --- ... > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 1bd4c74e8c51..e4458168c340 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -34,6 +34,7 @@ > #include <linux/iomap.h> > #include <asm/unaligned.h> > #include <linux/fsverity.h> > +#include <linux/io_uring/cmd.h> > #include "misc.h" > #include "ctree.h" > #include "disk-io.h" > @@ -9078,7 +9079,7 @@ static ssize_t btrfs_encoded_read_inline( > return ret; > } > > -static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) > +static void btrfs_encoded_read_ioctl_endio(struct btrfs_bio *bbio) > { > struct btrfs_encoded_read_private *priv = bbio->private; > > @@ -9098,6 +9099,47 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) > bio_put(&bbio->bio); > } > > +static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu( > + struct io_uring_cmd *cmd) > +{ > + return *(struct btrfs_encoded_read_private **)cmd->pdu; > +} > +static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + struct btrfs_encoded_read_private *priv; > + ssize_t ret; > + > + priv = btrfs_uring_encoded_read_pdu(cmd); > + ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED); tw callback -> btrfs_encoded_read_finish() -> copy_to_user() That's usually fine except cases when the task and/or io_uring are dying and the callback executes not from a user task context. Same problem as with fuse, we can pass a flag from io_uring into the callback telling btrfs that it should terminate the request and not rely on mm or any other task related pieces. > + > + io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags); > + > + kfree(priv); > +} > + ...
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a5d786c6d7d4..62e5757d3ba2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -620,6 +620,8 @@ struct btrfs_encoded_read_private { struct btrfs_ioctl_encoded_io_args args; struct file *file; void __user *copy_out; + struct io_uring_cmd *cmd; + unsigned int issue_flags; }; ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9914419f3b7d..2db76422d126 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3769,6 +3769,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1bd4c74e8c51..e4458168c340 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -34,6 +34,7 @@ #include <linux/iomap.h> #include <asm/unaligned.h> #include <linux/fsverity.h> +#include <linux/io_uring/cmd.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -9078,7 +9079,7 @@ static ssize_t btrfs_encoded_read_inline( return ret; } -static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) +static void btrfs_encoded_read_ioctl_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; @@ -9098,6 +9099,47 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) bio_put(&bbio->bio); } +static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu( + struct io_uring_cmd *cmd) +{ + return *(struct btrfs_encoded_read_private **)cmd->pdu; +} +static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct btrfs_encoded_read_private *priv; + ssize_t ret; + + priv = btrfs_uring_encoded_read_pdu(cmd); + ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED); + + io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags); + + kfree(priv); +} + +static void btrfs_encoded_read_uring_endio(struct btrfs_bio *bbio) +{ + struct btrfs_encoded_read_private *priv = bbio->private; + + if (bbio->bio.bi_status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, bbio->bio.bi_status); + } + if (!atomic_dec_return(&priv->pending)) { + io_uring_cmd_complete_in_task(priv->cmd, + btrfs_finish_uring_encoded_read); + } + bio_put(&bbio->bio); +} + static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, @@ -9106,11 +9148,16 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long i = 0; struct btrfs_bio *bbio; + btrfs_bio_end_io_t cb; - init_waitqueue_head(&priv->wait); + if (priv->cmd) { + cb = btrfs_encoded_read_uring_endio; + } else { + init_waitqueue_head(&priv->wait); + cb = btrfs_encoded_read_ioctl_endio; + } - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, priv); + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, cb, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9122,7 +9169,7 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, btrfs_submit_bio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, priv); + cb, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c1886209933a..85a512a9ca64 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -4509,8 +4510,8 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool return ret; } -static ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv, - ssize_t ret) +ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv, + ssize_t ret) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); @@ -4725,6 +4726,60 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool return ret; } +static void btrfs_uring_encoded_read(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + ssize_t ret; + void __user *argp = (void __user *)(uintptr_t)cmd->sqe->addr; + struct btrfs_encoded_read_private *priv; + bool compat = issue_flags & IO_URING_F_COMPAT; + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto out_uring; + } + + ret = btrfs_prepare_encoded_read(priv, cmd->file, compat, argp); + if (ret) + goto out_finish; + + if (iov_iter_count(&priv->iter) == 0) { + ret = 0; + goto out_finish; + } + + *(struct btrfs_encoded_read_private **)cmd->pdu = priv; + priv->cmd = cmd; + priv->issue_flags = issue_flags; + ret = btrfs_encoded_read(priv); + + if (ret == -EIOCBQUEUED && atomic_dec_return(&priv->pending)) + return; + +out_finish: + ret = btrfs_encoded_read_finish(priv, ret); + kfree(priv); + +out_uring: + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + btrfs_uring_encoded_read(cmd, issue_flags); + return -EIOCBQUEUED; + } + + io_uring_cmd_done(cmd, -EINVAL, 0, issue_flags); + return -EIOCBQUEUED; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..9d1522de79d3 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,5 +22,8 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv, + ssize_t ret); #endif
Adds an io_uring interface for asynchronous encoded reads, using the same interface as for the ioctl. To use this you would use an SQE opcode of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and addr would point to the userspace address of the btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have CAP_SYS_ADMIN for this to work. Signed-off-by: Mark Harmstone <maharmstone@fb.com> --- fs/btrfs/btrfs_inode.h | 2 ++ fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 57 ++++++++++++++++++++++++++++++++++++---- fs/btrfs/ioctl.c | 59 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/ioctl.h | 3 +++ 5 files changed, 115 insertions(+), 7 deletions(-)