diff mbox series

[6/6] btrfs: add io_uring interface for encoded reads

Message ID 20240823162810.1668399-7-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: add io_uring for encoded reads | expand

Commit Message

Mark Harmstone Aug. 23, 2024, 4:27 p.m. UTC
Adds an io_uring interface for asynchronous encoded reads, using the
same interface as for the ioctl. To use this you would use an SQE opcode
of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and
addr would point to the userspace address of the
btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have
CAP_SYS_ADMIN for this to work.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       | 57 ++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ioctl.c       | 59 ++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ioctl.h       |  3 +++
 5 files changed, 115 insertions(+), 7 deletions(-)

Comments

Pavel Begunkov Sept. 6, 2024, 2:41 p.m. UTC | #1
On 8/23/24 17:27, Mark Harmstone wrote:
> Adds an io_uring interface for asynchronous encoded reads, using the
> same interface as for the ioctl. To use this you would use an SQE opcode
> of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and
> addr would point to the userspace address of the
> btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have
> CAP_SYS_ADMIN for this to work.
> 
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
...
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index a5d786c6d7d4..62e5757d3ba2 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -620,6 +620,8 @@ struct btrfs_encoded_read_private {
>   	struct btrfs_ioctl_encoded_io_args args;
>   	struct file *file;
>   	void __user *copy_out;
> +	struct io_uring_cmd *cmd;
> +	unsigned int issue_flags;
>   };
>   
...
> +static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu(
> +		struct io_uring_cmd *cmd)
> +{
> +	return *(struct btrfs_encoded_read_private **)cmd->pdu;
> +}
> +static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd,
> +					    unsigned int issue_flags)
> +{
> +	struct btrfs_encoded_read_private *priv;
> +	ssize_t ret;
> +
> +	priv = btrfs_uring_encoded_read_pdu(cmd);
> +	ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED);
> +
> +	io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags);

s/priv->issue_flags/issue_flags/

You can't use flags that you got somewhere before from a completely
different execution context.

> +
> +	kfree(priv);
> +}
> +
> +static void btrfs_encoded_read_uring_endio(struct btrfs_bio *bbio)
> +{
> +	struct btrfs_encoded_read_private *priv = bbio->private;

Might be cleaner if you combine it with btrfs_encoded_read_ioctl_endio

btrfs_encoded_read_endio()
{
	...
	if (!atomic_dec_return(&priv->pending)) {
		if (priv->cmd)
			io_uring_cmd_complete_in_task();
		else
			wake_up();
	...
}



> +
> +	if (bbio->bio.bi_status) {
> +		/*
> +		 * The memory barrier implied by the atomic_dec_return() here
> +		 * pairs with the memory barrier implied by the
> +		 * atomic_dec_return() or io_wait_event() in
> +		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
> +		 * write is observed before the load of status in
> +		 * btrfs_encoded_read_regular_fill_pages().
> +		 */
> +		WRITE_ONCE(priv->status, bbio->bio.bi_status);
> +	}
> +	if (!atomic_dec_return(&priv->pending)) {
> +		io_uring_cmd_complete_in_task(priv->cmd,
> +					      btrfs_finish_uring_encoded_read);
> +	}
> +	bio_put(&bbio->bio);
> +}
> +
>   static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   					u64 file_offset, u64 disk_bytenr,
>   					u64 disk_io_size,
> @@ -9106,11 +9148,16 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   	struct btrfs_fs_info *fs_info = inode->root->fs_info;
>   	unsigned long i = 0;
>   	struct btrfs_bio *bbio;
> +	btrfs_bio_end_io_t cb;
>   
> -	init_waitqueue_head(&priv->wait);
> +	if (priv->cmd) {
> +		cb = btrfs_encoded_read_uring_endio;
> +	} else {
> +		init_waitqueue_head(&priv->wait);
> +		cb = btrfs_encoded_read_ioctl_endio;
> +	}
>   
> -	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
> -			       btrfs_encoded_read_endio, priv);
> +	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, cb, priv);
>   	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
>   	bbio->inode = inode;
>   
> @@ -9122,7 +9169,7 @@ static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   			btrfs_submit_bio(bbio, 0);
>   
>   			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
> -					       btrfs_encoded_read_endio, priv);
> +					       cb, priv);
>   			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
>   			bbio->inode = inode;
>   			continue;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index c1886209933a..85a512a9ca64 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
...
> +static void btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> +				     unsigned int issue_flags)
> +{
> +	ssize_t ret;
> +	void __user *argp = (void __user *)(uintptr_t)cmd->sqe->addr;

u64_to_user_ptr()

> +	struct btrfs_encoded_read_private *priv;
> +	bool compat = issue_flags & IO_URING_F_COMPAT;
> +
> +	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
> +	if (!priv) {
> +		ret = -ENOMEM;
> +		goto out_uring;
> +	}
> +
> +	ret = btrfs_prepare_encoded_read(priv, cmd->file, compat, argp);
> +	if (ret)
> +		goto out_finish;
> +
> +	if (iov_iter_count(&priv->iter) == 0) {
> +		ret = 0;
> +		goto out_finish;
> +	}
> +
> +	*(struct btrfs_encoded_read_private **)cmd->pdu = priv;
> +	priv->cmd = cmd;
> +	priv->issue_flags = issue_flags;

You shouldn't be stashing issue_flags, it almost always wrong.
Looks btrfs_finish_uring_encoded_read() is the only user, and
with my comment above you can kill priv->issue_flags

> +	ret = btrfs_encoded_read(priv);
> +
> +	if (ret == -EIOCBQUEUED && atomic_dec_return(&priv->pending))
> +		return;

Is gating on -EIOCBQUEUED just an optimisation? I.e. it's fine
to swap checks

if (atomic_dec_return(&priv->pending) && ret == -EIOCBQUEUED)

but we know that unless it returned -EIOCBQUEUED nobody
should've touched ->pending.

> +
> +out_finish:
> +	ret = btrfs_encoded_read_finish(priv, ret);
> +	kfree(priv);
> +
> +out_uring:
> +	io_uring_cmd_done(cmd, ret, 0, issue_flags);
> +}
> +
Pavel Begunkov Sept. 6, 2024, 3:33 p.m. UTC | #2
On 8/23/24 17:27, Mark Harmstone wrote:
> Adds an io_uring interface for asynchronous encoded reads, using the
> same interface as for the ioctl. To use this you would use an SQE opcode
> of IORING_OP_URING_CMD, the cmd_op would be BTRFS_IOC_ENCODED_READ, and
> addr would point to the userspace address of the
> btrfs_ioctl_encoded_io_args struct. As with the ioctl, you need to have
> CAP_SYS_ADMIN for this to work.
> 
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
...
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 1bd4c74e8c51..e4458168c340 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -34,6 +34,7 @@
>   #include <linux/iomap.h>
>   #include <asm/unaligned.h>
>   #include <linux/fsverity.h>
> +#include <linux/io_uring/cmd.h>
>   #include "misc.h"
>   #include "ctree.h"
>   #include "disk-io.h"
> @@ -9078,7 +9079,7 @@ static ssize_t btrfs_encoded_read_inline(
>   	return ret;
>   }
>   
> -static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
> +static void btrfs_encoded_read_ioctl_endio(struct btrfs_bio *bbio)
>   {
>   	struct btrfs_encoded_read_private *priv = bbio->private;
>   
> @@ -9098,6 +9099,47 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
>   	bio_put(&bbio->bio);
>   }
>   
> +static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu(
> +		struct io_uring_cmd *cmd)
> +{
> +	return *(struct btrfs_encoded_read_private **)cmd->pdu;
> +}
> +static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd,
> +					    unsigned int issue_flags)
> +{
> +	struct btrfs_encoded_read_private *priv;
> +	ssize_t ret;
> +
> +	priv = btrfs_uring_encoded_read_pdu(cmd);
> +	ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED);

tw callback -> btrfs_encoded_read_finish() -> copy_to_user()

That's usually fine except cases when the task and/or io_uring are dying
and the callback executes not from a user task context. Same problem as
with fuse, we can pass a flag from io_uring into the callback telling
btrfs that it should terminate the request and not rely on mm or any
other task related pieces.


> +
> +	io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags);
> +
> +	kfree(priv);
> +}
> +

...
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a5d786c6d7d4..62e5757d3ba2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -620,6 +620,8 @@  struct btrfs_encoded_read_private {
 	struct btrfs_ioctl_encoded_io_args args;
 	struct file *file;
 	void __user *copy_out;
+	struct io_uring_cmd *cmd;
+	unsigned int issue_flags;
 };
 
 ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9914419f3b7d..2db76422d126 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3769,6 +3769,7 @@  const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
 	.remap_file_range = btrfs_remap_file_range,
+	.uring_cmd	= btrfs_uring_cmd,
 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
 };
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bd4c74e8c51..e4458168c340 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,6 +34,7 @@ 
 #include <linux/iomap.h>
 #include <asm/unaligned.h>
 #include <linux/fsverity.h>
+#include <linux/io_uring/cmd.h>
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -9078,7 +9079,7 @@  static ssize_t btrfs_encoded_read_inline(
 	return ret;
 }
 
-static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+static void btrfs_encoded_read_ioctl_endio(struct btrfs_bio *bbio)
 {
 	struct btrfs_encoded_read_private *priv = bbio->private;
 
@@ -9098,6 +9099,47 @@  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 	bio_put(&bbio->bio);
 }
 
+static inline struct btrfs_encoded_read_private *btrfs_uring_encoded_read_pdu(
+		struct io_uring_cmd *cmd)
+{
+	return *(struct btrfs_encoded_read_private **)cmd->pdu;
+}
+static void btrfs_finish_uring_encoded_read(struct io_uring_cmd *cmd,
+					    unsigned int issue_flags)
+{
+	struct btrfs_encoded_read_private *priv;
+	ssize_t ret;
+
+	priv = btrfs_uring_encoded_read_pdu(cmd);
+	ret = btrfs_encoded_read_finish(priv, -EIOCBQUEUED);
+
+	io_uring_cmd_done(priv->cmd, ret, 0, priv->issue_flags);
+
+	kfree(priv);
+}
+
+static void btrfs_encoded_read_uring_endio(struct btrfs_bio *bbio)
+{
+	struct btrfs_encoded_read_private *priv = bbio->private;
+
+	if (bbio->bio.bi_status) {
+		/*
+		 * The memory barrier implied by the atomic_dec_return() here
+		 * pairs with the memory barrier implied by the
+		 * atomic_dec_return() or io_wait_event() in
+		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
+		 * write is observed before the load of status in
+		 * btrfs_encoded_read_regular_fill_pages().
+		 */
+		WRITE_ONCE(priv->status, bbio->bio.bi_status);
+	}
+	if (!atomic_dec_return(&priv->pending)) {
+		io_uring_cmd_complete_in_task(priv->cmd,
+					      btrfs_finish_uring_encoded_read);
+	}
+	bio_put(&bbio->bio);
+}
+
 static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 					u64 file_offset, u64 disk_bytenr,
 					u64 disk_io_size,
@@ -9106,11 +9148,16 @@  static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	unsigned long i = 0;
 	struct btrfs_bio *bbio;
+	btrfs_bio_end_io_t cb;
 
-	init_waitqueue_head(&priv->wait);
+	if (priv->cmd) {
+		cb = btrfs_encoded_read_uring_endio;
+	} else {
+		init_waitqueue_head(&priv->wait);
+		cb = btrfs_encoded_read_ioctl_endio;
+	}
 
-	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
-			       btrfs_encoded_read_endio, priv);
+	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, cb, priv);
 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 	bbio->inode = inode;
 
@@ -9122,7 +9169,7 @@  static void _btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 			btrfs_submit_bio(bbio, 0);
 
 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
-					       btrfs_encoded_read_endio, priv);
+					       cb, priv);
 			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 			bbio->inode = inode;
 			continue;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c1886209933a..85a512a9ca64 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@ 
 #include <linux/fileattr.h>
 #include <linux/fsverity.h>
 #include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -4509,8 +4510,8 @@  static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool
 	return ret;
 }
 
-static ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv,
-					 ssize_t ret)
+ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv,
+				  ssize_t ret)
 {
 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
 					     flags);
@@ -4725,6 +4726,60 @@  static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 	return ret;
 }
 
+static void btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
+				     unsigned int issue_flags)
+{
+	ssize_t ret;
+	void __user *argp = (void __user *)(uintptr_t)cmd->sqe->addr;
+	struct btrfs_encoded_read_private *priv;
+	bool compat = issue_flags & IO_URING_F_COMPAT;
+
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto out_uring;
+	}
+
+	ret = btrfs_prepare_encoded_read(priv, cmd->file, compat, argp);
+	if (ret)
+		goto out_finish;
+
+	if (iov_iter_count(&priv->iter) == 0) {
+		ret = 0;
+		goto out_finish;
+	}
+
+	*(struct btrfs_encoded_read_private **)cmd->pdu = priv;
+	priv->cmd = cmd;
+	priv->issue_flags = issue_flags;
+	ret = btrfs_encoded_read(priv);
+
+	if (ret == -EIOCBQUEUED && atomic_dec_return(&priv->pending))
+		return;
+
+out_finish:
+	ret = btrfs_encoded_read_finish(priv, ret);
+	kfree(priv);
+
+out_uring:
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	switch (cmd->cmd_op) {
+	case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_ENCODED_READ_32:
+#endif
+		btrfs_uring_encoded_read(cmd, issue_flags);
+		return -EIOCBQUEUED;
+	}
+
+	io_uring_cmd_done(cmd, -EINVAL, 0, issue_flags);
+	return -EIOCBQUEUED;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..9d1522de79d3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,8 @@  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(const u8 *uuid);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 				     struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv,
+				  ssize_t ret);
 
 #endif