diff mbox series

[5/5] btrfs: add io_uring command for encoded reads

Message ID 20241014171838.304953-6-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: encoded reads via io_uring | expand

Commit Message

Mark Harmstone Oct. 14, 2024, 5:18 p.m. UTC
Adds an io_uring command for encoded reads, using the same interface as
the existing BTRFS_IOC_ENCODED_READ ioctl.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 fs/btrfs/file.c  |   1 +
 fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |   1 +
 3 files changed, 285 insertions(+)

Comments

David Sterba Oct. 21, 2024, 1:50 p.m. UTC | #1
On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote:
> Adds an io_uring command for encoded reads, using the same interface as

Add ...

> the existing BTRFS_IOC_ENCODED_READ ioctl.

This is probably a good summary in a changelog but the patch is quite
long so it feels like this should be described in a more detail how it's
done. Connecting two interfaces can be done in various ways, so at least
mention that it's a simple pass through, or if there are any
complications regardign locking, object lifetime and such.

> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>  fs/btrfs/file.c  |   1 +
>  fs/btrfs/ioctl.c | 283 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/ioctl.h |   1 +
>  3 files changed, 285 insertions(+)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 2aeb8116549c..e33ca73fef8c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -3774,6 +3774,7 @@ const struct file_operations btrfs_file_operations = {
>  	.compat_ioctl	= btrfs_compat_ioctl,
>  #endif
>  	.remap_file_range = btrfs_remap_file_range,
> +	.uring_cmd	= btrfs_uring_cmd,
>  	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
>  };
>  
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 8c9ff4898ab0..c0393575cf5e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -29,6 +29,7 @@
>  #include <linux/fileattr.h>
>  #include <linux/fsverity.h>
>  #include <linux/sched/xacct.h>
> +#include <linux/io_uring/cmd.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "export.h"
> @@ -4723,6 +4724,288 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
>  	return ret;
>  }
>  
> +struct btrfs_uring_priv {
> +	struct io_uring_cmd *cmd;
> +	struct page **pages;
> +	unsigned long nr_pages;
> +	struct kiocb iocb;
> +	struct iovec *iov;
> +	struct iov_iter iter;
> +	struct extent_state *cached_state;
> +	u64 count;
> +	bool compressed;

This leaves a 7 byte hole.

> +	u64 start;
> +	u64 lockend;
> +};

The whole structure should be documented and the members too if it's not
obvious what they are used for.

> +
> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
> +				      unsigned int issue_flags)
> +{
> +	struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu;
> +	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	unsigned long i;

Why is this long?

> +	u64 cur;
> +	size_t page_offset;
> +	ssize_t ret;
> +
> +	if (priv->compressed) {
> +		i = 0;
> +		page_offset = 0;
> +	} else {
> +		i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
> +		page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1);

Please don't open code page_offset()

> +	}
> +	cur = 0;
> +	while (cur < priv->count) {
> +		size_t bytes = min_t(size_t, priv->count - cur,
> +				     PAGE_SIZE - page_offset);
> +
> +		if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
> +				      &priv->iter) != bytes) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		i++;
> +		cur += bytes;
> +		page_offset = 0;
> +	}
> +	ret = priv->count;
> +
> +out:
> +	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +
> +	io_uring_cmd_done(cmd, ret, 0, issue_flags);
> +	add_rchar(current, ret);
> +
> +	for (unsigned long i = 0; i < priv->nr_pages; i++) {

Shadowing 'i' of the same type as is declared in the function. Maybe
don't call it just 'i' but index as it's used outside of a loop.

> +		__free_page(priv->pages[i]);
> +	}

Please drop the outer { } for a single statement block.

> +
> +	kfree(priv->pages);
> +	kfree(priv->iov);
> +	kfree(priv);
> +}
> +
> +static void btrfs_uring_read_extent_cb(void *ctx, int err)
> +{
> +	struct btrfs_uring_priv *priv = ctx;
> +
> +	*(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;

Isn't there a helper for that? Type casting should be done in justified
cases and as an exception.

> +	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
> +}
> +
> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
> +				   u64 start, u64 lockend,
> +				   struct extent_state *cached_state,
> +				   u64 disk_bytenr, u64 disk_io_size,
> +				   size_t count, bool compressed,
> +				   struct iovec *iov,
> +				   struct io_uring_cmd *cmd)
> +{
> +	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	struct page **pages;
> +	struct btrfs_uring_priv *priv = NULL;
> +	unsigned long nr_pages;
> +	int ret;
> +
> +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> +	if (!pages)
> +		return -ENOMEM;
> +	ret = btrfs_alloc_page_array(nr_pages, pages, 0);

The allocation sizes are derived from disk_io_size that comes from the
outside, potentially making large allocatoins. Or is there some inherent
limit on the maximu size?

> +	if (ret) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	priv = kmalloc(sizeof(*priv), GFP_NOFS);
> +	if (!priv) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	priv->iocb = *iocb;
> +	priv->iov = iov;
> +	priv->iter = *iter;
> +	priv->count = count;
> +	priv->cmd = cmd;
> +	priv->cached_state = cached_state;
> +	priv->compressed = compressed;
> +	priv->nr_pages = nr_pages;
> +	priv->pages = pages;
> +	priv->start = start;
> +	priv->lockend = lockend;
> +
> +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> +						    disk_io_size, pages,
> +						    btrfs_uring_read_extent_cb,
> +						    priv);
> +	if (ret)
> +		goto fail;
> +
> +	return -EIOCBQUEUED;
> +
> +fail:
> +	unlock_extent(io_tree, start, lockend, &cached_state);
> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +	kfree(priv);

Does this leak pages and priv->pages?

> +	return ret;
> +}
> +
> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> +				    unsigned int issue_flags)
> +{
> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> +					     flags);
> +	size_t copy_end;
> +	struct btrfs_ioctl_encoded_io_args args = {0};
                                                = { 0 }
> +	int ret;
> +	u64 disk_bytenr, disk_io_size;
> +	struct file *file = cmd->file;
> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = iovstack;
> +	struct iov_iter iter;
> +	loff_t pos;
> +	struct kiocb kiocb;
> +	struct extent_state *cached_state = NULL;
> +	u64 start, lockend;

The stack consumption looks quite high.

> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		ret = -EPERM;
> +		goto out_acct;
> +	}
> +
> +	if (issue_flags & IO_URING_F_COMPAT) {
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +		struct btrfs_ioctl_encoded_io_args_32 args32;
> +
> +		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
> +				       flags);
> +		if (copy_from_user(&args32, (const void *)cmd->sqe->addr,
> +				   copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +		args.iov = compat_ptr(args32.iov);
> +		args.iovcnt = args32.iovcnt;
> +		args.offset = args32.offset;
> +		args.flags = args32.flags;
> +#else
> +		return -ENOTTY;
> +#endif
> +	} else {
> +		copy_end = copy_end_kernel;
> +		if (copy_from_user(&args, (const void *)cmd->sqe->addr,
> +				   copy_end)) {
> +			ret = -EFAULT;
> +			goto out_acct;
> +		}
> +	}
> +
> +	if (args.flags != 0)
> +		return -EINVAL;
> +
> +	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
> +			   &iov, &iter);
> +	if (ret < 0)
> +		goto out_acct;
> +
> +	if (iov_iter_count(&iter) == 0) {
> +		ret = 0;
> +		goto out_free;
> +	}
> +
> +	pos = args.offset;
> +	ret = rw_verify_area(READ, file, &pos, args.len);
> +	if (ret < 0)
> +		goto out_free;
> +
> +	init_sync_kiocb(&kiocb, file);
> +	kiocb.ki_pos = pos;
> +
> +	start = ALIGN_DOWN(pos, fs_info->sectorsize);
> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> +	ret = btrfs_encoded_read(&kiocb, &iter, &args,
> +				 issue_flags & IO_URING_F_NONBLOCK,
> +				 &cached_state, &disk_bytenr, &disk_io_size);
> +	if (ret < 0 && ret != -EIOCBQUEUED)
> +		goto out_free;
> +
> +	file_accessed(file);
> +
> +	if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
> +			 (char *)&args + copy_end_kernel,

So many type casts again

> +			 sizeof(args) - copy_end_kernel)) {
> +		if (ret == -EIOCBQUEUED) {
> +			unlock_extent(io_tree, start, lockend, &cached_state);
> +			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +		}
> +		ret = -EFAULT;
> +		goto out_free;
> +	}
> +
> +	if (ret == -EIOCBQUEUED) {
> +		u64 count;
> +
> +		/* If we've optimized things by storing the iovecs on the stack,
> +		 * undo this.  */

This is not proper comment formatting.

> +		if (!iov) {
> +			iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
> +				      GFP_NOFS);
> +			if (!iov) {
> +				unlock_extent(io_tree, start, lockend,
> +					      &cached_state);
> +				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> +				ret = -ENOMEM;
> +				goto out_acct;
> +			}
> +
> +			memcpy(iov, iovstack,
> +			       sizeof(struct iovec) * args.iovcnt);
> +		}
> +
> +		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
> +
> +		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
> +					      cached_state, disk_bytenr,
> +					      disk_io_size, count,
> +					      args.compression, iov, cmd);
> +
> +		goto out_acct;
> +	}
> +
> +out_free:
> +	kfree(iov);
> +
> +out_acct:
> +	if (ret > 0)
> +		add_rchar(current, ret);
> +	inc_syscr(current);
> +
> +	return ret;
> +}
> +
> +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
> +{
> +	switch (cmd->cmd_op) {
> +	case BTRFS_IOC_ENCODED_READ:
> +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
> +	case BTRFS_IOC_ENCODED_READ_32:
> +#endif
> +		return btrfs_uring_encoded_read(cmd, issue_flags);
> +	}
> +
> +	return -EINVAL;
> +}
> +
>  long btrfs_ioctl(struct file *file, unsigned int
>  		cmd, unsigned long arg)
>  {
Pavel Begunkov Oct. 21, 2024, 4:15 p.m. UTC | #2
On 10/21/24 14:50, David Sterba wrote:
> On Mon, Oct 14, 2024 at 06:18:27PM +0100, Mark Harmstone wrote:
>> Adds an io_uring command for encoded reads, using the same interface as
> 
> Add ...
> 
>> the existing BTRFS_IOC_ENCODED_READ ioctl.
> 
> This is probably a good summary in a changelog but the patch is quite
> long so it feels like this should be described in a more detail how it's
> done. Connecting two interfaces can be done in various ways, so at least
> mention that it's a simple pass through, or if there are any
> complications regardign locking, object lifetime and such.
> 
>> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
...
>> +
>> +	kfree(priv->pages);
>> +	kfree(priv->iov);
>> +	kfree(priv);
>> +}
>> +
>> +static void btrfs_uring_read_extent_cb(void *ctx, int err)
>> +{
>> +	struct btrfs_uring_priv *priv = ctx;
>> +
>> +	*(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
> 
> Isn't there a helper for that? Type casting should be done in justified
> cases and as an exception.

FWIW, I haven't taken a look yet, but for this one, please use
io_uring_cmd_to_pdu(cmd, type), it'll check the size for you.
And there in no need to cast it to uintptr, would be much nicer
to

struct btrfs_cmd {
	struct btrfs_uring_priv *priv;
};

struct btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct btrfs_cmd);
bc->priv = priv;

You get more type checking this way. You can also wrap around
io_uring_cmd_to_pdu() into a static inline helper, if that
looks better.

...>> +
>> +	start = ALIGN_DOWN(pos, fs_info->sectorsize);
>> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
>> +
>> +	ret = btrfs_encoded_read(&kiocb, &iter, &args,
>> +				 issue_flags & IO_URING_F_NONBLOCK,
>> +				 &cached_state, &disk_bytenr, &disk_io_size);
>> +	if (ret < 0 && ret != -EIOCBQUEUED)
>> +		goto out_free;
>> +
>> +	file_accessed(file);
>> +
>> +	if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
>> +			 (char *)&args + copy_end_kernel,

Be aware, SQE data is not stable, you should assume that the user
space can change it at any moment. It should be a READ_ONCE, and
likely you don't want it to be read twice, unless you handle it /
verify values / etc. I'd recommend to save it early in the callback
and stash somewhere, e.g. into struct btrfs_cmd I mentioned above.

> 
> So many type casts again
>
Mark Harmstone Oct. 21, 2024, 5:05 p.m. UTC | #3
Thanks David.

On 21/10/24 14:50, David Sterba wrote:
>> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
>> +				   u64 start, u64 lockend,
>> +				   struct extent_state *cached_state,
>> +				   u64 disk_bytenr, u64 disk_io_size,
>> +				   size_t count, bool compressed,
>> +				   struct iovec *iov,
>> +				   struct io_uring_cmd *cmd)
>> +{
>> +	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
>> +	struct extent_io_tree *io_tree = &inode->io_tree;
>> +	struct page **pages;
>> +	struct btrfs_uring_priv *priv = NULL;
>> +	unsigned long nr_pages;
>> +	int ret;
>> +
>> +	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
>> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
>> +	if (!pages)
>> +		return -ENOMEM;
>> +	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> 
> The allocation sizes are derived from disk_io_size that comes from the
> outside, potentially making large allocatoins. Or is there some inherent
> limit on the maximu size?

Yes. It comes from btrfs_encoded_read, where it's limited to 
BTRFS_MAX_UNCOMPRESSED (i.e. 128KB).

>> +	if (ret) {
>> +		ret = -ENOMEM;
>> +		goto fail;
>> +	}
>> +
>> +	priv = kmalloc(sizeof(*priv), GFP_NOFS);
>> +	if (!priv) {
>> +		ret = -ENOMEM;
>> +		goto fail;
>> +	}
>> +
>> +	priv->iocb = *iocb;
>> +	priv->iov = iov;
>> +	priv->iter = *iter;
>> +	priv->count = count;
>> +	priv->cmd = cmd;
>> +	priv->cached_state = cached_state;
>> +	priv->compressed = compressed;
>> +	priv->nr_pages = nr_pages;
>> +	priv->pages = pages;
>> +	priv->start = start;
>> +	priv->lockend = lockend;
>> +
>> +	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
>> +						    disk_io_size, pages,
>> +						    btrfs_uring_read_extent_cb,
>> +						    priv);
>> +	if (ret)
>> +		goto fail;
>> +
>> +	return -EIOCBQUEUED;
>> +
>> +fail:
>> +	unlock_extent(io_tree, start, lockend, &cached_state);
>> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +	kfree(priv);
> 
> Does this leak pages and priv->pages?

No, they get freed in btrfs_uring_read_finished.

>> +	return ret;
>> +}
>> +
>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>> +				    unsigned int issue_flags)
>> +{
>> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
>> +					     flags);
>> +	size_t copy_end;
>> +	struct btrfs_ioctl_encoded_io_args args = {0};
>                                                  = { 0 }
>> +	int ret;
>> +	u64 disk_bytenr, disk_io_size;
>> +	struct file *file = cmd->file;
>> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
>> +	struct extent_io_tree *io_tree = &inode->io_tree;
>> +	struct iovec iovstack[UIO_FASTIOV];
>> +	struct iovec *iov = iovstack;
>> +	struct iov_iter iter;
>> +	loff_t pos;
>> +	struct kiocb kiocb;
>> +	struct extent_state *cached_state = NULL;
>> +	u64 start, lockend;
> 
> The stack consumption looks quite high.

696 bytes, compared to 672 in btrfs_ioctl_encoded_read. 
btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing 
here would be to allocate btrfs_uring_priv early and pass that around, I 
think.

Do you have a recommendation for what the maximum stack size of a 
function should be?

Mark
David Sterba Oct. 21, 2024, 6:23 p.m. UTC | #4
On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote:
> >> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
> >> +				    unsigned int issue_flags)
> >> +{
> >> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
> >> +					     flags);
> >> +	size_t copy_end;
> >> +	struct btrfs_ioctl_encoded_io_args args = {0};
> >                                                  = { 0 }
> >> +	int ret;
> >> +	u64 disk_bytenr, disk_io_size;
> >> +	struct file *file = cmd->file;
> >> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
> >> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> >> +	struct extent_io_tree *io_tree = &inode->io_tree;
> >> +	struct iovec iovstack[UIO_FASTIOV];
> >> +	struct iovec *iov = iovstack;
> >> +	struct iov_iter iter;
> >> +	loff_t pos;
> >> +	struct kiocb kiocb;
> >> +	struct extent_state *cached_state = NULL;
> >> +	u64 start, lockend;
> > 
> > The stack consumption looks quite high.
> 
> 696 bytes, compared to 672 in btrfs_ioctl_encoded_read. 
> btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing 
> here would be to allocate btrfs_uring_priv early and pass that around, I 
> think.
> 
> Do you have a recommendation for what the maximum stack size of a 
> function should be?

It depends from where the function is called. For ioctl callbacks, like
btrfs_ioctl_encoded_read it's the first function using kernel stack
leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something
similar applies to the io_uring callbacks then it's probably fine.

Using a separate off-stack structure works but it's a penalty as it
needs the allcation. The io_uring is meant for high performance so if
the on-stack allocation is safe then keep it like that.

I've checked on a release config the stack consumption and the encoded
ioctl functions are not the worst:

tree-log.c:btrfs_sync_log                       728 static
scrub.c:scrub_verify_one_metadata               552 dynamic,bounded
inode.c:print_data_reloc_error                  544 dynamic,bounded
uuid-tree.c:btrfs_uuid_scan_kthread             520 static
tree-checker.c:check_root_item                  504 static
file-item.c:btrfs_csum_one_bio                  496 static
inode.c:btrfs_start_delalloc_roots              488 static
scrub.c:scrub_raid56_parity_stripe              464 dynamic,bounded
disk-io.c:write_dev_supers                      464 static
ioctl.c:btrfs_ioctl_encoded_write               456 dynamic,bounded
ioctl.c:btrfs_ioctl_encoded_read                456 dynamic,bounded
Mark Harmstone Oct. 22, 2024, 9:12 a.m. UTC | #5
On 21/10/24 19:23, David Sterba wrote:
> > 
> On Mon, Oct 21, 2024 at 05:05:20PM +0000, Mark Harmstone wrote:
>>>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>>>> +				    unsigned int issue_flags)
>>>> +{
>>>> +	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
>>>> +					     flags);
>>>> +	size_t copy_end;
>>>> +	struct btrfs_ioctl_encoded_io_args args = {0};
>>>                                                   = { 0 }
>>>> +	int ret;
>>>> +	u64 disk_bytenr, disk_io_size;
>>>> +	struct file *file = cmd->file;
>>>> +	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>>>> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
>>>> +	struct extent_io_tree *io_tree = &inode->io_tree;
>>>> +	struct iovec iovstack[UIO_FASTIOV];
>>>> +	struct iovec *iov = iovstack;
>>>> +	struct iov_iter iter;
>>>> +	loff_t pos;
>>>> +	struct kiocb kiocb;
>>>> +	struct extent_state *cached_state = NULL;
>>>> +	u64 start, lockend;
>>>
>>> The stack consumption looks quite high.
>>
>> 696 bytes, compared to 672 in btrfs_ioctl_encoded_read.
>> btrfs_ioctl_encoded write is pretty big too. Probably the easiest thing
>> here would be to allocate btrfs_uring_priv early and pass that around, I
>> think.
>>
>> Do you have a recommendation for what the maximum stack size of a
>> function should be?
> 
> It depends from where the function is called. For ioctl callbacks, like
> btrfs_ioctl_encoded_read it's the first function using kernel stack
> leaving enough for any deep IO stacks (DM/NFS/iSCSI/...). If something
> similar applies to the io_uring callbacks then it's probably fine.

Thanks. Yes, the two should functions should be broadly equivalent.

> Using a separate off-stack structure works but it's a penalty as it
> needs the allcation. The io_uring is meant for high performance so if
> the on-stack allocation is safe then keep it like that.

Okay, I'll leave this bit as it is, then. I can revisit it if we start 
getting a spike of stack overflow crashes mentioning 
btrfs_uring_encoded_read.

> 
> I've checked on a release config the stack consumption and the encoded
> ioctl functions are not the worst:
> 
> tree-log.c:btrfs_sync_log                       728 static
> scrub.c:scrub_verify_one_metadata               552 dynamic,bounded
> inode.c:print_data_reloc_error                  544 dynamic,bounded
> uuid-tree.c:btrfs_uuid_scan_kthread             520 static
> tree-checker.c:check_root_item                  504 static
> file-item.c:btrfs_csum_one_bio                  496 static
> inode.c:btrfs_start_delalloc_roots              488 static
> scrub.c:scrub_raid56_parity_stripe              464 dynamic,bounded
> disk-io.c:write_dev_supers                      464 static
> ioctl.c:btrfs_ioctl_encoded_write               456 dynamic,bounded
> ioctl.c:btrfs_ioctl_encoded_read                456 dynamic,bounded
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2aeb8116549c..e33ca73fef8c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3774,6 +3774,7 @@  const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
 	.remap_file_range = btrfs_remap_file_range,
+	.uring_cmd	= btrfs_uring_cmd,
 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
 };
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8c9ff4898ab0..c0393575cf5e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@ 
 #include <linux/fileattr.h>
 #include <linux/fsverity.h>
 #include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -4723,6 +4724,288 @@  static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 	return ret;
 }
 
+struct btrfs_uring_priv {
+	struct io_uring_cmd *cmd;
+	struct page **pages;
+	unsigned long nr_pages;
+	struct kiocb iocb;
+	struct iovec *iov;
+	struct iov_iter iter;
+	struct extent_state *cached_state;
+	u64 count;
+	bool compressed;
+	u64 start;
+	u64 lockend;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
+				      unsigned int issue_flags)
+{
+	struct btrfs_uring_priv *priv = (struct btrfs_uring_priv *)*(uintptr_t*)cmd->pdu;
+	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	unsigned long i;
+	u64 cur;
+	size_t page_offset;
+	ssize_t ret;
+
+	if (priv->compressed) {
+		i = 0;
+		page_offset = 0;
+	} else {
+		i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+		page_offset = (priv->iocb.ki_pos - priv->start) & (PAGE_SIZE - 1);
+	}
+	cur = 0;
+	while (cur < priv->count) {
+		size_t bytes = min_t(size_t, priv->count - cur,
+				     PAGE_SIZE - page_offset);
+
+		if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
+				      &priv->iter) != bytes) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		i++;
+		cur += bytes;
+		page_offset = 0;
+	}
+	ret = priv->count;
+
+out:
+	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+	add_rchar(current, ret);
+
+	for (unsigned long i = 0; i < priv->nr_pages; i++) {
+		__free_page(priv->pages[i]);
+	}
+
+	kfree(priv->pages);
+	kfree(priv->iov);
+	kfree(priv);
+}
+
+static void btrfs_uring_read_extent_cb(void *ctx, int err)
+{
+	struct btrfs_uring_priv *priv = ctx;
+
+	*(uintptr_t*)priv->cmd->pdu = (uintptr_t)priv;
+	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+				   u64 start, u64 lockend,
+				   struct extent_state *cached_state,
+				   u64 disk_bytenr, u64 disk_io_size,
+				   size_t count, bool compressed,
+				   struct iovec *iov,
+				   struct io_uring_cmd *cmd)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct page **pages;
+	struct btrfs_uring_priv *priv = NULL;
+	unsigned long nr_pages;
+	int ret;
+
+	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!pages)
+		return -ENOMEM;
+	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+	if (ret) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	priv = kmalloc(sizeof(*priv), GFP_NOFS);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	priv->iocb = *iocb;
+	priv->iov = iov;
+	priv->iter = *iter;
+	priv->count = count;
+	priv->cmd = cmd;
+	priv->cached_state = cached_state;
+	priv->compressed = compressed;
+	priv->nr_pages = nr_pages;
+	priv->pages = pages;
+	priv->start = start;
+	priv->lockend = lockend;
+
+	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+						    disk_io_size, pages,
+						    btrfs_uring_read_extent_cb,
+						    priv);
+	if (ret)
+		goto fail;
+
+	return -EIOCBQUEUED;
+
+fail:
+	unlock_extent(io_tree, start, lockend, &cached_state);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+	kfree(priv);
+	return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+					     flags);
+	size_t copy_end;
+	struct btrfs_ioctl_encoded_io_args args = {0};
+	int ret;
+	u64 disk_bytenr, disk_io_size;
+	struct file *file = cmd->file;
+	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	struct iov_iter iter;
+	loff_t pos;
+	struct kiocb kiocb;
+	struct extent_state *cached_state = NULL;
+	u64 start, lockend;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out_acct;
+	}
+
+	if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+		struct btrfs_ioctl_encoded_io_args_32 args32;
+
+		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+				       flags);
+		if (copy_from_user(&args32, (const void *)cmd->sqe->addr,
+				   copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+		args.iov = compat_ptr(args32.iov);
+		args.iovcnt = args32.iovcnt;
+		args.offset = args32.offset;
+		args.flags = args32.flags;
+#else
+		return -ENOTTY;
+#endif
+	} else {
+		copy_end = copy_end_kernel;
+		if (copy_from_user(&args, (const void *)cmd->sqe->addr,
+				   copy_end)) {
+			ret = -EFAULT;
+			goto out_acct;
+		}
+	}
+
+	if (args.flags != 0)
+		return -EINVAL;
+
+	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+			   &iov, &iter);
+	if (ret < 0)
+		goto out_acct;
+
+	if (iov_iter_count(&iter) == 0) {
+		ret = 0;
+		goto out_free;
+	}
+
+	pos = args.offset;
+	ret = rw_verify_area(READ, file, &pos, args.len);
+	if (ret < 0)
+		goto out_free;
+
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = pos;
+
+	start = ALIGN_DOWN(pos, fs_info->sectorsize);
+	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+	ret = btrfs_encoded_read(&kiocb, &iter, &args,
+				 issue_flags & IO_URING_F_NONBLOCK,
+				 &cached_state, &disk_bytenr, &disk_io_size);
+	if (ret < 0 && ret != -EIOCBQUEUED)
+		goto out_free;
+
+	file_accessed(file);
+
+	if (copy_to_user((void*)(uintptr_t)cmd->sqe->addr + copy_end,
+			 (char *)&args + copy_end_kernel,
+			 sizeof(args) - copy_end_kernel)) {
+		if (ret == -EIOCBQUEUED) {
+			unlock_extent(io_tree, start, lockend, &cached_state);
+			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+		}
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	if (ret == -EIOCBQUEUED) {
+		u64 count;
+
+		/* If we've optimized things by storing the iovecs on the stack,
+		 * undo this.  */
+		if (!iov) {
+			iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
+				      GFP_NOFS);
+			if (!iov) {
+				unlock_extent(io_tree, start, lockend,
+					      &cached_state);
+				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+				ret = -ENOMEM;
+				goto out_acct;
+			}
+
+			memcpy(iov, iovstack,
+			       sizeof(struct iovec) * args.iovcnt);
+		}
+
+		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+					      cached_state, disk_bytenr,
+					      disk_io_size, count,
+					      args.compression, iov, cmd);
+
+		goto out_acct;
+	}
+
+out_free:
+	kfree(iov);
+
+out_acct:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+
+	return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	switch (cmd->cmd_op) {
+	case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+	case BTRFS_IOC_ENCODED_READ_32:
+#endif
+		return btrfs_uring_encoded_read(cmd, issue_flags);
+	}
+
+	return -EINVAL;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..288f4f5c4409 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,6 @@  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(const u8 *uuid);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 				     struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 #endif