diff mbox series

[5/6] btrfs: move wait out of btrfs_encoded_read

Message ID 20240823162810.1668399-6-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs: add io_uring for encoded reads | expand

Commit Message

Mark Harmstone Aug. 23, 2024, 4:27 p.m. UTC
Makes it so that if btrfs_encoded_read has launched a bio, rather than
waiting for it to complete it leaves the extent and inode locked and returns
-EIOCBQUEUED. The caller is responsible for waiting on the bio, and
calling the completion code in the new btrfs_encoded_read_regular_end.

Signed-off-by: Mark Harmstone <maharmstone@fb.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/inode.c       | 81 +++++++++++++++++++++++-------------------
 fs/btrfs/ioctl.c       |  8 +++++
 3 files changed, 54 insertions(+), 36 deletions(-)

Comments

Pavel Begunkov Sept. 6, 2024, 3:11 p.m. UTC | #1
On 8/23/24 17:27, Mark Harmstone wrote:
> Makes it so that if btrfs_encoded_read has launched a bio, rather than
> waiting for it to complete it leaves the extent and inode locked and returns
> -EIOCBQUEUED. The caller is responsible for waiting on the bio, and
> calling the completion code in the new btrfs_encoded_read_regular_end.
> 
> Signed-off-by: Mark Harmstone <maharmstone@fb.com>
> ---
>   fs/btrfs/btrfs_inode.h |  1 +
>   fs/btrfs/inode.c       | 81 +++++++++++++++++++++++-------------------
>   fs/btrfs/ioctl.c       |  8 +++++
>   3 files changed, 54 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index f4d77c3bb544..a5d786c6d7d4 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -623,6 +623,7 @@ struct btrfs_encoded_read_private {
>   };
>   
>   ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv);
> +ssize_t btrfs_encoded_read_regular_end(struct btrfs_encoded_read_private *priv);
>   ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
>   			       const struct btrfs_ioctl_encoded_io_args *encoded);
>   
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 1e53977a4854..1bd4c74e8c51 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -8999,7 +8999,7 @@ static ssize_t btrfs_encoded_read_inline(
>   				struct extent_state **cached_state,
>   				u64 extent_start, size_t count,
>   				struct btrfs_ioctl_encoded_io_args *encoded,
> -				bool *unlocked)
> +				bool *need_unlock)
>   {
>   	struct btrfs_root *root = inode->root;
>   	struct btrfs_fs_info *fs_info = root->fs_info;
> @@ -9067,7 +9067,7 @@ static ssize_t btrfs_encoded_read_inline(
>   	btrfs_release_path(path);
>   	unlock_extent(io_tree, start, lockend, cached_state);
>   	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> -	*unlocked = true;
> +	*need_unlock = false;
>   
>   	ret = copy_to_iter(tmp, count, iter);
>   	if (ret != count)
> @@ -9155,42 +9155,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
>   	return blk_status_to_errno(READ_ONCE(priv.status));
>   }
>   
> -static ssize_t btrfs_encoded_read_regular(struct btrfs_encoded_read_private *priv,
> -					  u64 start, u64 lockend,
> -					  u64 disk_bytenr, u64 disk_io_size,
> -					  bool *unlocked)
> +ssize_t btrfs_encoded_read_regular_end(struct btrfs_encoded_read_private *priv)
> -	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
> -	struct extent_io_tree *io_tree = &inode->io_tree;
> +	u64 cur, start, lockend;
>   	unsigned long i;
> -	u64 cur;
>   	size_t page_offset;
> -	ssize_t ret;
> -
> -	priv->nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
> -	priv->pages = kcalloc(priv->nr_pages, sizeof(struct page *), GFP_NOFS);
> -	if (!priv->pages) {
> -		priv->nr_pages = 0;
> -		return -ENOMEM;
> -	}
> -	ret = btrfs_alloc_page_array(priv->nr_pages, priv->pages, false);
> -	if (ret)
> -		return -ENOMEM;
> +	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_io_tree *io_tree = &inode->io_tree;
> +	int ret;
> -	_btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
> -					       disk_io_size, priv);
> +	start = ALIGN_DOWN(priv->args.offset, fs_info->sectorsize);
> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
>   
> -	if (atomic_dec_return(&priv->pending))
> -		io_wait_event(priv->wait, !atomic_read(&priv->pending));
> +	unlock_extent(io_tree, start, lockend, &priv->cached_state);
> +	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);

AFAIS, btrfs_encoded_read_regular_end is here to be used asynchronously
in a later patch, specifically via bio callback -> io_uring tw callback,
and that io_uring tw callback might not get run in any reasonable amount
of time as it's controlled by the user space.

So, it takes the inode lock in one context (task executing the
request) and releases it in another, it's usually is not allowed.
And it also holds the locks potentially semi indefinitely (guaranteed
to be executed when io_uring and/or task die). There is also
unlock_extent() which I'd assume we don't want to hold locked
for too long.

It's in general a bad practice having is_locked variables, especially
when it's not on stack, even more so when it's implicit like

regular_read() {
	(ret == -EIOCBQUEUED)
		need_unlock = false; // delay it
}
btrfs_encoded_read_finish() {
	if (ret == -EIOCBQUEUED)
		unlock();
}

That just too easy to miss when you do anything with the code
afterwards.

I hope Josef and btrfs folks can comment what would be a way
here, does the inode lock has to be held for the entire
duration of IO?

>   
>   	ret = blk_status_to_errno(READ_ONCE(priv->status));
>   	if (ret)
>   		return ret;
>   
> -	unlock_extent(io_tree, start, lockend, &priv->cached_state);
> -	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> -	*unlocked = true;
> -
>   	if (priv->args.compression) {
>   		i = 0;
>   		page_offset = 0;
> @@ -9215,6 +9199,30 @@ static ssize_t btrfs_encoded_read_regular(struct btrfs_encoded_read_private *pri
>   	return priv->count;
>   }
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f4d77c3bb544..a5d786c6d7d4 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -623,6 +623,7 @@  struct btrfs_encoded_read_private {
 };
 
 ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv);
+ssize_t btrfs_encoded_read_regular_end(struct btrfs_encoded_read_private *priv);
 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 			       const struct btrfs_ioctl_encoded_io_args *encoded);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e53977a4854..1bd4c74e8c51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8999,7 +8999,7 @@  static ssize_t btrfs_encoded_read_inline(
 				struct extent_state **cached_state,
 				u64 extent_start, size_t count,
 				struct btrfs_ioctl_encoded_io_args *encoded,
-				bool *unlocked)
+				bool *need_unlock)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -9067,7 +9067,7 @@  static ssize_t btrfs_encoded_read_inline(
 	btrfs_release_path(path);
 	unlock_extent(io_tree, start, lockend, cached_state);
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-	*unlocked = true;
+	*need_unlock = false;
 
 	ret = copy_to_iter(tmp, count, iter);
 	if (ret != count)
@@ -9155,42 +9155,26 @@  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 	return blk_status_to_errno(READ_ONCE(priv.status));
 }
 
-static ssize_t btrfs_encoded_read_regular(struct btrfs_encoded_read_private *priv,
-					  u64 start, u64 lockend,
-					  u64 disk_bytenr, u64 disk_io_size,
-					  bool *unlocked)
+ssize_t btrfs_encoded_read_regular_end(struct btrfs_encoded_read_private *priv)
 {
-	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
-	struct extent_io_tree *io_tree = &inode->io_tree;
+	u64 cur, start, lockend;
 	unsigned long i;
-	u64 cur;
 	size_t page_offset;
-	ssize_t ret;
-
-	priv->nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
-	priv->pages = kcalloc(priv->nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!priv->pages) {
-		priv->nr_pages = 0;
-		return -ENOMEM;
-	}
-	ret = btrfs_alloc_page_array(priv->nr_pages, priv->pages, false);
-	if (ret)
-		return -ENOMEM;
+	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	int ret;
 
-	_btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
-					       disk_io_size, priv);
+	start = ALIGN_DOWN(priv->args.offset, fs_info->sectorsize);
+	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
 
-	if (atomic_dec_return(&priv->pending))
-		io_wait_event(priv->wait, !atomic_read(&priv->pending));
+	unlock_extent(io_tree, start, lockend, &priv->cached_state);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 
 	ret = blk_status_to_errno(READ_ONCE(priv->status));
 	if (ret)
 		return ret;
 
-	unlock_extent(io_tree, start, lockend, &priv->cached_state);
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-	*unlocked = true;
-
 	if (priv->args.compression) {
 		i = 0;
 		page_offset = 0;
@@ -9215,6 +9199,30 @@  static ssize_t btrfs_encoded_read_regular(struct btrfs_encoded_read_private *pri
 	return priv->count;
 }
 
+static ssize_t btrfs_encoded_read_regular(struct btrfs_encoded_read_private *priv,
+					  u64 disk_bytenr, u64 disk_io_size)
+{
+	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	u64 start = ALIGN_DOWN(priv->args.offset, fs_info->sectorsize);
+	ssize_t ret;
+
+	priv->nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+	priv->pages = kcalloc(priv->nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!priv->pages) {
+		priv->nr_pages = 0;
+		return -ENOMEM;
+	}
+	ret = btrfs_alloc_page_array(priv->nr_pages, priv->pages, false);
+	if (ret)
+		return -ENOMEM;
+
+	_btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+					       disk_io_size, priv);
+
+	return -EIOCBQUEUED;
+}
+
 ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv)
 {
 	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->file));
@@ -9223,7 +9231,7 @@  ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv)
 	ssize_t ret;
 	u64 start, lockend, disk_bytenr, disk_io_size;
 	struct extent_map *em;
-	bool unlocked = false;
+	bool need_unlock = true;
 
 	priv->count = iov_iter_count(&priv->iter);
 
@@ -9278,7 +9286,7 @@  ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv)
 						&priv->iter, start,
 						lockend, &priv->cached_state,
 						extent_start, priv->count,
-						&priv->args, &unlocked);
+						&priv->args, &need_unlock);
 		goto out_em;
 	}
 
@@ -9333,23 +9341,24 @@  ssize_t btrfs_encoded_read(struct btrfs_encoded_read_private *priv)
 	if (disk_bytenr == EXTENT_MAP_HOLE) {
 		unlock_extent(io_tree, start, lockend, &priv->cached_state);
 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-		unlocked = true;
+		need_unlock = false;
 		ret = iov_iter_zero(priv->count, &priv->iter);
 		if (ret != priv->count)
 			ret = -EFAULT;
 	} else {
-		ret = btrfs_encoded_read_regular(priv, start, lockend,
-						 disk_bytenr, disk_io_size,
-						 &unlocked);
+		ret = btrfs_encoded_read_regular(priv, disk_bytenr, disk_io_size);
+
+		if (ret == -EIOCBQUEUED)
+			need_unlock = false;
 	}
 
 out_em:
 	free_extent_map(em);
 out_unlock_extent:
-	if (!unlocked)
+	if (need_unlock)
 		unlock_extent(io_tree, start, lockend, &priv->cached_state);
 out_unlock_inode:
-	if (!unlocked)
+	if (need_unlock)
 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d2658508e055..c1886209933a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4516,6 +4516,9 @@  static ssize_t btrfs_encoded_read_finish(struct btrfs_encoded_read_private *priv
 					     flags);
 	unsigned long i;
 
+	if (ret == -EIOCBQUEUED)
+		ret = btrfs_encoded_read_regular_end(priv);
+
 	if (ret >= 0) {
 		fsnotify_access(priv->file);
 		if (copy_to_user(priv->copy_out,
@@ -4613,6 +4616,11 @@  static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
 
 	ret = btrfs_encoded_read(&priv);
 
+	if (ret == -EIOCBQUEUED) {
+		if (atomic_dec_return(&priv.pending))
+			io_wait_event(priv.wait, !atomic_read(&priv.pending));
+	}
+
 out:
 	return btrfs_encoded_read_finish(&priv, ret);
 }