diff mbox series

[v6,24/28] btrfs: enable relocation in HMZONED mode

Message ID 20191213040915.3502922-25-naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Dec. 13, 2019, 4:09 a.m. UTC
To serialize allocation and submit_bio, we introduced mutex around them. As
a result, preallocation must be completely disabled to avoid a deadlock.

Since current relocation process relies on preallocation to move file data
extents, it must be handled in another way. In HMZONED mode, we just
truncate the inode to the size that we wanted to pre-allocate. Then, we
flush dirty pages on the file before finishing relocation process.
run_delalloc_hmzoned() will handle all the allocation and submit IOs to
the underlying layers.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/relocation.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

Comments

Josef Bacik Dec. 17, 2019, 9:32 p.m. UTC | #1
On 12/12/19 11:09 PM, Naohiro Aota wrote:
> To serialize allocation and submit_bio, we introduced mutex around them. As
> a result, preallocation must be completely disabled to avoid a deadlock.
> 
> Since current relocation process relies on preallocation to move file data
> extents, it must be handled in another way. In HMZONED mode, we just
> truncate the inode to the size that we wanted to pre-allocate. Then, we
> flush dirty pages on the file before finishing relocation process.
> run_delalloc_hmzoned() will handle all the allocation and submit IOs to
> the underlying layers.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/relocation.c | 39 +++++++++++++++++++++++++++++++++++++--
>   1 file changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index d897a8e5e430..2d17b7566df4 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3159,6 +3159,34 @@ int prealloc_file_extent_cluster(struct inode *inode,
>   	if (ret)
>   		goto out;
>   
> +	/*
> +	 * In HMZONED, we cannot preallocate the file region. Instead,
> +	 * we dirty and fiemap_write the region.
> +	 */
> +
> +	if (btrfs_fs_incompat(btrfs_sb(inode->i_sb), HMZONED)) {
> +		struct btrfs_root *root = BTRFS_I(inode)->root;
> +		struct btrfs_trans_handle *trans;
> +
> +		end = cluster->end - offset + 1;
> +		trans = btrfs_start_transaction(root, 1);
> +		if (IS_ERR(trans))
> +			return PTR_ERR(trans);
> +
> +		inode->i_ctime = current_time(inode);
> +		i_size_write(inode, end);
> +		btrfs_ordered_update_i_size(inode, end, NULL);
> +		ret = btrfs_update_inode(trans, root, inode);
> +		if (ret) {
> +			btrfs_abort_transaction(trans, ret);
> +			btrfs_end_transaction(trans);
> +			return ret;
> +		}
> +		ret = btrfs_end_transaction(trans);
> +
> +		goto out;
> +	}
> +

Why are we arbitrarily extending the i_size here?  If we don't need prealloc we 
don't need to jack up the i_size either.

>   	cur_offset = prealloc_start;
>   	while (nr < cluster->nr) {
>   		start = cluster->boundary[nr] - offset;
> @@ -3346,6 +3374,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
>   		btrfs_throttle(fs_info);
>   	}
>   	WARN_ON(nr != cluster->nr);
> +	if (btrfs_fs_incompat(fs_info, HMZONED) && !ret) {
> +		ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
> +		WARN_ON(ret);

Do not WAR_ON() when this could happen due to IO errors.  Thanks,

Josef
Naohiro Aota Dec. 18, 2019, 10:49 a.m. UTC | #2
On Tue, Dec 17, 2019 at 04:32:04PM -0500, Josef Bacik wrote:
>On 12/12/19 11:09 PM, Naohiro Aota wrote:
>>To serialize allocation and submit_bio, we introduced mutex around them. As
>>a result, preallocation must be completely disabled to avoid a deadlock.
>>
>>Since current relocation process relies on preallocation to move file data
>>extents, it must be handled in another way. In HMZONED mode, we just
>>truncate the inode to the size that we wanted to pre-allocate. Then, we
>>flush dirty pages on the file before finishing relocation process.
>>run_delalloc_hmzoned() will handle all the allocation and submit IOs to
>>the underlying layers.
>>
>>Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>>---
>>  fs/btrfs/relocation.c | 39 +++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 37 insertions(+), 2 deletions(-)
>>
>>diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>>index d897a8e5e430..2d17b7566df4 100644
>>--- a/fs/btrfs/relocation.c
>>+++ b/fs/btrfs/relocation.c
>>@@ -3159,6 +3159,34 @@ int prealloc_file_extent_cluster(struct inode *inode,
>>  	if (ret)
>>  		goto out;
>>+	/*
>>+	 * In HMZONED, we cannot preallocate the file region. Instead,
>>+	 * we dirty and fiemap_write the region.
>>+	 */
>>+
>>+	if (btrfs_fs_incompat(btrfs_sb(inode->i_sb), HMZONED)) {
>>+		struct btrfs_root *root = BTRFS_I(inode)->root;
>>+		struct btrfs_trans_handle *trans;
>>+
>>+		end = cluster->end - offset + 1;
>>+		trans = btrfs_start_transaction(root, 1);
>>+		if (IS_ERR(trans))
>>+			return PTR_ERR(trans);
>>+
>>+		inode->i_ctime = current_time(inode);
>>+		i_size_write(inode, end);
>>+		btrfs_ordered_update_i_size(inode, end, NULL);
>>+		ret = btrfs_update_inode(trans, root, inode);
>>+		if (ret) {
>>+			btrfs_abort_transaction(trans, ret);
>>+			btrfs_end_transaction(trans);
>>+			return ret;
>>+		}
>>+		ret = btrfs_end_transaction(trans);
>>+
>>+		goto out;
>>+	}
>>+
>
>Why are we arbitrarily extending the i_size here?  If we don't need 
>prealloc we don't need to jack up the i_size either.

We need to extend i_size to read data from the relocating block
group. If not, btrfs_readpage() in relocate_file_extent_cluster()
always reads zero filled page because the read position is beyond the
file size.

>>  	cur_offset = prealloc_start;
>>  	while (nr < cluster->nr) {
>>  		start = cluster->boundary[nr] - offset;
>>@@ -3346,6 +3374,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
>>  		btrfs_throttle(fs_info);
>>  	}
>>  	WARN_ON(nr != cluster->nr);
>>+	if (btrfs_fs_incompat(fs_info, HMZONED) && !ret) {
>>+		ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
>>+		WARN_ON(ret);
>
>Do not WAR_ON() when this could happen due to IO errors.  Thanks,
>
>Josef

Sure. We can just drop it.
Josef Bacik Dec. 18, 2019, 3:01 p.m. UTC | #3
On 12/18/19 5:49 AM, Naohiro Aota wrote:
> On Tue, Dec 17, 2019 at 04:32:04PM -0500, Josef Bacik wrote:
>> On 12/12/19 11:09 PM, Naohiro Aota wrote:
>>> To serialize allocation and submit_bio, we introduced mutex around them. As
>>> a result, preallocation must be completely disabled to avoid a deadlock.
>>>
>>> Since current relocation process relies on preallocation to move file data
>>> extents, it must be handled in another way. In HMZONED mode, we just
>>> truncate the inode to the size that we wanted to pre-allocate. Then, we
>>> flush dirty pages on the file before finishing relocation process.
>>> run_delalloc_hmzoned() will handle all the allocation and submit IOs to
>>> the underlying layers.
>>>
>>> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>>> ---
>>>  fs/btrfs/relocation.c | 39 +++++++++++++++++++++++++++++++++++++--
>>>  1 file changed, 37 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>>> index d897a8e5e430..2d17b7566df4 100644
>>> --- a/fs/btrfs/relocation.c
>>> +++ b/fs/btrfs/relocation.c
>>> @@ -3159,6 +3159,34 @@ int prealloc_file_extent_cluster(struct inode *inode,
>>>      if (ret)
>>>          goto out;
>>> +    /*
>>> +     * In HMZONED, we cannot preallocate the file region. Instead,
>>> +     * we dirty and fiemap_write the region.
>>> +     */
>>> +
>>> +    if (btrfs_fs_incompat(btrfs_sb(inode->i_sb), HMZONED)) {
>>> +        struct btrfs_root *root = BTRFS_I(inode)->root;
>>> +        struct btrfs_trans_handle *trans;
>>> +
>>> +        end = cluster->end - offset + 1;
>>> +        trans = btrfs_start_transaction(root, 1);
>>> +        if (IS_ERR(trans))
>>> +            return PTR_ERR(trans);
>>> +
>>> +        inode->i_ctime = current_time(inode);
>>> +        i_size_write(inode, end);
>>> +        btrfs_ordered_update_i_size(inode, end, NULL);
>>> +        ret = btrfs_update_inode(trans, root, inode);
>>> +        if (ret) {
>>> +            btrfs_abort_transaction(trans, ret);
>>> +            btrfs_end_transaction(trans);
>>> +            return ret;
>>> +        }
>>> +        ret = btrfs_end_transaction(trans);
>>> +
>>> +        goto out;
>>> +    }
>>> +
>>
>> Why are we arbitrarily extending the i_size here?  If we don't need prealloc 
>> we don't need to jack up the i_size either.
> 
> We need to extend i_size to read data from the relocating block
> group. If not, btrfs_readpage() in relocate_file_extent_cluster()
> always reads zero filled page because the read position is beyond the
> file size.

Right but the finish_ordered_io stuff will do the btrfs_ordered_update_i_size() 
once the IO is complete.  So all you really need is the i_size_write and the 
btrfs_update_inode.  If this crashes you'll have an inode that has a i_size with 
no extents up to i_size.  This is fine for NO_HOLES but not fine for !NO_HOLES. 
Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d897a8e5e430..2d17b7566df4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3159,6 +3159,34 @@  int prealloc_file_extent_cluster(struct inode *inode,
 	if (ret)
 		goto out;
 
+	/*
+	 * In HMZONED, we cannot preallocate the file region. Instead,
+	 * we dirty and fiemap_write the region.
+	 */
+
+	if (btrfs_fs_incompat(btrfs_sb(inode->i_sb), HMZONED)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		struct btrfs_trans_handle *trans;
+
+		end = cluster->end - offset + 1;
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		inode->i_ctime = current_time(inode);
+		i_size_write(inode, end);
+		btrfs_ordered_update_i_size(inode, end, NULL);
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			btrfs_end_transaction(trans);
+			return ret;
+		}
+		ret = btrfs_end_transaction(trans);
+
+		goto out;
+	}
+
 	cur_offset = prealloc_start;
 	while (nr < cluster->nr) {
 		start = cluster->boundary[nr] - offset;
@@ -3346,6 +3374,10 @@  static int relocate_file_extent_cluster(struct inode *inode,
 		btrfs_throttle(fs_info);
 	}
 	WARN_ON(nr != cluster->nr);
+	if (btrfs_fs_incompat(fs_info, HMZONED) && !ret) {
+		ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		WARN_ON(ret);
+	}
 out:
 	kfree(ra);
 	return ret;
@@ -4186,8 +4218,12 @@  static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_inode_item *item;
 	struct extent_buffer *leaf;
+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 	int ret;
 
+	if (btrfs_fs_incompat(trans->fs_info, HMZONED))
+		flags &= ~BTRFS_INODE_PREALLOC;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4202,8 +4238,7 @@  static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_generation(leaf, item, 1);
 	btrfs_set_inode_size(leaf, item, 0);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
-					  BTRFS_INODE_PREALLOC);
+	btrfs_set_inode_flags(leaf, item, flags);
 	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);