diff mbox series

[v7,04/13] btrfs: add support for inserting raid stripe extents

Message ID 94293952cdc120b46edf82672af874b0877e1e83.1677750131.git.johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: introduce RAID stripe tree | expand

Commit Message

Johannes Thumshirn March 2, 2023, 9:45 a.m. UTC
Add support for inserting stripe extents into the raid stripe tree on
completion of every write that needs an extra logical-to-physical
translation when using RAID.

Inserting the stripe extents happens after the data I/O has completed,
this is done to a) support zone-append and b) rule out the possibility of
a RAID-write-hole.

This is done by creating in-memory ordered stripe extents, just like the
in memory ordered extents, on I/O completion and the on-disk raid stripe
extents get created once we're running the delayed_refs for the extent
item this stripe extent is tied to.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/Makefile           |   2 +-
 fs/btrfs/bio.c              |  29 +++++
 fs/btrfs/delayed-ref.c      |   6 +-
 fs/btrfs/delayed-ref.h      |   2 +
 fs/btrfs/extent-tree.c      |  60 +++++++++++
 fs/btrfs/inode.c            |  15 ++-
 fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid-stripe-tree.h |  71 +++++++++++++
 fs/btrfs/volumes.c          |   4 +-
 fs/btrfs/volumes.h          |  13 +--
 fs/btrfs/zoned.c            |   3 +
 11 files changed, 397 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/raid-stripe-tree.c
 create mode 100644 fs/btrfs/raid-stripe-tree.h

Comments

Qu Wenruo March 2, 2023, 10:58 a.m. UTC | #1
On 2023/3/2 17:45, Johannes Thumshirn wrote:
> Add support for inserting stripe extents into the raid stripe tree on
> completion of every write that needs an extra logical-to-physical
> translation when using RAID.
> 
> Inserting the stripe extents happens after the data I/O has completed,
> this is done to a) support zone-append and b) rule out the possibility of
> a RAID-write-hole.
> 
> This is done by creating in-memory ordered stripe extents, just like the
> in memory ordered extents, on I/O completion and the on-disk raid stripe
> extents get created once we're running the delayed_refs for the extent
> item this stripe extent is tied to.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/Makefile           |   2 +-
>   fs/btrfs/bio.c              |  29 +++++
>   fs/btrfs/delayed-ref.c      |   6 +-
>   fs/btrfs/delayed-ref.h      |   2 +
>   fs/btrfs/extent-tree.c      |  60 +++++++++++
>   fs/btrfs/inode.c            |  15 ++-
>   fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h |  71 +++++++++++++
>   fs/btrfs/volumes.c          |   4 +-
>   fs/btrfs/volumes.h          |  13 +--
>   fs/btrfs/zoned.c            |   3 +
>   11 files changed, 397 insertions(+), 12 deletions(-)
>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>   create mode 100644 fs/btrfs/raid-stripe-tree.h
> 
> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
> index 90d53209755b..3bb869a84e54 100644
> --- a/fs/btrfs/Makefile
> +++ b/fs/btrfs/Makefile
> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>   	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>   	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
>   	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
> -	   lru_cache.o
> +	   lru_cache.o raid-stripe-tree.o
>   
>   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>   btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index 726592868e9c..2b174865d347 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -15,6 +15,7 @@
>   #include "rcu-string.h"
>   #include "zoned.h"
>   #include "file-item.h"
> +#include "raid-stripe-tree.h"
>   
>   static struct bio_set btrfs_bioset;
>   static struct bio_set btrfs_clone_bioset;
> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
>   	btrfs_put_bioc(bioc);
>   }
>   
> +static void btrfs_raid_stripe_update(struct work_struct *work)
> +{
> +	struct btrfs_bio *bbio =
> +		container_of(work, struct btrfs_bio, end_io_work);
> +	struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
> +	struct btrfs_io_context *bioc = stripe->bioc;
> +	int ret;
> +
> +	ret = btrfs_add_ordered_stripe(bioc);
> +	if (ret)
> +		bbio->bio.bi_status = errno_to_blk_status(ret);
> +	btrfs_orig_bbio_end_io(bbio);
> +	btrfs_put_bioc(bioc);
> +}
> +
>   static void btrfs_orig_write_end_io(struct bio *bio)
>   {
>   	struct btrfs_io_stripe *stripe = bio->bi_private;
> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>   	else
>   		bio->bi_status = BLK_STS_OK;
>   
> +	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
> +		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
> +
> +	if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> +		INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
> +		queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
> +			&bbio->end_io_work);

I'm still having the old question, what would happen if the delayed 
workload happen after the ordered extent finished?

Since we can not ensure the order between this RST update workload and 
finish_ordered_io(), there can be an window where we finish ordered io, 
and then the pages get released (by memory pressure), then a new read 
happen to the range, then our RST workload happened.

In that case, we would have read failure.


Thus I strongly recommened to do the RST tree update inside 
finish_ordered_io().

This has several advantages:

- We don't need in-memory structure as a gap stopper
   Since read would be blocked if there is a running ordered extent,
   we don't need an in-memory RST mapping.

- finish_ordered_io() itself has all the proper context for tree
   updates.
   As that's the main location we update the subvolume tree.

The main concern may be the bioc <-> ordered extent mapping, but IIRC 
for zoned mode one bioc is one ordered extent, thus this shouldn't be a 
super big deal?

Otherwise we may need something to trace all the bioc belong to the 
ordered extent.

Thanks,
Qu
> +		return;
> +	}
> +
>   	btrfs_orig_bbio_end_io(bbio);
>   	btrfs_put_bioc(bioc);
>   }
> @@ -381,6 +407,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
>   	if (bio->bi_status) {
>   		atomic_inc(&stripe->bioc->error);
>   		btrfs_log_dev_io_error(bio, stripe->dev);
> +	} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
> +		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>   	}
>   
>   	/* Pass on control to the original bio this one was cloned from */
> @@ -440,6 +468,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
>   	bio->bi_private = &bioc->stripes[dev_nr];
>   	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
>   	bioc->stripes[dev_nr].bioc = bioc;
> +	bioc->size = bio->bi_iter.bi_size;
>   	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
>   }
>   
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 7660ac642c81..261f52ad8e12 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -14,6 +14,7 @@
>   #include "space-info.h"
>   #include "tree-mod-log.h"
>   #include "fs.h"
> +#include "raid-stripe-tree.h"
>   
>   struct kmem_cache *btrfs_delayed_ref_head_cachep;
>   struct kmem_cache *btrfs_delayed_tree_ref_cachep;
> @@ -637,8 +638,11 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
>   	exist->ref_mod += mod;
>   
>   	/* remove existing tail if its ref_mod is zero */
> -	if (exist->ref_mod == 0)
> +	if (exist->ref_mod == 0) {
> +		btrfs_drop_ordered_stripe(trans->fs_info, exist->bytenr);
>   		drop_delayed_ref(root, href, exist);
> +	}
> +
>   	spin_unlock(&href->lock);
>   	return ret;
>   inserted:
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 2eb34abf700f..5096c1a1ed3e 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -51,6 +51,8 @@ struct btrfs_delayed_ref_node {
>   	/* is this node still in the rbtree? */
>   	unsigned int is_head:1;
>   	unsigned int in_tree:1;
> +	/* Do we need RAID stripe tree modifications? */
> +	unsigned int must_insert_stripe:1;
>   };
>   
>   struct btrfs_delayed_extent_op {
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 6b6c59e6805c..7441d784fe03 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -42,6 +42,7 @@
>   #include "file-item.h"
>   #include "orphan.h"
>   #include "tree-checker.h"
> +#include "raid-stripe-tree.h"
>   
>   #undef SCRAMBLE_DELAYED_REFS
>   
> @@ -1497,6 +1498,56 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
>   	return ret;
>   }
>   
> +static bool delayed_ref_needs_rst_update(struct btrfs_fs_info *fs_info,
> +					 struct btrfs_delayed_ref_head *head)
> +{
> +	struct extent_map *em;
> +	struct map_lookup *map;
> +	bool ret = false;
> +
> +	if (!btrfs_stripe_tree_root(fs_info))
> +		return ret;
> +
> +	em = btrfs_get_chunk_map(fs_info, head->bytenr, head->num_bytes);
> +	if (!em)
> +		return ret;
> +
> +	map = em->map_lookup;
> +
> +	if (btrfs_need_stripe_tree_update(fs_info, map->type))
> +		ret = true;
> +
> +	free_extent_map(em);
> +
> +	return ret;
> +}
> +
> +static int add_stripe_entry_for_delayed_ref(struct btrfs_trans_handle *trans,
> +					    struct btrfs_delayed_ref_node *node)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_ordered_stripe *stripe;
> +	int ret = 0;
> +
> +	stripe = btrfs_lookup_ordered_stripe(fs_info, node->bytenr);
> +	if (!stripe) {
> +		btrfs_err(fs_info,
> +			  "cannot get stripe extent for address %llu (%llu)",
> +			  node->bytenr, node->num_bytes);
> +		return -EINVAL;
> +	}
> +
> +	ASSERT(stripe->logical == node->bytenr);
> +
> +	ret = btrfs_insert_raid_extent(trans, stripe);
> +	/* once for us */
> +	btrfs_put_ordered_stripe(fs_info, stripe);
> +	/* once for the tree */
> +	btrfs_put_ordered_stripe(fs_info, stripe);
> +
> +	return ret;
> +}
> +
>   static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
>   				struct btrfs_delayed_ref_node *node,
>   				struct btrfs_delayed_extent_op *extent_op,
> @@ -1527,11 +1578,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
>   						 flags, ref->objectid,
>   						 ref->offset, &ins,
>   						 node->ref_mod);
> +		if (ret)
> +			return ret;
> +		if (node->must_insert_stripe)
> +			ret = add_stripe_entry_for_delayed_ref(trans, node);
>   	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
>   		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
>   					     ref->objectid, ref->offset,
>   					     node->ref_mod, extent_op);
>   	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
> +		if (node->must_insert_stripe)
> +			btrfs_drop_ordered_stripe(trans->fs_info, node->bytenr);
>   		ret = __btrfs_free_extent(trans, node, parent,
>   					  ref_root, ref->objectid,
>   					  ref->offset, node->ref_mod,
> @@ -1901,6 +1958,8 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
>   	struct btrfs_delayed_ref_root *delayed_refs;
>   	struct btrfs_delayed_extent_op *extent_op;
>   	struct btrfs_delayed_ref_node *ref;
> +	const bool need_rst_update =
> +		delayed_ref_needs_rst_update(fs_info, locked_ref);
>   	int must_insert_reserved = 0;
>   	int ret;
>   
> @@ -1951,6 +2010,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
>   		locked_ref->extent_op = NULL;
>   		spin_unlock(&locked_ref->lock);
>   
> +		ref->must_insert_stripe = need_rst_update;
>   		ret = run_one_delayed_ref(trans, ref, extent_op,
>   					  must_insert_reserved);
>   
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8f07d59e8193..aaa1db90e58b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -70,6 +70,7 @@
>   #include "verity.h"
>   #include "super.h"
>   #include "orphan.h"
> +#include "raid-stripe-tree.h"
>   
>   struct btrfs_iget_args {
>   	u64 ino;
> @@ -9495,12 +9496,17 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
>   	if (qgroup_released < 0)
>   		return ERR_PTR(qgroup_released);
>   
> +	ret = btrfs_insert_preallocated_raid_stripe(inode->root->fs_info,
> +						    start, len);
> +	if (ret)
> +		goto free_qgroup;
> +
>   	if (trans) {
>   		ret = insert_reserved_file_extent(trans, inode,
>   						  file_offset, &stack_fi,
>   						  true, qgroup_released);
>   		if (ret)
> -			goto free_qgroup;
> +			goto free_stripe_extent;
>   		return trans;
>   	}
>   
> @@ -9518,7 +9524,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
>   	path = btrfs_alloc_path();
>   	if (!path) {
>   		ret = -ENOMEM;
> -		goto free_qgroup;
> +		goto free_stripe_extent;
>   	}
>   
>   	ret = btrfs_replace_file_extents(inode, path, file_offset,
> @@ -9526,9 +9532,12 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
>   				     &trans);
>   	btrfs_free_path(path);
>   	if (ret)
> -		goto free_qgroup;
> +		goto free_stripe_extent;
>   	return trans;
>   
> +free_stripe_extent:
> +	btrfs_drop_ordered_stripe(inode->root->fs_info, start);
> +
>   free_qgroup:
>   	/*
>   	 * We have released qgroup data range at the beginning of the function,
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> new file mode 100644
> index 000000000000..9d3e7bffe6f8
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -0,0 +1,204 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Western Digital Corporation or its affiliates.
> + */
> +
> +#include <linux/btrfs_tree.h>
> +
> +#include "ctree.h"
> +#include "fs.h"
> +#include "accessors.h"
> +#include "transaction.h"
> +#include "disk-io.h"
> +#include "raid-stripe-tree.h"
> +#include "volumes.h"
> +#include "misc.h"
> +#include "disk-io.h"
> +#include "print-tree.h"
> +
> +static int ordered_stripe_cmp(const void *key, const struct rb_node *node)
> +{
> +	struct btrfs_ordered_stripe *stripe =
> +		rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> +	const u64 *logical = key;
> +
> +	if (*logical < stripe->logical)
> +		return -1;
> +	if (*logical >= stripe->logical + stripe->num_bytes)
> +		return 1;
> +	return 0;
> +}
> +
> +static int ordered_stripe_less(struct rb_node *rba, const struct rb_node *rbb)
> +{
> +	struct btrfs_ordered_stripe *stripe =
> +		rb_entry(rba, struct btrfs_ordered_stripe, rb_node);
> +	return ordered_stripe_cmp(&stripe->logical, rbb);
> +}
> +
> +int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_fs_info *fs_info = bioc->fs_info;
> +	struct btrfs_ordered_stripe *stripe;
> +	struct btrfs_io_stripe *tmp;
> +	u64 logical = bioc->logical;
> +	u64 length = bioc->size;
> +	struct rb_node *node;
> +	size_t size;
> +
> +	size = bioc->num_stripes * sizeof(struct btrfs_io_stripe);
> +	stripe = kzalloc(sizeof(struct btrfs_ordered_stripe), GFP_NOFS);
> +	if (!stripe)
> +		return -ENOMEM;
> +
> +	spin_lock_init(&stripe->lock);
> +	tmp = kmemdup(bioc->stripes, size, GFP_NOFS);
> +	if (!tmp) {
> +		kfree(stripe);
> +		return -ENOMEM;
> +	}
> +
> +	stripe->logical = logical;
> +	stripe->num_bytes = length;
> +	stripe->num_stripes = bioc->num_stripes;
> +	spin_lock(&stripe->lock);
> +	stripe->stripes = tmp;
> +	spin_unlock(&stripe->lock);
> +	refcount_set(&stripe->ref, 1);
> +
> +	write_lock(&fs_info->stripe_update_lock);
> +	node = rb_find_add(&stripe->rb_node, &fs_info->stripe_update_tree,
> +	       ordered_stripe_less);
> +	if (node) {
> +		struct btrfs_ordered_stripe *old =
> +			rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> +
> +		btrfs_debug(fs_info, "logical: %llu, length: %llu already exists",
> +			  logical, length);
> +		ASSERT(logical == old->logical);
> +
> +		rb_replace_node(node, &stripe->rb_node,
> +				&fs_info->stripe_update_tree);
> +	}
> +	write_unlock(&fs_info->stripe_update_lock);
> +
> +	return 0;
> +}
> +
> +struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *fs_info,
> +							 u64 logical)
> +{
> +	struct rb_root *root = &fs_info->stripe_update_tree;
> +	struct btrfs_ordered_stripe *stripe = NULL;
> +	struct rb_node *node;
> +
> +	read_lock(&fs_info->stripe_update_lock);
> +	node = rb_find(&logical, root, ordered_stripe_cmp);
> +	if (node) {
> +		stripe = rb_entry(node, struct btrfs_ordered_stripe, rb_node);
> +		refcount_inc(&stripe->ref);
> +	}
> +	read_unlock(&fs_info->stripe_update_lock);
> +
> +	return stripe;
> +}
> +
> +void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
> +				 struct btrfs_ordered_stripe *stripe)
> +{
> +
> +	if (refcount_dec_and_test(&stripe->ref)) {
> +		struct rb_node *node;
> +
> +		write_lock(&fs_info->stripe_update_lock);
> +
> +		node = &stripe->rb_node;
> +		rb_erase(node, &fs_info->stripe_update_tree);
> +		RB_CLEAR_NODE(node);
> +
> +		spin_lock(&stripe->lock);
> +		kfree(stripe->stripes);
> +		spin_unlock(&stripe->lock);
> +		kfree(stripe);
> +		write_unlock(&fs_info->stripe_update_lock);
> +	}
> +}
> +
> +int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
> +					  u64 start, u64 len)
> +{
> +	struct btrfs_io_context *bioc = NULL;
> +	struct btrfs_ordered_stripe *stripe;
> +	u64 map_length = len;
> +	int ret;
> +
> +	if (!btrfs_stripe_tree_root(fs_info))
> +		return 0;
> +
> +	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, start, &map_length,
> +			      &bioc, 0);
> +	if (ret)
> +		return ret;
> +
> +	bioc->size = len;
> +
> +	stripe = btrfs_lookup_ordered_stripe(fs_info, start);
> +	if (!stripe) {
> +		ret = btrfs_add_ordered_stripe(bioc);
> +		if (ret)
> +			return ret;
> +	} else {
> +		spin_lock(&stripe->lock);
> +		memcpy(stripe->stripes, bioc->stripes,
> +		       bioc->num_stripes * sizeof(struct btrfs_io_stripe));
> +		spin_unlock(&stripe->lock);
> +		btrfs_put_ordered_stripe(fs_info, stripe);
> +	}
> +
> +	return 0;
> +}
> +
> +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> +			     struct btrfs_ordered_stripe *stripe)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
> +	struct btrfs_stripe_extent *stripe_extent;
> +	size_t item_size;
> +	int ret;
> +
> +	item_size = stripe->num_stripes * sizeof(struct btrfs_raid_stride);
> +
> +	stripe_extent = kzalloc(item_size, GFP_NOFS);
> +	if (!stripe_extent) {
> +		btrfs_abort_transaction(trans, -ENOMEM);
> +		btrfs_end_transaction(trans);
> +		return -ENOMEM;
> +	}
> +
> +	spin_lock(&stripe->lock);
> +	for (int i = 0; i < stripe->num_stripes; i++) {
> +		u64 devid = stripe->stripes[i].dev->devid;
> +		u64 physical = stripe->stripes[i].physical;
> +		struct btrfs_raid_stride *raid_stride =
> +						&stripe_extent->strides[i];
> +
> +		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
> +		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
> +	}
> +	spin_unlock(&stripe->lock);
> +
> +	stripe_key.objectid = stripe->logical;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = stripe->num_bytes;
> +
> +	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
> +				item_size);
> +	if (ret)
> +		btrfs_abort_transaction(trans, ret);
> +
> +	kfree(stripe_extent);
> +
> +	return ret;
> +}
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> new file mode 100644
> index 000000000000..60d3f8489cc9
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Western Digital Corporation or its affiliates.
> + */
> +
> +#ifndef BTRFS_RAID_STRIPE_TREE_H
> +#define BTRFS_RAID_STRIPE_TREE_H
> +
> +#include "disk-io.h"
> +#include "messages.h"
> +
> +struct btrfs_io_context;
> +
> +struct btrfs_ordered_stripe {
> +	struct rb_node rb_node;
> +
> +	u64 logical;
> +	u64 num_bytes;
> +	int num_stripes;
> +	struct btrfs_io_stripe *stripes;
> +	spinlock_t lock;
> +	refcount_t ref;
> +};
> +
> +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> +			     struct btrfs_ordered_stripe *stripe);
> +int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
> +					  u64 start, u64 len);
> +struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
> +						 struct btrfs_fs_info *fs_info,
> +						 u64 logical);
> +int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc);
> +void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
> +					    struct btrfs_ordered_stripe *stripe);
> +
> +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
> +						 u64 map_type)
> +{
> +	u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
> +	u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
> +
> +	if (!btrfs_stripe_tree_root(fs_info))
> +		return false;
> +
> +	if (type != BTRFS_BLOCK_GROUP_DATA)
> +		return false;
> +
> +	if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
> +		return true;
> +
> +	return false;
> +}
> +
> +static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
> +					     u64 logical)
> +{
> +	struct btrfs_ordered_stripe *stripe;
> +
> +	if (!btrfs_stripe_tree_root(fs_info))
> +		return;
> +
> +	stripe = btrfs_lookup_ordered_stripe(fs_info, logical);
> +	if (!stripe)
> +		return;
> +	ASSERT(refcount_read(&stripe->ref) == 2);
> +	/* once for us */
> +	btrfs_put_ordered_stripe(fs_info, stripe);
> +	/* once for the tree */
> +	btrfs_put_ordered_stripe(fs_info, stripe);
> +}
> +#endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 9d6775c7196f..fee611d1b01d 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5879,6 +5879,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>   }
>   
>   static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
> +						       u64 logical,
>   						       u16 total_stripes)
>   {
>   	struct btrfs_io_context *bioc;
> @@ -5898,6 +5899,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
>   	bioc->fs_info = fs_info;
>   	bioc->replace_stripe_src = -1;
>   	bioc->full_stripe_logical = (u64)-1;
> +	bioc->logical = logical;
>   
>   	return bioc;
>   }
> @@ -6493,7 +6495,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   		goto out;
>   	}
>   
> -	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
> +	bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
>   	if (!bioc) {
>   		ret = -ENOMEM;
>   		goto out;
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 650e131d079e..114c76c81eda 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -372,12 +372,10 @@ struct btrfs_fs_devices {
>   
>   struct btrfs_io_stripe {
>   	struct btrfs_device *dev;
> -	union {
> -		/* Block mapping */
> -		u64 physical;
> -		/* For the endio handler */
> -		struct btrfs_io_context *bioc;
> -	};
> +	/* Block mapping */
> +	u64 physical;
> +	/* For the endio handler */
> +	struct btrfs_io_context *bioc;
>   };
>   
>   struct btrfs_discard_stripe {
> @@ -410,6 +408,9 @@ struct btrfs_io_context {
>   	atomic_t error;
>   	u16 max_errors;
>   
> +	u64 logical;
> +	u64 size;
> +
>   	/*
>   	 * The total number of stripes, including the extra duplicated
>   	 * stripe for replace.
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index f95b2c94d619..7e6cfc7a2918 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -1692,6 +1692,9 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
>   	u64 chunk_start_phys;
>   	u64 logical;
>   
> +	/* Filesystems with a stripe tree have their own l2p mapping */
> +	ASSERT(!btrfs_stripe_tree_root(fs_info));
> +
>   	em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
>   	if (IS_ERR(em))
>   		return;
Johannes Thumshirn March 2, 2023, 11:25 a.m. UTC | #2
On 02.03.23 11:59, Qu Wenruo wrote:
> 
> 
> On 2023/3/2 17:45, Johannes Thumshirn wrote:
>> Add support for inserting stripe extents into the raid stripe tree on
>> completion of every write that needs an extra logical-to-physical
>> translation when using RAID.
>>
>> Inserting the stripe extents happens after the data I/O has completed,
>> this is done to a) support zone-append and b) rule out the possibility of
>> a RAID-write-hole.
>>
>> This is done by creating in-memory ordered stripe extents, just like the
>> in memory ordered extents, on I/O completion and the on-disk raid stripe
>> extents get created once we're running the delayed_refs for the extent
>> item this stripe extent is tied to.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>   fs/btrfs/Makefile           |   2 +-
>>   fs/btrfs/bio.c              |  29 +++++
>>   fs/btrfs/delayed-ref.c      |   6 +-
>>   fs/btrfs/delayed-ref.h      |   2 +
>>   fs/btrfs/extent-tree.c      |  60 +++++++++++
>>   fs/btrfs/inode.c            |  15 ++-
>>   fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/raid-stripe-tree.h |  71 +++++++++++++
>>   fs/btrfs/volumes.c          |   4 +-
>>   fs/btrfs/volumes.h          |  13 +--
>>   fs/btrfs/zoned.c            |   3 +
>>   11 files changed, 397 insertions(+), 12 deletions(-)
>>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>>
>> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
>> index 90d53209755b..3bb869a84e54 100644
>> --- a/fs/btrfs/Makefile
>> +++ b/fs/btrfs/Makefile
>> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>>   	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>>   	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
>>   	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
>> -	   lru_cache.o
>> +	   lru_cache.o raid-stripe-tree.o
>>   
>>   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>>   btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>> index 726592868e9c..2b174865d347 100644
>> --- a/fs/btrfs/bio.c
>> +++ b/fs/btrfs/bio.c
>> @@ -15,6 +15,7 @@
>>   #include "rcu-string.h"
>>   #include "zoned.h"
>>   #include "file-item.h"
>> +#include "raid-stripe-tree.h"
>>   
>>   static struct bio_set btrfs_bioset;
>>   static struct bio_set btrfs_clone_bioset;
>> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
>>   	btrfs_put_bioc(bioc);
>>   }
>>   
>> +static void btrfs_raid_stripe_update(struct work_struct *work)
>> +{
>> +	struct btrfs_bio *bbio =
>> +		container_of(work, struct btrfs_bio, end_io_work);
>> +	struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
>> +	struct btrfs_io_context *bioc = stripe->bioc;
>> +	int ret;
>> +
>> +	ret = btrfs_add_ordered_stripe(bioc);
>> +	if (ret)
>> +		bbio->bio.bi_status = errno_to_blk_status(ret);
>> +	btrfs_orig_bbio_end_io(bbio);
>> +	btrfs_put_bioc(bioc);
>> +}
>> +
>>   static void btrfs_orig_write_end_io(struct bio *bio)
>>   {
>>   	struct btrfs_io_stripe *stripe = bio->bi_private;
>> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>>   	else
>>   		bio->bi_status = BLK_STS_OK;
>>   
>> +	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
>> +		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>> +
>> +	if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>> +		INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
>> +		queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
>> +			&bbio->end_io_work);
> 
> I'm still having the old question, what would happen if the delayed 
> workload happen after the ordered extent finished?
> 
> Since we can not ensure the order between this RST update workload and 
> finish_ordered_io(), there can be an window where we finish ordered io, 
> and then the pages get released (by memory pressure), then a new read 
> happen to the range, then our RST workload happened.
> 
> In that case, we would have read failure.
> 
> 
> Thus I strongly recommened to do the RST tree update inside 
> finish_ordered_io().
> 
> This has several advantages:
> 
> - We don't need in-memory structure as a gap stopper
>    Since read would be blocked if there is a running ordered extent,
>    we don't need an in-memory RST mapping.
> 
> - finish_ordered_io() itself has all the proper context for tree
>    updates.
>    As that's the main location we update the subvolume tree.

The first versions of this patchset did do that and then you asked me
to create an in-memory structure and do the update at delayed ref time.

How about adding a completion, or something like a atomic_t 
ordered_stripes_pending for the RST updates and have 
finish_ordered_io() waiting for it?

> The main concern may be the bioc <-> ordered extent mapping, but IIRC 
> for zoned mode one bioc is one ordered extent, thus this shouldn't be a 
> super big deal?

Yep, but I want to be able to use RST for non-zoned devices as well
to attack the RAID56 problems and add erasure coding RAID.

> Otherwise we may need something to trace all the bioc belong to the 
> ordered extent.
Qu Wenruo March 2, 2023, 11:45 a.m. UTC | #3
On 2023/3/2 19:25, Johannes Thumshirn wrote:
> On 02.03.23 11:59, Qu Wenruo wrote:
>>
>>
>> On 2023/3/2 17:45, Johannes Thumshirn wrote:
>>> Add support for inserting stripe extents into the raid stripe tree on
>>> completion of every write that needs an extra logical-to-physical
>>> translation when using RAID.
>>>
>>> Inserting the stripe extents happens after the data I/O has completed,
>>> this is done to a) support zone-append and b) rule out the possibility of
>>> a RAID-write-hole.
>>>
>>> This is done by creating in-memory ordered stripe extents, just like the
>>> in memory ordered extents, on I/O completion and the on-disk raid stripe
>>> extents get created once we're running the delayed_refs for the extent
>>> item this stripe extent is tied to.
>>>
>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>> ---
>>>    fs/btrfs/Makefile           |   2 +-
>>>    fs/btrfs/bio.c              |  29 +++++
>>>    fs/btrfs/delayed-ref.c      |   6 +-
>>>    fs/btrfs/delayed-ref.h      |   2 +
>>>    fs/btrfs/extent-tree.c      |  60 +++++++++++
>>>    fs/btrfs/inode.c            |  15 ++-
>>>    fs/btrfs/raid-stripe-tree.c | 204 ++++++++++++++++++++++++++++++++++++
>>>    fs/btrfs/raid-stripe-tree.h |  71 +++++++++++++
>>>    fs/btrfs/volumes.c          |   4 +-
>>>    fs/btrfs/volumes.h          |  13 +--
>>>    fs/btrfs/zoned.c            |   3 +
>>>    11 files changed, 397 insertions(+), 12 deletions(-)
>>>    create mode 100644 fs/btrfs/raid-stripe-tree.c
>>>    create mode 100644 fs/btrfs/raid-stripe-tree.h
>>>
>>> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
>>> index 90d53209755b..3bb869a84e54 100644
>>> --- a/fs/btrfs/Makefile
>>> +++ b/fs/btrfs/Makefile
>>> @@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>>>    	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>>>    	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
>>>    	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
>>> -	   lru_cache.o
>>> +	   lru_cache.o raid-stripe-tree.o
>>>    
>>>    btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>>>    btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
>>> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
>>> index 726592868e9c..2b174865d347 100644
>>> --- a/fs/btrfs/bio.c
>>> +++ b/fs/btrfs/bio.c
>>> @@ -15,6 +15,7 @@
>>>    #include "rcu-string.h"
>>>    #include "zoned.h"
>>>    #include "file-item.h"
>>> +#include "raid-stripe-tree.h"
>>>    
>>>    static struct bio_set btrfs_bioset;
>>>    static struct bio_set btrfs_clone_bioset;
>>> @@ -348,6 +349,21 @@ static void btrfs_raid56_end_io(struct bio *bio)
>>>    	btrfs_put_bioc(bioc);
>>>    }
>>>    
>>> +static void btrfs_raid_stripe_update(struct work_struct *work)
>>> +{
>>> +	struct btrfs_bio *bbio =
>>> +		container_of(work, struct btrfs_bio, end_io_work);
>>> +	struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
>>> +	struct btrfs_io_context *bioc = stripe->bioc;
>>> +	int ret;
>>> +
>>> +	ret = btrfs_add_ordered_stripe(bioc);
>>> +	if (ret)
>>> +		bbio->bio.bi_status = errno_to_blk_status(ret);
>>> +	btrfs_orig_bbio_end_io(bbio);
>>> +	btrfs_put_bioc(bioc);
>>> +}
>>> +
>>>    static void btrfs_orig_write_end_io(struct bio *bio)
>>>    {
>>>    	struct btrfs_io_stripe *stripe = bio->bi_private;
>>> @@ -370,6 +386,16 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>>>    	else
>>>    		bio->bi_status = BLK_STS_OK;
>>>    
>>> +	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
>>> +		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>>> +
>>> +	if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>>> +		INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
>>> +		queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
>>> +			&bbio->end_io_work);
>>
>> I'm still having the old question, what would happen if the delayed
>> workload happen after the ordered extent finished?
>>
>> Since we can not ensure the order between this RST update workload and
>> finish_ordered_io(), there can be an window where we finish ordered io,
>> and then the pages get released (by memory pressure), then a new read
>> happen to the range, then our RST workload happened.
>>
>> In that case, we would have read failure.
>>
>>
>> Thus I strongly recommened to do the RST tree update inside
>> finish_ordered_io().
>>
>> This has several advantages:
>>
>> - We don't need in-memory structure as a gap stopper
>>     Since read would be blocked if there is a running ordered extent,
>>     we don't need an in-memory RST mapping.
>>
>> - finish_ordered_io() itself has all the proper context for tree
>>     updates.
>>     As that's the main location we update the subvolume tree.
> 
> The first versions of this patchset did do that and then you asked me
> to create an in-memory structure and do the update at delayed ref time.

I have to admit that, I was a total idiot.

At that time I didn't notice the read would block when there is a 
running ordered extent at all...

So, all my fault.

> 
> How about adding a completion, or something like a atomic_t
> ordered_stripes_pending for the RST updates and have
> finish_ordered_io() waiting for it?

That's also a feasible solution.

Although I'm a little concerned about the fact that the RST delayed work 
is also going into fs_info->endio_workers, which is also used by 
finish_ordered_fn().

Thus it can cause deadlock if the workqueue has one max_active, and the 
running one is finish_ordered_fn(), which then can be waiting for the 
RST work.

But the RST work can only be executed if the endio_workers has finished 
its current work, thus leading to a deadlock.

Thanks,
Qu
> 
>> The main concern may be the bioc <-> ordered extent mapping, but IIRC
>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
>> super big deal?
> 
> Yep, but I want to be able to use RST for non-zoned devices as well
> to attack the RAID56 problems and add erasure coding RAID.
> 
>> Otherwise we may need something to trace all the bioc belong to the
>> ordered extent.
>
Johannes Thumshirn March 2, 2023, 11:45 a.m. UTC | #4
On 02.03.23 12:25, Johannes Thumshirn wrote:
>> I'm still having the old question, what would happen if the delayed 
>> workload happen after the ordered extent finished?
>>
>> Since we can not ensure the order between this RST update workload and 
>> finish_ordered_io(), there can be an window where we finish ordered io, 
>> and then the pages get released (by memory pressure), then a new read 
>> happen to the range, then our RST workload happened.
>>
>> In that case, we would have read failure.
>>
>>
>> Thus I strongly recommened to do the RST tree update inside 
>> finish_ordered_io().
>>
>> This has several advantages:
>>
>> - We don't need in-memory structure as a gap stopper
>>    Since read would be blocked if there is a running ordered extent,
>>    we don't need an in-memory RST mapping.
>>
>> - finish_ordered_io() itself has all the proper context for tree
>>    updates.
>>    As that's the main location we update the subvolume tree.
> 
> The first versions of this patchset did do that and then you asked me
> to create an in-memory structure and do the update at delayed ref time.
> 
> How about adding a completion, or something like a atomic_t 
> ordered_stripes_pending for the RST updates and have 
> finish_ordered_io() waiting for it?


Something like the following (completely untested):
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 2b174865d347..f96177a501e4 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -391,6 +391,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
 
 	if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
 		INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
+		btrfs_add_ordered_stripe_pending(bioc->fs_info);
 		queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
 			&bbio->end_io_work);
 		return;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index abbfd71f2cb6..f88a4c92c248 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3044,6 +3044,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 
 	rwlock_init(&fs_info->stripe_update_lock);
 	fs_info->stripe_update_tree = RB_ROOT;
+	atomic_set(&fs_info->ordered_stripes_pending, 0);
+	init_waitqueue_head(&fs_info->ordered_stripe_wait);
 }
 
 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index dd151538d2b1..ab25873267ea 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -794,6 +794,8 @@ struct btrfs_fs_info {
 
 	rwlock_t stripe_update_lock;
 	struct rb_root stripe_update_tree;
+	atomic_t ordered_stripes_pending;
+	wait_queue_head_t ordered_stripe_wait;
 
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	spinlock_t ref_verify_lock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aaa1db90e58b..a84e79ad840e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3230,6 +3230,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		goto out;
 	}
 
+	wait_event(fs_info->ordered_stripe_wait,
+		   !btrfs_ordered_stripes_pending(fs_info));
+
 	ret = add_pending_csums(trans, &ordered_extent->list);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 8799a7abaf38..e06457771976 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -110,6 +110,7 @@ int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
 		rb_replace_node(node, &stripe->rb_node,
 				&fs_info->stripe_update_tree);
 	}
+	atomic_dec(&fs_info->ordered_stripes_pending);
 	write_unlock(&fs_info->stripe_update_lock);
 
 	trace_btrfs_ordered_stripe_add(fs_info, stripe);
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 371409351d60..ecc67126ed62 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -84,4 +84,15 @@ static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
 	/* once for the tree */
 	btrfs_put_ordered_stripe(fs_info, stripe);
 }
+
+static inline void btrfs_add_ordered_stripe_pending(struct btrfs_fs_info *fs_info)
+{
+	atomic_inc(&fs_info->ordered_stripes_pending);
+}
+
+static inline bool btrfs_ordered_stripes_pending(struct btrfs_fs_info *fs_info)
+{
+	return atomic_read(&fs_info->ordered_stripes_pending) != 0;
+}
+
 #endif
Johannes Thumshirn March 2, 2023, 11:58 a.m. UTC | #5
On 02.03.23 12:45, Qu Wenruo wrote:
>>
>> How about adding a completion, or something like a atomic_t
>> ordered_stripes_pending for the RST updates and have
>> finish_ordered_io() waiting for it?
> 
> That's also a feasible solution.
> 
> Although I'm a little concerned about the fact that the RST delayed work 
> is also going into fs_info->endio_workers, which is also used by 
> finish_ordered_fn().
> 
> Thus it can cause deadlock if the workqueue has one max_active, and the 
> running one is finish_ordered_fn(), which then can be waiting for the 
> RST work.
> 
> But the RST work can only be executed if the endio_workers has finished 
> its current work, thus leading to a deadlock.

How about adding a new workqueue for RST updates? That should mitigate
the deadlock.
Qu Wenruo March 2, 2023, 12:01 p.m. UTC | #6
On 2023/3/2 19:58, Johannes Thumshirn wrote:
> On 02.03.23 12:45, Qu Wenruo wrote:
>>>
>>> How about adding a completion, or something like a atomic_t
>>> ordered_stripes_pending for the RST updates and have
>>> finish_ordered_io() waiting for it?
>>
>> That's also a feasible solution.
>>
>> Although I'm a little concerned about the fact that the RST delayed work
>> is also going into fs_info->endio_workers, which is also used by
>> finish_ordered_fn().
>>
>> Thus it can cause deadlock if the workqueue has one max_active, and the
>> running one is finish_ordered_fn(), which then can be waiting for the
>> RST work.
>>
>> But the RST work can only be executed if the endio_workers has finished
>> its current work, thus leading to a deadlock.
> 
> How about adding a new workqueue for RST updates? That should mitigate
> the deadlock.
> 
My bad, the finish_ordered_io() go endio_write_workers, not 
endio_workers, thus it should be fine.

Thanks,
Qu
Christoph Hellwig March 2, 2023, 1:59 p.m. UTC | #7
On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
> > The main concern may be the bioc <-> ordered extent mapping, but IIRC 
> > for zoned mode one bioc is one ordered extent, thus this shouldn't be a 
> > super big deal?
> 
> Yep, but I want to be able to use RST for non-zoned devices as well
> to attack the RAID56 problems and add erasure coding RAID.

I have a series in my queue the limits every btrfs_bio (and thus bioc)
to a single ordered_extent.  The bio spanning ordered_extents is a very
strange corner case that rarely happens but causes a lot of problems.
With that series we'll also gain a pointer to the ordered_extent from
the btrfs_bio, which will remove all the ordered_extent lookups from
the fast path.

So I think you can rework your series to also limit the bio to a single
ordered extent, and if needed split the ordered extent for anything that
uses the raid stripe tree and we'll nicely converge there.
Christoph Hellwig March 2, 2023, 2:01 p.m. UTC | #8
On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
> > Thus it can cause deadlock if the workqueue has one max_active, and the 
> > running one is finish_ordered_fn(), which then can be waiting for the 
> > RST work.
> > 
> > But the RST work can only be executed if the endio_workers has finished 
> > its current work, thus leading to a deadlock.
> 
> How about adding a new workqueue for RST updates? That should mitigate
> the deadlock.

The amount of weird workqueues in the btrfs end I/O path is worrysome.
What I plan to do, and might be ready to submit about next week is a
series to actually offload the I/O completion to a workqueue on a
per (original) btrfs_bio basis.  This means that RST updates,
ordered_extent processing, compressed write handling etc can all
run from the same end I/O worker.  As a bonus we remove all irqsave
locking from btrfs.
Christoph Hellwig March 2, 2023, 2:03 p.m. UTC | #9
On Thu, Mar 02, 2023 at 11:45:44AM +0000, Johannes Thumshirn wrote:
> +	wait_event(fs_info->ordered_stripe_wait,
> +		   !btrfs_ordered_stripes_pending(fs_info));

Ugg.  Waiting for processing from one workqueue from another one
is really a big fat warning sign you are in workqueue hell.  Let's
take a step back and solve this properly.
Johannes Thumshirn March 2, 2023, 3:29 p.m. UTC | #10
On 02.03.23 14:59, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
>>> The main concern may be the bioc <-> ordered extent mapping, but IIRC 
>>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a 
>>> super big deal?
>>
>> Yep, but I want to be able to use RST for non-zoned devices as well
>> to attack the RAID56 problems and add erasure coding RAID.
> 
> I have a series in my queue the limits every btrfs_bio (and thus bioc)
> to a single ordered_extent.  The bio spanning ordered_extents is a very
> strange corner case that rarely happens but causes a lot of problems.
> With that series we'll also gain a pointer to the ordered_extent from
> the btrfs_bio, which will remove all the ordered_extent lookups from
> the fast path.
>
> So I think you can rework your series to also limit the bio to a single
> ordered extent, and if needed split the ordered extent for anything that
> uses the raid stripe tree and we'll nicely converge there.
> 

That does indeed sound like a good idea to me.
Johannes Thumshirn March 2, 2023, 3:31 p.m. UTC | #11
On 02.03.23 15:02, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
>>> Thus it can cause deadlock if the workqueue has one max_active, and the 
>>> running one is finish_ordered_fn(), which then can be waiting for the 
>>> RST work.
>>>
>>> But the RST work can only be executed if the endio_workers has finished 
>>> its current work, thus leading to a deadlock.
>>
>> How about adding a new workqueue for RST updates? That should mitigate
>> the deadlock.
> 
> The amount of weird workqueues in the btrfs end I/O path is worrysome.
> What I plan to do, and might be ready to submit about next week is a
> series to actually offload the I/O completion to a workqueue on a
> per (original) btrfs_bio basis.  This means that RST updates,
> ordered_extent processing, compressed write handling etc can all
> run from the same end I/O worker.  As a bonus we remove all irqsave
> locking from btrfs.
> 

If it's all running from the same end I/O worker then we can make sure
the race Qu suspects can be eliminated, can't we?
Qu Wenruo March 2, 2023, 10:35 p.m. UTC | #12
On 2023/3/2 23:31, Johannes Thumshirn wrote:
> On 02.03.23 15:02, Christoph Hellwig wrote:
>> On Thu, Mar 02, 2023 at 11:58:13AM +0000, Johannes Thumshirn wrote:
>>>> Thus it can cause deadlock if the workqueue has one max_active, and the
>>>> running one is finish_ordered_fn(), which then can be waiting for the
>>>> RST work.
>>>>
>>>> But the RST work can only be executed if the endio_workers has finished
>>>> its current work, thus leading to a deadlock.
>>>
>>> How about adding a new workqueue for RST updates? That should mitigate
>>> the deadlock.
>>
>> The amount of weird workqueues in the btrfs end I/O path is worrysome.
>> What I plan to do, and might be ready to submit about next week is a
>> series to actually offload the I/O completion to a workqueue on a
>> per (original) btrfs_bio basis.  This means that RST updates,
>> ordered_extent processing, compressed write handling etc can all
>> run from the same end I/O worker.  As a bonus we remove all irqsave
>> locking from btrfs.
>>
> 
> If it's all running from the same end I/O worker then we can make sure
> the race Qu suspects can be eliminated, can't we?

In fact, waiting other workqueue inside other workqueue is already a wq 
hell...

Just make the in-memory RST update happen in finish_ordered_io() should 
be good enough.
Then we can keep the RST tree update in delayed ref.

Thanks,
Qu
Qu Wenruo March 3, 2023, 12:13 a.m. UTC | #13
On 2023/3/2 21:59, Christoph Hellwig wrote:
> On Thu, Mar 02, 2023 at 11:25:22AM +0000, Johannes Thumshirn wrote:
>>> The main concern may be the bioc <-> ordered extent mapping, but IIRC
>>> for zoned mode one bioc is one ordered extent, thus this shouldn't be a
>>> super big deal?
>>
>> Yep, but I want to be able to use RST for non-zoned devices as well
>> to attack the RAID56 problems and add erasure coding RAID.
> 
> I have a series in my queue the limits every btrfs_bio (and thus bioc)
> to a single ordered_extent.  The bio spanning ordered_extents is a very
> strange corner case that rarely happens but causes a lot of problems.

Really?

A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily 
lead to that situation.

If we really split ordered extents to that stripe len, it can cause a 
lot of small file extents thus bloat the size of subvolume trees.

Thanks,
Qu

> With that series we'll also gain a pointer to the ordered_extent from
> the btrfs_bio, which will remove all the ordered_extent lookups from
> the fast path.
> 
> So I think you can rework your series to also limit the bio to a single
> ordered extent, and if needed split the ordered extent for anything that
> uses the raid stripe tree and we'll nicely converge there.
Johannes Thumshirn March 3, 2023, 11:15 a.m. UTC | #14
On 02.03.23 23:35, Qu Wenruo wrote:
>>
>> If it's all running from the same end I/O worker then we can make sure
>> the race Qu suspects can be eliminated, can't we?
> 
> In fact, waiting other workqueue inside other workqueue is already a wq 
> hell...
> 
> Just make the in-memory RST update happen in finish_ordered_io() should 
> be good enough.
> Then we can keep the RST tree update in delayed ref.

There's two possibilities how to handle it:
1) Have a common workfn that handles all the calls in the correct order
2) Do the RST update in btrfs_finish_ordered_io()

To me both are valuable options, so I don't care. Both need a bit of
preparation work before, but that's the nature of the beast.

For 2) we need a pointer to the bioc in ordered_extent, so we need to
make sure the lifetimes are in sync. Or the other way around, have
ordered_stripe hold enough information for the RST updates and the
end_io handler insert it in the ordered_stripe (that needs to be
passed into the bioc or bbio).

*Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
easier to implement.

Qu, Christoph and others, what do you think?

[1] https://lore.kernel.org/linux-btrfs/ZACsVI3mfprrj4j6@infradead.org
Qu Wenruo March 3, 2023, 11:42 a.m. UTC | #15
On 2023/3/3 19:15, Johannes Thumshirn wrote:
> On 02.03.23 23:35, Qu Wenruo wrote:
>>>
>>> If it's all running from the same end I/O worker then we can make sure
>>> the race Qu suspects can be eliminated, can't we?
>>
>> In fact, waiting other workqueue inside other workqueue is already a wq
>> hell...
>>
>> Just make the in-memory RST update happen in finish_ordered_io() should
>> be good enough.
>> Then we can keep the RST tree update in delayed ref.
> 
> There's two possibilities how to handle it:
> 1) Have a common workfn that handles all the calls in the correct order
> 2) Do the RST update in btrfs_finish_ordered_io()
> 
> To me both are valuable options, so I don't care. Both need a bit of
> preparation work before, but that's the nature of the beast.
> 
> For 2) we need a pointer to the bioc in ordered_extent, so we need to
> make sure the lifetimes are in sync. Or the other way around, have
> ordered_stripe hold enough information for the RST updates and the
> end_io handler insert it in the ordered_stripe (that needs to be
> passed into the bioc or bbio).
> 
> *Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
> easier to implement.

 From my understanding, HCH's proposal is a super set of 2), not only do 
the ordered IO thing in the same workqueue, but also all the other endio 
works, thus if done properly can easily handle all the complex dependency.

But that still depends on the final patchset.

Thanks,
Qu
> 
> Qu, Christoph and others, what do you think?
> 
> [1] https://lore.kernel.org/linux-btrfs/ZACsVI3mfprrj4j6@infradead.org
>
Christoph Hellwig March 3, 2023, 2:15 p.m. UTC | #16
On Fri, Mar 03, 2023 at 08:13:23AM +0800, Qu Wenruo wrote:
> > I have a series in my queue the limits every btrfs_bio (and thus bioc)
> > to a single ordered_extent.  The bio spanning ordered_extents is a very
> > strange corner case that rarely happens but causes a lot of problems.
> 
> Really?
> 
> A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily lead
> to that situation.
> 
> If we really split ordered extents to that stripe len, it can cause a lot of
> small file extents thus bloat the size of subvolume trees.

I might have been a little more clear in my wording.

This is talking about the btrfs_bio submitted by the upper layers using
btrfs_submit_bio, not the ones split out by it at the extent boundaries.
Christoph Hellwig March 3, 2023, 2:15 p.m. UTC | #17
On Thu, Mar 02, 2023 at 03:31:51PM +0000, Johannes Thumshirn wrote:
> If it's all running from the same end I/O worker then we can make sure
> the race Qu suspects can be eliminated, can't we? 

Yes.
Christoph Hellwig March 3, 2023, 2:16 p.m. UTC | #18
On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
> Just make the in-memory RST update happen in finish_ordered_io() should be
> good enough.
> Then we can keep the RST tree update in delayed ref.

Independent of the workqueue changes, doing the RST update in
finish_ordered_io feels like the right thing to me, although my
gut feeling migh not be properly adjusted to btrfs yet :)
Christoph Hellwig March 3, 2023, 2:17 p.m. UTC | #19
On Fri, Mar 03, 2023 at 11:15:00AM +0000, Johannes Thumshirn wrote:
> There's two possibilities how to handle it:
> 1) Have a common workfn that handles all the calls in the correct order
> 2) Do the RST update in btrfs_finish_ordered_io()
> 
> To me both are valuable options, so I don't care. Both need a bit of
> preparation work before, but that's the nature of the beast.
> 
> For 2) we need a pointer to the bioc in ordered_extent, so we need to
> make sure the lifetimes are in sync. Or the other way around, have
> ordered_stripe hold enough information for the RST updates and the
> end_io handler insert it in the ordered_stripe (that needs to be
> passed into the bioc or bbio).
> 
> *Iff* I interpret Christoph's proposal in [1] correctly, options 1) is
> easier to implement.

1 is probably easier, and should be done for other reasons.  But 2
really feels like the right thing to do in addition to 1.
Christoph Hellwig March 3, 2023, 2:21 p.m. UTC | #20
On Fri, Mar 03, 2023 at 07:42:00PM +0800, Qu Wenruo wrote:
> From my understanding, HCH's proposal is a super set of 2), not only do the
> ordered IO thing in the same workqueue, but also all the other endio works,
> thus if done properly can easily handle all the complex dependency.

I've pushed my current WIP out, some of the commit logs aren't there
yet, and it still does some superflous offloads for RAID5 reads:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/btrfs-io_end-work
Qu Wenruo March 3, 2023, 11:03 p.m. UTC | #21
On 2023/3/3 22:15, Christoph Hellwig wrote:
> On Fri, Mar 03, 2023 at 08:13:23AM +0800, Qu Wenruo wrote:
>>> I have a series in my queue the limits every btrfs_bio (and thus bioc)
>>> to a single ordered_extent.  The bio spanning ordered_extents is a very
>>> strange corner case that rarely happens but causes a lot of problems.
>>
>> Really?
>>
>> A not-so-large write (e.g. 4MiB) for RAID0 (64K stripe len) can easily lead
>> to that situation.
>>
>> If we really split ordered extents to that stripe len, it can cause a lot of
>> small file extents thus bloat the size of subvolume trees.
> 
> I might have been a little more clear in my wording.
> 
> This is talking about the btrfs_bio submitted by the upper layers using
> btrfs_submit_bio, not the ones split out by it at the extent boundaries.

Oh, that indeed solves most of the common cases.

But I'm not familiar enough on the direct IO front, IIRC we have some 
recent bugs related to page faulting during a larger direct IO write.

Not sure if that would affect the use case.

Thanks,
Qu
Johannes Thumshirn March 6, 2023, 10:58 a.m. UTC | #22
On 03.03.23 15:21, hch@infradead.org wrote:
> On Fri, Mar 03, 2023 at 07:42:00PM +0800, Qu Wenruo wrote:
>> From my understanding, HCH's proposal is a super set of 2), not only do the
>> ordered IO thing in the same workqueue, but also all the other endio works,
>> thus if done properly can easily handle all the complex dependency.
> 
> I've pushed my current WIP out, some of the commit logs aren't there
> yet, and it still does some superflous offloads for RAID5 reads:
> 
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/btrfs-io_end-work
> 


Thanks, I just had a look at it and it seems like I can easily rebase on top
and implement option 1.

For option 2 there is still plumbing needed and I'll look into that as well.
Shouldn't be too hard.
Christoph Hellwig March 6, 2023, 5:11 p.m. UTC | #23
On Sat, Mar 04, 2023 at 07:03:01AM +0800, Qu Wenruo wrote:
> But I'm not familiar enough on the direct IO front, IIRC we have some recent
> bugs related to page faulting during a larger direct IO write.

Yes.  But the fix for that is actually doing some of the same changes
I had planned for my series, so I think we're aligned on the direction.
Johannes Thumshirn March 8, 2023, 9:11 a.m. UTC | #24
On 03.03.23 15:16, hch@infradead.org wrote:
> On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
>> Just make the in-memory RST update happen in finish_ordered_io() should be
>> good enough.
>> Then we can keep the RST tree update in delayed ref.
> 
> Independent of the workqueue changes, doing the RST update in
> finish_ordered_io feels like the right thing to me, although my
> gut feeling migh not be properly adjusted to btrfs yet :)
> 

Btw, this would look sth like the following (untested):

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ab8f1c21a773..f22e34b4328f 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -352,21 +352,6 @@ static void btrfs_raid56_end_io(struct bio *bio)
        btrfs_put_bioc(bioc);
 }
 
-static void btrfs_raid_stripe_update(struct work_struct *work)
-{
-       struct btrfs_bio *bbio =
-               container_of(work, struct btrfs_bio, end_io_work);
-       struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
-       struct btrfs_io_context *bioc = stripe->bioc;
-       int ret;
-
-       ret = btrfs_add_ordered_stripe(bioc);
-       if (ret)
-               bbio->bio.bi_status = errno_to_blk_status(ret);
-       btrfs_orig_bbio_end_io(bbio);
-       btrfs_put_bioc(bioc);
-}
-
 static void btrfs_orig_write_end_io(struct bio *bio)
 {
        struct btrfs_io_stripe *stripe = bio->bi_private;
@@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
                stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 
        if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
-               INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
-               queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
-                       &bbio->end_io_work);
-               return;
+               struct btrfs_ordered_extent *oe;
+
+               oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+               btrfs_get_bioc(bioc);
+               oe->bioc = bioc;
+               btrfs_put_ordered_extent(oe);
        }
 
        btrfs_orig_bbio_end_io(bbio);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b0cff5c50fb..704e8705bbb9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3159,6 +3159,11 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                btrfs_rewrite_logical_zoned(ordered_extent);
                btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
                                        ordered_extent->disk_num_bytes);
+       } else if (ordered_extent->bioc) {
+               ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
+               btrfs_put_bioc(ordered_extent->bioc);
+               if (ret)
+                       goto out;
        }
 
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 18007f9c00ad..e3939bb8a525 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -157,6 +157,8 @@ struct btrfs_ordered_extent {
         * command in a workqueue context
         */
        u64 physical;
+
+       struct btrfs_io_context *bioc;
 };
 
 static inline void
Qu Wenruo March 8, 2023, 12:01 p.m. UTC | #25
On 2023/3/8 17:11, Johannes Thumshirn wrote:
> On 03.03.23 15:16, hch@infradead.org wrote:
>> On Fri, Mar 03, 2023 at 06:35:30AM +0800, Qu Wenruo wrote:
>>> Just make the in-memory RST update happen in finish_ordered_io() should be
>>> good enough.
>>> Then we can keep the RST tree update in delayed ref.
>>
>> Independent of the workqueue changes, doing the RST update in
>> finish_ordered_io feels like the right thing to me, although my
>> gut feeling migh not be properly adjusted to btrfs yet :)
>>
> 
> Btw, this would look sth like the following (untested):
> 
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index ab8f1c21a773..f22e34b4328f 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -352,21 +352,6 @@ static void btrfs_raid56_end_io(struct bio *bio)
>          btrfs_put_bioc(bioc);
>   }
>   
> -static void btrfs_raid_stripe_update(struct work_struct *work)
> -{
> -       struct btrfs_bio *bbio =
> -               container_of(work, struct btrfs_bio, end_io_work);
> -       struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
> -       struct btrfs_io_context *bioc = stripe->bioc;
> -       int ret;
> -
> -       ret = btrfs_add_ordered_stripe(bioc);
> -       if (ret)
> -               bbio->bio.bi_status = errno_to_blk_status(ret);
> -       btrfs_orig_bbio_end_io(bbio);
> -       btrfs_put_bioc(bioc);
> -}
> -
>   static void btrfs_orig_write_end_io(struct bio *bio)
>   {
>          struct btrfs_io_stripe *stripe = bio->bi_private;
> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>                  stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>   
>          if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> -               INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
> -               queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
> -                       &bbio->end_io_work);
> -               return;
> +               struct btrfs_ordered_extent *oe;
> +
> +               oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
> +               btrfs_get_bioc(bioc);
> +               oe->bioc = bioc;

Looks valid to me.

Bioc would get a slightly longer lifespan, but it should be fine I guess?

Thanks,
Qu

> +               btrfs_put_ordered_extent(oe);
>          }
>   
>          btrfs_orig_bbio_end_io(bbio);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 6b0cff5c50fb..704e8705bbb9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3159,6 +3159,11 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>                  btrfs_rewrite_logical_zoned(ordered_extent);
>                  btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
>                                          ordered_extent->disk_num_bytes);
> +       } else if (ordered_extent->bioc) {
> +               ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
> +               btrfs_put_bioc(ordered_extent->bioc);
> +               if (ret)
> +                       goto out;
>          }
>   
>          if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index 18007f9c00ad..e3939bb8a525 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -157,6 +157,8 @@ struct btrfs_ordered_extent {
>           * command in a workqueue context
>           */
>          u64 physical;
> +
> +       struct btrfs_io_context *bioc;
>   };
>   
>   static inline void
>
Christoph Hellwig March 8, 2023, 2:33 p.m. UTC | #26
On Wed, Mar 08, 2023 at 09:11:54AM +0000, Johannes Thumshirn wrote:
> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>                 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>  
>         if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
> +               struct btrfs_ordered_extent *oe;
> +
> +               oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
> +               btrfs_get_bioc(bioc);
> +               oe->bioc = bioc;
> +               btrfs_put_ordered_extent(oe);


> +       } else if (ordered_extent->bioc) {
> +               ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
> +               btrfs_put_bioc(ordered_extent->bioc);
> +               if (ret)
> +                       goto out;

Given that btrfs_add_ordered_stripe only really builds the
btrfs_ordered_stripe structure and inserts it into the tree,
can't we just allocate the btrfs_ordered_stripe structure
in the end_io handler and have the ordered_extent point to it?

Also if you don't to split the ordered_extent for each bio,
you could instead have a list of btrfs_ordered_stripes in the
ordered_extent and then process all of them in the ordered_extent
completion handling.
Johannes Thumshirn March 9, 2023, 10:53 a.m. UTC | #27
On 08.03.23 15:34, Christoph Hellwig wrote:
> On Wed, Mar 08, 2023 at 09:11:54AM +0000, Johannes Thumshirn wrote:
>> @@ -393,10 +378,12 @@ static void btrfs_orig_write_end_io(struct bio *bio)
>>                 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
>>  
>>         if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
>> +               struct btrfs_ordered_extent *oe;
>> +
>> +               oe = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
>> +               btrfs_get_bioc(bioc);
>> +               oe->bioc = bioc;
>> +               btrfs_put_ordered_extent(oe);
> 
> 
>> +       } else if (ordered_extent->bioc) {
>> +               ret = btrfs_add_ordered_stripe(ordered_extent->bioc);
>> +               btrfs_put_bioc(ordered_extent->bioc);
>> +               if (ret)
>> +                       goto out;
> 
> Given that btrfs_add_ordered_stripe only really builds the
> btrfs_ordered_stripe structure and inserts it into the tree,
> can't we just allocate the btrfs_ordered_stripe structure
> in the end_io handler and have the ordered_extent point to it?

I wanted to avoid memory allocations in the end_io handler though.
If all is offloaded to a common workqueue, like with your proposal,
that'll be ok for me, but atomic allocations don't look right for
this for me.

> Also if you don't to split the ordered_extent for each bio,
> you could instead have a list of btrfs_ordered_stripes in the
> ordered_extent and then process all of them in the ordered_extent
> completion handling.
> 

Hmm yeah I need to think about it but theoretically this should be 
doable.
Christoph Hellwig March 9, 2023, 3:20 p.m. UTC | #28
On Thu, Mar 09, 2023 at 10:53:08AM +0000, Johannes Thumshirn wrote:
> I wanted to avoid memory allocations in the end_io handler though.
> If all is offloaded to a common workqueue, like with your proposal,
> that'll be ok for me, but atomic allocations don't look right for
> this for me.

Indeed.
diff mbox series

Patch

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..3bb869a84e54 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,7 +33,7 @@  btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
 	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-	   lru_cache.o
+	   lru_cache.o raid-stripe-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 726592868e9c..2b174865d347 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -15,6 +15,7 @@ 
 #include "rcu-string.h"
 #include "zoned.h"
 #include "file-item.h"
+#include "raid-stripe-tree.h"
 
 static struct bio_set btrfs_bioset;
 static struct bio_set btrfs_clone_bioset;
@@ -348,6 +349,21 @@  static void btrfs_raid56_end_io(struct bio *bio)
 	btrfs_put_bioc(bioc);
 }
 
+static void btrfs_raid_stripe_update(struct work_struct *work)
+{
+	struct btrfs_bio *bbio =
+		container_of(work, struct btrfs_bio, end_io_work);
+	struct btrfs_io_stripe *stripe = bbio->bio.bi_private;
+	struct btrfs_io_context *bioc = stripe->bioc;
+	int ret;
+
+	ret = btrfs_add_ordered_stripe(bioc);
+	if (ret)
+		bbio->bio.bi_status = errno_to_blk_status(ret);
+	btrfs_orig_bbio_end_io(bbio);
+	btrfs_put_bioc(bioc);
+}
+
 static void btrfs_orig_write_end_io(struct bio *bio)
 {
 	struct btrfs_io_stripe *stripe = bio->bi_private;
@@ -370,6 +386,16 @@  static void btrfs_orig_write_end_io(struct bio *bio)
 	else
 		bio->bi_status = BLK_STS_OK;
 
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+	if (btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+		INIT_WORK(&bbio->end_io_work, btrfs_raid_stripe_update);
+		queue_work(btrfs_end_io_wq(bioc->fs_info, bio),
+			&bbio->end_io_work);
+		return;
+	}
+
 	btrfs_orig_bbio_end_io(bbio);
 	btrfs_put_bioc(bioc);
 }
@@ -381,6 +407,8 @@  static void btrfs_clone_write_end_io(struct bio *bio)
 	if (bio->bi_status) {
 		atomic_inc(&stripe->bioc->error);
 		btrfs_log_dev_io_error(bio, stripe->dev);
+	} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 	}
 
 	/* Pass on control to the original bio this one was cloned from */
@@ -440,6 +468,7 @@  static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
 	bio->bi_private = &bioc->stripes[dev_nr];
 	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
 	bioc->stripes[dev_nr].bioc = bioc;
+	bioc->size = bio->bi_iter.bi_size;
 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
 }
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7660ac642c81..261f52ad8e12 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -14,6 +14,7 @@ 
 #include "space-info.h"
 #include "tree-mod-log.h"
 #include "fs.h"
+#include "raid-stripe-tree.h"
 
 struct kmem_cache *btrfs_delayed_ref_head_cachep;
 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -637,8 +638,11 @@  static int insert_delayed_ref(struct btrfs_trans_handle *trans,
 	exist->ref_mod += mod;
 
 	/* remove existing tail if its ref_mod is zero */
-	if (exist->ref_mod == 0)
+	if (exist->ref_mod == 0) {
+		btrfs_drop_ordered_stripe(trans->fs_info, exist->bytenr);
 		drop_delayed_ref(root, href, exist);
+	}
+
 	spin_unlock(&href->lock);
 	return ret;
 inserted:
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 2eb34abf700f..5096c1a1ed3e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -51,6 +51,8 @@  struct btrfs_delayed_ref_node {
 	/* is this node still in the rbtree? */
 	unsigned int is_head:1;
 	unsigned int in_tree:1;
+	/* Do we need RAID stripe tree modifications? */
+	unsigned int must_insert_stripe:1;
 };
 
 struct btrfs_delayed_extent_op {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b6c59e6805c..7441d784fe03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,6 +42,7 @@ 
 #include "file-item.h"
 #include "orphan.h"
 #include "tree-checker.h"
+#include "raid-stripe-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -1497,6 +1498,56 @@  static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static bool delayed_ref_needs_rst_update(struct btrfs_fs_info *fs_info,
+					 struct btrfs_delayed_ref_head *head)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	bool ret = false;
+
+	if (!btrfs_stripe_tree_root(fs_info))
+		return ret;
+
+	em = btrfs_get_chunk_map(fs_info, head->bytenr, head->num_bytes);
+	if (!em)
+		return ret;
+
+	map = em->map_lookup;
+
+	if (btrfs_need_stripe_tree_update(fs_info, map->type))
+		ret = true;
+
+	free_extent_map(em);
+
+	return ret;
+}
+
+static int add_stripe_entry_for_delayed_ref(struct btrfs_trans_handle *trans,
+					    struct btrfs_delayed_ref_node *node)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_ordered_stripe *stripe;
+	int ret = 0;
+
+	stripe = btrfs_lookup_ordered_stripe(fs_info, node->bytenr);
+	if (!stripe) {
+		btrfs_err(fs_info,
+			  "cannot get stripe extent for address %llu (%llu)",
+			  node->bytenr, node->num_bytes);
+		return -EINVAL;
+	}
+
+	ASSERT(stripe->logical == node->bytenr);
+
+	ret = btrfs_insert_raid_extent(trans, stripe);
+	/* once for us */
+	btrfs_put_ordered_stripe(fs_info, stripe);
+	/* once for the tree */
+	btrfs_put_ordered_stripe(fs_info, stripe);
+
+	return ret;
+}
+
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_delayed_ref_node *node,
 				struct btrfs_delayed_extent_op *extent_op,
@@ -1527,11 +1578,17 @@  static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 						 flags, ref->objectid,
 						 ref->offset, &ins,
 						 node->ref_mod);
+		if (ret)
+			return ret;
+		if (node->must_insert_stripe)
+			ret = add_stripe_entry_for_delayed_ref(trans, node);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
 					     ref->objectid, ref->offset,
 					     node->ref_mod, extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		if (node->must_insert_stripe)
+			btrfs_drop_ordered_stripe(trans->fs_info, node->bytenr);
 		ret = __btrfs_free_extent(trans, node, parent,
 					  ref_root, ref->objectid,
 					  ref->offset, node->ref_mod,
@@ -1901,6 +1958,8 @@  static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_extent_op *extent_op;
 	struct btrfs_delayed_ref_node *ref;
+	const bool need_rst_update =
+		delayed_ref_needs_rst_update(fs_info, locked_ref);
 	int must_insert_reserved = 0;
 	int ret;
 
@@ -1951,6 +2010,7 @@  static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		locked_ref->extent_op = NULL;
 		spin_unlock(&locked_ref->lock);
 
+		ref->must_insert_stripe = need_rst_update;
 		ret = run_one_delayed_ref(trans, ref, extent_op,
 					  must_insert_reserved);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f07d59e8193..aaa1db90e58b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,6 +70,7 @@ 
 #include "verity.h"
 #include "super.h"
 #include "orphan.h"
+#include "raid-stripe-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -9495,12 +9496,17 @@  static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	if (qgroup_released < 0)
 		return ERR_PTR(qgroup_released);
 
+	ret = btrfs_insert_preallocated_raid_stripe(inode->root->fs_info,
+						    start, len);
+	if (ret)
+		goto free_qgroup;
+
 	if (trans) {
 		ret = insert_reserved_file_extent(trans, inode,
 						  file_offset, &stack_fi,
 						  true, qgroup_released);
 		if (ret)
-			goto free_qgroup;
+			goto free_stripe_extent;
 		return trans;
 	}
 
@@ -9518,7 +9524,7 @@  static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
-		goto free_qgroup;
+		goto free_stripe_extent;
 	}
 
 	ret = btrfs_replace_file_extents(inode, path, file_offset,
@@ -9526,9 +9532,12 @@  static struct btrfs_trans_handle *insert_prealloc_file_extent(
 				     &trans);
 	btrfs_free_path(path);
 	if (ret)
-		goto free_qgroup;
+		goto free_stripe_extent;
 	return trans;
 
+free_stripe_extent:
+	btrfs_drop_ordered_stripe(inode->root->fs_info, start);
+
 free_qgroup:
 	/*
 	 * We have released qgroup data range at the beginning of the function,
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..9d3e7bffe6f8
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,204 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static int ordered_stripe_cmp(const void *key, const struct rb_node *node)
+{
+	struct btrfs_ordered_stripe *stripe =
+		rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+	const u64 *logical = key;
+
+	if (*logical < stripe->logical)
+		return -1;
+	if (*logical >= stripe->logical + stripe->num_bytes)
+		return 1;
+	return 0;
+}
+
+static int ordered_stripe_less(struct rb_node *rba, const struct rb_node *rbb)
+{
+	struct btrfs_ordered_stripe *stripe =
+		rb_entry(rba, struct btrfs_ordered_stripe, rb_node);
+	return ordered_stripe_cmp(&stripe->logical, rbb);
+}
+
+int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc)
+{
+	struct btrfs_fs_info *fs_info = bioc->fs_info;
+	struct btrfs_ordered_stripe *stripe;
+	struct btrfs_io_stripe *tmp;
+	u64 logical = bioc->logical;
+	u64 length = bioc->size;
+	struct rb_node *node;
+	size_t size;
+
+	size = bioc->num_stripes * sizeof(struct btrfs_io_stripe);
+	stripe = kzalloc(sizeof(struct btrfs_ordered_stripe), GFP_NOFS);
+	if (!stripe)
+		return -ENOMEM;
+
+	spin_lock_init(&stripe->lock);
+	tmp = kmemdup(bioc->stripes, size, GFP_NOFS);
+	if (!tmp) {
+		kfree(stripe);
+		return -ENOMEM;
+	}
+
+	stripe->logical = logical;
+	stripe->num_bytes = length;
+	stripe->num_stripes = bioc->num_stripes;
+	spin_lock(&stripe->lock);
+	stripe->stripes = tmp;
+	spin_unlock(&stripe->lock);
+	refcount_set(&stripe->ref, 1);
+
+	write_lock(&fs_info->stripe_update_lock);
+	node = rb_find_add(&stripe->rb_node, &fs_info->stripe_update_tree,
+	       ordered_stripe_less);
+	if (node) {
+		struct btrfs_ordered_stripe *old =
+			rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+
+		btrfs_debug(fs_info, "logical: %llu, length: %llu already exists",
+			  logical, length);
+		ASSERT(logical == old->logical);
+
+		rb_replace_node(node, &stripe->rb_node,
+				&fs_info->stripe_update_tree);
+	}
+	write_unlock(&fs_info->stripe_update_lock);
+
+	return 0;
+}
+
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *fs_info,
+							 u64 logical)
+{
+	struct rb_root *root = &fs_info->stripe_update_tree;
+	struct btrfs_ordered_stripe *stripe = NULL;
+	struct rb_node *node;
+
+	read_lock(&fs_info->stripe_update_lock);
+	node = rb_find(&logical, root, ordered_stripe_cmp);
+	if (node) {
+		stripe = rb_entry(node, struct btrfs_ordered_stripe, rb_node);
+		refcount_inc(&stripe->ref);
+	}
+	read_unlock(&fs_info->stripe_update_lock);
+
+	return stripe;
+}
+
+void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
+				 struct btrfs_ordered_stripe *stripe)
+{
+
+	if (refcount_dec_and_test(&stripe->ref)) {
+		struct rb_node *node;
+
+		write_lock(&fs_info->stripe_update_lock);
+
+		node = &stripe->rb_node;
+		rb_erase(node, &fs_info->stripe_update_tree);
+		RB_CLEAR_NODE(node);
+
+		spin_lock(&stripe->lock);
+		kfree(stripe->stripes);
+		spin_unlock(&stripe->lock);
+		kfree(stripe);
+		write_unlock(&fs_info->stripe_update_lock);
+	}
+}
+
+int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
+					  u64 start, u64 len)
+{
+	struct btrfs_io_context *bioc = NULL;
+	struct btrfs_ordered_stripe *stripe;
+	u64 map_length = len;
+	int ret;
+
+	if (!btrfs_stripe_tree_root(fs_info))
+		return 0;
+
+	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, start, &map_length,
+			      &bioc, 0);
+	if (ret)
+		return ret;
+
+	bioc->size = len;
+
+	stripe = btrfs_lookup_ordered_stripe(fs_info, start);
+	if (!stripe) {
+		ret = btrfs_add_ordered_stripe(bioc);
+		if (ret)
+			return ret;
+	} else {
+		spin_lock(&stripe->lock);
+		memcpy(stripe->stripes, bioc->stripes,
+		       bioc->num_stripes * sizeof(struct btrfs_io_stripe));
+		spin_unlock(&stripe->lock);
+		btrfs_put_ordered_stripe(fs_info, stripe);
+	}
+
+	return 0;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_stripe *stripe)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_key stripe_key;
+	struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
+	struct btrfs_stripe_extent *stripe_extent;
+	size_t item_size;
+	int ret;
+
+	item_size = stripe->num_stripes * sizeof(struct btrfs_raid_stride);
+
+	stripe_extent = kzalloc(item_size, GFP_NOFS);
+	if (!stripe_extent) {
+		btrfs_abort_transaction(trans, -ENOMEM);
+		btrfs_end_transaction(trans);
+		return -ENOMEM;
+	}
+
+	spin_lock(&stripe->lock);
+	for (int i = 0; i < stripe->num_stripes; i++) {
+		u64 devid = stripe->stripes[i].dev->devid;
+		u64 physical = stripe->stripes[i].physical;
+		struct btrfs_raid_stride *raid_stride =
+						&stripe_extent->strides[i];
+
+		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+	}
+	spin_unlock(&stripe->lock);
+
+	stripe_key.objectid = stripe->logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = stripe->num_bytes;
+
+	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+				item_size);
+	if (ret)
+		btrfs_abort_transaction(trans, ret);
+
+	kfree(stripe_extent);
+
+	return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..60d3f8489cc9
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,71 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#include "disk-io.h"
+#include "messages.h"
+
+struct btrfs_io_context;
+
+struct btrfs_ordered_stripe {
+	struct rb_node rb_node;
+
+	u64 logical;
+	u64 num_bytes;
+	int num_stripes;
+	struct btrfs_io_stripe *stripes;
+	spinlock_t lock;
+	refcount_t ref;
+};
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_stripe *stripe);
+int btrfs_insert_preallocated_raid_stripe(struct btrfs_fs_info *fs_info,
+					  u64 start, u64 len);
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
+						 struct btrfs_fs_info *fs_info,
+						 u64 logical);
+int btrfs_add_ordered_stripe(struct btrfs_io_context *bioc);
+void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
+					    struct btrfs_ordered_stripe *stripe);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+						 u64 map_type)
+{
+	u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (!btrfs_stripe_tree_root(fs_info))
+		return false;
+
+	if (type != BTRFS_BLOCK_GROUP_DATA)
+		return false;
+
+	if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
+		return true;
+
+	return false;
+}
+
+static inline void btrfs_drop_ordered_stripe(struct btrfs_fs_info *fs_info,
+					     u64 logical)
+{
+	struct btrfs_ordered_stripe *stripe;
+
+	if (!btrfs_stripe_tree_root(fs_info))
+		return;
+
+	stripe = btrfs_lookup_ordered_stripe(fs_info, logical);
+	if (!stripe)
+		return;
+	ASSERT(refcount_read(&stripe->ref) == 2);
+	/* once for us */
+	btrfs_put_ordered_stripe(fs_info, stripe);
+	/* once for the tree */
+	btrfs_put_ordered_stripe(fs_info, stripe);
+}
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d6775c7196f..fee611d1b01d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5879,6 +5879,7 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 }
 
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+						       u64 logical,
 						       u16 total_stripes)
 {
 	struct btrfs_io_context *bioc;
@@ -5898,6 +5899,7 @@  static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	bioc->fs_info = fs_info;
 	bioc->replace_stripe_src = -1;
 	bioc->full_stripe_logical = (u64)-1;
+	bioc->logical = logical;
 
 	return bioc;
 }
@@ -6493,7 +6495,7 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		goto out;
 	}
 
-	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+	bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
 	if (!bioc) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 650e131d079e..114c76c81eda 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -372,12 +372,10 @@  struct btrfs_fs_devices {
 
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
-	union {
-		/* Block mapping */
-		u64 physical;
-		/* For the endio handler */
-		struct btrfs_io_context *bioc;
-	};
+	/* Block mapping */
+	u64 physical;
+	/* For the endio handler */
+	struct btrfs_io_context *bioc;
 };
 
 struct btrfs_discard_stripe {
@@ -410,6 +408,9 @@  struct btrfs_io_context {
 	atomic_t error;
 	u16 max_errors;
 
+	u64 logical;
+	u64 size;
+
 	/*
 	 * The total number of stripes, including the extra duplicated
 	 * stripe for replace.
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f95b2c94d619..7e6cfc7a2918 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1692,6 +1692,9 @@  void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
 	u64 chunk_start_phys;
 	u64 logical;
 
+	/* Filesystems with a stripe tree have their own l2p mapping */
+	ASSERT(!btrfs_stripe_tree_root(fs_info));
+
 	em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
 	if (IS_ERR(em))
 		return;