Message ID | b018704727883c27c3368f1cd3ba84daf682b733.1652711187.git.johannes.thumshirn@wdc.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: introduce raid-stripe-tree | expand |
On 2022/5/16 22:31, Johannes Thumshirn wrote: > Add boilerplate code to delete entries from the raid-stripe-tree if the > corresponding file extent got deleted. > > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> > --- > fs/btrfs/ctree.c | 1 + > fs/btrfs/extent-tree.c | 9 +++ > fs/btrfs/file.c | 1 - > fs/btrfs/raid-stripe-tree.c | 111 ++++++++++++++++++++++++++++++++++++ > fs/btrfs/raid-stripe-tree.h | 8 +++ > 5 files changed, 129 insertions(+), 1 deletion(-) > > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c > index 1e24695ede0a..b7b4e421e9b8 100644 > --- a/fs/btrfs/ctree.c > +++ b/fs/btrfs/ctree.c > @@ -3623,6 +3623,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, > btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > > BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && > + key.type != BTRFS_RAID_STRIPE_KEY && > key.type != BTRFS_EXTENT_CSUM_KEY); > > if (btrfs_leaf_free_space(leaf) >= ins_len) > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index f477035a2ac2..00af3e469881 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -36,6 +36,7 @@ > #include "rcu-string.h" > #include "zoned.h" > #include "dev-replace.h" > +#include "raid-stripe-tree.h" > > #undef SCRAMBLE_DELAYED_REFS > > @@ -3199,6 +3200,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, > } > } Considering we're already in __btrfs_free_extents(), and the branch we're in is already for refs == 1 case, which means we're already the last one owning the file extent (and its stripe tree entry). > > + if (is_data) { > + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); > + if (ret) { > + btrfs_abort_transaction(trans, ret); > + return ret; > + } > + } > + > ret = btrfs_del_items(trans, extent_root, path, path->slots[0], > num_to_del); > if (ret) { > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index bd329316945f..6021188dcb9a 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -1009,7 +1009,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, > btrfs_release_path(path); > out: > args->drop_end = found ? min(args->end, last_end) : args->end; > - > return ret; > } > > diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c > index 426066bd7c0d..370ea68fe343 100644 > --- a/fs/btrfs/raid-stripe-tree.c > +++ b/fs/btrfs/raid-stripe-tree.c > @@ -6,6 +6,117 @@ > #include "raid-stripe-tree.h" > #include "volumes.h" > > +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, > + u64 length) > +{ > + struct btrfs_fs_info *fs_info = trans->fs_info; > + struct btrfs_root *stripe_root = fs_info->stripe_root; > + struct btrfs_path *path; > + struct btrfs_key stripe_key; > + struct btrfs_key found_key; > + struct extent_buffer *leaf; > + u64 end = start + length; > + u64 found_start; > + u64 found_end; > + int slot; > + int ret; > + > + if (!stripe_root) > + return 0; > + > + stripe_key.objectid = start; > + stripe_key.type = BTRFS_RAID_STRIPE_KEY; > + stripe_key.offset = end; > + > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > + > + ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1); > + if (ret < 0) > + goto out; > + if (ret == 0) > + goto delete; > + > + leaf = path->nodes[0]; > + slot = path->slots[0]; > + btrfs_item_key_to_cpu(leaf, &found_key, slot); > + found_start = found_key.objectid; > + found_end = found_start + found_key.offset; > + > + /* > + * | -- range to drop --| > + * | ---------- extent ---------- | > + */ Thus I believe we don't need those complex checking. The call site has make sure we're the last one owning the file extent, and since raid stripe is 1:1 mapped to the full extent (not just part of a data extent, like btrfs_file_extent_item can do), we should be safe to just do an ASSERT() without the complex split. Thus, I guess to be extra accurate, the 1:1 mapping is between an (data) EXTENT_ITEM and a raid stripe? Thanks, Qu > +front_split: > + if (start > found_start) { > + struct btrfs_key front_key; > + struct btrfs_dp_stripe *raid_stripe; > + struct extent_buffer *front_leaf; > + struct btrfs_stripe_extent *stripe_extent; > + int num_stripes; > + int i; > + > + front_key.objectid = found_start + length; > + front_key.type = BTRFS_RAID_STRIPE_KEY; > + front_key.offset = found_end - length; > + > + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); > + > + ret = btrfs_duplicate_item(trans, stripe_root, path, &front_key); > + if (ret == -EAGAIN) { > + btrfs_release_path(path); > + goto front_split; > + } > + if (ret < 0) > + goto out; > + front_leaf = path->nodes[0]; > + > + raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe); > + stripe_extent = &raid_stripe->extents; > + for (i = 0; i < num_stripes; i++) { > + u64 physical; > + > + physical = btrfs_stripe_extent_offset(leaf, stripe_extent); > + btrfs_set_stripe_extent_offset(front_leaf, stripe_extent, > + physical + length); > + stripe_extent++; > + } > + > + btrfs_mark_buffer_dirty(front_leaf); > + } > + > + /* > + * | -- range to drop --| > + * | ---------- extent ---------- | > + */ > +tail_split: > + if (end < found_end) { > + struct btrfs_key tail_key; > + > + > + tail_key.objectid = start; > + tail_key.type = BTRFS_RAID_STRIPE_KEY; > + tail_key.offset = found_end - end; > + > + ret = btrfs_duplicate_item(trans, stripe_root, path, &tail_key); > + if (ret == -EAGAIN) { > + btrfs_release_path(path); > + goto tail_split; > + } > + if (ret < 0) > + goto out; > + btrfs_mark_buffer_dirty(path->nodes[0]); > + } > + > +delete: > + ret = btrfs_del_item(trans, stripe_root, path); > +out: > + btrfs_free_path(path); > + return ret; > + > +} > + > static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, > struct btrfs_io_context *bioc) > { > diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h > index 320a110ecc66..766634df8601 100644 > --- a/fs/btrfs/raid-stripe-tree.h > +++ b/fs/btrfs/raid-stripe-tree.h > @@ -5,8 +5,16 @@ > > #include "volumes.h" > > +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, > + u64 length); > void btrfs_raid_stripe_tree_fn(struct work_struct *work); > > +static inline int btrfs_num_raid_stripes(u32 item_size) > +{ > + return item_size - offsetof(struct btrfs_dp_stripe, extents) / > + sizeof(struct btrfs_stripe_extent); > +} > + > static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc) > { > u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
On 17/05/2022 10:06, Qu Wenruo wrote: >> + ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1); >> + if (ret < 0) >> + goto out; >> + if (ret == 0) >> + goto delete; >> + >> + leaf = path->nodes[0]; >> + slot = path->slots[0]; >> + btrfs_item_key_to_cpu(leaf, &found_key, slot); >> + found_start = found_key.objectid; >> + found_end = found_start + found_key.offset; >> + >> + /* >> + * | -- range to drop --| >> + * | ---------- extent ---------- | >> + */ > Thus I believe we don't need those complex checking. > > The call site has make sure we're the last one owning the file extent, > and since raid stripe is 1:1 mapped to the full extent (not just part of > a data extent, like btrfs_file_extent_item can do), we should be safe to > just do an ASSERT() without the complex split. > > > Thus, I guess to be extra accurate, the 1:1 mapping is between an (data) > EXTENT_ITEM and a raid stripe? Unfortunately not, as we can split extents. I've found out the hard way that we need this. See btrfs_drop_extents() for details.
On 2022/5/17 16:10, Johannes Thumshirn wrote: > On 17/05/2022 10:06, Qu Wenruo wrote: >>> + ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1); >>> + if (ret < 0) >>> + goto out; >>> + if (ret == 0) >>> + goto delete; >>> + >>> + leaf = path->nodes[0]; >>> + slot = path->slots[0]; >>> + btrfs_item_key_to_cpu(leaf, &found_key, slot); >>> + found_start = found_key.objectid; >>> + found_end = found_start + found_key.offset; >>> + >>> + /* >>> + * | -- range to drop --| >>> + * | ---------- extent ---------- | >>> + */ >> Thus I believe we don't need those complex checking. >> >> The call site has make sure we're the last one owning the file extent, >> and since raid stripe is 1:1 mapped to the full extent (not just part of >> a data extent, like btrfs_file_extent_item can do), we should be safe to >> just do an ASSERT() without the complex split. >> >> >> Thus, I guess to be extra accurate, the 1:1 mapping is between an (data) >> EXTENT_ITEM and a raid stripe? > > Unfortunately not, as we can split extents. I've found out the hard way that > we need this. See btrfs_drop_extents() for details. But btrfs extents are immortal, it can only get freed when every bytes are no longer referred to. Btrfs_drop_extents() is complex because it's working on file extent level, but __btrfs_free_extent() is already working on extent level. Or do you mean, the raid stripe is not really bounded to extent level, but really bounded to file extent level? Then I'm not sure if it's a good idea to do such level mapping then. Thanks, Qu
On 17/05/2022 10:15, Qu Wenruo wrote: > But btrfs extents are immortal, it can only get freed when every bytes > are no longer referred to. > > Btrfs_drop_extents() is complex because it's working on file extent > level, but __btrfs_free_extent() is already working on extent level. > > Or do you mean, the raid stripe is not really bounded to extent level, > but really bounded to file extent level? > > Then I'm not sure if it's a good idea to do such level mapping then. It is bound to the file_extent_item as the extent_item is lacking the (logical, lenght) information I need.
On 2022/5/17 16:20, Johannes Thumshirn wrote: > On 17/05/2022 10:15, Qu Wenruo wrote: >> But btrfs extents are immortal, it can only get freed when every bytes >> are no longer referred to. >> >> Btrfs_drop_extents() is complex because it's working on file extent >> level, but __btrfs_free_extent() is already working on extent level. >> >> Or do you mean, the raid stripe is not really bounded to extent level, >> but really bounded to file extent level? >> >> Then I'm not sure if it's a good idea to do such level mapping then. > > It is bound to the file_extent_item as the extent_item is lacking the > (logical, lenght) information I need. Then I guess the it would be way more complex than we though. For file extent we can split it for things like CoW, and it's really too flex and too complex to me. The info of logical and length can still be extracted from from extent item (of course, or we will have way bigger problems). Furthermore, if we are bounded to extent item, then we even have the help from delayed refs update code, to help reduce the IO on stripe tree. Although all those benefits come from the cost of new creative ways to pass the stripe mapping info to delayed refs... Thanks, Qu
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1e24695ede0a..b7b4e421e9b8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3623,6 +3623,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f477035a2ac2..00af3e469881 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,7 @@ #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" +#include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS @@ -3199,6 +3200,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + if (is_data) { + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bd329316945f..6021188dcb9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1009,7 +1009,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); out: args->drop_end = found ? min(args->end, last_end) : args->end; - return ret; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 426066bd7c0d..370ea68fe343 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -6,6 +6,117 @@ #include "raid-stripe-tree.h" #include "volumes.h" +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, + u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_path *path; + struct btrfs_key stripe_key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + u64 end = start + length; + u64 found_start; + u64 found_end; + int slot; + int ret; + + if (!stripe_root) + return 0; + + stripe_key.objectid = start; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = end; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1); + if (ret < 0) + goto out; + if (ret == 0) + goto delete; + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + found_start = found_key.objectid; + found_end = found_start + found_key.offset; + + /* + * | -- range to drop --| + * | ---------- extent ---------- | + */ +front_split: + if (start > found_start) { + struct btrfs_key front_key; + struct btrfs_dp_stripe *raid_stripe; + struct extent_buffer *front_leaf; + struct btrfs_stripe_extent *stripe_extent; + int num_stripes; + int i; + + front_key.objectid = found_start + length; + front_key.type = BTRFS_RAID_STRIPE_KEY; + front_key.offset = found_end - length; + + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + + ret = btrfs_duplicate_item(trans, stripe_root, path, &front_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + goto front_split; + } + if (ret < 0) + goto out; + front_leaf = path->nodes[0]; + + raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe); + stripe_extent = &raid_stripe->extents; + for (i = 0; i < num_stripes; i++) { + u64 physical; + + physical = btrfs_stripe_extent_offset(leaf, stripe_extent); + btrfs_set_stripe_extent_offset(front_leaf, stripe_extent, + physical + length); + stripe_extent++; + } + + btrfs_mark_buffer_dirty(front_leaf); + } + + /* + * | -- range to drop --| + * | ---------- extent ---------- | + */ +tail_split: + if (end < found_end) { + struct btrfs_key tail_key; + + + tail_key.objectid = start; + tail_key.type = BTRFS_RAID_STRIPE_KEY; + tail_key.offset = found_end - end; + + ret = btrfs_duplicate_item(trans, stripe_root, path, &tail_key); + if (ret == -EAGAIN) { + btrfs_release_path(path); + goto tail_split; + } + if (ret < 0) + goto out; + btrfs_mark_buffer_dirty(path->nodes[0]); + } + +delete: + ret = btrfs_del_item(trans, stripe_root, path); +out: + btrfs_free_path(path); + return ret; + +} + static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 320a110ecc66..766634df8601 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -5,8 +5,16 @@ #include "volumes.h" +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, + u64 length); void btrfs_raid_stripe_tree_fn(struct work_struct *work); +static inline int btrfs_num_raid_stripes(u32 item_size) +{ + return item_size - offsetof(struct btrfs_dp_stripe, extents) / + sizeof(struct btrfs_stripe_extent); +} + static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc) { u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
Add boilerplate code to delete entries from the raid-stripe-tree if the corresponding file extent got deleted. Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> --- fs/btrfs/ctree.c | 1 + fs/btrfs/extent-tree.c | 9 +++ fs/btrfs/file.c | 1 - fs/btrfs/raid-stripe-tree.c | 111 ++++++++++++++++++++++++++++++++++++ fs/btrfs/raid-stripe-tree.h | 8 +++ 5 files changed, 129 insertions(+), 1 deletion(-)