@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o
+ subpage.o tree-mod-log.o raid-stripe-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
@@ -1092,6 +1092,9 @@ struct btrfs_fs_info {
/* Updates are not protected by any lock */
struct btrfs_commit_stats commit_stats;
+ struct mutex stripe_update_lock;
+ struct rb_root stripe_update_tree;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -3155,6 +3155,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
+
+ mutex_init(&fs_info->stripe_update_lock);
+ fs_info->stripe_update_tree = RB_ROOT;
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@@ -36,6 +36,7 @@
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
+#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -1491,6 +1492,47 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
return ret;
}
+static int add_stripe_entry_for_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ int ret;
+
+ if (!fs_info->stripe_root)
+ return 0;
+
+ em = btrfs_get_chunk_map(fs_info, node->bytenr, node->num_bytes);
+ if (!em) {
+ btrfs_err(fs_info,
+ "cannot get chunk map for address %llu",
+ node->bytenr);
+ return -EINVAL;
+ }
+
+ map = em->map_lookup;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type)) {
+ struct btrfs_ordered_stripe *stripe;
+
+ stripe = btrfs_lookup_ordered_stripe(fs_info, node->bytenr);
+ if (!stripe) {
+ btrfs_err(fs_info,
+ "cannot get stripe extent for address %llu (%llu)",
+ node->bytenr, node->num_bytes);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+ ASSERT(stripe->logical == node->bytenr);
+ ret = btrfs_insert_raid_extent(trans, stripe);
+ btrfs_put_ordered_stripe(fs_info, stripe);
+ }
+ free_extent_map(em);
+
+ return ret;
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
@@ -1521,6 +1563,9 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
flags, ref->objectid,
ref->offset, &ins,
node->ref_mod);
+ if (ret)
+ return ret;
+ ret = add_stripe_entry_for_delayed_ref(trans, node);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
ref->objectid, ref->offset,
new file mode 100644
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/btrfs_tree.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+
+static struct rb_node *stripe_tree_insert(struct rb_root *root, u64 logical,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct btrfs_ordered_stripe *stripe;
+ struct rb_node *parent = NULL;
+
+ while (*p) {
+ parent = *p;
+ stripe = rb_entry(*p, struct btrfs_ordered_stripe, rb_node);
+
+ if (logical < stripe->logical)
+ p = &(*p)->rb_left;
+ else if (logical >= stripe->logical + stripe->num_bytes)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct btrfs_ordered_stripe *btrfs_add_ordered_stripe(
+ struct btrfs_fs_info *fs_info,
+ u64 logical,
+ u64 length, int num_stripes,
+ struct btrfs_io_stripe *stripes)
+{
+ struct btrfs_ordered_stripe *stripe;
+ struct btrfs_io_stripe *tmp;
+ struct rb_node *node;
+ size_t size;
+
+ size = num_stripes * sizeof(struct btrfs_io_stripe);
+ stripe = kzalloc(sizeof(struct btrfs_ordered_stripe), GFP_NOFS);
+ if (!stripe)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&stripe->lock);
+ tmp = kmemdup(stripes, size, GFP_NOFS);
+ if (!tmp) {
+ kfree(stripe);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ stripe->logical = logical;
+ stripe->num_bytes = length;
+ stripe->num_stripes = num_stripes;
+ spin_lock(&stripe->lock);
+ stripe->stripes = tmp;
+ spin_unlock(&stripe->lock);
+ refcount_set(&stripe->ref, 1);
+
+ mutex_lock(&fs_info->stripe_update_lock);
+ node = stripe_tree_insert(&fs_info->stripe_update_tree, logical,
+ &stripe->rb_node);
+ mutex_unlock(&fs_info->stripe_update_lock);
+
+ if (node) {
+ btrfs_panic(fs_info, -EEXIST,
+ "inconsistency in ordered stripes at offset %llu",
+ logical);
+ kfree(stripe->stripes);
+ kfree(stripe);
+ return ERR_PTR(-EEXIST);
+ }
+
+ return stripe;
+}
+
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(struct btrfs_fs_info *fs_info,
+ u64 logical)
+{
+ struct rb_root *root = &fs_info->stripe_update_tree;
+ struct btrfs_ordered_stripe *stripe;
+ struct rb_node *n = root->rb_node;
+
+ mutex_lock(&fs_info->stripe_update_lock);
+ while (n) {
+ stripe = rb_entry(n, struct btrfs_ordered_stripe, rb_node);
+
+ if (logical < stripe->logical) {
+ n = n->rb_left;
+ stripe = NULL;
+ } else if (logical >= stripe->logical + stripe->num_bytes) {
+ n = n->rb_right;
+ stripe = NULL;
+ } else {
+ break;
+ }
+ }
+ if (stripe)
+ btrfs_get_ordered_stripe(stripe);
+ mutex_unlock(&fs_info->stripe_update_lock);
+
+ return stripe;
+}
+
+void btrfs_remove_ordered_stripe(struct btrfs_fs_info *fs_info,
+ struct btrfs_ordered_stripe *stripe)
+{
+ struct rb_node *node = &stripe->rb_node;
+
+ mutex_lock(&fs_info->stripe_update_lock);
+ rb_erase(node, &fs_info->stripe_update_tree);
+ RB_CLEAR_NODE(node);
+ mutex_unlock(&fs_info->stripe_update_lock);
+
+ spin_lock(&stripe->lock);
+ kfree(stripe->stripes);
+ spin_unlock(&stripe->lock);
+ kfree(stripe);
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key stripe_key;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_dp_stripe *raid_stripe;
+ size_t item_size;
+ int ret;
+
+ item_size = stripe->num_stripes * sizeof(struct btrfs_stripe_extent);
+
+ raid_stripe = kzalloc(item_size, GFP_NOFS);
+ if (!raid_stripe) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ btrfs_end_transaction(trans);
+ return -ENOMEM;
+ }
+
+ spin_lock(&stripe->lock);
+ for (int i = 0; i < stripe->num_stripes; i++) {
+ u64 devid = stripe->stripes[i].dev->devid;
+ u64 physical = stripe->stripes[i].physical;
+ struct btrfs_stripe_extent *stripe_extent =
+ &raid_stripe->extents[i];
+
+ btrfs_set_stack_stripe_extent_devid(stripe_extent, devid);
+ btrfs_set_stack_stripe_extent_physical(stripe_extent, physical);
+ }
+ spin_unlock(&stripe->lock);
+
+ stripe_key.objectid = stripe->logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = stripe->num_bytes;
+
+ ret = btrfs_insert_item(trans, stripe_root, &stripe_key, raid_stripe,
+ item_size);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ kfree(raid_stripe);
+
+ return ret;
+}
+
+void btrfs_raid_stripe_update(struct work_struct *work)
+{
+ struct btrfs_io_context *bioc =
+ container_of(work, struct btrfs_io_context, stripe_update_work);
+ struct btrfs_ordered_stripe *stripe;
+ struct bio *bio = bioc->orig_bio;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+
+ stripe = btrfs_add_ordered_stripe(fs_info, bioc->logical, bioc->length,
+ bioc->num_stripes, bioc->stripes);
+ if (IS_ERR(stripe)) {
+ btrfs_bio_counter_dec(fs_info);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(stripe));
+ }
+ btrfs_put_bioc(bioc);
+}
+
new file mode 100644
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#include "volumes.h"
+
+struct btrfs_ordered_stripe {
+ struct rb_node rb_node;
+
+ u64 logical;
+ u64 num_bytes;
+ int num_stripes;
+ struct btrfs_io_stripe *stripes;
+ spinlock_t lock;
+ refcount_t ref;
+};
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_stripe *stripe);
+void btrfs_raid_stripe_update(struct work_struct *work);
+struct btrfs_ordered_stripe *btrfs_lookup_ordered_stripe(
+ struct btrfs_fs_info *fs_info,
+ u64 logical);
+void btrfs_remove_ordered_stripe(struct btrfs_fs_info *fs_info,
+ struct btrfs_ordered_stripe *stripe);
+
+static inline void btrfs_get_ordered_stripe(struct btrfs_ordered_stripe *stripe)
+{
+ refcount_inc(&stripe->ref);
+}
+
+static inline void btrfs_put_ordered_stripe(struct btrfs_fs_info *fs_info,
+ struct btrfs_ordered_stripe *stripe)
+{
+ if (refcount_dec_and_test(&stripe->ref))
+ btrfs_remove_ordered_stripe(fs_info, stripe);
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+ return item_size - offsetof(struct btrfs_dp_stripe, extents) /
+ sizeof(struct btrfs_stripe_extent);
+}
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+ u64 map_type)
+{
+ u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (!fs_info->stripe_root)
+ return false;
+
+ // for now
+ if (type != BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ return true;
+
+ return false;
+}
+
+#endif
@@ -33,6 +33,7 @@
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -5897,6 +5898,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+ INIT_WORK(&bioc->stripe_update_work, btrfs_raid_stripe_update);
return bioc;
}
@@ -6623,6 +6625,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
{
struct bio *orig_bio = bioc->orig_bio;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_bio *bbio = btrfs_bio(orig_bio);
bbio->mirror_num = bioc->mirror_num;
@@ -6642,6 +6645,12 @@ static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
} else {
+ if (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+ btrfs_need_stripe_tree_update(fs_info,
+ bioc->map_type)) {
+ btrfs_get_bioc(bioc);
+ schedule_work(&bioc->stripe_update_work);
+ }
bio_endio(orig_bio);
}
@@ -6667,6 +6676,8 @@ static void btrfs_end_bio(struct bio *bio)
btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
+ } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
if (bio != bioc->orig_bio)
@@ -6754,6 +6765,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
bioc->orig_bio = bio;
bioc->private = bio->bi_private;
bioc->end_io = bio->bi_end_io;
+ bioc->logical = logical;
+ bioc->length = length;
atomic_set(&bioc->stripes_pending, total_devs);
if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
@@ -14,6 +14,7 @@
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
extern struct mutex uuid_mutex;
+struct btrfs_ordered_stripe;
#define BTRFS_STRIPE_LEN SZ_64K
@@ -463,6 +464,9 @@ struct btrfs_io_context {
int mirror_num;
int num_tgtdevs;
int *tgtdev_map;
+ u64 logical;
+ u64 length;
+ struct work_struct stripe_update_work;
/*
* logical block numbers for the start of each stripe
* The last one or two are p/q. These are sorted,
@@ -1641,6 +1641,10 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
u64 *logical = NULL;
int nr, stripe_len;
+ /* Filesystems with a stripe tree have their own l2p mapping */
+ if (fs_info->stripe_root)
+ return;
+
/* Zoned devices should not have partitions. So, we can assume it is 0 */
ASSERT(!bdev_is_partition(ordered->bdev));
if (WARN_ON(!ordered->bdev))
Add boilerplate code to insert raid extents into the raid-stripe-tree on each write to a RAID1 block-group. Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 3 + fs/btrfs/disk-io.c | 3 + fs/btrfs/extent-tree.c | 45 +++++++++ fs/btrfs/raid-stripe-tree.c | 188 ++++++++++++++++++++++++++++++++++++ fs/btrfs/raid-stripe-tree.h | 65 +++++++++++++ fs/btrfs/volumes.c | 13 +++ fs/btrfs/volumes.h | 4 + fs/btrfs/zoned.c | 4 + 9 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/raid-stripe-tree.c create mode 100644 fs/btrfs/raid-stripe-tree.h