@@ -2326,11 +2326,21 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
/*
* For subpage size support, btree inode tracks EXTENT_UPTODATE for
* its IO.
+ *
+ * And never merge extent states to make all set/clear operation never
+ * to allocate memory, except the initial EXTENT_HAS_TREE_BLOCK bit.
+ * This adds extra ~150 bytes for each extent buffer.
+ *
+ * TODO: Josef's rwsem rework on tree lock would kill the leave_spining
+ * case, and then we can revert this behavior.
*/
- if (btrfs_is_subpage(fs_info))
+ if (btrfs_is_subpage(fs_info)) {
BTRFS_I(inode)->io_tree.track_uptodate = true;
- else
+ BTRFS_I(inode)->io_tree.never_merge = true;
+ } else {
BTRFS_I(inode)->io_tree.track_uptodate = false;
+ BTRFS_I(inode)->io_tree.never_merge = false;
+ }
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
@@ -71,6 +71,20 @@ struct extent_io_tree {
u64 dirty_bytes;
bool track_uptodate;
+ /*
+ * Never to merge extent_state.
+ *
+ * This allows any set/clear function to be execute in atomic context
+ * without allocating extra memory.
+ * The cost is extra memory usage.
+ *
+ * Should only be used for subpage btree io tree, which mostly adds per
+ * extent buffer memory usage.
+ *
+ * Default: false.
+ */
+ bool never_merge;
+
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -286,6 +286,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
+ tree->never_merge = false;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
@@ -481,11 +482,18 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
}
/*
- * utility function to look for merge candidates inside a given range.
+ * Utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
+ * extent in the tree.
+ *
+ * Except the following cases:
+ * - extent_state with EXTENT_LOCK or EXTENT_BOUNDARY bit set
+ * Those extents are not merged because end_io handlers need to be able
+ * to do operations on them without sleeping (or doing allocations/splits)
+ *
+ * - extent_io_tree with never_merge bit set
+ * Same reason as above, but extra call sites may have spinlock/rwlock hold,
+ * and we don't want to abuse GFP_ATOMIC.
*
* This should be called with the tree lock held.
*/
@@ -495,7 +503,8 @@ static void merge_state(struct extent_io_tree *tree,
struct extent_state *other;
struct rb_node *other_node;
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY) ||
+ tree->never_merge)
return;
other_node = rb_prev(&state->rb_node);
For incoming subpage metadata rw support, prevent extent_state from being merged for btree io tree. The main cause is set_extent_buffer_dirty(). In the following call chain, we could fall into the situation where we have to call set_extent_dirty() with atomic context: alloc_reserved_tree_block() |- path->leave_spinning = 1; |- btrfs_insert_empty_item() |- btrfs_search_slot() | Now the path has all its tree block spinning locked |- setup_items_for_insert(); |- btrfs_unlock_up_safe(path, 1); | Now path->nodes[0] still spin locked |- btrfs_mark_buffer_dirty(leaf); |- set_extent_buffer_dirty() Since set_extent_buffer_dirty() is in fact a pretty common call, just fall back to GFP_ATOMIC allocation used in __set_extent_bit() may exhause the pool sooner than we expected. So this patch goes another direction, by not merging all extent_state for subpage btree io tree. Since for subpage btree io tree, all in tree extent buffers has EXTENT_HAS_TREE_BLOCK bit set during its lifespan, as long as extent_state is not merged, each extent buffer would has its own extent_state, so that set/clear_extent_bit() can reuse existing extent buffer extent_state, without allocating new memory. The cost is obvious, around 150 bytes per subpage extent buffer. But considering for subpage extent buffer, we saved 15 page pointers, this should save 120 bytes, so the net cost is just 30 bytes per subpage extent buffer, which should be acceptable. Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/disk-io.c | 14 ++++++++++++-- fs/btrfs/extent-io-tree.h | 14 ++++++++++++++ fs/btrfs/extent_io.c | 19 ++++++++++++++----- 3 files changed, 40 insertions(+), 7 deletions(-)