@@ -1111,8 +1111,8 @@ struct ext4_inode_info {
/* mballoc */
atomic_t i_prealloc_active;
- struct list_head i_prealloc_list;
- spinlock_t i_prealloc_lock;
+ struct rb_root i_prealloc_node;
+ rwlock_t i_prealloc_lock;
/* extents status tree */
struct ext4_es_tree i_es_tree;
@@ -4020,6 +4020,24 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}
+/*
+ * This function returns the next element to look at during inode
+ * PA rbtree walk. We assume that we have held the inode PA rbtree lock
+ * (ei->i_prealloc_lock)
+ *
+ * new_start The start of the range we want to compare
+ * cur_start The existing start that we are comparing against
+ * node The node of the rb_tree
+ */
+static inline struct rb_node*
+ext4_mb_pa_rb_next_iter(int new_start, int cur_start, struct rb_node *node)
+{
+ if (new_start < cur_start)
+ return node->rb_left;
+ else
+ return node->rb_right;
+}
+
static inline void
ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
ext4_lblk_t start, ext4_lblk_t end)
@@ -4028,27 +4046,31 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *tmp_pa;
ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
- spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0) {
- tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ read_lock(&ei->i_prealloc_lock);
+ iter = ei->i_prealloc_node.rb_node;
+ while (iter) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
- }
spin_unlock(&tmp_pa->pa_lock);
+
+ iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
}
-
/*
* Given an allocation context "ac" and a range "start", "end", check
* and adjust boundaries if the range overlaps with any of the existing
* preallocatoins stored in the corresponding inode of the allocation context.
*
- *Parameters:
+ * Parameters:
* ac allocation context
* start start of the new range
* end end of the new range
@@ -4060,6 +4082,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *tmp_pa;
+ struct rb_node *iter;
ext4_lblk_t new_start, new_end;
ext4_lblk_t tmp_pa_start, tmp_pa_end;
@@ -4067,19 +4090,29 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
new_end = *end;
/* check we don't cross already preallocated blocks */
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
- if (tmp_pa->pa_deleted)
+ read_lock(&ei->i_prealloc_lock);
+ iter = ei->i_prealloc_node.rb_node;
+ while (iter) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ /*
+ * If pa is deleted, ignore overlaps and just iterate in rbtree
+ * based on tmp_pa_start
+ */
+ if (tmp_pa->pa_deleted) {
+ iter = ext4_mb_pa_rb_next_iter(new_start, tmp_pa_start, iter);
continue;
+ }
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&tmp_pa->pa_lock);
+ iter = ext4_mb_pa_rb_next_iter(new_start, tmp_pa_start, iter);
continue;
}
- tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
-
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
ac->ac_o_ex.fe_logical < tmp_pa_start));
@@ -4087,6 +4120,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
/* skip PAs this normalized request doesn't overlap with */
if (tmp_pa_start >= new_end || tmp_pa_end <= new_start) {
spin_unlock(&tmp_pa->pa_lock);
+ iter = ext4_mb_pa_rb_next_iter(new_start, tmp_pa_start, iter);
continue;
}
BUG_ON(tmp_pa_start <= new_start && tmp_pa_end >= new_end);
@@ -4100,8 +4134,9 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
new_end = tmp_pa_start;
}
spin_unlock(&tmp_pa->pa_lock);
+ iter = ext4_mb_pa_rb_next_iter(new_start, tmp_pa_start, iter);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
/* XXX: extra loop to check we really don't overlap preallocations */
ext4_mb_pa_assert_overlap(ac, new_start, new_end);
@@ -4228,7 +4263,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ext4_mb_pa_adjust_overlap(ac, &start, &end);
size = end - start;
-
/*
* In this function "start" and "size" are normalized for better
* alignment and length such that we could preallocate more blocks.
@@ -4437,6 +4471,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg;
struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
ext4_fsblk_t goal_block;
/* only data can be preallocated */
@@ -4444,17 +4479,23 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
return false;
/* first, try per-file preallocation */
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
+ read_lock(&ei->i_prealloc_lock);
+ iter = ei->i_prealloc_node.rb_node;
+ while (iter) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node);
/* all fields in this condition don't change,
* so we can skip locking for them */
tmp_pa_start = tmp_pa->pa_lstart;
tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ /* original request start doesn't lie in this PA */
if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
- ac->ac_o_ex.fe_logical >= tmp_pa_end)
+ ac->ac_o_ex.fe_logical >= tmp_pa_end) {
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter);
continue;
+ }
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
@@ -4474,12 +4515,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ext4_mb_use_inode_pa(ac, tmp_pa);
spin_unlock(&tmp_pa->pa_lock);
ac->ac_criteria = 10;
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
return true;
}
spin_unlock(&tmp_pa->pa_lock);
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
@@ -4631,6 +4674,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
{
ext4_group_t grp;
ext4_fsblk_t grp_blk;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
@@ -4676,16 +4720,34 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_unlock_group(sb, grp);
if (pa->pa_type == MB_INODE_PA) {
- spin_lock(pa->pa_node_lock.inode_lock);
- list_del_rcu(&pa->pa_node.inode_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_free(pa);
} else {
spin_lock(pa->pa_node_lock.lg_lock);
list_del_rcu(&pa->pa_node.lg_list);
spin_unlock(pa->pa_node_lock.lg_lock);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
+}
+
+static int ext4_mb_pa_cmp(struct rb_node *new, struct rb_node *cur)
+{
+ ext4_grpblk_t cur_start, new_start;
+ struct ext4_prealloc_space *cur_pa = rb_entry(cur,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ struct ext4_prealloc_space *new_pa = rb_entry(new,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ cur_start = cur_pa->pa_lstart;
+ new_start = new_pa->pa_lstart;
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ if (new_start < cur_start)
+ return 1;
+ else
+ return -1;
}
/*
@@ -4751,7 +4813,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_node.inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
@@ -4771,9 +4832,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- spin_lock(pa->pa_node_lock.inode_lock);
- list_add_rcu(&pa->pa_node.inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ext4_mb_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node,
+ ext4_mb_pa_cmp);
+ write_unlock(pa->pa_node_lock.inode_lock);
atomic_inc(&ei->i_prealloc_active);
}
@@ -4939,6 +5001,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct list_head list;
struct ext4_buddy e4b;
+ struct ext4_inode_info *ei;
int err;
int free = 0;
@@ -5002,18 +5065,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
list_del_rcu(&pa->pa_node.lg_list);
spin_unlock(pa->pa_node_lock.lg_lock);
} else {
- spin_lock(pa->pa_node_lock.inode_lock);
- list_del_rcu(&pa->pa_node.inode_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ei = EXT4_I(pa->pa_inode);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
}
- if (pa->pa_type == MB_GROUP_PA)
+ list_del(&pa->u.pa_tmp_list);
+
+ if (pa->pa_type == MB_GROUP_PA) {
ext4_mb_release_group_pa(&e4b, pa);
- else
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ } else {
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
-
- list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
+ }
}
ext4_unlock_group(sb, group);
@@ -5043,6 +5109,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
ext4_group_t group = 0;
struct list_head list;
struct ext4_buddy e4b;
+ struct rb_node *iter;
int err;
if (!S_ISREG(inode->i_mode)) {
@@ -5065,17 +5132,18 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
repeat:
/* first, collect all pa's in the inode */
- spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list) && needed) {
- pa = list_entry(ei->i_prealloc_list.prev,
- struct ext4_prealloc_space, pa_node.inode_list);
+ write_lock(&ei->i_prealloc_lock);
+ for (iter = rb_first(&ei->i_prealloc_node); iter && needed; iter = rb_next(iter)) {
+ pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
+
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/* this shouldn't happen often - nobody should
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
ext4_msg(sb, KERN_ERR,
"uh-oh! used pa while discarding");
WARN_ON(1);
@@ -5086,7 +5154,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
if (pa->pa_deleted == 0) {
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_node.inode_list);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
needed--;
continue;
@@ -5094,7 +5162,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
/* someone is deleting pa right now */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
/* we have to wait here because pa_deleted
* doesn't mean pa is already unlinked from
@@ -5111,7 +5179,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
schedule_timeout_uninterruptible(HZ);
goto repeat;
}
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
@@ -5143,7 +5211,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
}
}
@@ -5516,7 +5584,6 @@ static void ext4_mb_trim_inode_pa(struct inode *inode)
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct inode *inode = ac->ac_inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -5543,16 +5610,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
}
}
- if (pa->pa_type == MB_INODE_PA) {
- /*
- * treat per-inode prealloc list as a lru list, then try
- * to trim the least recently used PA.
- */
- spin_lock(pa->pa_node_lock.inode_lock);
- list_move(&pa->pa_node.inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
- }
-
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -115,7 +115,7 @@ struct ext4_free_data {
struct ext4_prealloc_space {
union {
- struct list_head inode_list; /* for inode PAs */
+ struct rb_node inode_node; /* for inode PA rbtree */
struct list_head lg_list; /* for lg PAs */
} pa_node;
struct list_head pa_group_list;
@@ -132,10 +132,10 @@ struct ext4_prealloc_space {
ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
union {
- spinlock_t *inode_lock; /* locks the inode list holding this PA */
+ rwlock_t *inode_lock; /* locks the rbtree holding this PA */
spinlock_t *lg_lock; /* locks the lg list holding this PA */
} pa_node_lock;
- struct inode *pa_inode; /* hack, for history only */
+ struct inode *pa_inode; /* used to get the inode during group discard */
};
enum {
@@ -1330,9 +1330,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
- INIT_LIST_HEAD(&ei->i_prealloc_list);
+ ei->i_prealloc_node = RB_ROOT;
atomic_set(&ei->i_prealloc_active, 0);
- spin_lock_init(&ei->i_prealloc_lock);
+ rwlock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_list);