@@ -522,55 +522,40 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
return find_free_dev_extent_start(device, num_bytes, 0, start, len);
}
-static int btrfs_insert_dev_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct map_lookup *map, u64 stripe_size)
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_offset, u64 physical,
+ u64 stripe_size)
{
- int ret = 0;
- struct btrfs_path path;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
- int i;
- btrfs_init_path(&path);
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
-
- key.objectid = device->devid;
- key.offset = map->stripes[i].physical;
- key.type = BTRFS_DEV_EXTENT_KEY;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, root, &path, &key,
- sizeof(*extent));
- if (ret < 0)
- goto out;
- leaf = path.nodes[0];
- extent = btrfs_item_ptr(leaf, path.slots[0],
- struct btrfs_dev_extent);
- btrfs_set_dev_extent_chunk_tree(leaf, extent,
+ key.objectid = device->devid;
+ key.offset = physical;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ goto out;
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent,
BTRFS_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_objectid(leaf, extent,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_offset(leaf, extent, map->ce.start);
-
- write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
- (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
- BTRFS_UUID_SIZE);
-
- btrfs_set_dev_extent_length(leaf, extent, stripe_size);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(&path);
-
- device->bytes_used += stripe_size;
- ret = btrfs_update_device(trans, device);
- if (ret < 0)
- goto out;
- }
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+ btrfs_set_dev_extent_length(leaf, extent, stripe_size);
+ btrfs_mark_buffer_dirty(leaf);
out:
- btrfs_release_path(&path);
+ btrfs_free_path(path);
return ret;
}
@@ -812,28 +797,28 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
/ sizeof(struct btrfs_stripe) + 1)
/*
- * Alloc a chunk, will insert dev extents, chunk item, and insert new
- * block group and update space info (so that extent allocator can use
- * newly allocated chunk).
+ * Alloc a chunk mapping.
+ * Will do chunk size calculation and free dev extent search, and insert
+ * chunk mapping into chunk mapping tree.
+ *
+ * NOTE: This function doesn't handle any chunk item/dev extent insert.
+ * chunk item/dev extent insert is handled by later btrfs_finish_chunk_alloc().
+ * And for convert chunk (1:1 mapped, more flex chunk location), it's
+ * handled by __btrfs_alloc_convert_chunk().
+ *
+ * Qu: Block group item is still inserted in this function by
+ * btrfs_make_block_group(), this still differs from kernel.
*
* @start: return value of allocated chunk start bytenr.
* @num_bytes: return value of allocated chunk size
* @type: chunk type (including both profile and type)
- * @convert: if the chunk is allocated for convert case.
- * If @convert is true, chunk allocator will skip device extent
- * search, but use *start and *num_bytes as chunk start/num_bytes
- * and devive offset, to build a 1:1 chunk mapping for convert.
*/
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *info, u64 *start,
- u64 *num_bytes, u64 type, bool convert)
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *info, u64 *start,
+ u64 *num_bytes, u64 type)
{
- struct btrfs_root *extent_root = info->extent_root;
- struct btrfs_root *chunk_root = info->chunk_root;
struct btrfs_device *device = NULL;
- struct btrfs_chunk *chunk;
struct list_head *dev_list = &info->fs_devices->devices;
- struct btrfs_stripe *stripe;
struct map_lookup *map;
struct btrfs_device_info *devices_info = NULL;
u64 percent_max;
@@ -844,8 +829,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int index;
int ndevs = 0;
int rw_devs = 0;
- int stripe_len = BTRFS_STRIPE_LEN;
- struct btrfs_key key;
u64 offset;
int data_stripes; /* number of stripes that counts for bg size */
int sub_stripes; /* sub_stripes info for map */
@@ -873,34 +856,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
- if (convert) {
- /* For convert, profile must be SINGLE */
- if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- error("convert only suports SINGLE profile");
- return -EINVAL;
- }
- if (!IS_ALIGNED(*start, info->sectorsize)) {
- error("chunk start not aligned, start=%llu sectorsize=%u",
- *start, info->sectorsize);
- return -EINVAL;
- }
- if (!IS_ALIGNED(*num_bytes, info->sectorsize)) {
- error("chunk size not aligned, size=%llu sectorsize=%u",
- *num_bytes, info->sectorsize);
- return -EINVAL;
- }
- num_stripes = 1;
- data_stripes = 1;
- offset = *start;
- stripe_size = *num_bytes;
- /*
- * For convert, we use 1:1 chunk mapping specified by @start and
- * @num_bytes, so there is no need to go through dev_extent
- * searching.
- */
- goto alloc_chunk;
- }
-
/*
* Chunk size calculation part.
*/
@@ -1046,55 +1001,23 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
/*
* Fill chunk mapping and chunk stripes
*/
-alloc_chunk:
- if (!convert) {
- ret = find_next_chunk(info, &offset);
- if (ret)
- goto out_devices_info;
- }
- key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.type = BTRFS_CHUNK_ITEM_KEY;
- key.offset = offset;
- *num_bytes = stripe_size * data_stripes;
-
- chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
- if (!chunk)
+ ret = find_next_chunk(info, &offset);
+ if (ret)
goto out_devices_info;
+ *num_bytes = stripe_size * data_stripes;
map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
if (!map)
goto out_chunk_map;
map->num_stripes = num_stripes;
- if (convert) {
- device = list_entry(dev_list->next, struct btrfs_device,
- dev_list);
+ for (i = 0; i < ndevs; i++) {
+ for (j = 0; j < dev_stripes; ++j) {
+ int s = i * dev_stripes + j;
- map->stripes[0].dev = device;
- map->stripes[0].physical = *start;
- stripe = &chunk->stripe;
- btrfs_set_stack_stripe_devid(stripe, device->devid);
- btrfs_set_stack_stripe_offset(stripe, *start);
- memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
- } else {
- for (i = 0; i < ndevs; i++) {
- for (j = 0; j < dev_stripes; ++j) {
- int s = i * dev_stripes + j;
-
- device = devices_info[i].dev;
- map->stripes[s].dev = device;
- map->stripes[s].physical =
- devices_info[i].dev_offset +
- j * stripe_size;
- stripe = &chunk->stripe + s;
- btrfs_set_stack_stripe_devid(stripe,
- device->devid);
- btrfs_set_stack_stripe_offset(stripe,
- devices_info[i].dev_offset +
- j * stripe_size);
- memcpy(stripe->dev_uuid, device->uuid,
- BTRFS_UUID_SIZE);
- }
+ map->stripes[s].dev = devices_info[i].dev;
+ map->stripes[s].physical = devices_info[i].dev_offset +
+ j * stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
@@ -1103,60 +1026,236 @@ alloc_chunk:
map->type = type;
map->sub_stripes = sub_stripes;
map->sector_size = info->sectorsize;
- map->ce.start = key.offset;
+ map->ce.start = offset;
map->ce.size = *num_bytes;
+ kfree(devices_info);
+ /*
+ * Insert chunk mapping and block group
+ */
ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce);
if (ret < 0)
goto out_chunk_map;
- /* key was set above */
- btrfs_set_stack_chunk_length(chunk, *num_bytes);
- btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
- btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
- btrfs_set_stack_chunk_type(chunk, type);
- btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
- btrfs_set_stack_chunk_io_align(chunk, stripe_len);
- btrfs_set_stack_chunk_io_width(chunk, stripe_len);
- btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize);
- btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
+ ret = btrfs_make_block_group(trans, info, 0, type, offset,
+ *num_bytes);
+ *start = offset;
+ return ret;
+
+out_chunk_map:
+ kfree(map);
+out_devices_info:
+ kfree(devices_info);
+ return ret;
+}
+
+/*
+ * Alloc a chunk mapping for convert.
+ * Convert needs special SINGLE chunk whose logical bytenr is the same as its
+ * physical bytenr.
+ * Chunk size and start bytenr are all specified by @start and @num_bytes
+ *
+ * And just like __btrfs_alloc_chunk() this only handles chunk mapping and
+ * block group item.
+ */
+static int __btrfs_alloc_convert_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 start,
+ u64 num_bytes, u64 type)
+{
+ struct list_head *dev_list = &fs_info->fs_devices->devices;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ int ret;
+
+ /* For convert, profile must be SINGLE */
+ if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ error("convert only suports SINGLE profile");
+ return -EINVAL;
+ }
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ error("chunk start not aligned, start=%llu sectorsize=%u",
+ start, fs_info->sectorsize);
+ return -EINVAL;
+ }
+ if (!IS_ALIGNED(num_bytes, fs_info->sectorsize)) {
+ error("chunk size not aligned, size=%llu sectorsize=%u",
+ num_bytes, fs_info->sectorsize);
+ return -EINVAL;
+ }
+ if (list_empty(dev_list)) {
+ error("no writable device");
+ return -ENODEV;
+ }
+
+ device = list_entry(dev_list->next, struct btrfs_device, dev_list);
+ map = malloc(btrfs_map_lookup_size(1));
+ if (!map)
+ return -ENOMEM;
+ map->num_stripes = 1;
+ map->stripes[0].dev = device;
+ map->stripes[0].physical = start;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->type = type;
+ map->sub_stripes = 1;
+ map->sector_size = fs_info->sectorsize;
+ map->ce.start = start;
+ map->ce.size = num_bytes;
+
+ ret = insert_cache_extent(&fs_info->mapping_tree.cache_tree, &map->ce);
+ if (ret < 0)
+ goto error;
+ ret = btrfs_make_block_group(trans, fs_info, 0, type, start, num_bytes);
+ return ret;
+error:
+ free(map);
+ return ret;
+}
+
+/*
+ * Finish the chunk allocation by inserting needed chunk item and device
+ * extents, and update device used bytes
+ */
+static int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 chunk_start, u64 chunk_size)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_key key;
+ struct btrfs_device *device;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ struct cache_extent *ce;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
+ int ret = 0;
+
+ ce = search_cache_extent(&fs_info->mapping_tree.cache_tree, chunk_start);
+ if (!ce)
+ return -ENOENT;
+
+ map = container_of(ce, struct map_lookup, ce);
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = calc_stripe_length(map->type, map->ce.size,
+ map->num_stripes);
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
/*
- * Insert chunk item and chunk mapping.
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()).
*/
- ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
- btrfs_chunk_item_size(num_stripes));
- BUG_ON(ret);
- *start = key.offset;
+ /* mutex_lock(&fs_info->fs_devices->device_list_mutex); */
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
- if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_add_system_chunk(info, &key,
- chunk, btrfs_chunk_item_size(num_stripes));
- if (ret < 0)
- goto out_chunk;
+ device->bytes_used += stripe_size;
+ ret = btrfs_update_device(trans, device);
+ if (ret)
+ break;
+ ret = btrfs_alloc_dev_extent(trans, device, chunk_start,
+ dev_offset, stripe_size);
+ if (ret)
+ break;
+ }
+ if (ret) {
+ /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
+ goto out;
}
+ stripe = &chunk->stripe;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ }
+ /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
+
+ btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+ btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+ btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
+ btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_start;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+ if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ /*
+ * TODO: Cleanup of inserted chunk root in case of
+ * failure.
+ */
+ ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+ }
+
+out:
kfree(chunk);
+ return ret;
+}
+
+/*
+ * Alloc a chunk.
+ * Will do all the needed work including seaching free device extent, insert
+ * chunk mapping, chunk item, block group item and device extents.
+ *
+ * @start: return value of allocated chunk start bytenr.
+ * @num_bytes: return value of allocated chunk size
+ * @type: chunk type (including both profile and type)
+ * @convert: if the chunk is allocated for convert case.
+ * If @convert is true, chunk allocator will skip device extent
+ * search, but use *start and *num_bytes as chunk start/num_bytes
+ * and devive offset, to build a 1:1 chunk mapping for convert.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 *start, u64 *num_bytes,
+ u64 type, bool convert)
+{
+ int ret;
/*
- * Insert device extents
+ * Allocate chunk mapping
*/
- ret = btrfs_insert_dev_extents(trans, info, map, stripe_size);
- if (ret < 0)
- goto out_devices_info;
-
- ret = btrfs_make_block_group(trans, info, 0, type, map->ce.start,
- map->ce.size);
- kfree(devices_info);
- return ret;
+ if (convert)
+ ret = __btrfs_alloc_convert_chunk(trans, fs_info, *start,
+ *num_bytes, type);
+ else
+ ret = __btrfs_alloc_chunk(trans, fs_info, start, num_bytes,
+ type);
+ if (ret < 0) {
+ error("failed to allocate chunk mapping: %s", strerror(-ret));
+ return ret;
+ }
-out_chunk_map:
- kfree(map);
-out_chunk:
- kfree(chunk);
-out_devices_info:
- kfree(devices_info);
+ /*
+ * Insert the remaining part (insert variant items)
+ */
+ ret = btrfs_finish_chunk_alloc(trans, fs_info, *start, *num_bytes);
+ if (ret < 0)
+ error("failed to finish chunk allocation: %s", strerror(-ret));
return ret;
}
Kernel uses a delayed chunk allocation behavior for metadata chunks KERNEL: btrfs_alloc_chunk() |- __btrfs_alloc_chunk(): Only allocate chunk mapping |- btrfs_make_block_group(): Add corresponding bg to fs_info->new_bgs Then at transaction commit time, it finishes the remaining work: btrfs_start_dirty_block_groups(): |- btrfs_create_pending_block_groups() |- btrfs_insert_item(): To insert block group item |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents Although btrfs-progs btrfs_alloc_chunk() does all the work in one function, it can still benefit from function refactor like: btrfs-progs: btrfs_alloc_chunk(): Wrapper for both normal and convert chunks |- __btrfs_alloc_chunk(): Only alloc chunk mapping | |- btrfs_make_block_group(): <<Insert bg items into extent tree>> |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents With such refactor, the following functions can share most of its code with kernel now: __btrfs_alloc_chunk() btrfs_finish_chunk_alloc() btrfs_alloc_dev_extent() Signed-off-by: Qu Wenruo <wqu@suse.com> --- volumes.c | 421 ++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 260 insertions(+), 161 deletions(-)