Message ID | 1434375680-4115-4-git-send-email-jeffm@suse.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
On Mon, Jun 15, 2015 at 2:41 PM, <jeffm@suse.com> wrote: > From: Jeff Mahoney <jeffm@suse.com> > > Since we now clean up block groups automatically as they become > empty, iterating over block groups is no longer sufficient to discard > unused space. > > This patch iterates over the unused chunk space and discards any regions > that are unallocated, regardless of whether they were ever used. This is > a change for btrfs but is consistent with other file systems. > > We do this in a transactionless manner since the discard process can take > a substantial amount of time and a transaction would need to be started > before the acquisition of the device list lock. That would mean a > transaction would be held open across /all/ of the discards collectively. > In order to prevent other threads from allocating or freeing chunks, we > hold the chunks lock across the search and discard calls. We release it > between searches to allow the file system to perform more-or-less > normally. Since the running transaction can commit and disappear while > we're using the transaction pointer, we take a reference to it and > release it after the search. This is safe since it would happen normally > at the end of the transaction commit after any locks are released anyway. > We also take the commit_root_sem to protect against a transaction starting > and committing while we're running. > > Signed-off-by: Jeff Mahoney <jeffm@suse.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Tested-by: Filipe Manana <fdmanana@suse.com> Side note, this still doesn't apply cleanly on latest integration branch (integration-4.2), results in warnings about casting pointer from different type (btrfs_trans_handle to btrfs_transaction) at btrfs_shrink_device(). > --- > fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++ > fs/btrfs/volumes.c | 60 ++++++++++++++++++----------- > fs/btrfs/volumes.h | 3 ++ > 3 files changed, 141 insertions(+), 23 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 1e44b93..24b48df 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -10143,10 +10143,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) > return unpin_extent_range(root, start, end, false); > } > > +/* > + * It used to be that old block groups would be left around forever. > + * Iterating over them would be enough to trim unused space. Since we > + * now automatically remove them, we also need to iterate over unallocated > + * space. > + * > + * We don't want a transaction for this since the discard may take a > + * substantial amount of time. We don't require that a transaction be > + * running, but we do need to take a running transaction into account > + * to ensure that we're not discarding chunks that were released in > + * the current transaction. > + * > + * Holding the chunks lock will prevent other threads from allocating > + * or releasing chunks, but it won't prevent a running transaction > + * from committing and releasing the memory that the pending chunks > + * list head uses. For that, we need to take a reference to the > + * transaction. > + */ > +static int btrfs_trim_free_extents(struct btrfs_device *device, > + u64 minlen, u64 *trimmed) > +{ > + u64 start = 0, len = 0; > + int ret; > + > + *trimmed = 0; > + > + /* Not writeable = nothing to do. */ > + if (!device->writeable) > + return 0; > + > + /* No free space = nothing to do. */ > + if (device->total_bytes <= device->bytes_used) > + return 0; > + > + ret = 0; > + > + while (1) { > + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; > + struct btrfs_transaction *trans; > + u64 bytes; > + > + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); > + if (ret) > + return ret; > + > + down_read(&fs_info->commit_root_sem); > + > + spin_lock(&fs_info->trans_lock); > + trans = fs_info->running_transaction; > + if (trans) > + atomic_inc(&trans->use_count); > + spin_unlock(&fs_info->trans_lock); > + > + ret = find_free_dev_extent_start(trans, device, minlen, start, > + &start, &len); > + if (trans) > + btrfs_put_transaction(trans); > + > + if (ret) { > + up_read(&fs_info->commit_root_sem); > + mutex_unlock(&fs_info->chunk_mutex); > + if (ret == -ENOSPC) > + ret = 0; > + break; > + } > + > + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); > + up_read(&fs_info->commit_root_sem); > + mutex_unlock(&fs_info->chunk_mutex); > + > + if (ret) > + break; > + > + start += len; > + *trimmed += bytes; > + > + if (fatal_signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + > + cond_resched(); > + } > + > + return ret; > +} > + > int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) > { > struct btrfs_fs_info *fs_info = root->fs_info; > struct btrfs_block_group_cache *cache = NULL; > + struct btrfs_device *device; > + struct list_head *devices; > u64 group_trimmed; > u64 start; > u64 end; > @@ -10201,6 +10290,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) > cache = next_block_group(fs_info->tree_root, cache); > } > > + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); > + devices = &root->fs_info->fs_devices->alloc_list; > + list_for_each_entry(device, devices, dev_alloc_list) { > + ret = btrfs_trim_free_extents(device, range->minlen, > + &group_trimmed); > + if (ret) > + break; > + > + trimmed += group_trimmed; > + } > + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); > + > range->len = trimmed; > return ret; > } > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 174f5e1..7fdde31 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -1051,15 +1051,18 @@ out: > return ret; > } > > -static int contains_pending_extent(struct btrfs_trans_handle *trans, > +static int contains_pending_extent(struct btrfs_transaction *transaction, > struct btrfs_device *device, > u64 *start, u64 len) > { > + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; > struct extent_map *em; > - struct list_head *search_list = &trans->transaction->pending_chunks; > + struct list_head *search_list = &fs_info->pinned_chunks; > int ret = 0; > u64 physical_start = *start; > > + if (transaction) > + search_list = &transaction->pending_chunks; > again: > list_for_each_entry(em, search_list, list) { > struct map_lookup *map; > @@ -1078,8 +1081,8 @@ again: > ret = 1; > } > } > - if (search_list == &trans->transaction->pending_chunks) { > - search_list = &trans->root->fs_info->pinned_chunks; > + if (search_list != &fs_info->pinned_chunks) { > + search_list = &fs_info->pinned_chunks; > goto again; > } > > @@ -1088,12 +1091,13 @@ again: > > > /* > - * find_free_dev_extent - find free space in the specified device > - * @device: the device which we search the free space in > - * @num_bytes: the size of the free space that we need > - * @start: store the start of the free space. > - * @len: the size of the free space. that we find, or the size of the max > - * free space if we don't find suitable free space > + * find_free_dev_extent_start - find free space in the specified device > + * @device: the device which we search the free space in > + * @num_bytes: the size of the free space that we need > + * @search_start: the position from which to begin the search > + * @start: store the start of the free space. > + * @len: the size of the free space. that we find, or the size > + * of the max free space if we don't find suitable free space > * > * this uses a pretty simple search, the expectation is that it is > * called very infrequently and that a given device has a small number > @@ -1107,9 +1111,9 @@ again: > * But if we don't find suitable free space, it is used to store the size of > * the max free space. > */ > -int find_free_dev_extent(struct btrfs_trans_handle *trans, > - struct btrfs_device *device, u64 num_bytes, > - u64 *start, u64 *len) > +int find_free_dev_extent_start(struct btrfs_transaction *transaction, > + struct btrfs_device *device, u64 num_bytes, > + u64 search_start, u64 *start, u64 *len) > { > struct btrfs_key key; > struct btrfs_root *root = device->dev_root; > @@ -1119,19 +1123,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, > u64 max_hole_start; > u64 max_hole_size; > u64 extent_end; > - u64 search_start; > u64 search_end = device->total_bytes; > int ret; > int slot; > struct extent_buffer *l; > > - /* FIXME use last free of some kind */ > - > - /* we don't want to overwrite the superblock on the drive, > - * so we make sure to start at an offset of at least 1MB > - */ > - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); > - > path = btrfs_alloc_path(); > if (!path) > return -ENOMEM; > @@ -1192,7 +1188,7 @@ again: > * Have to check before we set max_hole_start, otherwise > * we could end up sending back this offset anyway. > */ > - if (contains_pending_extent(trans, device, > + if (contains_pending_extent(transaction, device, > &search_start, > hole_size)) { > if (key.offset >= search_start) { > @@ -1241,7 +1237,7 @@ next: > if (search_end > search_start) { > hole_size = search_end - search_start; > > - if (contains_pending_extent(trans, device, &search_start, > + if (contains_pending_extent(transaction, device, &search_start, > hole_size)) { > btrfs_release_path(path); > goto again; > @@ -1267,6 +1263,24 @@ out: > return ret; > } > > +int find_free_dev_extent(struct btrfs_trans_handle *trans, > + struct btrfs_device *device, u64 num_bytes, > + u64 *start, u64 *len) > +{ > + struct btrfs_root *root = device->dev_root; > + u64 search_start; > + > + /* FIXME use last free of some kind */ > + > + /* > + * we don't want to overwrite the superblock on the drive, > + * so we make sure to start at an offset of at least 1MB > + */ > + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); > + return find_free_dev_extent_start(trans->transaction, device, > + num_bytes, search_start, start, len); > +} > + > static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, > struct btrfs_device *device, > u64 start, u64 *dev_extent_len) > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index ebc3133..30918a8 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); > int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); > int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); > int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); > +int find_free_dev_extent_start(struct btrfs_transaction *transaction, > + struct btrfs_device *device, u64 num_bytes, > + u64 search_start, u64 *start, u64 *max_avail); > int find_free_dev_extent(struct btrfs_trans_handle *trans, > struct btrfs_device *device, u64 num_bytes, > u64 *start, u64 *max_avail); > -- > 2.4.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1e44b93..24b48df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10143,10 +10143,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) return unpin_extent_range(root, start, end, false); } +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +{ + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -10201,6 +10290,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 174f5e1..7fdde31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1051,15 +1051,18 @@ out: return ret; } -static int contains_pending_extent(struct btrfs_trans_handle *trans, +static int contains_pending_extent(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 *start, u64 len) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; struct extent_map *em; - struct list_head *search_list = &trans->transaction->pending_chunks; + struct list_head *search_list = &fs_info->pinned_chunks; int ret = 0; u64 physical_start = *start; + if (transaction) + search_list = &transaction->pending_chunks; again: list_for_each_entry(em, search_list, list) { struct map_lookup *map; @@ -1078,8 +1081,8 @@ again: ret = 1; } } - if (search_list == &trans->transaction->pending_chunks) { - search_list = &trans->root->fs_info->pinned_chunks; + if (search_list != &fs_info->pinned_chunks) { + search_list = &fs_info->pinned_chunks; goto again; } @@ -1088,12 +1091,13 @@ again: /* - * find_free_dev_extent - find free space in the specified device - * @device: the device which we search the free space in - * @num_bytes: the size of the free space that we need - * @start: store the start of the free space. - * @len: the size of the free space. that we find, or the size of the max - * free space if we don't find suitable free space + * find_free_dev_extent_start - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @search_start: the position from which to begin the search + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size + * of the max free space if we don't find suitable free space * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number @@ -1107,9 +1111,9 @@ again: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1119,19 +1123,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 max_hole_start; u64 max_hole_size; u64 extent_end; - u64 search_start; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - /* FIXME use last free of some kind */ - - /* we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1192,7 +1188,7 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(trans, device, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { if (key.offset >= search_start) { @@ -1241,7 +1237,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(trans, device, &search_start, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { btrfs_release_path(path); goto again; @@ -1267,6 +1263,24 @@ out: return ret; } +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_root *root = device->dev_root; + u64 search_start; + + /* FIXME use last free of some kind */ + + /* + * we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + return find_free_dev_extent_start(trans->transaction, device, + num_bytes, search_start, start, len); +} + static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc3133..30918a8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail);