[v2] btrfs: iterate over unused chunk space in FITRIM
diff mbox

Message ID 554909A5.4030307@suse.com
State Superseded
Headers show

Commit Message

Jeff Mahoney May 5, 2015, 6:19 p.m. UTC
Since we now clean up block groups automatically as they become
empty, iterating over block groups is no longer sufficient to discard
unused space.

This patch iterates over the unused chunk space and discards any regions
that are unallocated, regardless of whether they were ever used.  This is
a change for btrfs but is consistent with other file systems.

We do this in a transactionless manner since the discard process can take
a substantial amount of time and a transaction would need to be started
before the acquisition of the device list lock.  That would mean a
transaction would be held open across /all/ of the discards collectively.
In order to prevent other threads from allocating or freeing chunks, we
hold the chunks lock across the search and discard calls.  We release it
between searches to allow the file system to perform more-or-less
normally.  Since the running transaction can commit and disappear while
we're using the transaction pointer, we take a reference to it and
release it after the search.  This is safe since it would happen normally
at the end of the transaction commit after any locks are released anyway.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
 fs/btrfs/volumes.h     |  3 ++
 3 files changed, 137 insertions(+), 23 deletions(-)

Comments

Filipe Manana May 8, 2015, 8:38 p.m. UTC | #1
On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
> Since we now clean up block groups automatically as they become
> empty, iterating over block groups is no longer sufficient to discard
> unused space.
>
> This patch iterates over the unused chunk space and discards any regions
> that are unallocated, regardless of whether they were ever used.  This is
> a change for btrfs but is consistent with other file systems.
>
> We do this in a transactionless manner since the discard process can take
> a substantial amount of time and a transaction would need to be started
> before the acquisition of the device list lock.  That would mean a
> transaction would be held open across /all/ of the discards collectively.
> In order to prevent other threads from allocating or freeing chunks, we
> hold the chunks lock across the search and discard calls.  We release it
> between searches to allow the file system to perform more-or-less
> normally.  Since the running transaction can commit and disappear while
> we're using the transaction pointer, we take a reference to it and
> release it after the search.  This is safe since it would happen normally
> at the end of the transaction commit after any locks are released anyway.
>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---

Hi Jeff,

Missing changelog describing what changed between v1 and v2.

>  fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>  fs/btrfs/volumes.h     |  3 ++
>  3 files changed, 137 insertions(+), 23 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 0ec8e22..6d1d74d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>         return unpin_extent_range(root, start, end, false);
>  }
>
> +/*
> + * It used to be that old block groups would be left around forever.
> + * Iterating over them would be enough to trim unused space.  Since we
> + * now automatically remove them, we also need to iterate over unallocated
> + * space.
> + *
> + * We don't want a transaction for this since the discard may take a
> + * substantial amount of time.  We don't require that a transaction be
> + * running, but we do need to take a running transaction into account
> + * to ensure that we're not discarding chunks that were released in
> + * the current transaction.
> + *
> + * Holding the chunks lock will prevent other threads from allocating
> + * or releasing chunks, but it won't prevent a running transaction
> + * from committing and releasing the memory that the pending chunks
> + * list head uses.  For that, we need to take a reference to the
> + * transaction.
> + */
> +static int btrfs_trim_free_extents(struct btrfs_device *device,
> +                                  u64 minlen, u64 *trimmed)
> +{
> +       u64 start = 0, len = 0;
> +       int ret;
> +
> +       *trimmed = 0;
> +
> +       /* Not writeable = nothing to do. */
> +       if (!device->writeable)
> +               return 0;
> +
> +       /* No free space = nothing to do. */
> +       if (device->total_bytes <= device->bytes_used)
> +               return 0;
> +
> +       ret = 0;
> +
> +       while (1) {
> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
> +               struct btrfs_transaction *trans;
> +
> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
> +               if (ret)
> +                       return ret;
> +
> +               spin_lock(&fs_info->trans_lock);
> +               trans = fs_info->running_transaction;
> +               if (trans)
> +                       atomic_inc(&trans->use_count);
> +               spin_unlock(&fs_info->trans_lock);
> +
> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
> +                                                &start, &len);
> +               if (trans)
> +                       btrfs_put_transaction(trans);
> +
> +               if (ret) {
> +                       mutex_unlock(&fs_info->chunk_mutex);
> +                       if (ret == -ENOSPC)
> +                               ret = 0;
> +                       break;
> +               }
> +
> +               ret = btrfs_issue_discard(device->bdev, start, len);
> +               mutex_unlock(&fs_info->chunk_mutex);
> +
> +               if (ret)
> +                       break;
> +
> +               start += len;
> +               *trimmed += len;
> +
> +               if (fatal_signal_pending(current)) {
> +                       ret = -ERESTARTSYS;
> +                       break;
> +               }
> +
> +               cond_resched();
> +       }
> +
> +       return ret;
> +}
> +
>  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>  {
>         struct btrfs_fs_info *fs_info = root->fs_info;
>         struct btrfs_block_group_cache *cache = NULL;
> +       struct btrfs_device *device;
> +       struct list_head *devices;
>         u64 group_trimmed;
>         u64 start;
>         u64 end;
> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>                 cache = next_block_group(fs_info->tree_root, cache);
>         }
>
> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
> +       devices = &root->fs_info->fs_devices->alloc_list;
> +       list_for_each_entry(device, devices, dev_alloc_list) {
> +               ret = btrfs_trim_free_extents(device, range->minlen,
> +                                             &group_trimmed);
> +               if (ret)
> +                       break;
> +
> +               trimmed += group_trimmed;
> +       }
> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
> +
>         range->len = trimmed;
>         return ret;
>  }
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 96aebf3..ada8965 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1051,15 +1051,19 @@ out:
>         return ret;
>  }
>
> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>                                    struct btrfs_device *device,
>                                    u64 *start, u64 len)
>  {
> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>         struct extent_map *em;
> -       struct list_head *search_list = &trans->transaction->pending_chunks;
> +       struct list_head *search_list = &transaction->pending_chunks;

transaction can be NULL so we need to check for that first (as you do below).

>         int ret = 0;
>         u64 physical_start = *start;
>
> +       if (!transaction)
> +               search_list = &fs_info->pinned_chunks;
> +
>  again:
>         list_for_each_entry(em, search_list, list) {
>                 struct map_lookup *map;
> @@ -1078,8 +1082,8 @@ again:
>                         ret = 1;
>                 }
>         }
> -       if (search_list == &trans->transaction->pending_chunks) {
> -               search_list = &trans->root->fs_info->pinned_chunks;
> +       if (search_list == &transaction->pending_chunks) {
> +               search_list = &fs_info->pinned_chunks;
>                 goto again;
>         }
>
> @@ -1088,12 +1092,13 @@ again:
>
>
>  /*
> - * find_free_dev_extent - find free space in the specified device
> - * @device:    the device which we search the free space in
> - * @num_bytes: the size of the free space that we need
> - * @start:     store the start of the free space.
> - * @len:       the size of the free space. that we find, or the size of the max
> - *             free space if we don't find suitable free space
> + * find_free_dev_extent_start - find free space in the specified device
> + * @device:      the device which we search the free space in
> + * @num_bytes:   the size of the free space that we need
> + * @search_start: the position from which to begin the search
> + * @start:       store the start of the free space.
> + * @len:         the size of the free space. that we find, or the size
> + *               of the max free space if we don't find suitable free space
>   *
>   * this uses a pretty simple search, the expectation is that it is
>   * called very infrequently and that a given device has a small number
> @@ -1107,9 +1112,9 @@ again:
>   * But if we don't find suitable free space, it is used to store the size of
>   * the max free space.
>   */
> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
> -                        struct btrfs_device *device, u64 num_bytes,
> -                        u64 *start, u64 *len)
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> +                              struct btrfs_device *device, u64 num_bytes,
> +                              u64 search_start, u64 *start, u64 *len)
>  {
>         struct btrfs_key key;
>         struct btrfs_root *root = device->dev_root;
> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>         u64 max_hole_start;
>         u64 max_hole_size;
>         u64 extent_end;
> -       u64 search_start;
>         u64 search_end = device->total_bytes;
>         int ret;
>         int slot;
>         struct extent_buffer *l;
>
> -       /* FIXME use last free of some kind */
> -
> -       /* we don't want to overwrite the superblock on the drive,
> -        * so we make sure to start at an offset of at least 1MB
> -        */
> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> -
>         path = btrfs_alloc_path();
>         if (!path)
>                 return -ENOMEM;
> @@ -1192,7 +1189,7 @@ again:
>                          * Have to check before we set max_hole_start, otherwise
>                          * we could end up sending back this offset anyway.
>                          */
> -                       if (contains_pending_extent(trans, device,
> +                       if (contains_pending_extent(transaction, device,
>                                                     &search_start,
>                                                     hole_size)) {
>                                 if (key.offset >= search_start) {
> @@ -1241,7 +1238,7 @@ next:
>         if (search_end > search_start) {
>                 hole_size = search_end - search_start;
>
> -               if (contains_pending_extent(trans, device, &search_start,
> +               if (contains_pending_extent(transaction, device, &search_start,
>                                             hole_size)) {
>                         btrfs_release_path(path);
>                         goto again;
> @@ -1267,6 +1264,24 @@ out:
>         return ret;
>  }
>
> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
> +                        struct btrfs_device *device, u64 num_bytes,
> +                        u64 *start, u64 *len)
> +{
> +       struct btrfs_root *root = device->dev_root;
> +       u64 search_start;
> +
> +       /* FIXME use last free of some kind */
> +
> +       /*
> +        * we don't want to overwrite the superblock on the drive,
> +        * so we make sure to start at an offset of at least 1MB
> +        */
> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> +       return find_free_dev_extent_start(trans->transaction, device,
> +                                         num_bytes, search_start, start, len);
> +}
> +
>  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>                           struct btrfs_device *device,
>                           u64 start, u64 *dev_extent_len)
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index ebc3133..30918a8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> +                        struct btrfs_device *device, u64 num_bytes,
> +                        u64 search_start, u64 *start, u64 *max_avail);
>  int find_free_dev_extent(struct btrfs_trans_handle *trans,
>                          struct btrfs_device *device, u64 num_bytes,
>                          u64 *start, u64 *max_avail);
> --
> 1.8.5.6
>
>
> --
> Jeff Mahoney
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana May 8, 2015, 9:16 p.m. UTC | #2
On Fri, May 8, 2015 at 9:38 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
>> Since we now clean up block groups automatically as they become
>> empty, iterating over block groups is no longer sufficient to discard
>> unused space.
>>
>> This patch iterates over the unused chunk space and discards any regions
>> that are unallocated, regardless of whether they were ever used.  This is
>> a change for btrfs but is consistent with other file systems.
>>
>> We do this in a transactionless manner since the discard process can take
>> a substantial amount of time and a transaction would need to be started
>> before the acquisition of the device list lock.  That would mean a
>> transaction would be held open across /all/ of the discards collectively.
>> In order to prevent other threads from allocating or freeing chunks, we
>> hold the chunks lock across the search and discard calls.  We release it
>> between searches to allow the file system to perform more-or-less
>> normally.  Since the running transaction can commit and disappear while
>> we're using the transaction pointer, we take a reference to it and
>> release it after the search.  This is safe since it would happen normally
>> at the end of the transaction commit after any locks are released anyway.
>>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>> ---
>
> Hi Jeff,
>
> Missing changelog describing what changed between v1 and v2.
>
>>  fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>>  fs/btrfs/volumes.h     |  3 ++
>>  3 files changed, 137 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 0ec8e22..6d1d74d 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>>         return unpin_extent_range(root, start, end, false);
>>  }
>>
>> +/*
>> + * It used to be that old block groups would be left around forever.
>> + * Iterating over them would be enough to trim unused space.  Since we
>> + * now automatically remove them, we also need to iterate over unallocated
>> + * space.
>> + *
>> + * We don't want a transaction for this since the discard may take a
>> + * substantial amount of time.  We don't require that a transaction be
>> + * running, but we do need to take a running transaction into account
>> + * to ensure that we're not discarding chunks that were released in
>> + * the current transaction.
>> + *
>> + * Holding the chunks lock will prevent other threads from allocating
>> + * or releasing chunks, but it won't prevent a running transaction
>> + * from committing and releasing the memory that the pending chunks
>> + * list head uses.  For that, we need to take a reference to the
>> + * transaction.
>> + */
>> +static int btrfs_trim_free_extents(struct btrfs_device *device,
>> +                                  u64 minlen, u64 *trimmed)
>> +{
>> +       u64 start = 0, len = 0;
>> +       int ret;
>> +
>> +       *trimmed = 0;
>> +
>> +       /* Not writeable = nothing to do. */
>> +       if (!device->writeable)
>> +               return 0;
>> +
>> +       /* No free space = nothing to do. */
>> +       if (device->total_bytes <= device->bytes_used)
>> +               return 0;
>> +
>> +       ret = 0;
>> +
>> +       while (1) {
>> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>> +               struct btrfs_transaction *trans;
>> +
>> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               spin_lock(&fs_info->trans_lock);
>> +               trans = fs_info->running_transaction;
>> +               if (trans)
>> +                       atomic_inc(&trans->use_count);
>> +               spin_unlock(&fs_info->trans_lock);
>> +
>> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
>> +                                                &start, &len);
>> +               if (trans)
>> +                       btrfs_put_transaction(trans);
>> +
>> +               if (ret) {
>> +                       mutex_unlock(&fs_info->chunk_mutex);
>> +                       if (ret == -ENOSPC)
>> +                               ret = 0;
>> +                       break;
>> +               }
>> +
>> +               ret = btrfs_issue_discard(device->bdev, start, len);
>> +               mutex_unlock(&fs_info->chunk_mutex);
>> +
>> +               if (ret)
>> +                       break;
>> +
>> +               start += len;
>> +               *trimmed += len;
>> +
>> +               if (fatal_signal_pending(current)) {
>> +                       ret = -ERESTARTSYS;
>> +                       break;
>> +               }
>> +
>> +               cond_resched();
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>  {
>>         struct btrfs_fs_info *fs_info = root->fs_info;
>>         struct btrfs_block_group_cache *cache = NULL;
>> +       struct btrfs_device *device;
>> +       struct list_head *devices;
>>         u64 group_trimmed;
>>         u64 start;
>>         u64 end;
>> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>                 cache = next_block_group(fs_info->tree_root, cache);
>>         }
>>
>> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>> +       devices = &root->fs_info->fs_devices->alloc_list;
>> +       list_for_each_entry(device, devices, dev_alloc_list) {
>> +               ret = btrfs_trim_free_extents(device, range->minlen,
>> +                                             &group_trimmed);
>> +               if (ret)
>> +                       break;
>> +
>> +               trimmed += group_trimmed;
>> +       }
>> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>> +
>>         range->len = trimmed;
>>         return ret;
>>  }
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 96aebf3..ada8965 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1051,15 +1051,19 @@ out:
>>         return ret;
>>  }
>>
>> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
>> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>>                                    struct btrfs_device *device,
>>                                    u64 *start, u64 len)
>>  {
>> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>         struct extent_map *em;
>> -       struct list_head *search_list = &trans->transaction->pending_chunks;
>> +       struct list_head *search_list = &transaction->pending_chunks;
>
> transaction can be NULL so we need to check for that first (as you do below).
>
>>         int ret = 0;
>>         u64 physical_start = *start;
>>
>> +       if (!transaction)
>> +               search_list = &fs_info->pinned_chunks;
>> +
>>  again:
>>         list_for_each_entry(em, search_list, list) {
>>                 struct map_lookup *map;
>> @@ -1078,8 +1082,8 @@ again:
>>                         ret = 1;
>>                 }
>>         }
>> -       if (search_list == &trans->transaction->pending_chunks) {
>> -               search_list = &trans->root->fs_info->pinned_chunks;
>> +       if (search_list == &transaction->pending_chunks) {

And here missing the null test too, which should be something like:

   if (transaction && search_list == &transaction->pending_chunks) {


>> +               search_list = &fs_info->pinned_chunks;
>>                 goto again;
>>         }
>>
>> @@ -1088,12 +1092,13 @@ again:
>>
>>
>>  /*
>> - * find_free_dev_extent - find free space in the specified device
>> - * @device:    the device which we search the free space in
>> - * @num_bytes: the size of the free space that we need
>> - * @start:     store the start of the free space.
>> - * @len:       the size of the free space. that we find, or the size of the max
>> - *             free space if we don't find suitable free space
>> + * find_free_dev_extent_start - find free space in the specified device
>> + * @device:      the device which we search the free space in
>> + * @num_bytes:   the size of the free space that we need
>> + * @search_start: the position from which to begin the search
>> + * @start:       store the start of the free space.
>> + * @len:         the size of the free space. that we find, or the size
>> + *               of the max free space if we don't find suitable free space
>>   *
>>   * this uses a pretty simple search, the expectation is that it is
>>   * called very infrequently and that a given device has a small number
>> @@ -1107,9 +1112,9 @@ again:
>>   * But if we don't find suitable free space, it is used to store the size of
>>   * the max free space.
>>   */
>> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
>> -                        struct btrfs_device *device, u64 num_bytes,
>> -                        u64 *start, u64 *len)
>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>> +                              struct btrfs_device *device, u64 num_bytes,
>> +                              u64 search_start, u64 *start, u64 *len)
>>  {
>>         struct btrfs_key key;
>>         struct btrfs_root *root = device->dev_root;
>> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>         u64 max_hole_start;
>>         u64 max_hole_size;
>>         u64 extent_end;
>> -       u64 search_start;
>>         u64 search_end = device->total_bytes;
>>         int ret;
>>         int slot;
>>         struct extent_buffer *l;
>>
>> -       /* FIXME use last free of some kind */
>> -
>> -       /* we don't want to overwrite the superblock on the drive,
>> -        * so we make sure to start at an offset of at least 1MB
>> -        */
>> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>> -
>>         path = btrfs_alloc_path();
>>         if (!path)
>>                 return -ENOMEM;
>> @@ -1192,7 +1189,7 @@ again:
>>                          * Have to check before we set max_hole_start, otherwise
>>                          * we could end up sending back this offset anyway.
>>                          */
>> -                       if (contains_pending_extent(trans, device,
>> +                       if (contains_pending_extent(transaction, device,
>>                                                     &search_start,
>>                                                     hole_size)) {
>>                                 if (key.offset >= search_start) {
>> @@ -1241,7 +1238,7 @@ next:
>>         if (search_end > search_start) {
>>                 hole_size = search_end - search_start;
>>
>> -               if (contains_pending_extent(trans, device, &search_start,
>> +               if (contains_pending_extent(transaction, device, &search_start,
>>                                             hole_size)) {
>>                         btrfs_release_path(path);
>>                         goto again;
>> @@ -1267,6 +1264,24 @@ out:
>>         return ret;
>>  }
>>
>> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
>> +                        struct btrfs_device *device, u64 num_bytes,
>> +                        u64 *start, u64 *len)
>> +{
>> +       struct btrfs_root *root = device->dev_root;
>> +       u64 search_start;
>> +
>> +       /* FIXME use last free of some kind */
>> +
>> +       /*
>> +        * we don't want to overwrite the superblock on the drive,
>> +        * so we make sure to start at an offset of at least 1MB
>> +        */
>> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>> +       return find_free_dev_extent_start(trans->transaction, device,
>> +                                         num_bytes, search_start, start, len);
>> +}
>> +
>>  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>>                           struct btrfs_device *device,
>>                           u64 start, u64 *dev_extent_len)
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index ebc3133..30918a8 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>>  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>>  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>>  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>> +                        struct btrfs_device *device, u64 num_bytes,
>> +                        u64 search_start, u64 *start, u64 *max_avail);
>>  int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>                          struct btrfs_device *device, u64 num_bytes,
>>                          u64 *start, u64 *max_avail);
>> --
>> 1.8.5.6
>>
>>
>> --
>> Jeff Mahoney
>> SUSE Labs
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Filipe David Manana,
>
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."
Jeff Mahoney May 8, 2015, 10:17 p.m. UTC | #3
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 5/8/15 4:38 PM, Filipe David Manana wrote:
> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com>
> wrote:
>> Since we now clean up block groups automatically as they become 
>> empty, iterating over block groups is no longer sufficient to
>> discard unused space.
>> 
>> This patch iterates over the unused chunk space and discards any
>> regions that are unallocated, regardless of whether they were
>> ever used.  This is a change for btrfs but is consistent with
>> other file systems.
>> 
>> We do this in a transactionless manner since the discard process
>> can take a substantial amount of time and a transaction would
>> need to be started before the acquisition of the device list
>> lock.  That would mean a transaction would be held open across
>> /all/ of the discards collectively. In order to prevent other
>> threads from allocating or freeing chunks, we hold the chunks
>> lock across the search and discard calls.  We release it between
>> searches to allow the file system to perform more-or-less 
>> normally.  Since the running transaction can commit and disappear
>> while we're using the transaction pointer, we take a reference to
>> it and release it after the search.  This is safe since it would
>> happen normally at the end of the transaction commit after any
>> locks are released anyway.
>> 
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> ---
> 
> Hi Jeff,
> 
> Missing changelog describing what changed between v1 and v2.
> 
>> fs/btrfs/extent-tree.c | 96
>> ++++++++++++++++++++++++++++++++++++++++++++++++++ 
>> fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------ 
>> fs/btrfs/volumes.h     |  3 ++ 3 files changed, 137
>> insertions(+), 23 deletions(-)
>> 
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c 
>> index 0ec8e22..6d1d74d 100644 --- a/fs/btrfs/extent-tree.c +++
>> b/fs/btrfs/extent-tree.c @@ -10031,10 +10031,94 @@ int
>> btrfs_error_unpin_extent_range(struct btrfs_root *root, u64
>> start, u64 end) return unpin_extent_range(root, start, end,
>> false); }
>> 
>> +/* + * It used to be that old block groups would be left around
>> forever. + * Iterating over them would be enough to trim unused
>> space.  Since we + * now automatically remove them, we also need
>> to iterate over unallocated + * space. + * + * We don't want a
>> transaction for this since the discard may take a + * substantial
>> amount of time.  We don't require that a transaction be + *
>> running, but we do need to take a running transaction into
>> account + * to ensure that we're not discarding chunks that were
>> released in + * the current transaction. + * + * Holding the
>> chunks lock will prevent other threads from allocating + * or
>> releasing chunks, but it won't prevent a running transaction + *
>> from committing and releasing the memory that the pending chunks 
>> + * list head uses.  For that, we need to take a reference to
>> the + * transaction. + */ +static int
>> btrfs_trim_free_extents(struct btrfs_device *device, +
>> u64 minlen, u64 *trimmed) +{ +       u64 start = 0, len = 0; +
>> int ret; + +       *trimmed = 0; + +       /* Not writeable =
>> nothing to do. */ +       if (!device->writeable) +
>> return 0; + +       /* No free space = nothing to do. */ +
>> if (device->total_bytes <= device->bytes_used) +
>> return 0; + +       ret = 0; + +       while (1) { +
>> struct btrfs_fs_info *fs_info = device->dev_root->fs_info; +
>> struct btrfs_transaction *trans; + +               ret =
>> mutex_lock_interruptible(&fs_info->chunk_mutex); +
>> if (ret) +                       return ret; + +
>> spin_lock(&fs_info->trans_lock); +               trans =
>> fs_info->running_transaction; +               if (trans) +
>> atomic_inc(&trans->use_count); +
>> spin_unlock(&fs_info->trans_lock); + +               ret =
>> find_free_dev_extent_start(trans, device, minlen, start, +
>> &start, &len); +               if (trans) +
>> btrfs_put_transaction(trans); + +               if (ret) { +
>> mutex_unlock(&fs_info->chunk_mutex); +                       if
>> (ret == -ENOSPC) +                               ret = 0; +
>> break; +               } + +               ret =
>> btrfs_issue_discard(device->bdev, start, len); +
>> mutex_unlock(&fs_info->chunk_mutex); + +               if (ret) +
>> break; + +               start += len; +               *trimmed
>> += len; + +               if (fatal_signal_pending(current)) { +
>> ret = -ERESTARTSYS; +                       break; +
>> } + +               cond_resched(); +       } + +       return
>> ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct
>> fstrim_range *range) { struct btrfs_fs_info *fs_info =
>> root->fs_info; struct btrfs_block_group_cache *cache = NULL; +
>> struct btrfs_device *device; +       struct list_head *devices; 
>> u64 group_trimmed; u64 start; u64 end; @@ -10089,6 +10173,18 @@
>> int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range
>> *range) cache = next_block_group(fs_info->tree_root, cache); }
>> 
>> +
>> mutex_lock(&root->fs_info->fs_devices->device_list_mutex); +
>> devices = &root->fs_info->fs_devices->alloc_list; +
>> list_for_each_entry(device, devices, dev_alloc_list) { +
>> ret = btrfs_trim_free_extents(device, range->minlen, +
>> &group_trimmed); +               if (ret) +
>> break; + +               trimmed += group_trimmed; +       } +
>> mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + 
>> range->len = trimmed; return ret; } diff --git
>> a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96aebf3..ada8965
>> 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@
>> -1051,15 +1051,19 @@ out: return ret; }
>> 
>> -static int contains_pending_extent(struct btrfs_trans_handle
>> *trans, +static int contains_pending_extent(struct
>> btrfs_transaction *transaction, struct btrfs_device *device, u64
>> *start, u64 len) { +       struct btrfs_fs_info *fs_info =
>> device->dev_root->fs_info; struct extent_map *em; -       struct
>> list_head *search_list = &trans->transaction->pending_chunks; +
>> struct list_head *search_list = &transaction->pending_chunks;
> 
> transaction can be NULL so we need to check for that first (as you
> do below).

The ordering here is safe. We're not dereferencing the transaction,
we're getting the address of one of the members. search_list will
point to 0x100 or something. We don't ever dereference that without
checking the transaction against NULL.

- -Jeff

>> int ret = 0; u64 physical_start = *start;
>> 
>> +       if (!transaction) +               search_list =
>> &fs_info->pinned_chunks; + again: list_for_each_entry(em,
>> search_list, list) { struct map_lookup *map; @@ -1078,8 +1082,8
>> @@ again: ret = 1; } } -       if (search_list ==
>> &trans->transaction->pending_chunks) { -
>> search_list = &trans->root->fs_info->pinned_chunks; +       if
>> (search_list == &transaction->pending_chunks) { +
>> search_list = &fs_info->pinned_chunks; goto again; }
>> 
>> @@ -1088,12 +1092,13 @@ again:
>> 
>> 
>> /* - * find_free_dev_extent - find free space in the specified
>> device - * @device:    the device which we search the free space
>> in - * @num_bytes: the size of the free space that we need - *
>> @start:     store the start of the free space. - * @len:
>> the size of the free space. that we find, or the size of the max 
>> - *             free space if we don't find suitable free space +
>> * find_free_dev_extent_start - find free space in the specified
>> device + * @device:      the device which we search the free
>> space in + * @num_bytes:   the size of the free space that we
>> need + * @search_start: the position from which to begin the
>> search + * @start:       store the start of the free space. + *
>> @len:         the size of the free space. that we find, or the
>> size + *               of the max free space if we don't find
>> suitable free space * * this uses a pretty simple search, the
>> expectation is that it is * called very infrequently and that a
>> given device has a small number @@ -1107,9 +1112,9 @@ again: *
>> But if we don't find suitable free space, it is used to store the
>> size of * the max free space. */ -int find_free_dev_extent(struct
>> btrfs_trans_handle *trans, -                        struct
>> btrfs_device *device, u64 num_bytes, -                        u64
>> *start, u64 *len) +int find_free_dev_extent_start(struct
>> btrfs_transaction *transaction, +
>> struct btrfs_device *device, u64 num_bytes, +
>> u64 search_start, u64 *start, u64 *len) { struct btrfs_key key; 
>> struct btrfs_root *root = device->dev_root; @@ -1119,19 +1124,11
>> @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64
>> max_hole_start; u64 max_hole_size; u64 extent_end; -       u64
>> search_start; u64 search_end = device->total_bytes; int ret; int
>> slot; struct extent_buffer *l;
>> 
>> -       /* FIXME use last free of some kind */ - -       /* we
>> don't want to overwrite the superblock on the drive, -        *
>> so we make sure to start at an offset of at least 1MB -
>> */ -       search_start = max(root->fs_info->alloc_start, 1024ull
>> * 1024); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; 
>> @@ -1192,7 +1189,7 @@ again: * Have to check before we set
>> max_hole_start, otherwise * we could end up sending back this
>> offset anyway. */ -                       if
>> (contains_pending_extent(trans, device, +
>> if (contains_pending_extent(transaction, device, &search_start, 
>> hole_size)) { if (key.offset >= search_start) { @@ -1241,7
>> +1238,7 @@ next: if (search_end > search_start) { hole_size =
>> search_end - search_start;
>> 
>> -               if (contains_pending_extent(trans, device,
>> &search_start, +               if
>> (contains_pending_extent(transaction, device, &search_start, 
>> hole_size)) { btrfs_release_path(path); goto again; @@ -1267,6
>> +1264,24 @@ out: return ret; }
>> 
>> +int find_free_dev_extent(struct btrfs_trans_handle *trans, +
>> struct btrfs_device *device, u64 num_bytes, +
>> u64 *start, u64 *len) +{ +       struct btrfs_root *root =
>> device->dev_root; +       u64 search_start; + +       /* FIXME
>> use last free of some kind */ + +       /* +        * we don't
>> want to overwrite the superblock on the drive, +        * so we
>> make sure to start at an offset of at least 1MB +        */ +
>> search_start = max(root->fs_info->alloc_start, 1024ull * 1024); +
>> return find_free_dev_extent_start(trans->transaction, device, +
>> num_bytes, search_start, start, len); +} + static int
>> btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct
>> btrfs_device *device, u64 start, u64 *dev_extent_len) diff --git
>> a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc3133..30918a8
>> 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@
>> -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info
>> *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info
>> *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info
>> *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64
>> chunk_offset); +int find_free_dev_extent_start(struct
>> btrfs_transaction *transaction, +                        struct
>> btrfs_device *device, u64 num_bytes, +                        u64
>> search_start, u64 *start, u64 *max_avail); int
>> find_free_dev_extent(struct btrfs_trans_handle *trans, struct
>> btrfs_device *device, u64 num_bytes, u64 *start, u64
>> *max_avail); -- 1.8.5.6
>> 
>> 
>> -- Jeff Mahoney SUSE Labs -- To unsubscribe from this list: send
>> the line "unsubscribe linux-btrfs" in the body of a message to
>> majordomo@vger.kernel.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
> 
> 
> 


- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVTTX2AAoJEB57S2MheeWyUGYP/iIl5JchDH1JqFV/AWtdrf4B
huxL1gFYH8hAVj0pr1av7+EnbPp9rPwIFZXanvbJC9KFM3SCGEnF9/IYHeTzxiUl
ZH7UC6E4xnr4xXrgKVT5nhOIDwg5cSdvuBzGfG6HttdL7KxhiHNGATDAXIn/hU1h
XbU+aze8RR5+AHmh4ZboMAJr0/2VRU7C5aHyzvnCJbBwY6IqaI+RLyJ5ClQsIK1H
PTMAgNmKogieEmNcFDq4UKaMZDZ0lj4A3QDJ6XAGvXyFcecncz8vF9QrurusslDO
b074P1QVnzCPlejiIxsalPjT441tvWTPc5tcIa7Is6Zuoe9ezjN3LFwUA1Yvr061
0/D667gN2/+Tjgak5zAvD/efkvzXdZV1GOm9fZdFeR6cBhXVYv2sZVBmf7llEtn6
EAk6gqqHmdJQT1SgG1hOqSGVAKH4FbGUGq40zuKpey8Yna0iOaYxMssYxEnJIeeP
EPt56nIv84Np69pFIsembp1ZB6Evy5yFm6DzjFwUQ8kH8NHbz1GqMDKaOCuCAXPn
hiYG0VK6tNrfwmzthxmgwlTwYtA2wx9ug/i8fnOLwkFyxnOJJxX3RGBRe5Wiqcaf
RKP2a3ZbXSl+OyNguqr0Zg4GXcgtrccSqXj9FGvUqNSVTdYoo5B6ctbKPZtxA0u1
v5n8rcnwDvuLDKJuldG4
=HZMM
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney May 8, 2015, 10:45 p.m. UTC | #4
> On May 8, 2015, at 5:16 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
> 
>> On Fri, May 8, 2015 at 9:38 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
>>> Since we now clean up block groups automatically as they become
>>> empty, iterating over block groups is no longer sufficient to discard
>>> unused space.
>>> 
>>> This patch iterates over the unused chunk space and discards any regions
>>> that are unallocated, regardless of whether they were ever used.  This is
>>> a change for btrfs but is consistent with other file systems.
>>> 
>>> We do this in a transactionless manner since the discard process can take
>>> a substantial amount of time and a transaction would need to be started
>>> before the acquisition of the device list lock.  That would mean a
>>> transaction would be held open across /all/ of the discards collectively.
>>> In order to prevent other threads from allocating or freeing chunks, we
>>> hold the chunks lock across the search and discard calls.  We release it
>>> between searches to allow the file system to perform more-or-less
>>> normally.  Since the running transaction can commit and disappear while
>>> we're using the transaction pointer, we take a reference to it and
>>> release it after the search.  This is safe since it would happen normally
>>> at the end of the transaction commit after any locks are released anyway.
>>> 
>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>>> ---
>> 
>> Hi Jeff,
>> 
>> Missing changelog describing what changed between v1 and v2.
>> 
>>> fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>>> fs/btrfs/volumes.h     |  3 ++
>>> 3 files changed, 137 insertions(+), 23 deletions(-)
>>> 
>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>> index 0ec8e22..6d1d74d 100644
>>> --- a/fs/btrfs/extent-tree.c
>>> +++ b/fs/btrfs/extent-tree.c
>>> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>>>        return unpin_extent_range(root, start, end, false);
>>> }
>>> 
>>> +/*
>>> + * It used to be that old block groups would be left around forever.
>>> + * Iterating over them would be enough to trim unused space.  Since we
>>> + * now automatically remove them, we also need to iterate over unallocated
>>> + * space.
>>> + *
>>> + * We don't want a transaction for this since the discard may take a
>>> + * substantial amount of time.  We don't require that a transaction be
>>> + * running, but we do need to take a running transaction into account
>>> + * to ensure that we're not discarding chunks that were released in
>>> + * the current transaction.
>>> + *
>>> + * Holding the chunks lock will prevent other threads from allocating
>>> + * or releasing chunks, but it won't prevent a running transaction
>>> + * from committing and releasing the memory that the pending chunks
>>> + * list head uses.  For that, we need to take a reference to the
>>> + * transaction.
>>> + */
>>> +static int btrfs_trim_free_extents(struct btrfs_device *device,
>>> +                                  u64 minlen, u64 *trimmed)
>>> +{
>>> +       u64 start = 0, len = 0;
>>> +       int ret;
>>> +
>>> +       *trimmed = 0;
>>> +
>>> +       /* Not writeable = nothing to do. */
>>> +       if (!device->writeable)
>>> +               return 0;
>>> +
>>> +       /* No free space = nothing to do. */
>>> +       if (device->total_bytes <= device->bytes_used)
>>> +               return 0;
>>> +
>>> +       ret = 0;
>>> +
>>> +       while (1) {
>>> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>> +               struct btrfs_transaction *trans;
>>> +
>>> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
>>> +               if (ret)
>>> +                       return ret;
>>> +
>>> +               spin_lock(&fs_info->trans_lock);
>>> +               trans = fs_info->running_transaction;
>>> +               if (trans)
>>> +                       atomic_inc(&trans->use_count);
>>> +               spin_unlock(&fs_info->trans_lock);
>>> +
>>> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
>>> +                                                &start, &len);
>>> +               if (trans)
>>> +                       btrfs_put_transaction(trans);
>>> +
>>> +               if (ret) {
>>> +                       mutex_unlock(&fs_info->chunk_mutex);
>>> +                       if (ret == -ENOSPC)
>>> +                               ret = 0;
>>> +                       break;
>>> +               }
>>> +
>>> +               ret = btrfs_issue_discard(device->bdev, start, len);
>>> +               mutex_unlock(&fs_info->chunk_mutex);
>>> +
>>> +               if (ret)
>>> +                       break;
>>> +
>>> +               start += len;
>>> +               *trimmed += len;
>>> +
>>> +               if (fatal_signal_pending(current)) {
>>> +                       ret = -ERESTARTSYS;
>>> +                       break;
>>> +               }
>>> +
>>> +               cond_resched();
>>> +       }
>>> +
>>> +       return ret;
>>> +}
>>> +
>>> int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>> {
>>>        struct btrfs_fs_info *fs_info = root->fs_info;
>>>        struct btrfs_block_group_cache *cache = NULL;
>>> +       struct btrfs_device *device;
>>> +       struct list_head *devices;
>>>        u64 group_trimmed;
>>>        u64 start;
>>>        u64 end;
>>> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>>                cache = next_block_group(fs_info->tree_root, cache);
>>>        }
>>> 
>>> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>>> +       devices = &root->fs_info->fs_devices->alloc_list;
>>> +       list_for_each_entry(device, devices, dev_alloc_list) {
>>> +               ret = btrfs_trim_free_extents(device, range->minlen,
>>> +                                             &group_trimmed);
>>> +               if (ret)
>>> +                       break;
>>> +
>>> +               trimmed += group_trimmed;
>>> +       }
>>> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>>> +
>>>        range->len = trimmed;
>>>        return ret;
>>> }
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 96aebf3..ada8965 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -1051,15 +1051,19 @@ out:
>>>        return ret;
>>> }
>>> 
>>> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
>>> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>>>                                   struct btrfs_device *device,
>>>                                   u64 *start, u64 len)
>>> {
>>> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>>        struct extent_map *em;
>>> -       struct list_head *search_list = &trans->transaction->pending_chunks;
>>> +       struct list_head *search_list = &transaction->pending_chunks;
>> 
>> transaction can be NULL so we need to check for that first (as you do below).
>> 
>>>        int ret = 0;
>>>        u64 physical_start = *start;
>>> 
>>> +       if (!transaction)
>>> +               search_list = &fs_info->pinned_chunks;
>>> +
>>> again:
>>>        list_for_each_entry(em, search_list, list) {
>>>                struct map_lookup *map;
>>> @@ -1078,8 +1082,8 @@ again:
>>>                        ret = 1;
>>>                }
>>>        }
>>> -       if (search_list == &trans->transaction->pending_chunks) {
>>> -               search_list = &trans->root->fs_info->pinned_chunks;
>>> +       if (search_list == &transaction->pending_chunks) {
> 
> And here missing the null test too, which should be something like:
> 
>   if (transaction && search_list == &transaction->pending_chunks) {
> 

That's the same case as above. We're not dereferencing transaction here either. The test in the if statement will fail because &transaction->pending_chunks will be 0x100 or something.

-Jeff


> 
>>> +               search_list = &fs_info->pinned_chunks;
>>>                goto again;
>>>        }
>>> 
>>> @@ -1088,12 +1092,13 @@ again:
>>> 
>>> 
>>> /*
>>> - * find_free_dev_extent - find free space in the specified device
>>> - * @device:    the device which we search the free space in
>>> - * @num_bytes: the size of the free space that we need
>>> - * @start:     store the start of the free space.
>>> - * @len:       the size of the free space. that we find, or the size of the max
>>> - *             free space if we don't find suitable free space
>>> + * find_free_dev_extent_start - find free space in the specified device
>>> + * @device:      the device which we search the free space in
>>> + * @num_bytes:   the size of the free space that we need
>>> + * @search_start: the position from which to begin the search
>>> + * @start:       store the start of the free space.
>>> + * @len:         the size of the free space. that we find, or the size
>>> + *               of the max free space if we don't find suitable free space
>>>  *
>>>  * this uses a pretty simple search, the expectation is that it is
>>>  * called very infrequently and that a given device has a small number
>>> @@ -1107,9 +1112,9 @@ again:
>>>  * But if we don't find suitable free space, it is used to store the size of
>>>  * the max free space.
>>>  */
>>> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>> -                        struct btrfs_device *device, u64 num_bytes,
>>> -                        u64 *start, u64 *len)
>>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>>> +                              struct btrfs_device *device, u64 num_bytes,
>>> +                              u64 search_start, u64 *start, u64 *len)
>>> {
>>>        struct btrfs_key key;
>>>        struct btrfs_root *root = device->dev_root;
>>> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>        u64 max_hole_start;
>>>        u64 max_hole_size;
>>>        u64 extent_end;
>>> -       u64 search_start;
>>>        u64 search_end = device->total_bytes;
>>>        int ret;
>>>        int slot;
>>>        struct extent_buffer *l;
>>> 
>>> -       /* FIXME use last free of some kind */
>>> -
>>> -       /* we don't want to overwrite the superblock on the drive,
>>> -        * so we make sure to start at an offset of at least 1MB
>>> -        */
>>> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>>> -
>>>        path = btrfs_alloc_path();
>>>        if (!path)
>>>                return -ENOMEM;
>>> @@ -1192,7 +1189,7 @@ again:
>>>                         * Have to check before we set max_hole_start, otherwise
>>>                         * we could end up sending back this offset anyway.
>>>                         */
>>> -                       if (contains_pending_extent(trans, device,
>>> +                       if (contains_pending_extent(transaction, device,
>>>                                                    &search_start,
>>>                                                    hole_size)) {
>>>                                if (key.offset >= search_start) {
>>> @@ -1241,7 +1238,7 @@ next:
>>>        if (search_end > search_start) {
>>>                hole_size = search_end - search_start;
>>> 
>>> -               if (contains_pending_extent(trans, device, &search_start,
>>> +               if (contains_pending_extent(transaction, device, &search_start,
>>>                                            hole_size)) {
>>>                        btrfs_release_path(path);
>>>                        goto again;
>>> @@ -1267,6 +1264,24 @@ out:
>>>        return ret;
>>> }
>>> 
>>> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>> +                        struct btrfs_device *device, u64 num_bytes,
>>> +                        u64 *start, u64 *len)
>>> +{
>>> +       struct btrfs_root *root = device->dev_root;
>>> +       u64 search_start;
>>> +
>>> +       /* FIXME use last free of some kind */
>>> +
>>> +       /*
>>> +        * we don't want to overwrite the superblock on the drive,
>>> +        * so we make sure to start at an offset of at least 1MB
>>> +        */
>>> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>>> +       return find_free_dev_extent_start(trans->transaction, device,
>>> +                                         num_bytes, search_start, start, len);
>>> +}
>>> +
>>> static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>>>                          struct btrfs_device *device,
>>>                          u64 start, u64 *dev_extent_len)
>>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>>> index ebc3133..30918a8 100644
>>> --- a/fs/btrfs/volumes.h
>>> +++ b/fs/btrfs/volumes.h
>>> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>>> int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>>> int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>>> int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
>>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>>> +                        struct btrfs_device *device, u64 num_bytes,
>>> +                        u64 search_start, u64 *start, u64 *max_avail);
>>> int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>                         struct btrfs_device *device, u64 num_bytes,
>>>                         u64 *start, u64 *max_avail);
>>> --
>>> 1.8.5.6
>>> 
>>> 
>>> --
>>> Jeff Mahoney
>>> SUSE Labs
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
>> 
>> 
>> --
>> Filipe David Manana,
>> 
>> "Reasonable men adapt themselves to the world.
>> Unreasonable men adapt the world to themselves.
>> That's why all progress depends on unreasonable men."
> 
> 
> 
> -- 
> Filipe David Manana,
> 
> "Reasonable men adapt themselves to the world.
> Unreasonable men adapt the world to themselves.
> That's why all progress depends on unreasonable men."
> 

--
Jeff Mahoney
SUSE Labs--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Omar Sandoval May 9, 2015, 12:18 a.m. UTC | #5
On Fri, May 08, 2015 at 06:17:26PM -0400, Jeff Mahoney wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> On 5/8/15 4:38 PM, Filipe David Manana wrote:
> > On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com>
> > wrote:
> >> -static int contains_pending_extent(struct btrfs_trans_handle
> >> *trans, +static int contains_pending_extent(struct
> >> btrfs_transaction *transaction, struct btrfs_device *device, u64
> >> *start, u64 len) { +       struct btrfs_fs_info *fs_info =
> >> device->dev_root->fs_info; struct extent_map *em; -       struct
> >> list_head *search_list = &trans->transaction->pending_chunks; +
> >> struct list_head *search_list = &transaction->pending_chunks;
> > 
> > transaction can be NULL so we need to check for that first (as you
> > do below).
> 
> The ordering here is safe. We're not dereferencing the transaction,
> we're getting the address of one of the members. search_list will
> point to 0x100 or something. We don't ever dereference that without
> checking the transaction against NULL.
> 

Hi, Jeff,

I believe I've seen this topic come up before -- see [1]. Although it's
probably fine on any sane compiler, it's apparently not correct from a
language standard standpoint. Probably better safe than sorry.

[1] https://software.intel.com/en-us/blogs/2015/04/20/null-pointer-dereferencing-causes-undefined-behavior

Thanks!
Jeff Mahoney May 9, 2015, 3:57 a.m. UTC | #6
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 5/8/15 8:18 PM, Omar Sandoval wrote:
> On Fri, May 08, 2015 at 06:17:26PM -0400, Jeff Mahoney wrote:
>> -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1
>> 
>> On 5/8/15 4:38 PM, Filipe David Manana wrote:
>>> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> 
>>> wrote:
>>>> -static int contains_pending_extent(struct
>>>> btrfs_trans_handle *trans, +static int
>>>> contains_pending_extent(struct btrfs_transaction
>>>> *transaction, struct btrfs_device *device, u64 *start, u64
>>>> len) { +       struct btrfs_fs_info *fs_info = 
>>>> device->dev_root->fs_info; struct extent_map *em; -
>>>> struct list_head *search_list =
>>>> &trans->transaction->pending_chunks; + struct list_head
>>>> *search_list = &transaction->pending_chunks;
>>> 
>>> transaction can be NULL so we need to check for that first (as
>>> you do below).
>> 
>> The ordering here is safe. We're not dereferencing the
>> transaction, we're getting the address of one of the members.
>> search_list will point to 0x100 or something. We don't ever
>> dereference that without checking the transaction against NULL.
>> 
> 
> Hi, Jeff,
> 
> I believe I've seen this topic come up before -- see [1]. Although
> it's probably fine on any sane compiler, it's apparently not
> correct from a language standard standpoint. Probably better safe
> than sorry.
> 
> [1]
> https://software.intel.com/en-us/blogs/2015/04/20/null-pointer-derefer
encing-causes-undefined-behavior

I
> 
get that if there are questions now, there will be questions later,
and I should just change it. And I will. But I stand by my argument
that the code is correct. Historically, a zero address being invalid
is convention. It wasn't part of the language spec.

I disagree with the blog post. A straight up NULL pointer may be
un-evaluatable as an object, but by the time it gets passed through
several layers of casting and function parameters, it's no longer
(void *)0.

Accordingly, this:

"""
An lvalue is an expression with an object type or an incomplete type
other than void; if an lvalue does not designate an object when it is
evaluated, the behavior is undefined.
"""

doesn't actually make any sense when the source of the null pointer is
unknown, like with a function parameter.


- -Jeff

- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.22 (Darwin)

iQIcBAEBAgAGBQJVTYWeAAoJEB57S2MheeWy5U8P/2+VJbtK04pojXmfChdt2y5Z
qJoADUnnboa+20HBgo+dgd/aZDMM72sngss6vle65HSJqMvvagv0ZCEXBqA3HeqM
11Cpsdi122a34vYp2RO6W4ldpLebcsq4IlU06LDSPW71xciFYtCpv/6o5CM6oHIW
3a9yKl1KlhJmmb9l86kF0SJxvGWuyCeL8EQY22be7QR7t6zQERPybXXhW3L6cfin
ya+2pPOwOeJl/u3AgA2U8nCX5KufABMfjQ+6uMcgUXgVnN4YmmivoNh5OJPJhgJi
V4k/0g7IEuLRe4dN5eGjoIShaftAu0aWqgH7Xg9NrNewjbwBCKkqliS+qf0B8Rud
Ma21ovUzpIFJAEXDqJPi8yDXHUy3DZ3zBSpfASI0CdC97USFSp/R36pVPYIHqrbA
0t6UIHoRUokdTgSTErft/oRxnxG4Qze41M3O3Bk04KMqoWay78VWHrOCziIDQ22q
cr0RUCi2vCLFMdg9QKrFm8h+rvusoxiZ176cDkIiFfevYUcG/IiFO5WnHSYBASVD
t1ZPvWEqgeXNhMMXPUX9QOObtef/iXhy44uD5GnVZtdGE0arBSAyIKZaAFkv79qo
eedH7TcANRoSbqYERgK+dwNrrd/bfuguTvQeXk9//hxK6z+Fmmsh/4weduUoV2oA
c8lLTHz46fVECHIXbB9H
=Nare
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Filipe Manana May 14, 2015, 9:33 a.m. UTC | #7
On Fri, May 8, 2015 at 11:45 PM, Jeff Mahoney <jeffm@suse.com> wrote:
>
>> On May 8, 2015, at 5:16 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>
>>> On Fri, May 8, 2015 at 9:38 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>>> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
>>>> Since we now clean up block groups automatically as they become
>>>> empty, iterating over block groups is no longer sufficient to discard
>>>> unused space.
>>>>
>>>> This patch iterates over the unused chunk space and discards any regions
>>>> that are unallocated, regardless of whether they were ever used.  This is
>>>> a change for btrfs but is consistent with other file systems.
>>>>
>>>> We do this in a transactionless manner since the discard process can take
>>>> a substantial amount of time and a transaction would need to be started
>>>> before the acquisition of the device list lock.  That would mean a
>>>> transaction would be held open across /all/ of the discards collectively.
>>>> In order to prevent other threads from allocating or freeing chunks, we
>>>> hold the chunks lock across the search and discard calls.  We release it
>>>> between searches to allow the file system to perform more-or-less
>>>> normally.  Since the running transaction can commit and disappear while
>>>> we're using the transaction pointer, we take a reference to it and
>>>> release it after the search.  This is safe since it would happen normally
>>>> at the end of the transaction commit after any locks are released anyway.
>>>>
>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>>>> ---
>>>
>>> Hi Jeff,
>>>
>>> Missing changelog describing what changed between v1 and v2.
>>>
>>>> fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>> fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>>>> fs/btrfs/volumes.h     |  3 ++
>>>> 3 files changed, 137 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>>> index 0ec8e22..6d1d74d 100644
>>>> --- a/fs/btrfs/extent-tree.c
>>>> +++ b/fs/btrfs/extent-tree.c
>>>> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>>>>        return unpin_extent_range(root, start, end, false);
>>>> }
>>>>
>>>> +/*
>>>> + * It used to be that old block groups would be left around forever.
>>>> + * Iterating over them would be enough to trim unused space.  Since we
>>>> + * now automatically remove them, we also need to iterate over unallocated
>>>> + * space.
>>>> + *
>>>> + * We don't want a transaction for this since the discard may take a
>>>> + * substantial amount of time.  We don't require that a transaction be
>>>> + * running, but we do need to take a running transaction into account
>>>> + * to ensure that we're not discarding chunks that were released in
>>>> + * the current transaction.
>>>> + *
>>>> + * Holding the chunks lock will prevent other threads from allocating
>>>> + * or releasing chunks, but it won't prevent a running transaction
>>>> + * from committing and releasing the memory that the pending chunks
>>>> + * list head uses.  For that, we need to take a reference to the
>>>> + * transaction.
>>>> + */
>>>> +static int btrfs_trim_free_extents(struct btrfs_device *device,
>>>> +                                  u64 minlen, u64 *trimmed)
>>>> +{
>>>> +       u64 start = 0, len = 0;
>>>> +       int ret;
>>>> +
>>>> +       *trimmed = 0;
>>>> +
>>>> +       /* Not writeable = nothing to do. */
>>>> +       if (!device->writeable)
>>>> +               return 0;
>>>> +
>>>> +       /* No free space = nothing to do. */
>>>> +       if (device->total_bytes <= device->bytes_used)
>>>> +               return 0;
>>>> +
>>>> +       ret = 0;
>>>> +
>>>> +       while (1) {
>>>> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>>> +               struct btrfs_transaction *trans;
>>>> +
>>>> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +
>>>> +               spin_lock(&fs_info->trans_lock);
>>>> +               trans = fs_info->running_transaction;
>>>> +               if (trans)
>>>> +                       atomic_inc(&trans->use_count);
>>>> +               spin_unlock(&fs_info->trans_lock);
>>>> +
>>>> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
>>>> +                                                &start, &len);
>>>> +               if (trans)
>>>> +                       btrfs_put_transaction(trans);
>>>> +
>>>> +               if (ret) {
>>>> +                       mutex_unlock(&fs_info->chunk_mutex);
>>>> +                       if (ret == -ENOSPC)
>>>> +                               ret = 0;
>>>> +                       break;
>>>> +               }
>>>> +
>>>> +               ret = btrfs_issue_discard(device->bdev, start, len);
>>>> +               mutex_unlock(&fs_info->chunk_mutex);
>>>> +
>>>> +               if (ret)
>>>> +                       break;
>>>> +
>>>> +               start += len;
>>>> +               *trimmed += len;
>>>> +
>>>> +               if (fatal_signal_pending(current)) {
>>>> +                       ret = -ERESTARTSYS;
>>>> +                       break;
>>>> +               }
>>>> +
>>>> +               cond_resched();
>>>> +       }
>>>> +
>>>> +       return ret;
>>>> +}
>>>> +
>>>> int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>>> {
>>>>        struct btrfs_fs_info *fs_info = root->fs_info;
>>>>        struct btrfs_block_group_cache *cache = NULL;
>>>> +       struct btrfs_device *device;
>>>> +       struct list_head *devices;
>>>>        u64 group_trimmed;
>>>>        u64 start;
>>>>        u64 end;
>>>> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>>>                cache = next_block_group(fs_info->tree_root, cache);
>>>>        }
>>>>
>>>> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>>>> +       devices = &root->fs_info->fs_devices->alloc_list;
>>>> +       list_for_each_entry(device, devices, dev_alloc_list) {
>>>> +               ret = btrfs_trim_free_extents(device, range->minlen,
>>>> +                                             &group_trimmed);
>>>> +               if (ret)
>>>> +                       break;
>>>> +
>>>> +               trimmed += group_trimmed;
>>>> +       }
>>>> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>>>> +
>>>>        range->len = trimmed;
>>>>        return ret;
>>>> }
>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>> index 96aebf3..ada8965 100644
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -1051,15 +1051,19 @@ out:
>>>>        return ret;
>>>> }
>>>>
>>>> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
>>>> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>>>>                                   struct btrfs_device *device,
>>>>                                   u64 *start, u64 len)
>>>> {
>>>> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>>>        struct extent_map *em;
>>>> -       struct list_head *search_list = &trans->transaction->pending_chunks;
>>>> +       struct list_head *search_list = &transaction->pending_chunks;
>>>
>>> transaction can be NULL so we need to check for that first (as you do below).
>>>
>>>>        int ret = 0;
>>>>        u64 physical_start = *start;
>>>>
>>>> +       if (!transaction)
>>>> +               search_list = &fs_info->pinned_chunks;
>>>> +
>>>> again:
>>>>        list_for_each_entry(em, search_list, list) {
>>>>                struct map_lookup *map;
>>>> @@ -1078,8 +1082,8 @@ again:
>>>>                        ret = 1;
>>>>                }
>>>>        }
>>>> -       if (search_list == &trans->transaction->pending_chunks) {
>>>> -               search_list = &trans->root->fs_info->pinned_chunks;
>>>> +       if (search_list == &transaction->pending_chunks) {
>>
>> And here missing the null test too, which should be something like:
>>
>>   if (transaction && search_list == &transaction->pending_chunks) {
>>
>
> That's the same case as above. We're not dereferencing transaction here either. The test in the if statement will fail because &transaction->pending_chunks will be 0x100 or something.

Right. Sorry friday evening review. What you are doing is valid then
and something very close to the offsetof implementation.

>
> -Jeff
>
>
>>
>>>> +               search_list = &fs_info->pinned_chunks;
>>>>                goto again;
>>>>        }
>>>>
>>>> @@ -1088,12 +1092,13 @@ again:
>>>>
>>>>
>>>> /*
>>>> - * find_free_dev_extent - find free space in the specified device
>>>> - * @device:    the device which we search the free space in
>>>> - * @num_bytes: the size of the free space that we need
>>>> - * @start:     store the start of the free space.
>>>> - * @len:       the size of the free space. that we find, or the size of the max
>>>> - *             free space if we don't find suitable free space
>>>> + * find_free_dev_extent_start - find free space in the specified device
>>>> + * @device:      the device which we search the free space in
>>>> + * @num_bytes:   the size of the free space that we need
>>>> + * @search_start: the position from which to begin the search
>>>> + * @start:       store the start of the free space.
>>>> + * @len:         the size of the free space. that we find, or the size
>>>> + *               of the max free space if we don't find suitable free space
>>>>  *
>>>>  * this uses a pretty simple search, the expectation is that it is
>>>>  * called very infrequently and that a given device has a small number
>>>> @@ -1107,9 +1112,9 @@ again:
>>>>  * But if we don't find suitable free space, it is used to store the size of
>>>>  * the max free space.
>>>>  */
>>>> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>> -                        struct btrfs_device *device, u64 num_bytes,
>>>> -                        u64 *start, u64 *len)
>>>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>>>> +                              struct btrfs_device *device, u64 num_bytes,
>>>> +                              u64 search_start, u64 *start, u64 *len)
>>>> {
>>>>        struct btrfs_key key;
>>>>        struct btrfs_root *root = device->dev_root;
>>>> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>>        u64 max_hole_start;
>>>>        u64 max_hole_size;
>>>>        u64 extent_end;
>>>> -       u64 search_start;
>>>>        u64 search_end = device->total_bytes;
>>>>        int ret;
>>>>        int slot;
>>>>        struct extent_buffer *l;
>>>>
>>>> -       /* FIXME use last free of some kind */
>>>> -
>>>> -       /* we don't want to overwrite the superblock on the drive,
>>>> -        * so we make sure to start at an offset of at least 1MB
>>>> -        */
>>>> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>>>> -
>>>>        path = btrfs_alloc_path();
>>>>        if (!path)
>>>>                return -ENOMEM;
>>>> @@ -1192,7 +1189,7 @@ again:
>>>>                         * Have to check before we set max_hole_start, otherwise
>>>>                         * we could end up sending back this offset anyway.
>>>>                         */
>>>> -                       if (contains_pending_extent(trans, device,
>>>> +                       if (contains_pending_extent(transaction, device,
>>>>                                                    &search_start,
>>>>                                                    hole_size)) {
>>>>                                if (key.offset >= search_start) {
>>>> @@ -1241,7 +1238,7 @@ next:
>>>>        if (search_end > search_start) {
>>>>                hole_size = search_end - search_start;
>>>>
>>>> -               if (contains_pending_extent(trans, device, &search_start,
>>>> +               if (contains_pending_extent(transaction, device, &search_start,
>>>>                                            hole_size)) {
>>>>                        btrfs_release_path(path);
>>>>                        goto again;
>>>> @@ -1267,6 +1264,24 @@ out:
>>>>        return ret;
>>>> }
>>>>
>>>> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>> +                        struct btrfs_device *device, u64 num_bytes,
>>>> +                        u64 *start, u64 *len)
>>>> +{
>>>> +       struct btrfs_root *root = device->dev_root;
>>>> +       u64 search_start;
>>>> +
>>>> +       /* FIXME use last free of some kind */
>>>> +
>>>> +       /*
>>>> +        * we don't want to overwrite the superblock on the drive,
>>>> +        * so we make sure to start at an offset of at least 1MB
>>>> +        */
>>>> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>>>> +       return find_free_dev_extent_start(trans->transaction, device,
>>>> +                                         num_bytes, search_start, start, len);
>>>> +}
>>>> +
>>>> static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>>>>                          struct btrfs_device *device,
>>>>                          u64 start, u64 *dev_extent_len)
>>>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>>>> index ebc3133..30918a8 100644
>>>> --- a/fs/btrfs/volumes.h
>>>> +++ b/fs/btrfs/volumes.h
>>>> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>>>> int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>>>> int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>>>> int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
>>>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>>>> +                        struct btrfs_device *device, u64 num_bytes,
>>>> +                        u64 search_start, u64 *start, u64 *max_avail);
>>>> int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>>>                         struct btrfs_device *device, u64 num_bytes,
>>>>                         u64 *start, u64 *max_avail);
>>>> --
>>>> 1.8.5.6
>>>>
>>>>
>>>> --
>>>> Jeff Mahoney
>>>> SUSE Labs
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>
>>> --
>>> Filipe David Manana,
>>>
>>> "Reasonable men adapt themselves to the world.
>>> Unreasonable men adapt the world to themselves.
>>> That's why all progress depends on unreasonable men."
>>
>>
>>
>> --
>> Filipe David Manana,
>>
>> "Reasonable men adapt themselves to the world.
>> Unreasonable men adapt the world to themselves.
>> That's why all progress depends on unreasonable men."
>>
>
> --
> Jeff Mahoney
> SUSE Labs
Filipe Manana May 14, 2015, 9:40 a.m. UTC | #8
On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
> Since we now clean up block groups automatically as they become
> empty, iterating over block groups is no longer sufficient to discard
> unused space.
>
> This patch iterates over the unused chunk space and discards any regions
> that are unallocated, regardless of whether they were ever used.  This is
> a change for btrfs but is consistent with other file systems.
>
> We do this in a transactionless manner since the discard process can take
> a substantial amount of time and a transaction would need to be started
> before the acquisition of the device list lock.  That would mean a
> transaction would be held open across /all/ of the discards collectively.
> In order to prevent other threads from allocating or freeing chunks, we
> hold the chunks lock across the search and discard calls.  We release it
> between searches to allow the file system to perform more-or-less
> normally.  Since the running transaction can commit and disappear while
> we're using the transaction pointer, we take a reference to it and
> release it after the search.  This is safe since it would happen normally
> at the end of the transaction commit after any locks are released anyway.
>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>

Hi Jeff,

So I tested this and seems ok so far. Only one comment below.

> ---
>  fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>  fs/btrfs/volumes.h     |  3 ++
>  3 files changed, 137 insertions(+), 23 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 0ec8e22..6d1d74d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>         return unpin_extent_range(root, start, end, false);
>  }
>
> +/*
> + * It used to be that old block groups would be left around forever.
> + * Iterating over them would be enough to trim unused space.  Since we
> + * now automatically remove them, we also need to iterate over unallocated
> + * space.
> + *
> + * We don't want a transaction for this since the discard may take a
> + * substantial amount of time.  We don't require that a transaction be
> + * running, but we do need to take a running transaction into account
> + * to ensure that we're not discarding chunks that were released in
> + * the current transaction.
> + *
> + * Holding the chunks lock will prevent other threads from allocating
> + * or releasing chunks, but it won't prevent a running transaction
> + * from committing and releasing the memory that the pending chunks
> + * list head uses.  For that, we need to take a reference to the
> + * transaction.
> + */
> +static int btrfs_trim_free_extents(struct btrfs_device *device,
> +                                  u64 minlen, u64 *trimmed)
> +{
> +       u64 start = 0, len = 0;
> +       int ret;
> +
> +       *trimmed = 0;
> +
> +       /* Not writeable = nothing to do. */
> +       if (!device->writeable)
> +               return 0;
> +
> +       /* No free space = nothing to do. */
> +       if (device->total_bytes <= device->bytes_used)
> +               return 0;
> +
> +       ret = 0;
> +
> +       while (1) {
> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
> +               struct btrfs_transaction *trans;
> +
> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
> +               if (ret)
> +                       return ret;
> +
> +               spin_lock(&fs_info->trans_lock);
> +               trans = fs_info->running_transaction;
> +               if (trans)
> +                       atomic_inc(&trans->use_count);
> +               spin_unlock(&fs_info->trans_lock);
> +
> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
> +                                                &start, &len);

So we can't do this safely without holding a transaction open.
If we don't hold a transaction open, then we need to take the
commit_root_sem because find_free_dev_extent_start() uses the commit
root (and skips tree locking) of the device tree. We need to make sure
the commit root, or any of the nodes/leafs it leads to, doesn't
disappear or gets reused while we're doing the search(es) on the
device tree. Take a look a caching_thread() in extent-tree.c for
example.

Not holding the commit_root_sem while doing the search (calling
find_free_dev_extent_start) can lead to the following:

              CPU 0                                                  CPU 1

     starts transaction N

     at this point root R's
     root node (RN) == commit root node (CRN)

     RN and CRN point to child node eb X which
     has a logical address (X->start) LAX

                                                           (doesn't
join transaction N)

path->search_commit_root = 1

btrfs_search_slot(path)

                                                             get ref
on eb CRN (root->commit_root)

     eb X is deleted (or COWed)
     eb X's logical location is pinned

     btrfs_commit_transaction(N)
       btrfs_finish_extent_commit()
         eb X's range added back to free
         space cache

     transaction N commit finishes

     transaction N + 1 starts

       eb X's old range is allocated for
       some other new eb X' (or cowed eb),
       possibly for some other root

       new stuff written for eb X'

                                                             reads
node at logical address LAX
                                                               --> gets eb X'
                                                               -->
transid/generation check fails

(expected N gets N + 1)
                                                               --> returns -EIO



Otherwise the change looks good to me.
Thanks.

> +               if (trans)
> +                       btrfs_put_transaction(trans);
> +
> +               if (ret) {
> +                       mutex_unlock(&fs_info->chunk_mutex);
> +                       if (ret == -ENOSPC)
> +                               ret = 0;
> +                       break;
> +               }
> +
> +               ret = btrfs_issue_discard(device->bdev, start, len);
> +               mutex_unlock(&fs_info->chunk_mutex);
> +
> +               if (ret)
> +                       break;
> +
> +               start += len;
> +               *trimmed += len;
> +
> +               if (fatal_signal_pending(current)) {
> +                       ret = -ERESTARTSYS;
> +                       break;
> +               }
> +
> +               cond_resched();
> +       }
> +
> +       return ret;
> +}
> +
>  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>  {
>         struct btrfs_fs_info *fs_info = root->fs_info;
>         struct btrfs_block_group_cache *cache = NULL;
> +       struct btrfs_device *device;
> +       struct list_head *devices;
>         u64 group_trimmed;
>         u64 start;
>         u64 end;
> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>                 cache = next_block_group(fs_info->tree_root, cache);
>         }
>
> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
> +       devices = &root->fs_info->fs_devices->alloc_list;
> +       list_for_each_entry(device, devices, dev_alloc_list) {
> +               ret = btrfs_trim_free_extents(device, range->minlen,
> +                                             &group_trimmed);
> +               if (ret)
> +                       break;
> +
> +               trimmed += group_trimmed;
> +       }
> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
> +
>         range->len = trimmed;
>         return ret;
>  }
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 96aebf3..ada8965 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1051,15 +1051,19 @@ out:
>         return ret;
>  }
>
> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>                                    struct btrfs_device *device,
>                                    u64 *start, u64 len)
>  {
> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>         struct extent_map *em;
> -       struct list_head *search_list = &trans->transaction->pending_chunks;
> +       struct list_head *search_list = &transaction->pending_chunks;
>         int ret = 0;
>         u64 physical_start = *start;
>
> +       if (!transaction)
> +               search_list = &fs_info->pinned_chunks;
> +
>  again:
>         list_for_each_entry(em, search_list, list) {
>                 struct map_lookup *map;
> @@ -1078,8 +1082,8 @@ again:
>                         ret = 1;
>                 }
>         }
> -       if (search_list == &trans->transaction->pending_chunks) {
> -               search_list = &trans->root->fs_info->pinned_chunks;
> +       if (search_list == &transaction->pending_chunks) {
> +               search_list = &fs_info->pinned_chunks;
>                 goto again;
>         }
>
> @@ -1088,12 +1092,13 @@ again:
>
>
>  /*
> - * find_free_dev_extent - find free space in the specified device
> - * @device:    the device which we search the free space in
> - * @num_bytes: the size of the free space that we need
> - * @start:     store the start of the free space.
> - * @len:       the size of the free space. that we find, or the size of the max
> - *             free space if we don't find suitable free space
> + * find_free_dev_extent_start - find free space in the specified device
> + * @device:      the device which we search the free space in
> + * @num_bytes:   the size of the free space that we need
> + * @search_start: the position from which to begin the search
> + * @start:       store the start of the free space.
> + * @len:         the size of the free space. that we find, or the size
> + *               of the max free space if we don't find suitable free space
>   *
>   * this uses a pretty simple search, the expectation is that it is
>   * called very infrequently and that a given device has a small number
> @@ -1107,9 +1112,9 @@ again:
>   * But if we don't find suitable free space, it is used to store the size of
>   * the max free space.
>   */
> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
> -                        struct btrfs_device *device, u64 num_bytes,
> -                        u64 *start, u64 *len)
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> +                              struct btrfs_device *device, u64 num_bytes,
> +                              u64 search_start, u64 *start, u64 *len)
>  {
>         struct btrfs_key key;
>         struct btrfs_root *root = device->dev_root;
> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>         u64 max_hole_start;
>         u64 max_hole_size;
>         u64 extent_end;
> -       u64 search_start;
>         u64 search_end = device->total_bytes;
>         int ret;
>         int slot;
>         struct extent_buffer *l;
>
> -       /* FIXME use last free of some kind */
> -
> -       /* we don't want to overwrite the superblock on the drive,
> -        * so we make sure to start at an offset of at least 1MB
> -        */
> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> -
>         path = btrfs_alloc_path();
>         if (!path)
>                 return -ENOMEM;
> @@ -1192,7 +1189,7 @@ again:
>                          * Have to check before we set max_hole_start, otherwise
>                          * we could end up sending back this offset anyway.
>                          */
> -                       if (contains_pending_extent(trans, device,
> +                       if (contains_pending_extent(transaction, device,
>                                                     &search_start,
>                                                     hole_size)) {
>                                 if (key.offset >= search_start) {
> @@ -1241,7 +1238,7 @@ next:
>         if (search_end > search_start) {
>                 hole_size = search_end - search_start;
>
> -               if (contains_pending_extent(trans, device, &search_start,
> +               if (contains_pending_extent(transaction, device, &search_start,
>                                             hole_size)) {
>                         btrfs_release_path(path);
>                         goto again;
> @@ -1267,6 +1264,24 @@ out:
>         return ret;
>  }
>
> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
> +                        struct btrfs_device *device, u64 num_bytes,
> +                        u64 *start, u64 *len)
> +{
> +       struct btrfs_root *root = device->dev_root;
> +       u64 search_start;
> +
> +       /* FIXME use last free of some kind */
> +
> +       /*
> +        * we don't want to overwrite the superblock on the drive,
> +        * so we make sure to start at an offset of at least 1MB
> +        */
> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> +       return find_free_dev_extent_start(trans->transaction, device,
> +                                         num_bytes, search_start, start, len);
> +}
> +
>  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>                           struct btrfs_device *device,
>                           u64 start, u64 *dev_extent_len)
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index ebc3133..30918a8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> +                        struct btrfs_device *device, u64 num_bytes,
> +                        u64 search_start, u64 *start, u64 *max_avail);
>  int find_free_dev_extent(struct btrfs_trans_handle *trans,
>                          struct btrfs_device *device, u64 num_bytes,
>                          u64 *start, u64 *max_avail);
> --
> 1.8.5.6
>
>
> --
> Jeff Mahoney
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney May 14, 2015, 12:54 p.m. UTC | #9
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 5/14/15 5:40 AM, Filipe David Manana wrote:
> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney <jeffm@suse.com> wrote:
>> Since we now clean up block groups automatically as they become
>> empty, iterating over block groups is no longer sufficient to discard
>> unused space.
>>
>> This patch iterates over the unused chunk space and discards any regions
>> that are unallocated, regardless of whether they were ever used.  This is
>> a change for btrfs but is consistent with other file systems.
>>
>> We do this in a transactionless manner since the discard process can take
>> a substantial amount of time and a transaction would need to be started
>> before the acquisition of the device list lock.  That would mean a
>> transaction would be held open across /all/ of the discards collectively.
>> In order to prevent other threads from allocating or freeing chunks, we
>> hold the chunks lock across the search and discard calls.  We release it
>> between searches to allow the file system to perform more-or-less
>> normally.  Since the running transaction can commit and disappear while
>> we're using the transaction pointer, we take a reference to it and
>> release it after the search.  This is safe since it would happen normally
>> at the end of the transaction commit after any locks are released anyway.
>>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> 
> Hi Jeff,
> 
> So I tested this and seems ok so far. Only one comment below.
> 
>> ---
>>  fs/btrfs/extent-tree.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/btrfs/volumes.c     | 61 ++++++++++++++++++++------------
>>  fs/btrfs/volumes.h     |  3 ++
>>  3 files changed, 137 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 0ec8e22..6d1d74d 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -10031,10 +10031,94 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
>>         return unpin_extent_range(root, start, end, false);
>>  }
>>
>> +/*
>> + * It used to be that old block groups would be left around forever.
>> + * Iterating over them would be enough to trim unused space.  Since we
>> + * now automatically remove them, we also need to iterate over unallocated
>> + * space.
>> + *
>> + * We don't want a transaction for this since the discard may take a
>> + * substantial amount of time.  We don't require that a transaction be
>> + * running, but we do need to take a running transaction into account
>> + * to ensure that we're not discarding chunks that were released in
>> + * the current transaction.
>> + *
>> + * Holding the chunks lock will prevent other threads from allocating
>> + * or releasing chunks, but it won't prevent a running transaction
>> + * from committing and releasing the memory that the pending chunks
>> + * list head uses.  For that, we need to take a reference to the
>> + * transaction.
>> + */
>> +static int btrfs_trim_free_extents(struct btrfs_device *device,
>> +                                  u64 minlen, u64 *trimmed)
>> +{
>> +       u64 start = 0, len = 0;
>> +       int ret;
>> +
>> +       *trimmed = 0;
>> +
>> +       /* Not writeable = nothing to do. */
>> +       if (!device->writeable)
>> +               return 0;
>> +
>> +       /* No free space = nothing to do. */
>> +       if (device->total_bytes <= device->bytes_used)
>> +               return 0;
>> +
>> +       ret = 0;
>> +
>> +       while (1) {
>> +               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>> +               struct btrfs_transaction *trans;
>> +
>> +               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               spin_lock(&fs_info->trans_lock);
>> +               trans = fs_info->running_transaction;
>> +               if (trans)
>> +                       atomic_inc(&trans->use_count);
>> +               spin_unlock(&fs_info->trans_lock);
>> +
>> +               ret = find_free_dev_extent_start(trans, device, minlen, start,
>> +                                                &start, &len);
> 
> So we can't do this safely without holding a transaction open.
> If we don't hold a transaction open, then we need to take the
> commit_root_sem because find_free_dev_extent_start() uses the commit
> root (and skips tree locking) of the device tree. We need to make sure
> the commit root, or any of the nodes/leafs it leads to, doesn't
> disappear or gets reused while we're doing the search(es) on the
> device tree. Take a look a caching_thread() in extent-tree.c for
> example.
> 
> Not holding the commit_root_sem while doing the search (calling
> find_free_dev_extent_start) can lead to the following:
> 
>               CPU 0                                                  CPU 1
> 
>      starts transaction N
> 
>      at this point root R's
>      root node (RN) == commit root node (CRN)
> 
>      RN and CRN point to child node eb X which
>      has a logical address (X->start) LAX
> 
>                                                            (doesn't
> join transaction N)
> 
> path->search_commit_root = 1
> 
> btrfs_search_slot(path)
> 
>                                                              get ref
> on eb CRN (root->commit_root)
> 
>      eb X is deleted (or COWed)
>      eb X's logical location is pinned
> 
>      btrfs_commit_transaction(N)
>        btrfs_finish_extent_commit()
>          eb X's range added back to free
>          space cache
> 
>      transaction N commit finishes
> 
>      transaction N + 1 starts
> 
>        eb X's old range is allocated for
>        some other new eb X' (or cowed eb),
>        possibly for some other root
> 
>        new stuff written for eb X'
> 
>                                                              reads
> node at logical address LAX
>                                                                --> gets eb X'
>                                                                -->
> transid/generation check fails
> 
> (expected N gets N + 1)
>                                                                --> returns -EIO
> 


Ack. Ok. I think this is the sort of thing we were trying to sort out
when we talked about this. Grabbing the commit root sem is easy enough
to integrate.

- -Jeff


> 
> Otherwise the change looks good to me.
> Thanks.
> 
>> +               if (trans)
>> +                       btrfs_put_transaction(trans);
>> +
>> +               if (ret) {
>> +                       mutex_unlock(&fs_info->chunk_mutex);
>> +                       if (ret == -ENOSPC)
>> +                               ret = 0;
>> +                       break;
>> +               }
>> +
>> +               ret = btrfs_issue_discard(device->bdev, start, len);
>> +               mutex_unlock(&fs_info->chunk_mutex);
>> +
>> +               if (ret)
>> +                       break;
>> +
>> +               start += len;
>> +               *trimmed += len;
>> +
>> +               if (fatal_signal_pending(current)) {
>> +                       ret = -ERESTARTSYS;
>> +                       break;
>> +               }
>> +
>> +               cond_resched();
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>  {
>>         struct btrfs_fs_info *fs_info = root->fs_info;
>>         struct btrfs_block_group_cache *cache = NULL;
>> +       struct btrfs_device *device;
>> +       struct list_head *devices;
>>         u64 group_trimmed;
>>         u64 start;
>>         u64 end;
>> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
>>                 cache = next_block_group(fs_info->tree_root, cache);
>>         }
>>
>> +       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>> +       devices = &root->fs_info->fs_devices->alloc_list;
>> +       list_for_each_entry(device, devices, dev_alloc_list) {
>> +               ret = btrfs_trim_free_extents(device, range->minlen,
>> +                                             &group_trimmed);
>> +               if (ret)
>> +                       break;
>> +
>> +               trimmed += group_trimmed;
>> +       }
>> +       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>> +
>>         range->len = trimmed;
>>         return ret;
>>  }
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 96aebf3..ada8965 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1051,15 +1051,19 @@ out:
>>         return ret;
>>  }
>>
>> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
>> +static int contains_pending_extent(struct btrfs_transaction *transaction,
>>                                    struct btrfs_device *device,
>>                                    u64 *start, u64 len)
>>  {
>> +       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
>>         struct extent_map *em;
>> -       struct list_head *search_list = &trans->transaction->pending_chunks;
>> +       struct list_head *search_list = &transaction->pending_chunks;
>>         int ret = 0;
>>         u64 physical_start = *start;
>>
>> +       if (!transaction)
>> +               search_list = &fs_info->pinned_chunks;
>> +
>>  again:
>>         list_for_each_entry(em, search_list, list) {
>>                 struct map_lookup *map;
>> @@ -1078,8 +1082,8 @@ again:
>>                         ret = 1;
>>                 }
>>         }
>> -       if (search_list == &trans->transaction->pending_chunks) {
>> -               search_list = &trans->root->fs_info->pinned_chunks;
>> +       if (search_list == &transaction->pending_chunks) {
>> +               search_list = &fs_info->pinned_chunks;
>>                 goto again;
>>         }
>>
>> @@ -1088,12 +1092,13 @@ again:
>>
>>
>>  /*
>> - * find_free_dev_extent - find free space in the specified device
>> - * @device:    the device which we search the free space in
>> - * @num_bytes: the size of the free space that we need
>> - * @start:     store the start of the free space.
>> - * @len:       the size of the free space. that we find, or the size of the max
>> - *             free space if we don't find suitable free space
>> + * find_free_dev_extent_start - find free space in the specified device
>> + * @device:      the device which we search the free space in
>> + * @num_bytes:   the size of the free space that we need
>> + * @search_start: the position from which to begin the search
>> + * @start:       store the start of the free space.
>> + * @len:         the size of the free space. that we find, or the size
>> + *               of the max free space if we don't find suitable free space
>>   *
>>   * this uses a pretty simple search, the expectation is that it is
>>   * called very infrequently and that a given device has a small number
>> @@ -1107,9 +1112,9 @@ again:
>>   * But if we don't find suitable free space, it is used to store the size of
>>   * the max free space.
>>   */
>> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
>> -                        struct btrfs_device *device, u64 num_bytes,
>> -                        u64 *start, u64 *len)
>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>> +                              struct btrfs_device *device, u64 num_bytes,
>> +                              u64 search_start, u64 *start, u64 *len)
>>  {
>>         struct btrfs_key key;
>>         struct btrfs_root *root = device->dev_root;
>> @@ -1119,19 +1124,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>         u64 max_hole_start;
>>         u64 max_hole_size;
>>         u64 extent_end;
>> -       u64 search_start;
>>         u64 search_end = device->total_bytes;
>>         int ret;
>>         int slot;
>>         struct extent_buffer *l;
>>
>> -       /* FIXME use last free of some kind */
>> -
>> -       /* we don't want to overwrite the superblock on the drive,
>> -        * so we make sure to start at an offset of at least 1MB
>> -        */
>> -       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>> -
>>         path = btrfs_alloc_path();
>>         if (!path)
>>                 return -ENOMEM;
>> @@ -1192,7 +1189,7 @@ again:
>>                          * Have to check before we set max_hole_start, otherwise
>>                          * we could end up sending back this offset anyway.
>>                          */
>> -                       if (contains_pending_extent(trans, device,
>> +                       if (contains_pending_extent(transaction, device,
>>                                                     &search_start,
>>                                                     hole_size)) {
>>                                 if (key.offset >= search_start) {
>> @@ -1241,7 +1238,7 @@ next:
>>         if (search_end > search_start) {
>>                 hole_size = search_end - search_start;
>>
>> -               if (contains_pending_extent(trans, device, &search_start,
>> +               if (contains_pending_extent(transaction, device, &search_start,
>>                                             hole_size)) {
>>                         btrfs_release_path(path);
>>                         goto again;
>> @@ -1267,6 +1264,24 @@ out:
>>         return ret;
>>  }
>>
>> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
>> +                        struct btrfs_device *device, u64 num_bytes,
>> +                        u64 *start, u64 *len)
>> +{
>> +       struct btrfs_root *root = device->dev_root;
>> +       u64 search_start;
>> +
>> +       /* FIXME use last free of some kind */
>> +
>> +       /*
>> +        * we don't want to overwrite the superblock on the drive,
>> +        * so we make sure to start at an offset of at least 1MB
>> +        */
>> +       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
>> +       return find_free_dev_extent_start(trans->transaction, device,
>> +                                         num_bytes, search_start, start, len);
>> +}
>> +
>>  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
>>                           struct btrfs_device *device,
>>                           u64 start, u64 *dev_extent_len)
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index ebc3133..30918a8 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
>>  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
>>  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
>>  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
>> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>> +                        struct btrfs_device *device, u64 num_bytes,
>> +                        u64 search_start, u64 *start, u64 *max_avail);
>>  int find_free_dev_extent(struct btrfs_trans_handle *trans,
>>                          struct btrfs_device *device, u64 num_bytes,
>>                          u64 *start, u64 *max_avail);
>> --
>> 1.8.5.6
>>
>>
>> --
>> Jeff Mahoney
>> SUSE Labs
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 


- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.22 (Darwin)

iQIcBAEBAgAGBQJVVJsJAAoJEB57S2MheeWyllUP/05DsybAH7NbOGSDV6l6Pjec
uPqwebADqgDnGFuHMegDsvn+92cbaJvUNbOkzGws41c6zyh4dga0yCvpdIW0Kie1
i1x1VcbgKQh4sRnd9aZibRTi0Tm1WFareklx3+jIQAUPzJOAjG7B4DHSKGuTF3Y8
XxMSRg5I4vuniZaIZhAAhpCKf5vPkWAh6LqdZhJMAjh0pAfMAkD3KbpKkFwwS+ES
o3hNjKUD17oRZCdIuz7wDe7mS/8fdJ2VLYXMZEz701zIsx2TAZQudZd89tg6f/1M
fX5EPCJmGlYmwUiE7yPMt1iuO4j7kJ9CiPGVgBwMaMMsFkQ2EAvBAAoDjiuIl9em
7Mw+1SJ0+R4EpxhRzOyo6gucgHkyDuN4ynNdoVv54ZKpx/kkuiEkvsSaT6fxSsTk
6ccT3l1hj+u70955TuiDpMQwL32MQMM+0ryoa0r4zAX1AhTxAQ3z9CIVtfv9FsuT
StXEMpj7plYWxzu6mVqg60q5pFlEYxlkJrydyaW1v+Ogc8JAdWSQ+tdQemr78k39
v6JURusA9683psUFXOpdBfdn6LivnNMvqHXvEMI6xOBT0TQWj2BO6h1x6DpBAxrq
6jCGZkvLAf2MmFm1+3T5UOq6WRWkT6E9/Ot0WmMhKeWOUWEpY+Aq2QHNxrusZPOT
r4dCPVzo+Um9xIME+Ran
=DG/0
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney May 14, 2015, 1:07 p.m. UTC | #10
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 5/14/15 5:33 AM, Filipe David Manana wrote:
> On Fri, May 8, 2015 at 11:45 PM, Jeff Mahoney <jeffm@suse.com>
> wrote:
>> 
>>> On May 8, 2015, at 5:16 PM, Filipe David Manana
>>> <fdmanana@gmail.com> wrote:
>>> 
>>>> On Fri, May 8, 2015 at 9:38 PM, Filipe David Manana
>>>> <fdmanana@gmail.com> wrote:
>>>>> On Tue, May 5, 2015 at 7:19 PM, Jeff Mahoney
>>>>> <jeffm@suse.com> wrote: Since we now clean up block groups
>>>>> automatically as they become empty, iterating over block
>>>>> groups is no longer sufficient to discard unused space.
>>>>> 
>>>>> This patch iterates over the unused chunk space and
>>>>> discards any regions that are unallocated, regardless of
>>>>> whether they were ever used.  This is a change for btrfs
>>>>> but is consistent with other file systems.
>>>>> 
>>>>> We do this in a transactionless manner since the discard
>>>>> process can take a substantial amount of time and a
>>>>> transaction would need to be started before the acquisition
>>>>> of the device list lock.  That would mean a transaction
>>>>> would be held open across /all/ of the discards
>>>>> collectively. In order to prevent other threads from
>>>>> allocating or freeing chunks, we hold the chunks lock
>>>>> across the search and discard calls.  We release it between
>>>>> searches to allow the file system to perform more-or-less 
>>>>> normally.  Since the running transaction can commit and
>>>>> disappear while we're using the transaction pointer, we
>>>>> take a reference to it and release it after the search.
>>>>> This is safe since it would happen normally at the end of
>>>>> the transaction commit after any locks are released
>>>>> anyway.
>>>>> 
>>>>> Signed-off-by: Jeff Mahoney <jeffm@suse.com> ---
>>>> 
>>>> Hi Jeff,
>>>> 
>>>> Missing changelog describing what changed between v1 and v2.
>>>> 
>>>>> fs/btrfs/extent-tree.c | 96
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++ 
>>>>> fs/btrfs/volumes.c     | 61
>>>>> ++++++++++++++++++++------------ fs/btrfs/volumes.h     |
>>>>> 3 ++ 3 files changed, 137 insertions(+), 23 deletions(-)
>>>>> 
>>>>> diff --git a/fs/btrfs/extent-tree.c
>>>>> b/fs/btrfs/extent-tree.c index 0ec8e22..6d1d74d 100644 ---
>>>>> a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@
>>>>> -10031,10 +10031,94 @@ int
>>>>> btrfs_error_unpin_extent_range(struct btrfs_root *root, u64
>>>>> start, u64 end) return unpin_extent_range(root, start, end,
>>>>> false); }
>>>>> 
>>>>> +/* + * It used to be that old block groups would be left
>>>>> around forever. + * Iterating over them would be enough to
>>>>> trim unused space.  Since we + * now automatically remove
>>>>> them, we also need to iterate over unallocated + * space. +
>>>>> * + * We don't want a transaction for this since the
>>>>> discard may take a + * substantial amount of time.  We
>>>>> don't require that a transaction be + * running, but we do
>>>>> need to take a running transaction into account + * to
>>>>> ensure that we're not discarding chunks that were released
>>>>> in + * the current transaction. + * + * Holding the chunks
>>>>> lock will prevent other threads from allocating + * or
>>>>> releasing chunks, but it won't prevent a running
>>>>> transaction + * from committing and releasing the memory
>>>>> that the pending chunks + * list head uses.  For that, we
>>>>> need to take a reference to the + * transaction. + */ 
>>>>> +static int btrfs_trim_free_extents(struct btrfs_device
>>>>> *device, +                                  u64 minlen, u64
>>>>> *trimmed) +{ +       u64 start = 0, len = 0; +       int
>>>>> ret; + +       *trimmed = 0; + +       /* Not writeable =
>>>>> nothing to do. */ +       if (!device->writeable) +
>>>>> return 0; + +       /* No free space = nothing to do. */ +
>>>>> if (device->total_bytes <= device->bytes_used) +
>>>>> return 0; + +       ret = 0; + +       while (1) { +
>>>>> struct btrfs_fs_info *fs_info = device->dev_root->fs_info; 
>>>>> +               struct btrfs_transaction *trans; + +
>>>>> ret = mutex_lock_interruptible(&fs_info->chunk_mutex); +
>>>>> if (ret) +                       return ret; + +
>>>>> spin_lock(&fs_info->trans_lock); +               trans =
>>>>> fs_info->running_transaction; +               if (trans) +
>>>>> atomic_inc(&trans->use_count); +
>>>>> spin_unlock(&fs_info->trans_lock); + +               ret =
>>>>> find_free_dev_extent_start(trans, device, minlen, start, +
>>>>> &start, &len); +               if (trans) +
>>>>> btrfs_put_transaction(trans); + +               if (ret) { 
>>>>> +
>>>>> mutex_unlock(&fs_info->chunk_mutex); +
>>>>> if (ret == -ENOSPC) +                               ret =
>>>>> 0; +                       break; +               } + +
>>>>> ret = btrfs_issue_discard(device->bdev, start, len); +
>>>>> mutex_unlock(&fs_info->chunk_mutex); + +               if
>>>>> (ret) +                       break; + +
>>>>> start += len; +               *trimmed += len; + +
>>>>> if (fatal_signal_pending(current)) { +
>>>>> ret = -ERESTARTSYS; +                       break; +
>>>>> } + +               cond_resched(); +       } + +
>>>>> return ret; +} + int btrfs_trim_fs(struct btrfs_root *root,
>>>>> struct fstrim_range *range) { struct btrfs_fs_info *fs_info
>>>>> = root->fs_info; struct btrfs_block_group_cache *cache =
>>>>> NULL; +       struct btrfs_device *device; +       struct
>>>>> list_head *devices; u64 group_trimmed; u64 start; u64 end; 
>>>>> @@ -10089,6 +10173,18 @@ int btrfs_trim_fs(struct
>>>>> btrfs_root *root, struct fstrim_range *range) cache =
>>>>> next_block_group(fs_info->tree_root, cache); }
>>>>> 
>>>>> +
>>>>> mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 
>>>>> +       devices = &root->fs_info->fs_devices->alloc_list; +
>>>>> list_for_each_entry(device, devices, dev_alloc_list) { +
>>>>> ret = btrfs_trim_free_extents(device, range->minlen, +
>>>>> &group_trimmed); +               if (ret) +
>>>>> break; + +               trimmed += group_trimmed; +
>>>>> } +
>>>>> mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>>>>>
>>>>> 
+
>>>>> range->len = trimmed; return ret; } diff --git
>>>>> a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index
>>>>> 96aebf3..ada8965 100644 --- a/fs/btrfs/volumes.c +++
>>>>> b/fs/btrfs/volumes.c @@ -1051,15 +1051,19 @@ out: return
>>>>> ret; }
>>>>> 
>>>>> -static int contains_pending_extent(struct
>>>>> btrfs_trans_handle *trans, +static int
>>>>> contains_pending_extent(struct btrfs_transaction
>>>>> *transaction, struct btrfs_device *device, u64 *start, u64
>>>>> len) { +       struct btrfs_fs_info *fs_info =
>>>>> device->dev_root->fs_info; struct extent_map *em; -
>>>>> struct list_head *search_list =
>>>>> &trans->transaction->pending_chunks; +       struct
>>>>> list_head *search_list = &transaction->pending_chunks;
>>>> 
>>>> transaction can be NULL so we need to check for that first
>>>> (as you do below).
>>>> 
>>>>> int ret = 0; u64 physical_start = *start;
>>>>> 
>>>>> +       if (!transaction) +               search_list =
>>>>> &fs_info->pinned_chunks; + again: list_for_each_entry(em,
>>>>> search_list, list) { struct map_lookup *map; @@ -1078,8
>>>>> +1082,8 @@ again: ret = 1; } } -       if (search_list ==
>>>>> &trans->transaction->pending_chunks) { -
>>>>> search_list = &trans->root->fs_info->pinned_chunks; +
>>>>> if (search_list == &transaction->pending_chunks) {
>>> 
>>> And here missing the null test too, which should be something
>>> like:
>>> 
>>> if (transaction && search_list == &transaction->pending_chunks)
>>> {
>>> 
>> 
>> That's the same case as above. We're not dereferencing
>> transaction here either. The test in the if statement will fail
>> because &transaction->pending_chunks will be 0x100 or something.
> 
> Right. Sorry friday evening review. What you are doing is valid
> then and something very close to the offsetof implementation.

Exactly. I got enough pushback on it that I just changed it around so
there aren't any questions, though.

- -Jeff


- -- 
Jeff Mahoney
SUSE Labs
-----BEGIN PGP SIGNATURE-----
Version: GnuPG/MacGPG2 v2.0.19 (Darwin)

iQIcBAEBAgAGBQJVVJ4nAAoJEB57S2MheeWyOJUQAKRdRkTzw6QGPBbEHL4mz5fV
u/1EUvnsf3nUf1xvjxhnimAkxdVFJ66jV4Et3MaZXQwx/ejVByn4VfT7wi58AiYD
x3mzbGgMBB2lxHrS4OZD2yO7nh+n9IRSKE3yVuKkqgXWqL0meyJ5FvU3cpVWaj5u
v7ka5pyFEIlaymJqcvFJX57uKYKgUKJkAreGYqEvvCpZYc+BWibmp1svsN66824l
kO0aBLIz0WCPc0CuG3i9YSRXMSIbf8sV6QygpQiaCQ2WfJiDNZINjNwUzK+mwBq9
UtMs3LiyqRMMIINSM6Ey14s842ov6Xh3DsIDS1YzqzyE0it4tYjS6o5lTK2vpmrM
RQVDGxu3btlFlWfTPOP32Ts1tOWdHB6bMOezca53p3I/sLvA5g5snonqbEDPltov
DnHNTskDdFAuh/3QzTpGHU93cPZJE3vp8bpQfc1UQuUs9NzN/b26ghxau1Rto87q
v0KKcp9qnU+2nQ/6TxKekudKBWTT+glene73ddHct0WRwtQWwPIZihL028llzHF4
Iu9JEE3ov879PMTvUzFFfc6cSKPQGy9UNjn/O0E0G6km/GFAW/BHNDeywa0O+Smh
DPJ4GLXYHKw4JHqGYtbuoMwt2KXZd/9XmCUiBPo0s+wRCqz9Ms3J1T1UMmhqJDOt
lLa/DggUBMYfjGOVHS/2
=WAYC
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec8e22..6d1d74d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10031,10 +10031,94 @@  int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 	return unpin_extent_range(root, start, end, false);
 }
 
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+				   u64 minlen, u64 *trimmed)
+{
+	u64 start = 0, len = 0;
+	int ret;
+
+	*trimmed = 0;
+
+	/* Not writeable = nothing to do. */
+	if (!device->writeable)
+		return 0;
+
+	/* No free space = nothing to do. */
+	if (device->total_bytes <= device->bytes_used)
+		return 0;
+
+	ret = 0;
+
+	while (1) {
+		struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+		struct btrfs_transaction *trans;
+
+		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+		if (ret)
+			return ret;
+
+		spin_lock(&fs_info->trans_lock);
+		trans = fs_info->running_transaction;
+		if (trans)
+			atomic_inc(&trans->use_count);
+		spin_unlock(&fs_info->trans_lock);
+
+		ret = find_free_dev_extent_start(trans, device, minlen, start,
+						 &start, &len);
+		if (trans)
+			btrfs_put_transaction(trans);
+
+		if (ret) {
+			mutex_unlock(&fs_info->chunk_mutex);
+			if (ret == -ENOSPC)
+				ret = 0;
+			break;
+		}
+
+		ret = btrfs_issue_discard(device->bdev, start, len);
+		mutex_unlock(&fs_info->chunk_mutex);
+
+		if (ret)
+			break;
+
+		start += len;
+		*trimmed += len;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_device *device;
+	struct list_head *devices;
 	u64 group_trimmed;
 	u64 start;
 	u64 end;
@@ -10089,6 +10173,18 @@  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 		cache = next_block_group(fs_info->tree_root, cache);
 	}
 
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	devices = &root->fs_info->fs_devices->alloc_list;
+	list_for_each_entry(device, devices, dev_alloc_list) {
+		ret = btrfs_trim_free_extents(device, range->minlen,
+					      &group_trimmed);
+		if (ret)
+			break;
+
+		trimmed += group_trimmed;
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	range->len = trimmed;
 	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 96aebf3..ada8965 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1051,15 +1051,19 @@  out:
 	return ret;
 }
 
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
 				   struct btrfs_device *device,
 				   u64 *start, u64 len)
 {
+	struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
 	struct extent_map *em;
-	struct list_head *search_list = &trans->transaction->pending_chunks;
+	struct list_head *search_list = &transaction->pending_chunks;
 	int ret = 0;
 	u64 physical_start = *start;
 
+	if (!transaction)
+		search_list = &fs_info->pinned_chunks;
+
 again:
 	list_for_each_entry(em, search_list, list) {
 		struct map_lookup *map;
@@ -1078,8 +1082,8 @@  again:
 			ret = 1;
 		}
 	}
-	if (search_list == &trans->transaction->pending_chunks) {
-		search_list = &trans->root->fs_info->pinned_chunks;
+	if (search_list == &transaction->pending_chunks) {
+		search_list = &fs_info->pinned_chunks;
 		goto again;
 	}
 
@@ -1088,12 +1092,13 @@  again:
 
 
 /*
- * find_free_dev_extent - find free space in the specified device
- * @device:	the device which we search the free space in
- * @num_bytes:	the size of the free space that we need
- * @start:	store the start of the free space.
- * @len:	the size of the free space. that we find, or the size of the max
- * 		free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device:	  the device which we search the free space in
+ * @num_bytes:	  the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start:	  store the start of the free space.
+ * @len:	  the size of the free space. that we find, or the size
+ *		  of the max free space if we don't find suitable free space
  *
  * this uses a pretty simple search, the expectation is that it is
  * called very infrequently and that a given device has a small number
@@ -1107,9 +1112,9 @@  again:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_device *device, u64 num_bytes,
-			 u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+			       struct btrfs_device *device, u64 num_bytes,
+			       u64 search_start, u64 *start, u64 *len)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -1119,19 +1124,11 @@  int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	u64 max_hole_start;
 	u64 max_hole_size;
 	u64 extent_end;
-	u64 search_start;
 	u64 search_end = device->total_bytes;
 	int ret;
 	int slot;
 	struct extent_buffer *l;
 
-	/* FIXME use last free of some kind */
-
-	/* we don't want to overwrite the superblock on the drive,
-	 * so we make sure to start at an offset of at least 1MB
-	 */
-	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1192,7 +1189,7 @@  again:
 			 * Have to check before we set max_hole_start, otherwise
 			 * we could end up sending back this offset anyway.
 			 */
-			if (contains_pending_extent(trans, device,
+			if (contains_pending_extent(transaction, device,
 						    &search_start,
 						    hole_size)) {
 				if (key.offset >= search_start) {
@@ -1241,7 +1238,7 @@  next:
 	if (search_end > search_start) {
 		hole_size = search_end - search_start;
 
-		if (contains_pending_extent(trans, device, &search_start,
+		if (contains_pending_extent(transaction, device, &search_start,
 					    hole_size)) {
 			btrfs_release_path(path);
 			goto again;
@@ -1267,6 +1264,24 @@  out:
 	return ret;
 }
 
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 *start, u64 *len)
+{
+	struct btrfs_root *root = device->dev_root;
+	u64 search_start;
+
+	/* FIXME use last free of some kind */
+
+	/*
+	 * we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+	return find_free_dev_extent_start(trans->transaction, device,
+					  num_bytes, search_start, start, len);
+}
+
 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start, u64 *dev_extent_len)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc3133..30918a8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -449,6 +449,9 @@  int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+			 struct btrfs_device *device, u64 num_bytes,
+			 u64 search_start, u64 *start, u64 *max_avail);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);