[13/19] btrfs: have multiple discard lists
diff mbox series

Message ID 87b5ef751e8febd481065e475bd53b276e670ff6.1570479299.git.dennis@kernel.org
State New
Headers show
Series
  • btrfs: async discard support
Related show

Commit Message

Dennis Zhou Oct. 7, 2019, 8:17 p.m. UTC
Non-block group destruction discarding currently only had a single list
with no minimum discard length. This can lead to caravaning more
meaningful discards behind a heavily fragmented block group.

This adds support for multiple lists with minimum discard lengths to
prevent the caravan effect. We promote block groups back up when we
exceed the BTRFS_DISCARD_MAX_FILTER size, currently we support only 2
lists with filters of 1MB and 32KB respectively.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 fs/btrfs/ctree.h            |  2 +-
 fs/btrfs/discard.c          | 60 +++++++++++++++++++++++++++++++++----
 fs/btrfs/discard.h          |  4 +++
 fs/btrfs/free-space-cache.c | 37 +++++++++++++++--------
 fs/btrfs/free-space-cache.h |  2 +-
 5 files changed, 85 insertions(+), 20 deletions(-)

Comments

Josef Bacik Oct. 10, 2019, 4:51 p.m. UTC | #1
On Mon, Oct 07, 2019 at 04:17:44PM -0400, Dennis Zhou wrote:
> Non-block group destruction discarding currently only had a single list
> with no minimum discard length. This can lead to caravaning more
> meaningful discards behind a heavily fragmented block group.
> 
> This adds support for multiple lists with minimum discard lengths to
> prevent the caravan effect. We promote block groups back up when we
> exceed the BTRFS_DISCARD_MAX_FILTER size, currently we support only 2
> lists with filters of 1MB and 32KB respectively.
> 
> Signed-off-by: Dennis Zhou <dennis@kernel.org>
> ---
>  fs/btrfs/ctree.h            |  2 +-
>  fs/btrfs/discard.c          | 60 +++++++++++++++++++++++++++++++++----
>  fs/btrfs/discard.h          |  4 +++
>  fs/btrfs/free-space-cache.c | 37 +++++++++++++++--------
>  fs/btrfs/free-space-cache.h |  2 +-
>  5 files changed, 85 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index e81f699347e0..b5608f8dc41a 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -439,7 +439,7 @@ struct btrfs_full_stripe_locks_tree {
>  };
>  
>  /* discard control */
> -#define BTRFS_NR_DISCARD_LISTS		2
> +#define BTRFS_NR_DISCARD_LISTS		3
>  
>  struct btrfs_discard_ctl {
>  	struct workqueue_struct *discard_workers;
> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> index 072c73f48297..296cbffc5957 100644
> --- a/fs/btrfs/discard.c
> +++ b/fs/btrfs/discard.c
> @@ -20,6 +20,10 @@
>  #define BTRFS_DISCARD_MAX_DELAY		(10000UL)
>  #define BTRFS_DISCARD_MAX_IOPS		(10UL)
>  
> +/* montonically decreasing filters after 0 */
> +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {0,
> +	BTRFS_DISCARD_MAX_FILTER, BTRFS_DISCARD_MIN_FILTER};
> +
>  static struct list_head *
>  btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl,
>  		       struct btrfs_block_group_cache *cache)
> @@ -120,7 +124,7 @@ find_next_cache(struct btrfs_discard_ctl *discard_ctl, u64 now)
>  }
>  
>  static struct btrfs_block_group_cache *
> -peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
> +peek_discard_list(struct btrfs_discard_ctl *discard_ctl, int *discard_index)
>  {
>  	struct btrfs_block_group_cache *cache;
>  	u64 now = ktime_get_ns();
> @@ -132,6 +136,7 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
>  
>  	if (cache && now > cache->discard_delay) {
>  		discard_ctl->cache = cache;
> +		*discard_index = cache->discard_index;
>  		if (cache->discard_index == 0 &&
>  		    cache->free_space_ctl->free_space != cache->key.offset) {
>  			__btrfs_add_to_discard_list(discard_ctl, cache);
> @@ -150,6 +155,36 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
>  	return cache;
>  }
>  
> +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
> +				u64 bytes)
> +{
> +	struct btrfs_discard_ctl *discard_ctl;
> +
> +	if (!cache || !btrfs_test_opt(cache->fs_info, DISCARD_ASYNC))
> +		return;
> +
> +	discard_ctl = &cache->fs_info->discard_ctl;
> +
> +	if (cache && cache->discard_index > 1 &&
> +	    bytes >= BTRFS_DISCARD_MAX_FILTER) {
> +		remove_from_discard_list(discard_ctl, cache);
> +		cache->discard_index = 1;

Really need names here, I have no idea what 1 is.

> +		btrfs_add_to_discard_list(discard_ctl, cache);
> +	}
> +}
> +
> +static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
> +				       struct btrfs_block_group_cache *cache)
> +{
> +	cache->discard_index++;
> +	if (cache->discard_index == BTRFS_NR_DISCARD_LISTS) {
> +		cache->discard_index = 1;
> +		return;
> +	}
> +
> +	btrfs_add_to_discard_list(discard_ctl, cache);
> +}
> +
>  void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
>  			       struct btrfs_block_group_cache *cache)
>  {
> @@ -202,23 +237,34 @@ static void btrfs_discard_workfn(struct work_struct *work)
>  {
>  	struct btrfs_discard_ctl *discard_ctl;
>  	struct btrfs_block_group_cache *cache;
> +	int discard_index = 0;
>  	u64 trimmed = 0;
> +	u64 minlen = 0;
>  
>  	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
>  
>  again:
> -	cache = peek_discard_list(discard_ctl);
> +	cache = peek_discard_list(discard_ctl, &discard_index);
>  	if (!cache || !btrfs_run_discard_work(discard_ctl))
>  		return;
>  
> -	if (btrfs_discard_bitmaps(cache))
> +	minlen = discard_minlen[discard_index];
> +
> +	if (btrfs_discard_bitmaps(cache)) {
> +		u64 maxlen = 0;
> +
> +		if (discard_index)
> +			maxlen = discard_minlen[discard_index - 1];
> +
>  		btrfs_trim_block_group_bitmaps(cache, &trimmed,
>  					       cache->discard_cursor,
>  					       btrfs_block_group_end(cache),
> -					       0, true);
> -	else
> +					       minlen, maxlen, true);
> +	} else {
>  		btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor,
> -				       btrfs_block_group_end(cache), 0, true);
> +				       btrfs_block_group_end(cache),
> +				       minlen, true);
> +	}
>  
>  	discard_ctl->prev_discard = trimmed;
>  
> @@ -231,6 +277,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
>  				 cache->key.offset)
>  				btrfs_add_to_discard_free_list(discard_ctl,
>  							       cache);
> +			else
> +				btrfs_update_discard_index(discard_ctl, cache);
>  		} else {
>  			cache->discard_cursor = cache->key.objectid;
>  			cache->discard_flags |= BTRFS_DISCARD_BITMAPS;
> diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
> index 898dd92dbf8f..1daa8da4a1b5 100644
> --- a/fs/btrfs/discard.h
> +++ b/fs/btrfs/discard.h
> @@ -18,6 +18,8 @@
>  
>  /* discard size limits */
>  #define BTRFS_DISCARD_MAX_SIZE		(SZ_64M)
> +#define BTRFS_DISCARD_MAX_FILTER	(SZ_1M)
> +#define BTRFS_DISCARD_MIN_FILTER	(SZ_32K)
>  
>  /* discard flags */
>  #define BTRFS_DISCARD_RESET_CURSOR	(1UL << 0)
> @@ -39,6 +41,8 @@ void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
>  			       struct btrfs_block_group_cache *cache);
>  void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
>  				    struct btrfs_block_group_cache *cache);
> +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
> +				u64 bytes);
>  void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
>  
>  void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index ce33803a45b2..ed35dc090df6 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2471,6 +2471,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
>  	if (ret)
>  		kmem_cache_free(btrfs_free_space_cachep, info);
>  out:
> +	btrfs_discard_check_filter(cache, bytes);

So we're only accounting the new space?  What if we merge with a larger area
here?  We should probably make our decision based on the actual trimable area.

>  	btrfs_discard_update_discardable(cache, ctl);
>  	spin_unlock(&ctl->tree_lock);
>  
> @@ -3409,7 +3410,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
>  				goto next;
>  			}
>  			unlink_free_space(ctl, entry);
> -			if (bytes > BTRFS_DISCARD_MAX_SIZE) {
> +			/*
> +			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
> +			 * If X < BTRFS_DISCARD_MIN_FILTER, we won't trim X when
> +			 * we come back around.  So trim it now.
> +			 */
> +			if (bytes > (BTRFS_DISCARD_MAX_SIZE +
> +				     BTRFS_DISCARD_MIN_FILTER)) {
>  				bytes = extent_bytes = BTRFS_DISCARD_MAX_SIZE;
>  				entry->offset += BTRFS_DISCARD_MAX_SIZE;
>  				entry->bytes -= BTRFS_DISCARD_MAX_SIZE;
> @@ -3510,7 +3517,7 @@ static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
>  
>  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
>  			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> -			bool async)
> +			u64 maxlen, bool async)
>  {
>  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>  	struct btrfs_free_space *entry;
> @@ -3535,7 +3542,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
>  		}
>  
>  		entry = tree_search_offset(ctl, offset, 1, 0);
> -		if (!entry || (async && start == offset &&
> +		if (!entry || (async && minlen && start == offset &&
>  			       btrfs_free_space_trimmed(entry))) {

Huh?  Why do we care if minlen is set if our entry is already trimmed?  If we're
already trimmed we should just skip it even with minlen set, right?  Thanks,

Josef
Dennis Zhou Oct. 14, 2019, 8:04 p.m. UTC | #2
On Thu, Oct 10, 2019 at 12:51:01PM -0400, Josef Bacik wrote:
> On Mon, Oct 07, 2019 at 04:17:44PM -0400, Dennis Zhou wrote:
> > Non-block group destruction discarding currently only had a single list
> > with no minimum discard length. This can lead to caravaning more
> > meaningful discards behind a heavily fragmented block group.
> > 
> > This adds support for multiple lists with minimum discard lengths to
> > prevent the caravan effect. We promote block groups back up when we
> > exceed the BTRFS_DISCARD_MAX_FILTER size, currently we support only 2
> > lists with filters of 1MB and 32KB respectively.
> > 
> > Signed-off-by: Dennis Zhou <dennis@kernel.org>
> > ---
> >  fs/btrfs/ctree.h            |  2 +-
> >  fs/btrfs/discard.c          | 60 +++++++++++++++++++++++++++++++++----
> >  fs/btrfs/discard.h          |  4 +++
> >  fs/btrfs/free-space-cache.c | 37 +++++++++++++++--------
> >  fs/btrfs/free-space-cache.h |  2 +-
> >  5 files changed, 85 insertions(+), 20 deletions(-)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index e81f699347e0..b5608f8dc41a 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -439,7 +439,7 @@ struct btrfs_full_stripe_locks_tree {
> >  };
> >  
> >  /* discard control */
> > -#define BTRFS_NR_DISCARD_LISTS		2
> > +#define BTRFS_NR_DISCARD_LISTS		3
> >  
> >  struct btrfs_discard_ctl {
> >  	struct workqueue_struct *discard_workers;
> > diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> > index 072c73f48297..296cbffc5957 100644
> > --- a/fs/btrfs/discard.c
> > +++ b/fs/btrfs/discard.c
> > @@ -20,6 +20,10 @@
> >  #define BTRFS_DISCARD_MAX_DELAY		(10000UL)
> >  #define BTRFS_DISCARD_MAX_IOPS		(10UL)
> >  
> > +/* montonically decreasing filters after 0 */
> > +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {0,
> > +	BTRFS_DISCARD_MAX_FILTER, BTRFS_DISCARD_MIN_FILTER};
> > +
> >  static struct list_head *
> >  btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl,
> >  		       struct btrfs_block_group_cache *cache)
> > @@ -120,7 +124,7 @@ find_next_cache(struct btrfs_discard_ctl *discard_ctl, u64 now)
> >  }
> >  
> >  static struct btrfs_block_group_cache *
> > -peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
> > +peek_discard_list(struct btrfs_discard_ctl *discard_ctl, int *discard_index)
> >  {
> >  	struct btrfs_block_group_cache *cache;
> >  	u64 now = ktime_get_ns();
> > @@ -132,6 +136,7 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
> >  
> >  	if (cache && now > cache->discard_delay) {
> >  		discard_ctl->cache = cache;
> > +		*discard_index = cache->discard_index;
> >  		if (cache->discard_index == 0 &&
> >  		    cache->free_space_ctl->free_space != cache->key.offset) {
> >  			__btrfs_add_to_discard_list(discard_ctl, cache);
> > @@ -150,6 +155,36 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
> >  	return cache;
> >  }
> >  
> > +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
> > +				u64 bytes)
> > +{
> > +	struct btrfs_discard_ctl *discard_ctl;
> > +
> > +	if (!cache || !btrfs_test_opt(cache->fs_info, DISCARD_ASYNC))
> > +		return;
> > +
> > +	discard_ctl = &cache->fs_info->discard_ctl;
> > +
> > +	if (cache && cache->discard_index > 1 &&
> > +	    bytes >= BTRFS_DISCARD_MAX_FILTER) {
> > +		remove_from_discard_list(discard_ctl, cache);
> > +		cache->discard_index = 1;
> 
> Really need names here, I have no idea what 1 is.
> 

Yep, done.

> > +		btrfs_add_to_discard_list(discard_ctl, cache);
> > +	}
> > +}
> > +
> > +static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
> > +				       struct btrfs_block_group_cache *cache)
> > +{
> > +	cache->discard_index++;
> > +	if (cache->discard_index == BTRFS_NR_DISCARD_LISTS) {
> > +		cache->discard_index = 1;
> > +		return;
> > +	}
> > +
> > +	btrfs_add_to_discard_list(discard_ctl, cache);
> > +}
> > +
> >  void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
> >  			       struct btrfs_block_group_cache *cache)
> >  {
> > @@ -202,23 +237,34 @@ static void btrfs_discard_workfn(struct work_struct *work)
> >  {
> >  	struct btrfs_discard_ctl *discard_ctl;
> >  	struct btrfs_block_group_cache *cache;
> > +	int discard_index = 0;
> >  	u64 trimmed = 0;
> > +	u64 minlen = 0;
> >  
> >  	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
> >  
> >  again:
> > -	cache = peek_discard_list(discard_ctl);
> > +	cache = peek_discard_list(discard_ctl, &discard_index);
> >  	if (!cache || !btrfs_run_discard_work(discard_ctl))
> >  		return;
> >  
> > -	if (btrfs_discard_bitmaps(cache))
> > +	minlen = discard_minlen[discard_index];
> > +
> > +	if (btrfs_discard_bitmaps(cache)) {
> > +		u64 maxlen = 0;
> > +
> > +		if (discard_index)
> > +			maxlen = discard_minlen[discard_index - 1];
> > +
> >  		btrfs_trim_block_group_bitmaps(cache, &trimmed,
> >  					       cache->discard_cursor,
> >  					       btrfs_block_group_end(cache),
> > -					       0, true);
> > -	else
> > +					       minlen, maxlen, true);
> > +	} else {
> >  		btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor,
> > -				       btrfs_block_group_end(cache), 0, true);
> > +				       btrfs_block_group_end(cache),
> > +				       minlen, true);
> > +	}
> >  
> >  	discard_ctl->prev_discard = trimmed;
> >  
> > @@ -231,6 +277,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
> >  				 cache->key.offset)
> >  				btrfs_add_to_discard_free_list(discard_ctl,
> >  							       cache);
> > +			else
> > +				btrfs_update_discard_index(discard_ctl, cache);
> >  		} else {
> >  			cache->discard_cursor = cache->key.objectid;
> >  			cache->discard_flags |= BTRFS_DISCARD_BITMAPS;
> > diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
> > index 898dd92dbf8f..1daa8da4a1b5 100644
> > --- a/fs/btrfs/discard.h
> > +++ b/fs/btrfs/discard.h
> > @@ -18,6 +18,8 @@
> >  
> >  /* discard size limits */
> >  #define BTRFS_DISCARD_MAX_SIZE		(SZ_64M)
> > +#define BTRFS_DISCARD_MAX_FILTER	(SZ_1M)
> > +#define BTRFS_DISCARD_MIN_FILTER	(SZ_32K)
> >  
> >  /* discard flags */
> >  #define BTRFS_DISCARD_RESET_CURSOR	(1UL << 0)
> > @@ -39,6 +41,8 @@ void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> >  			       struct btrfs_block_group_cache *cache);
> >  void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
> >  				    struct btrfs_block_group_cache *cache);
> > +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
> > +				u64 bytes);
> >  void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
> >  
> >  void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
> > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > index ce33803a45b2..ed35dc090df6 100644
> > --- a/fs/btrfs/free-space-cache.c
> > +++ b/fs/btrfs/free-space-cache.c
> > @@ -2471,6 +2471,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
> >  	if (ret)
> >  		kmem_cache_free(btrfs_free_space_cachep, info);
> >  out:
> > +	btrfs_discard_check_filter(cache, bytes);
> 
> So we're only accounting the new space?  What if we merge with a larger area
> here?  We should probably make our decision based on the actual trimable area.
> 

Thanks that's a good catch. I've updated it to be info->bytes if
possible.

> >  	btrfs_discard_update_discardable(cache, ctl);
> >  	spin_unlock(&ctl->tree_lock);
> >  
> > @@ -3409,7 +3410,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> >  				goto next;
> >  			}
> >  			unlink_free_space(ctl, entry);
> > -			if (bytes > BTRFS_DISCARD_MAX_SIZE) {
> > +			/*
> > +			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
> > +			 * If X < BTRFS_DISCARD_MIN_FILTER, we won't trim X when
> > +			 * we come back around.  So trim it now.
> > +			 */
> > +			if (bytes > (BTRFS_DISCARD_MAX_SIZE +
> > +				     BTRFS_DISCARD_MIN_FILTER)) {
> >  				bytes = extent_bytes = BTRFS_DISCARD_MAX_SIZE;
> >  				entry->offset += BTRFS_DISCARD_MAX_SIZE;
> >  				entry->bytes -= BTRFS_DISCARD_MAX_SIZE;
> > @@ -3510,7 +3517,7 @@ static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
> >  
> >  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> > -			bool async)
> > +			u64 maxlen, bool async)
> >  {
> >  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> >  	struct btrfs_free_space *entry;
> > @@ -3535,7 +3542,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  		}
> >  
> >  		entry = tree_search_offset(ctl, offset, 1, 0);
> > -		if (!entry || (async && start == offset &&
> > +		if (!entry || (async && minlen && start == offset &&
> >  			       btrfs_free_space_trimmed(entry))) {
> 
> Huh?  Why do we care if minlen is set if our entry is already trimmed?  If we're
> already trimmed we should just skip it even with minlen set, right?  Thanks,
> 

Yeah this definitely needs a comment. The reason for this is minlen is
used to check if we're in discard_index 0 which is the free path
discarding. In this case, because bitmaps are lossily marked trimmed,
(we skip lone pages less than BTRFS_ASYNC_DISCARD_MIN_FILTER), we need
to just go back and double check. The goal is to maintain the invariant
everything is discarded when we forget about a block group.

I think in practice, with the FORCE_EXTENT_THRESHOLD patch later on,
this shouldn't really be common case as it should have been picked out
by some coalescing.

Thanks,
Dennis

Patch
diff mbox series

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e81f699347e0..b5608f8dc41a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -439,7 +439,7 @@  struct btrfs_full_stripe_locks_tree {
 };
 
 /* discard control */
-#define BTRFS_NR_DISCARD_LISTS		2
+#define BTRFS_NR_DISCARD_LISTS		3
 
 struct btrfs_discard_ctl {
 	struct workqueue_struct *discard_workers;
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 072c73f48297..296cbffc5957 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -20,6 +20,10 @@ 
 #define BTRFS_DISCARD_MAX_DELAY		(10000UL)
 #define BTRFS_DISCARD_MAX_IOPS		(10UL)
 
+/* montonically decreasing filters after 0 */
+static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {0,
+	BTRFS_DISCARD_MAX_FILTER, BTRFS_DISCARD_MIN_FILTER};
+
 static struct list_head *
 btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl,
 		       struct btrfs_block_group_cache *cache)
@@ -120,7 +124,7 @@  find_next_cache(struct btrfs_discard_ctl *discard_ctl, u64 now)
 }
 
 static struct btrfs_block_group_cache *
-peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
+peek_discard_list(struct btrfs_discard_ctl *discard_ctl, int *discard_index)
 {
 	struct btrfs_block_group_cache *cache;
 	u64 now = ktime_get_ns();
@@ -132,6 +136,7 @@  peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
 
 	if (cache && now > cache->discard_delay) {
 		discard_ctl->cache = cache;
+		*discard_index = cache->discard_index;
 		if (cache->discard_index == 0 &&
 		    cache->free_space_ctl->free_space != cache->key.offset) {
 			__btrfs_add_to_discard_list(discard_ctl, cache);
@@ -150,6 +155,36 @@  peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
 	return cache;
 }
 
+void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
+				u64 bytes)
+{
+	struct btrfs_discard_ctl *discard_ctl;
+
+	if (!cache || !btrfs_test_opt(cache->fs_info, DISCARD_ASYNC))
+		return;
+
+	discard_ctl = &cache->fs_info->discard_ctl;
+
+	if (cache && cache->discard_index > 1 &&
+	    bytes >= BTRFS_DISCARD_MAX_FILTER) {
+		remove_from_discard_list(discard_ctl, cache);
+		cache->discard_index = 1;
+		btrfs_add_to_discard_list(discard_ctl, cache);
+	}
+}
+
+static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
+				       struct btrfs_block_group_cache *cache)
+{
+	cache->discard_index++;
+	if (cache->discard_index == BTRFS_NR_DISCARD_LISTS) {
+		cache->discard_index = 1;
+		return;
+	}
+
+	btrfs_add_to_discard_list(discard_ctl, cache);
+}
+
 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
 			       struct btrfs_block_group_cache *cache)
 {
@@ -202,23 +237,34 @@  static void btrfs_discard_workfn(struct work_struct *work)
 {
 	struct btrfs_discard_ctl *discard_ctl;
 	struct btrfs_block_group_cache *cache;
+	int discard_index = 0;
 	u64 trimmed = 0;
+	u64 minlen = 0;
 
 	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
 
 again:
-	cache = peek_discard_list(discard_ctl);
+	cache = peek_discard_list(discard_ctl, &discard_index);
 	if (!cache || !btrfs_run_discard_work(discard_ctl))
 		return;
 
-	if (btrfs_discard_bitmaps(cache))
+	minlen = discard_minlen[discard_index];
+
+	if (btrfs_discard_bitmaps(cache)) {
+		u64 maxlen = 0;
+
+		if (discard_index)
+			maxlen = discard_minlen[discard_index - 1];
+
 		btrfs_trim_block_group_bitmaps(cache, &trimmed,
 					       cache->discard_cursor,
 					       btrfs_block_group_end(cache),
-					       0, true);
-	else
+					       minlen, maxlen, true);
+	} else {
 		btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor,
-				       btrfs_block_group_end(cache), 0, true);
+				       btrfs_block_group_end(cache),
+				       minlen, true);
+	}
 
 	discard_ctl->prev_discard = trimmed;
 
@@ -231,6 +277,8 @@  static void btrfs_discard_workfn(struct work_struct *work)
 				 cache->key.offset)
 				btrfs_add_to_discard_free_list(discard_ctl,
 							       cache);
+			else
+				btrfs_update_discard_index(discard_ctl, cache);
 		} else {
 			cache->discard_cursor = cache->key.objectid;
 			cache->discard_flags |= BTRFS_DISCARD_BITMAPS;
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 898dd92dbf8f..1daa8da4a1b5 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -18,6 +18,8 @@ 
 
 /* discard size limits */
 #define BTRFS_DISCARD_MAX_SIZE		(SZ_64M)
+#define BTRFS_DISCARD_MAX_FILTER	(SZ_1M)
+#define BTRFS_DISCARD_MIN_FILTER	(SZ_32K)
 
 /* discard flags */
 #define BTRFS_DISCARD_RESET_CURSOR	(1UL << 0)
@@ -39,6 +41,8 @@  void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 			       struct btrfs_block_group_cache *cache);
 void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
 				    struct btrfs_block_group_cache *cache);
+void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache,
+				u64 bytes);
 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
 
 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ce33803a45b2..ed35dc090df6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2471,6 +2471,7 @@  int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	if (ret)
 		kmem_cache_free(btrfs_free_space_cachep, info);
 out:
+	btrfs_discard_check_filter(cache, bytes);
 	btrfs_discard_update_discardable(cache, ctl);
 	spin_unlock(&ctl->tree_lock);
 
@@ -3409,7 +3410,13 @@  static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
 				goto next;
 			}
 			unlink_free_space(ctl, entry);
-			if (bytes > BTRFS_DISCARD_MAX_SIZE) {
+			/*
+			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+			 * If X < BTRFS_DISCARD_MIN_FILTER, we won't trim X when
+			 * we come back around.  So trim it now.
+			 */
+			if (bytes > (BTRFS_DISCARD_MAX_SIZE +
+				     BTRFS_DISCARD_MIN_FILTER)) {
 				bytes = extent_bytes = BTRFS_DISCARD_MAX_SIZE;
 				entry->offset += BTRFS_DISCARD_MAX_SIZE;
 				entry->bytes -= BTRFS_DISCARD_MAX_SIZE;
@@ -3510,7 +3517,7 @@  static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
 
 static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
-			bool async)
+			u64 maxlen, bool async)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
@@ -3535,7 +3542,7 @@  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 		}
 
 		entry = tree_search_offset(ctl, offset, 1, 0);
-		if (!entry || (async && start == offset &&
+		if (!entry || (async && minlen && start == offset &&
 			       btrfs_free_space_trimmed(entry))) {
 			spin_unlock(&ctl->tree_lock);
 			mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3556,10 +3563,10 @@  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 		ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
 		if (ret2 || start >= end) {
 			/*
-			 * This keeps the invariant that all bytes are trimmed
-			 * if BTRFS_FSC_TRIMMED is set on a bitmap.
+			 * We lossily consider a bitmap trimmed if we only skip
+			 * over regions <= BTRFS_DISCARD_MIN_FILTER.
 			 */
-			if (ret2 && !minlen)
+			if (ret2 && minlen <= BTRFS_DISCARD_MIN_FILTER)
 				end_trimming_bitmap(ctl, entry);
 			else
 				entry->flags &= ~BTRFS_FSC_TRIMMING_BITMAP;
@@ -3580,14 +3587,19 @@  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
 		}
 
 		bytes = min(bytes, end - start);
-		if (bytes < minlen) {
-			entry->flags &= ~BTRFS_FSC_TRIMMING_BITMAP;
+		if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
 			spin_unlock(&ctl->tree_lock);
 			mutex_unlock(&ctl->cache_writeout_mutex);
 			goto next;
 		}
 
-		if (async && bytes > BTRFS_DISCARD_MAX_SIZE)
+		/*
+		 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+		 * If X < BTRFS_DISCARD_MIN_FILTER, we won't trim X when we come
+		 * back around.  So trim it now.
+		 */
+		if (async && bytes > (BTRFS_DISCARD_MAX_SIZE +
+				      BTRFS_DISCARD_MIN_FILTER))
 			bytes = BTRFS_DISCARD_MAX_SIZE;
 
 		bitmap_clear_bits(ctl, entry, start, bytes);
@@ -3694,7 +3706,7 @@  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 	if (ret || async)
 		goto out;
 
-	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, false);
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
 	/* if we ended in the middle of a bitmap, reset the trimming flag */
 	if (end % (BITS_PER_BITMAP * ctl->unit))
 		reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
@@ -3705,7 +3717,7 @@  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 
 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group_cache *block_group,
 				   u64 *trimmed, u64 start, u64 end, u64 minlen,
-				   bool async)
+				   u64 maxlen, bool async)
 {
 	int ret;
 
@@ -3719,7 +3731,8 @@  int btrfs_trim_block_group_bitmaps(struct btrfs_block_group_cache *block_group,
 	btrfs_get_block_group_trimming(block_group);
 	spin_unlock(&block_group->lock);
 
-	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, async);
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
+			   async);
 
 	btrfs_put_block_group_trimming(block_group);
 	return ret;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c5cce44b03af..90abf922f0ba 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -132,7 +132,7 @@  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 			   bool async);
 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group_cache *block_group,
 				   u64 *trimmed, u64 start, u64 end, u64 minlen,
-				   bool async);
+				   u64 maxlen, bool async);
 
 /* Support functions for running our sanity tests */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS