[1/2] btrfs: track odirect bytes in flight
diff mbox series

Message ID 20190410195610.84110-2-josef@toxicpanda.com
State New
Headers show
Series
  • ENOSPC refinements
Related show

Commit Message

Josef Bacik April 10, 2019, 7:56 p.m. UTC
When diagnosing a slowdown of generic/224 I noticed we were wasting a
lot of time in shrink_delalloc() despite all writes being O_DIRECT
writes.  O_DIRECT writes still have outstanding extents, but obviously
cannot be directly flushed, instead we need to wait on their
corresponding ordered extent.  Track the outstanding odirect write bytes
and if this amount is higher than the delalloc bytes in the system go
ahead and force us to wait on the ordered extents.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/ctree.h        |  1 +
 fs/btrfs/disk-io.c      | 15 ++++++++++++++-
 fs/btrfs/extent-tree.c  | 17 +++++++++++++++--
 fs/btrfs/ordered-data.c |  9 ++++++++-
 4 files changed, 38 insertions(+), 4 deletions(-)

Comments

Nikolay Borisov April 12, 2019, 10:17 a.m. UTC | #1
On 10.04.19 г. 22:56 ч., Josef Bacik wrote:
> When diagnosing a slowdown of generic/224 I noticed we were wasting a
> lot of time in shrink_delalloc() despite all writes being O_DIRECT
> writes.  O_DIRECT writes still have outstanding extents, but obviously
> cannot be directly flushed, instead we need to wait on their
> corresponding ordered extent.  Track the outstanding odirect write bytes
> and if this amount is higher than the delalloc bytes in the system go
> ahead and force us to wait on the ordered extents.

This is way too sparse. I've been running generic/224 to try and
reproduce your slowdown. So far I can confirm that this test exhibits
drastic swings in performance - I've seen it complete from 30s up to
300s. I've also been taking an offcputime[0] measurements in the case
where high completion times were observed but so far I haven't really
seen shrink_delalloc standing out.

Provide more information how you measured the said slowdown as well as
more information in the changelog about why it's happening. At the very
least this could be split into 2 patches:

1. Could add the percpu counter init + modification in ordered extent
routines

2. Should add the logic in shrink_delalloc. Ideally that patch will
include detailed explanation of how the problem manifests.


Slight off topic:

What purpose do the checks of trans in shrink_delalloc serve? Does it
mean "if there is currently an open transaction don't do any ordered
wait because that's expensive" ?


[0] https://drive.google.com/open?id=1rEtMchqll6LZ0hq7uAzYkC4vY975Mw4i

> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/ctree.h        |  1 +
>  fs/btrfs/disk-io.c      | 15 ++++++++++++++-
>  fs/btrfs/extent-tree.c  | 17 +++++++++++++++--
>  fs/btrfs/ordered-data.c |  9 ++++++++-
>  4 files changed, 38 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 7e774d48c48c..e293d74b2ead 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1016,6 +1016,7 @@ struct btrfs_fs_info {
>  	/* used to keep from writing metadata until there is a nice batch */
>  	struct percpu_counter dirty_metadata_bytes;
>  	struct percpu_counter delalloc_bytes;
> +	struct percpu_counter odirect_bytes;
>  	s32 dirty_metadata_batch;
>  	s32 delalloc_batch;
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 7a88de4be8d7..3f0b1854cedc 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2641,11 +2641,17 @@ int open_ctree(struct super_block *sb,
>  		goto fail;
>  	}
>  
> -	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
> +	ret = percpu_counter_init(&fs_info->odirect_bytes, 0, GFP_KERNEL);
>  	if (ret) {
>  		err = ret;
>  		goto fail_srcu;
>  	}
> +
> +	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
> +	if (ret) {
> +		err = ret;
> +		goto fail_odirect_bytes;
> +	}
>  	fs_info->dirty_metadata_batch = PAGE_SIZE *
>  					(1 + ilog2(nr_cpu_ids));
>  
> @@ -3344,6 +3350,8 @@ int open_ctree(struct super_block *sb,
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
>  fail_dirty_metadata_bytes:
>  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
> +fail_odirect_bytes:
> +	percpu_counter_destroy(&fs_info->odirect_bytes);
>  fail_srcu:
>  	cleanup_srcu_struct(&fs_info->subvol_srcu);
>  fail:
> @@ -4025,6 +4033,10 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>  		       percpu_counter_sum(&fs_info->delalloc_bytes));
>  	}
>  
> +	if (percpu_counter_sum(&fs_info->odirect_bytes))
> +		btrfs_info(fs_info, "at unmount odirect count %lld",
> +			   percpu_counter_sum(&fs_info->odirect_bytes));
> +
>  	btrfs_sysfs_remove_mounted(fs_info);
>  	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
>  
> @@ -4056,6 +4068,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>  
>  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
> +	percpu_counter_destroy(&fs_info->odirect_bytes);
>  	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
>  	cleanup_srcu_struct(&fs_info->subvol_srcu);
>  
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d0626f945de2..0982456ebabb 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4727,6 +4727,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  	struct btrfs_space_info *space_info;
>  	struct btrfs_trans_handle *trans;
>  	u64 delalloc_bytes;
> +	u64 odirect_bytes;
>  	u64 async_pages;
>  	u64 items;
>  	long time_left;
> @@ -4742,7 +4743,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  
>  	delalloc_bytes = percpu_counter_sum_positive(
>  						&fs_info->delalloc_bytes);
> -	if (delalloc_bytes == 0) {
> +	odirect_bytes = percpu_counter_sum_positive(
> +						&fs_info->odirect_bytes);
> +	if (delalloc_bytes == 0 && odirect_bytes == 0) {
>  		if (trans)
>  			return;
>  		if (wait_ordered)
> @@ -4750,8 +4753,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  		return;
>  	}
>  
> +	/*
> +	 * If we are doing more ordered than delalloc we need to just wait on
> +	 * ordered extents, otherwise we'll waste time trying to flush delalloc
> +	 * that likely won't give us the space back we need.
> +	 */
> +	if (odirect_bytes > delalloc_bytes)
> +		wait_ordered = true;
> +
>  	loops = 0;
> -	while (delalloc_bytes && loops < 3) {
> +	while ((delalloc_bytes || odirect_bytes)  && loops < 3) {
>  		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
>  
>  		/*
> @@ -4801,6 +4812,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  		}
>  		delalloc_bytes = percpu_counter_sum_positive(
>  						&fs_info->delalloc_bytes);
> +		odirect_bytes = percpu_counter_sum_positive(
> +						&fs_info->odirect_bytes);
>  	}
>  }
>  
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 6fde2b2741ef..967c62b85d77 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -194,8 +194,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
>  	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
>  		set_bit(type, &entry->flags);
>  
> -	if (dio)
> +	if (dio) {
> +		percpu_counter_add_batch(&fs_info->odirect_bytes, len,
> +					 fs_info->delalloc_batch);
>  		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
> +	}
>  
>  	/* one ref for the tree */
>  	refcount_set(&entry->refs, 1);
> @@ -468,6 +471,10 @@ void btrfs_remove_ordered_extent(struct inode *inode,
>  	if (root != fs_info->tree_root)
>  		btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
>  
> +	if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
> +		percpu_counter_add_batch(&fs_info->odirect_bytes, -entry->len,
> +					 fs_info->delalloc_batch);
> +
>  	tree = &btrfs_inode->ordered_tree;
>  	spin_lock_irq(&tree->lock);
>  	node = &entry->rb_node;
>
Josef Bacik April 12, 2019, 1:30 p.m. UTC | #2
On Fri, Apr 12, 2019 at 01:17:40PM +0300, Nikolay Borisov wrote:
> 
> 
> On 10.04.19 г. 22:56 ч., Josef Bacik wrote:
> > When diagnosing a slowdown of generic/224 I noticed we were wasting a
> > lot of time in shrink_delalloc() despite all writes being O_DIRECT
> > writes.  O_DIRECT writes still have outstanding extents, but obviously
> > cannot be directly flushed, instead we need to wait on their
> > corresponding ordered extent.  Track the outstanding odirect write bytes
> > and if this amount is higher than the delalloc bytes in the system go
> > ahead and force us to wait on the ordered extents.
> 
> This is way too sparse. I've been running generic/224 to try and
> reproduce your slowdown. So far I can confirm that this test exhibits
> drastic swings in performance - I've seen it complete from 30s up to
> 300s. I've also been taking an offcputime[0] measurements in the case
> where high completion times were observed but so far I haven't really
> seen shrink_delalloc standing out.
> 

It's not, I shouldn't have said "wasting time" I should have said most calls to
shrink_delalloc did nothing.  The rest is self explanatory.  shrink_delalloc()
keys off of fs_info->delalloc_bytes, which is 0 if you have nothing but
O_DIRECT.  Thus we need a mechanism for seeing that there's O_DIRECT in flight
and we would benefit from waiting on ordered extents.  The slowdown is addressed
in patch 2, this patch just makes it so shrink_delalloc() actually does
something if we are O_DIRECT.  Thanks,

Josef
David Sterba April 24, 2019, 5:26 p.m. UTC | #3
On Fri, Apr 12, 2019 at 01:17:40PM +0300, Nikolay Borisov wrote:
> 
> 
> On 10.04.19 г. 22:56 ч., Josef Bacik wrote:
> > When diagnosing a slowdown of generic/224 I noticed we were wasting a
> > lot of time in shrink_delalloc() despite all writes being O_DIRECT
> > writes.  O_DIRECT writes still have outstanding extents, but obviously
> > cannot be directly flushed, instead we need to wait on their
> > corresponding ordered extent.  Track the outstanding odirect write bytes
> > and if this amount is higher than the delalloc bytes in the system go
> > ahead and force us to wait on the ordered extents.
> 
> This is way too sparse. I've been running generic/224 to try and
> reproduce your slowdown. So far I can confirm that this test exhibits
> drastic swings in performance - I've seen it complete from 30s up to
> 300s. I've also been taking an offcputime[0] measurements in the case
> where high completion times were observed but so far I haven't really
> seen shrink_delalloc standing out.
> 
> Provide more information how you measured the said slowdown as well as
> more information in the changelog about why it's happening. At the very
> least this could be split into 2 patches:
> 
> 1. Could add the percpu counter init + modification in ordered extent
> routines
> 
> 2. Should add the logic in shrink_delalloc. Ideally that patch will
> include detailed explanation of how the problem manifests.

I don't think splitting the init code is required here, I did not find
it distracting while reading the patch.

The 'ideally' part of your comment is be something that we should not
ask for. Missing or vague explanation for change like that is a bad
practice. Josef, please update the changelog and resend, thanks.

Patch
diff mbox series

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7e774d48c48c..e293d74b2ead 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1016,6 +1016,7 @@  struct btrfs_fs_info {
 	/* used to keep from writing metadata until there is a nice batch */
 	struct percpu_counter dirty_metadata_bytes;
 	struct percpu_counter delalloc_bytes;
+	struct percpu_counter odirect_bytes;
 	s32 dirty_metadata_batch;
 	s32 delalloc_batch;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7a88de4be8d7..3f0b1854cedc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2641,11 +2641,17 @@  int open_ctree(struct super_block *sb,
 		goto fail;
 	}
 
-	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+	ret = percpu_counter_init(&fs_info->odirect_bytes, 0, GFP_KERNEL);
 	if (ret) {
 		err = ret;
 		goto fail_srcu;
 	}
+
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+	if (ret) {
+		err = ret;
+		goto fail_odirect_bytes;
+	}
 	fs_info->dirty_metadata_batch = PAGE_SIZE *
 					(1 + ilog2(nr_cpu_ids));
 
@@ -3344,6 +3350,8 @@  int open_ctree(struct super_block *sb,
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+fail_odirect_bytes:
+	percpu_counter_destroy(&fs_info->odirect_bytes);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
@@ -4025,6 +4033,10 @@  void close_ctree(struct btrfs_fs_info *fs_info)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
+	if (percpu_counter_sum(&fs_info->odirect_bytes))
+		btrfs_info(fs_info, "at unmount odirect count %lld",
+			   percpu_counter_sum(&fs_info->odirect_bytes));
+
 	btrfs_sysfs_remove_mounted(fs_info);
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
@@ -4056,6 +4068,7 @@  void close_ctree(struct btrfs_fs_info *fs_info)
 
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
+	percpu_counter_destroy(&fs_info->odirect_bytes);
 	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d0626f945de2..0982456ebabb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4727,6 +4727,7 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
 	u64 delalloc_bytes;
+	u64 odirect_bytes;
 	u64 async_pages;
 	u64 items;
 	long time_left;
@@ -4742,7 +4743,9 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 
 	delalloc_bytes = percpu_counter_sum_positive(
 						&fs_info->delalloc_bytes);
-	if (delalloc_bytes == 0) {
+	odirect_bytes = percpu_counter_sum_positive(
+						&fs_info->odirect_bytes);
+	if (delalloc_bytes == 0 && odirect_bytes == 0) {
 		if (trans)
 			return;
 		if (wait_ordered)
@@ -4750,8 +4753,16 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 		return;
 	}
 
+	/*
+	 * If we are doing more ordered than delalloc we need to just wait on
+	 * ordered extents, otherwise we'll waste time trying to flush delalloc
+	 * that likely won't give us the space back we need.
+	 */
+	if (odirect_bytes > delalloc_bytes)
+		wait_ordered = true;
+
 	loops = 0;
-	while (delalloc_bytes && loops < 3) {
+	while ((delalloc_bytes || odirect_bytes)  && loops < 3) {
 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
 
 		/*
@@ -4801,6 +4812,8 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 		}
 		delalloc_bytes = percpu_counter_sum_positive(
 						&fs_info->delalloc_bytes);
+		odirect_bytes = percpu_counter_sum_positive(
+						&fs_info->odirect_bytes);
 	}
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6fde2b2741ef..967c62b85d77 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -194,8 +194,11 @@  static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
-	if (dio)
+	if (dio) {
+		percpu_counter_add_batch(&fs_info->odirect_bytes, len,
+					 fs_info->delalloc_batch);
 		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+	}
 
 	/* one ref for the tree */
 	refcount_set(&entry->refs, 1);
@@ -468,6 +471,10 @@  void btrfs_remove_ordered_extent(struct inode *inode,
 	if (root != fs_info->tree_root)
 		btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
 
+	if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+		percpu_counter_add_batch(&fs_info->odirect_bytes, -entry->len,
+					 fs_info->delalloc_batch);
+
 	tree = &btrfs_inode->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;