diff mbox series

[v5,2/8] btrfs: only let one thread pre-flush delayed refs in commit

Message ID 9e47b11bdfe5b4905fdaa81e952de2e2466c6335.1608319304.git.josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series A variety of lock contention fixes | expand

Commit Message

Josef Bacik Dec. 18, 2020, 7:24 p.m. UTC
I've been running a stress test that runs 20 workers in their own
subvolume, which are running an fsstress instance with 4 threads per
worker, which is 80 total fsstress threads.  In addition to this I'm
running balance in the background as well as creating and deleting
snapshots.  This test takes around 12 hours to run normally, going
slower and slower as the test goes on.

The reason for this is because fsstress is running fsync sometimes, and
because we're messing with block groups we often fall through to
btrfs_commit_transaction, so will often have 20-30 threads all calling
btrfs_commit_transaction at the same time.

These all get stuck contending on the extent tree while they try to run
delayed refs during the initial part of the commit.

This is suboptimal, really because the extent tree is a single point of
failure we only want one thread acting on that tree at once to reduce
lock contention.  Fix this by making the flushing mechanism a bit
operation, to make it easy to use test_and_set_bit() in order to make
sure only one task does this initial flush.

Once we're into the transaction commit we only have one thread doing
delayed ref running, it's just this initial pre-flush that is
problematic.  With this patch my stress test takes around 90 minutes to
run, instead of 12 hours.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/delayed-ref.h | 12 ++++++------
 fs/btrfs/transaction.c | 33 ++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 23 deletions(-)

Comments

David Sterba Jan. 8, 2021, 4:01 p.m. UTC | #1
On Fri, Dec 18, 2020 at 02:24:20PM -0500, Josef Bacik wrote:
> I've been running a stress test that runs 20 workers in their own
> subvolume, which are running an fsstress instance with 4 threads per
> worker, which is 80 total fsstress threads.  In addition to this I'm
> running balance in the background as well as creating and deleting
> snapshots.  This test takes around 12 hours to run normally, going
> slower and slower as the test goes on.
> 
> The reason for this is because fsstress is running fsync sometimes, and
> because we're messing with block groups we often fall through to
> btrfs_commit_transaction, so will often have 20-30 threads all calling
> btrfs_commit_transaction at the same time.
> 
> These all get stuck contending on the extent tree while they try to run
> delayed refs during the initial part of the commit.
> 
> This is suboptimal, really because the extent tree is a single point of
> failure we only want one thread acting on that tree at once to reduce
> lock contention.  Fix this by making the flushing mechanism a bit
> operation, to make it easy to use test_and_set_bit() in order to make
> sure only one task does this initial flush.
> 
> Once we're into the transaction commit we only have one thread doing
> delayed ref running, it's just this initial pre-flush that is
> problematic.  With this patch my stress test takes around 90 minutes to
> run, instead of 12 hours.
> 
> Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/delayed-ref.h | 12 ++++++------
>  fs/btrfs/transaction.c | 33 ++++++++++++++++-----------------
>  2 files changed, 22 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 1c977e6d45dc..6e414785b56f 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref {
>  	u64 offset;
>  };
>  
> +enum btrfs_delayed_ref_flags {
> +	/* Used to indicate that we are flushing delayed refs for the commit. */
> +	BTRFS_DELAYED_REFS_FLUSHING,
> +};
> +
>  struct btrfs_delayed_ref_root {
>  	/* head ref rbtree */
>  	struct rb_root_cached href_root;
> @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root {
>  
>  	u64 pending_csums;
>  
> -	/*
> -	 * set when the tree is flushing before a transaction commit,
> -	 * used by the throttling code to decide if new updates need
> -	 * to be run right away
> -	 */
> -	int flushing;
> +	unsigned long flags;
>  
>  	u64 run_delayed_start;
>  
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index f51f9e39bcee..ccd37fbe5db1 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -909,9 +909,9 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
>  {
>  	struct btrfs_transaction *cur_trans = trans->transaction;
>  
> -	smp_mb();
>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
> -	    cur_trans->delayed_refs.flushing)
> +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
> +		     &cur_trans->delayed_refs.flags))
>  		return true;
>  
>  	return should_end_transaction(trans);
> @@ -2043,23 +2043,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>  	btrfs_trans_release_metadata(trans);
>  	trans->block_rsv = NULL;
>  
> -	/* make a pass through all the delayed refs we have so far
> -	 * any runnings procs may add more while we are here
> -	 */
> -	ret = btrfs_run_delayed_refs(trans, 0);
> -	if (ret) {
> -		btrfs_end_transaction(trans);
> -		return ret;
> -	}
> -
> -	cur_trans = trans->transaction;
> -
>  	/*
> -	 * set the flushing flag so procs in this transaction have to
> -	 * start sending their work down.
> +	 * We only want one transaction commit doing the flushing so we do not
> +	 * waste a bunch of time on lock contention on the extent root node.
>  	 */
> -	cur_trans->delayed_refs.flushing = 1;
> -	smp_wmb();

This barrier obviously separates the flushing = 1 and the rest of the
code, now implemented as test_and_set_bit, which implies full barrier.

However, hunk in btrfs_should_end_transaction removes the barrier and
I'm not sure whether this is correct:

-	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
-	    cur_trans->delayed_refs.flushing)
+	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
+		     &cur_trans->delayed_refs.flags))
 		return true;

This is never called under locks so we don't have complete
synchronization of neither the transaction state nor the flushing bit.
btrfs_should_end_transaction is merely a hint and not called in critical
places so we could probably afford to keep it without a barrier, or keep
it with comment(s).
Nikolay Borisov Jan. 11, 2021, 8:33 a.m. UTC | #2
On 8.01.21 г. 18:01 ч., David Sterba wrote:
> On Fri, Dec 18, 2020 at 02:24:20PM -0500, Josef Bacik wrote:
>> I've been running a stress test that runs 20 workers in their own
>> subvolume, which are running an fsstress instance with 4 threads per
>> worker, which is 80 total fsstress threads.  In addition to this I'm
>> running balance in the background as well as creating and deleting
>> snapshots.  This test takes around 12 hours to run normally, going
>> slower and slower as the test goes on.
>>
>> The reason for this is because fsstress is running fsync sometimes, and
>> because we're messing with block groups we often fall through to
>> btrfs_commit_transaction, so will often have 20-30 threads all calling
>> btrfs_commit_transaction at the same time.
>>
>> These all get stuck contending on the extent tree while they try to run
>> delayed refs during the initial part of the commit.
>>
>> This is suboptimal, really because the extent tree is a single point of
>> failure we only want one thread acting on that tree at once to reduce
>> lock contention.  Fix this by making the flushing mechanism a bit
>> operation, to make it easy to use test_and_set_bit() in order to make
>> sure only one task does this initial flush.
>>
>> Once we're into the transaction commit we only have one thread doing
>> delayed ref running, it's just this initial pre-flush that is
>> problematic.  With this patch my stress test takes around 90 minutes to
>> run, instead of 12 hours.
>>
>> Reviewed-by: Nikolay Borisov <nborisov@suse.com>
>> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
>> ---
>>  fs/btrfs/delayed-ref.h | 12 ++++++------
>>  fs/btrfs/transaction.c | 33 ++++++++++++++++-----------------
>>  2 files changed, 22 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
>> index 1c977e6d45dc..6e414785b56f 100644
>> --- a/fs/btrfs/delayed-ref.h
>> +++ b/fs/btrfs/delayed-ref.h
>> @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref {
>>  	u64 offset;
>>  };
>>  
>> +enum btrfs_delayed_ref_flags {
>> +	/* Used to indicate that we are flushing delayed refs for the commit. */
>> +	BTRFS_DELAYED_REFS_FLUSHING,
>> +};
>> +
>>  struct btrfs_delayed_ref_root {
>>  	/* head ref rbtree */
>>  	struct rb_root_cached href_root;
>> @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root {
>>  
>>  	u64 pending_csums;
>>  
>> -	/*
>> -	 * set when the tree is flushing before a transaction commit,
>> -	 * used by the throttling code to decide if new updates need
>> -	 * to be run right away
>> -	 */
>> -	int flushing;
>> +	unsigned long flags;
>>  
>>  	u64 run_delayed_start;
>>  
>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>> index f51f9e39bcee..ccd37fbe5db1 100644
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -909,9 +909,9 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
>>  {
>>  	struct btrfs_transaction *cur_trans = trans->transaction;
>>  
>> -	smp_mb();
>>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
>> -	    cur_trans->delayed_refs.flushing)
>> +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
>> +		     &cur_trans->delayed_refs.flags))
>>  		return true;
>>  
>>  	return should_end_transaction(trans);
>> @@ -2043,23 +2043,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>>  	btrfs_trans_release_metadata(trans);
>>  	trans->block_rsv = NULL;
>>  
>> -	/* make a pass through all the delayed refs we have so far
>> -	 * any runnings procs may add more while we are here
>> -	 */
>> -	ret = btrfs_run_delayed_refs(trans, 0);
>> -	if (ret) {
>> -		btrfs_end_transaction(trans);
>> -		return ret;
>> -	}
>> -
>> -	cur_trans = trans->transaction;
>> -
>>  	/*
>> -	 * set the flushing flag so procs in this transaction have to
>> -	 * start sending their work down.
>> +	 * We only want one transaction commit doing the flushing so we do not
>> +	 * waste a bunch of time on lock contention on the extent root node.
>>  	 */
>> -	cur_trans->delayed_refs.flushing = 1;
>> -	smp_wmb();
> 
> This barrier obviously separates the flushing = 1 and the rest of the
> code, now implemented as test_and_set_bit, which implies full barrier.
> 
> However, hunk in btrfs_should_end_transaction removes the barrier and
> I'm not sure whether this is correct:
> 
> -	smp_mb();
>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
> -	    cur_trans->delayed_refs.flushing)
> +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
> +		     &cur_trans->delayed_refs.flags))
>  		return true;
> 
> This is never called under locks so we don't have complete
> synchronization of neither the transaction state nor the flushing bit.
> btrfs_should_end_transaction is merely a hint and not called in critical
> places so we could probably afford to keep it without a barrier, or keep
> it with comment(s).

I think the point is moot in this case, because the test_bit either sees
the flag or it doesn't. It's not possible for the flag to be set AND
should_end_transaction return false that would be gross violation of
program correctness.

>
David Sterba Jan. 11, 2021, 9:50 p.m. UTC | #3
On Mon, Jan 11, 2021 at 10:33:42AM +0200, Nikolay Borisov wrote:
> On 8.01.21 г. 18:01 ч., David Sterba wrote:
> > On Fri, Dec 18, 2020 at 02:24:20PM -0500, Josef Bacik wrote:
> >> @@ -2043,23 +2043,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
> >>  	btrfs_trans_release_metadata(trans);
> >>  	trans->block_rsv = NULL;
> >>  
> >> -	/* make a pass through all the delayed refs we have so far
> >> -	 * any runnings procs may add more while we are here
> >> -	 */
> >> -	ret = btrfs_run_delayed_refs(trans, 0);
> >> -	if (ret) {
> >> -		btrfs_end_transaction(trans);
> >> -		return ret;
> >> -	}
> >> -
> >> -	cur_trans = trans->transaction;
> >> -
> >>  	/*
> >> -	 * set the flushing flag so procs in this transaction have to
> >> -	 * start sending their work down.
> >> +	 * We only want one transaction commit doing the flushing so we do not
> >> +	 * waste a bunch of time on lock contention on the extent root node.
> >>  	 */
> >> -	cur_trans->delayed_refs.flushing = 1;
> >> -	smp_wmb();
> > 
> > This barrier obviously separates the flushing = 1 and the rest of the
> > code, now implemented as test_and_set_bit, which implies full barrier.
> > 
> > However, hunk in btrfs_should_end_transaction removes the barrier and
> > I'm not sure whether this is correct:
> > 
> > -	smp_mb();
> >  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
> > -	    cur_trans->delayed_refs.flushing)
> > +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
> > +		     &cur_trans->delayed_refs.flags))
> >  		return true;
> > 
> > This is never called under locks so we don't have complete
> > synchronization of neither the transaction state nor the flushing bit.
> > btrfs_should_end_transaction is merely a hint and not called in critical
> > places so we could probably afford to keep it without a barrier, or keep
> > it with comment(s).
> 
> I think the point is moot in this case, because the test_bit either sees
> the flag or it doesn't. It's not possible for the flag to be set AND
> should_end_transaction return false that would be gross violation of
> program correctness.

So that's for the flushing part, but what about cur_trans->state?
Nikolay Borisov Jan. 12, 2021, 9:17 a.m. UTC | #4
On 11.01.21 г. 23:50 ч., David Sterba wrote:
> On Mon, Jan 11, 2021 at 10:33:42AM +0200, Nikolay Borisov wrote:
>> On 8.01.21 г. 18:01 ч., David Sterba wrote:
>>> On Fri, Dec 18, 2020 at 02:24:20PM -0500, Josef Bacik wrote:
>>>> @@ -2043,23 +2043,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>>>>  	btrfs_trans_release_metadata(trans);
>>>>  	trans->block_rsv = NULL;
>>>>  
>>>> -	/* make a pass through all the delayed refs we have so far
>>>> -	 * any runnings procs may add more while we are here
>>>> -	 */
>>>> -	ret = btrfs_run_delayed_refs(trans, 0);
>>>> -	if (ret) {
>>>> -		btrfs_end_transaction(trans);
>>>> -		return ret;
>>>> -	}
>>>> -
>>>> -	cur_trans = trans->transaction;
>>>> -
>>>>  	/*
>>>> -	 * set the flushing flag so procs in this transaction have to
>>>> -	 * start sending their work down.
>>>> +	 * We only want one transaction commit doing the flushing so we do not
>>>> +	 * waste a bunch of time on lock contention on the extent root node.
>>>>  	 */
>>>> -	cur_trans->delayed_refs.flushing = 1;
>>>> -	smp_wmb();
>>>
>>> This barrier obviously separates the flushing = 1 and the rest of the
>>> code, now implemented as test_and_set_bit, which implies full barrier.
>>>
>>> However, hunk in btrfs_should_end_transaction removes the barrier and
>>> I'm not sure whether this is correct:
>>>
>>> -	smp_mb();
>>>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
>>> -	    cur_trans->delayed_refs.flushing)
>>> +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
>>> +		     &cur_trans->delayed_refs.flags))
>>>  		return true;
>>>
>>> This is never called under locks so we don't have complete
>>> synchronization of neither the transaction state nor the flushing bit.
>>> btrfs_should_end_transaction is merely a hint and not called in critical
>>> places so we could probably afford to keep it without a barrier, or keep
>>> it with comment(s).
>>
>> I think the point is moot in this case, because the test_bit either sees
>> the flag or it doesn't. It's not possible for the flag to be set AND
>> should_end_transaction return false that would be gross violation of
>> program correctness.
> 
> So that's for the flushing part, but what about cur_trans->state?

Looking at the code, the barrier was there to order the publishing of
the delayed_ref.flushing (now replaced by the bit flag) against
surrounding code.

So independently of this patch, let's reason about trans state. In
should_end_transaction it's read without holding any locks. (U)

It's modified in btrfs_cleanup_transaction without holding the
fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set.

set in cleanup_transaction under fs_info->trans_lock (L)
set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_UNBLOCK under fs_info->trans_lock.(L)

set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this
point the transaction is finished and fs_info->running_trans is NULL (U
but irrelevant).

So by the looks of it we can have a concurrent READ race with a Write,
due to reads not taking a lock. In this case what we want to ensure is
we either see new or old state. I consulted with Will Deacon and he said
that in such a case we'd want to annotate the accesses to ->state with
(READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I
don't think this could happen but I imagine at some point kcsan would
flag such an access as racy (which it is).

>
David Sterba Jan. 26, 2021, 5:36 p.m. UTC | #5
On Tue, Jan 12, 2021 at 11:17:45AM +0200, Nikolay Borisov wrote:
> 
> 
> On 11.01.21 г. 23:50 ч., David Sterba wrote:
> > On Mon, Jan 11, 2021 at 10:33:42AM +0200, Nikolay Borisov wrote:
> >> On 8.01.21 г. 18:01 ч., David Sterba wrote:
> >>> On Fri, Dec 18, 2020 at 02:24:20PM -0500, Josef Bacik wrote:
> >>>> @@ -2043,23 +2043,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
> >>>>  	btrfs_trans_release_metadata(trans);
> >>>>  	trans->block_rsv = NULL;
> >>>>  
> >>>> -	/* make a pass through all the delayed refs we have so far
> >>>> -	 * any runnings procs may add more while we are here
> >>>> -	 */
> >>>> -	ret = btrfs_run_delayed_refs(trans, 0);
> >>>> -	if (ret) {
> >>>> -		btrfs_end_transaction(trans);
> >>>> -		return ret;
> >>>> -	}
> >>>> -
> >>>> -	cur_trans = trans->transaction;
> >>>> -
> >>>>  	/*
> >>>> -	 * set the flushing flag so procs in this transaction have to
> >>>> -	 * start sending their work down.
> >>>> +	 * We only want one transaction commit doing the flushing so we do not
> >>>> +	 * waste a bunch of time on lock contention on the extent root node.
> >>>>  	 */
> >>>> -	cur_trans->delayed_refs.flushing = 1;
> >>>> -	smp_wmb();
> >>>
> >>> This barrier obviously separates the flushing = 1 and the rest of the
> >>> code, now implemented as test_and_set_bit, which implies full barrier.
> >>>
> >>> However, hunk in btrfs_should_end_transaction removes the barrier and
> >>> I'm not sure whether this is correct:
> >>>
> >>> -	smp_mb();
> >>>  	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
> >>> -	    cur_trans->delayed_refs.flushing)
> >>> +	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
> >>> +		     &cur_trans->delayed_refs.flags))
> >>>  		return true;
> >>>
> >>> This is never called under locks so we don't have complete
> >>> synchronization of neither the transaction state nor the flushing bit.
> >>> btrfs_should_end_transaction is merely a hint and not called in critical
> >>> places so we could probably afford to keep it without a barrier, or keep
> >>> it with comment(s).
> >>
> >> I think the point is moot in this case, because the test_bit either sees
> >> the flag or it doesn't. It's not possible for the flag to be set AND
> >> should_end_transaction return false that would be gross violation of
> >> program correctness.
> > 
> > So that's for the flushing part, but what about cur_trans->state?
> 
> Looking at the code, the barrier was there to order the publishing of
> the delayed_ref.flushing (now replaced by the bit flag) against
> surrounding code.
> 
> So independently of this patch, let's reason about trans state. In
> should_end_transaction it's read without holding any locks. (U)
> 
> It's modified in btrfs_cleanup_transaction without holding the
> fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set.
> 
> set in cleanup_transaction under fs_info->trans_lock (L)
> set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L)
> set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L)
> set in btrfs_commit_trans to COMMIT_UNBLOCK under fs_info->trans_lock.(L)
> 
> set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this
> point the transaction is finished and fs_info->running_trans is NULL (U
> but irrelevant).
> 
> So by the looks of it we can have a concurrent READ race with a Write,
> due to reads not taking a lock. In this case what we want to ensure is
> we either see new or old state. I consulted with Will Deacon and he said
> that in such a case we'd want to annotate the accesses to ->state with
> (READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I
> don't think this could happen but I imagine at some point kcsan would
> flag such an access as racy (which it is).

Thanks for the analysis, I've copied it to the changelog as there's
probably no shorter way to explain it.
diff mbox series

Patch

diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1c977e6d45dc..6e414785b56f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -135,6 +135,11 @@  struct btrfs_delayed_data_ref {
 	u64 offset;
 };
 
+enum btrfs_delayed_ref_flags {
+	/* Used to indicate that we are flushing delayed refs for the commit. */
+	BTRFS_DELAYED_REFS_FLUSHING,
+};
+
 struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
 	struct rb_root_cached href_root;
@@ -158,12 +163,7 @@  struct btrfs_delayed_ref_root {
 
 	u64 pending_csums;
 
-	/*
-	 * set when the tree is flushing before a transaction commit,
-	 * used by the throttling code to decide if new updates need
-	 * to be run right away
-	 */
-	int flushing;
+	unsigned long flags;
 
 	u64 run_delayed_start;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f51f9e39bcee..ccd37fbe5db1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -909,9 +909,9 @@  bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 
-	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
-	    cur_trans->delayed_refs.flushing)
+	    test_bit(BTRFS_DELAYED_REFS_FLUSHING,
+		     &cur_trans->delayed_refs.flags))
 		return true;
 
 	return should_end_transaction(trans);
@@ -2043,23 +2043,22 @@  int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
-	/* make a pass through all the delayed refs we have so far
-	 * any runnings procs may add more while we are here
-	 */
-	ret = btrfs_run_delayed_refs(trans, 0);
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ret;
-	}
-
-	cur_trans = trans->transaction;
-
 	/*
-	 * set the flushing flag so procs in this transaction have to
-	 * start sending their work down.
+	 * We only want one transaction commit doing the flushing so we do not
+	 * waste a bunch of time on lock contention on the extent root node.
 	 */
-	cur_trans->delayed_refs.flushing = 1;
-	smp_wmb();
+	if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
+			      &cur_trans->delayed_refs.flags)) {
+		/*
+		 * make a pass through all the delayed refs we have so far
+		 * any runnings procs may add more while we are here
+		 */
+		ret = btrfs_run_delayed_refs(trans, 0);
+		if (ret) {
+			btrfs_end_transaction(trans);
+			return ret;
+		}
+	}
 
 	btrfs_create_pending_block_groups(trans);