diff mbox series

[2/2] xfs: convert m_active_trans counter to per-cpu

Message ID 20200512025949.1807131-3-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: fix a couple of performance issues | expand

Commit Message

Dave Chinner May 12, 2020, 2:59 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

It's a global atomic counter, and we are hitting it at a rate of
half a million transactions a second, so it's bouncing the counter
cacheline all over the place on large machines. Convert it to a
per-cpu counter.

And .... oh wow, that was unexpected!

Concurrent create, 50 million inodes, identical 16p/16GB virtual
machines on different physical hosts. Machine A has twice the CPU
cores per socket of machine B:

		unpatched	patched
machine A:	3m45s		2m27s
machine B:	4m13s		4m14s

Create rates:
		unpatched	patched
machine A:	246k+/-15k	384k+/-10k
machine B:	225k+/-13k	223k+/-11k

Concurrent rm of same 50 million inodes:

		unpatched	patched
machine A:	8m30s		3m09s
machine B:	4m02s		4m51s

The transaction rate on the fast machine went from about 250k/sec to
over 600k/sec, which indicates just how much of a bottleneck this
atomic counter was.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_mount.h |  2 +-
 fs/xfs/xfs_super.c | 12 +++++++++---
 fs/xfs/xfs_trans.c |  6 +++---
 3 files changed, 13 insertions(+), 7 deletions(-)

Comments

Christoph Hellwig May 12, 2020, 8:16 a.m. UTC | #1
On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> Concurrent rm of same 50 million inodes:
> 
> 		unpatched	patched
> machine A:	8m30s		3m09s
> machine B:	4m02s		4m51s

This actually is a significant slow down on machine B, which
might be the more common one.  Can you find out how that happens
and if we can mitigate it?
Dave Chinner May 12, 2020, 9:06 a.m. UTC | #2
On Tue, May 12, 2020 at 01:16:08AM -0700, Christoph Hellwig wrote:
> On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> > Concurrent rm of same 50 million inodes:
> > 
> > 		unpatched	patched
> > machine A:	8m30s		3m09s
> > machine B:	4m02s		4m51s
> 
> This actually is a significant slow down on machine B, which

Oops, that's supposed to read "5m02s", not "4m02s". It was
marginally faster on machine B, as the commentary indicated. See the
log below for raw numbers. First set is unpatched, second set is the
patched kernel.

Cheers,

Dave.
Darrick J. Wong May 12, 2020, 4:03 p.m. UTC | #3
On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> It's a global atomic counter, and we are hitting it at a rate of
> half a million transactions a second, so it's bouncing the counter
> cacheline all over the place on large machines. Convert it to a
> per-cpu counter.
> 
> And .... oh wow, that was unexpected!
> 
> Concurrent create, 50 million inodes, identical 16p/16GB virtual
> machines on different physical hosts. Machine A has twice the CPU
> cores per socket of machine B:
> 
> 		unpatched	patched
> machine A:	3m45s		2m27s
> machine B:	4m13s		4m14s
> 
> Create rates:
> 		unpatched	patched
> machine A:	246k+/-15k	384k+/-10k
> machine B:	225k+/-13k	223k+/-11k
> 
> Concurrent rm of same 50 million inodes:
> 
> 		unpatched	patched
> machine A:	8m30s		3m09s
> machine B:	4m02s		4m51s
> 
> The transaction rate on the fast machine went from about 250k/sec to
> over 600k/sec, which indicates just how much of a bottleneck this
> atomic counter was.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_mount.h |  2 +-
>  fs/xfs/xfs_super.c | 12 +++++++++---
>  fs/xfs/xfs_trans.c |  6 +++---
>  3 files changed, 13 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 712b3e2583316..af3d8b71e9591 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -84,6 +84,7 @@ typedef struct xfs_mount {
>  	 * extents or anything related to the rt device.
>  	 */
>  	struct percpu_counter	m_delalloc_blks;
> +	struct percpu_counter	m_active_trans;	/* in progress xact counter */
>  
>  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
>  	char			*m_rtname;	/* realtime device name */
> @@ -164,7 +165,6 @@ typedef struct xfs_mount {
>  	uint64_t		m_resblks;	/* total reserved blocks */
>  	uint64_t		m_resblks_avail;/* available reserved blocks */
>  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> -	atomic_t		m_active_trans;	/* number trans frozen */
>  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
>  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
>  	struct delayed_work	m_eofblocks_work; /* background eof blocks
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e80bd2c4c279e..bc4853525ce18 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -883,7 +883,7 @@ xfs_quiesce_attr(
>  	int	error = 0;
>  
>  	/* wait for all modifications to complete */
> -	while (atomic_read(&mp->m_active_trans) > 0)
> +	while (percpu_counter_sum(&mp->m_active_trans) > 0)
>  		delay(100);

Hmm.  AFAICT, this counter stops us from quiescing the log while
transactions are still running.  We only quiesce the log for unmount,
remount-ro, and fs freeze.  Given that we now start_sb_write for
xfs_getfsmap and the background freeing threads, I wonder, do we still
need this at all?

--D

>  
>  	/* force the log to unpin objects from the now complete transactions */
> @@ -902,7 +902,7 @@ xfs_quiesce_attr(
>  	 * Just warn here till VFS can correctly support
>  	 * read-only remount without racing.
>  	 */
> -	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
> +	WARN_ON(percpu_counter_sum(&mp->m_active_trans) != 0);
>  
>  	xfs_log_quiesce(mp);
>  }
> @@ -1027,8 +1027,14 @@ xfs_init_percpu_counters(
>  	if (error)
>  		goto free_fdblocks;
>  
> +	error = percpu_counter_init(&mp->m_active_trans, 0, GFP_KERNEL);
> +	if (error)
> +		goto free_delalloc_blocks;
> +
>  	return 0;
>  
> +free_delalloc_blocks:
> +	percpu_counter_destroy(&mp->m_delalloc_blks);
>  free_fdblocks:
>  	percpu_counter_destroy(&mp->m_fdblocks);
>  free_ifree:
> @@ -1057,6 +1063,7 @@ xfs_destroy_percpu_counters(
>  	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
>  	       percpu_counter_sum(&mp->m_delalloc_blks) == 0);
>  	percpu_counter_destroy(&mp->m_delalloc_blks);
> +	percpu_counter_destroy(&mp->m_active_trans);
>  }
>  
>  static void
> @@ -1792,7 +1799,6 @@ static int xfs_init_fs_context(
>  	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
>  	spin_lock_init(&mp->m_perag_lock);
>  	mutex_init(&mp->m_growlock);
> -	atomic_set(&mp->m_active_trans, 0);
>  	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
>  	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
>  	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 28b983ff8b113..636df5017782e 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -68,7 +68,7 @@ xfs_trans_free(
>  	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
>  
>  	trace_xfs_trans_free(tp, _RET_IP_);
> -	atomic_dec(&tp->t_mountp->m_active_trans);
> +	percpu_counter_dec(&tp->t_mountp->m_active_trans);
>  	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
>  		sb_end_intwrite(tp->t_mountp->m_super);
>  	xfs_trans_free_dqinfo(tp);
> @@ -126,7 +126,7 @@ xfs_trans_dup(
>  
>  	xfs_trans_dup_dqinfo(tp, ntp);
>  
> -	atomic_inc(&tp->t_mountp->m_active_trans);
> +	percpu_counter_inc(&tp->t_mountp->m_active_trans);
>  	return ntp;
>  }
>  
> @@ -275,7 +275,7 @@ xfs_trans_alloc(
>  	 */
>  	WARN_ON(resp->tr_logres > 0 &&
>  		mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
> -	atomic_inc(&mp->m_active_trans);
> +	percpu_counter_inc(&mp->m_active_trans);
>  
>  	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
>  	tp->t_flags = flags;
> -- 
> 2.26.1.301.g55bc3eb7cb9
>
Dave Chinner May 12, 2020, 9:39 p.m. UTC | #4
On Tue, May 12, 2020 at 09:03:52AM -0700, Darrick J. Wong wrote:
> On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > It's a global atomic counter, and we are hitting it at a rate of
> > half a million transactions a second, so it's bouncing the counter
> > cacheline all over the place on large machines. Convert it to a
> > per-cpu counter.
> > 
> > And .... oh wow, that was unexpected!
> > 
> > Concurrent create, 50 million inodes, identical 16p/16GB virtual
> > machines on different physical hosts. Machine A has twice the CPU
> > cores per socket of machine B:
> > 
> > 		unpatched	patched
> > machine A:	3m45s		2m27s
> > machine B:	4m13s		4m14s
> > 
> > Create rates:
> > 		unpatched	patched
> > machine A:	246k+/-15k	384k+/-10k
> > machine B:	225k+/-13k	223k+/-11k
> > 
> > Concurrent rm of same 50 million inodes:
> > 
> > 		unpatched	patched
> > machine A:	8m30s		3m09s
> > machine B:	4m02s		4m51s
> > 
> > The transaction rate on the fast machine went from about 250k/sec to
> > over 600k/sec, which indicates just how much of a bottleneck this
> > atomic counter was.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_mount.h |  2 +-
> >  fs/xfs/xfs_super.c | 12 +++++++++---
> >  fs/xfs/xfs_trans.c |  6 +++---
> >  3 files changed, 13 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index 712b3e2583316..af3d8b71e9591 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -84,6 +84,7 @@ typedef struct xfs_mount {
> >  	 * extents or anything related to the rt device.
> >  	 */
> >  	struct percpu_counter	m_delalloc_blks;
> > +	struct percpu_counter	m_active_trans;	/* in progress xact counter */
> >  
> >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> >  	char			*m_rtname;	/* realtime device name */
> > @@ -164,7 +165,6 @@ typedef struct xfs_mount {
> >  	uint64_t		m_resblks;	/* total reserved blocks */
> >  	uint64_t		m_resblks_avail;/* available reserved blocks */
> >  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> > -	atomic_t		m_active_trans;	/* number trans frozen */
> >  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
> >  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> >  	struct delayed_work	m_eofblocks_work; /* background eof blocks
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index e80bd2c4c279e..bc4853525ce18 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -883,7 +883,7 @@ xfs_quiesce_attr(
> >  	int	error = 0;
> >  
> >  	/* wait for all modifications to complete */
> > -	while (atomic_read(&mp->m_active_trans) > 0)
> > +	while (percpu_counter_sum(&mp->m_active_trans) > 0)
> >  		delay(100);
> 
> Hmm.  AFAICT, this counter stops us from quiescing the log while
> transactions are still running.  We only quiesce the log for unmount,
> remount-ro, and fs freeze.  Given that we now start_sb_write for
> xfs_getfsmap and the background freeing threads, I wonder, do we still
> need this at all?

Perhaps not - I didn't look that far. It's basically only needed to
protect against XFS_TRANS_NO_WRITECOUNT transactions, which is
really just xfs_sync_sb() these days. This can come from several
places, but the only one outside of mount/freeze/unmount is the log
worker.  Perhaps the log worker can be cancelled before calling
xfs_quiesce_attr() rather than after?

Cheers,

Dave.
Darrick J. Wong May 12, 2020, 10:59 p.m. UTC | #5
On Wed, May 13, 2020 at 07:39:19AM +1000, Dave Chinner wrote:
> On Tue, May 12, 2020 at 09:03:52AM -0700, Darrick J. Wong wrote:
> > On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > It's a global atomic counter, and we are hitting it at a rate of
> > > half a million transactions a second, so it's bouncing the counter
> > > cacheline all over the place on large machines. Convert it to a
> > > per-cpu counter.
> > > 
> > > And .... oh wow, that was unexpected!
> > > 
> > > Concurrent create, 50 million inodes, identical 16p/16GB virtual
> > > machines on different physical hosts. Machine A has twice the CPU
> > > cores per socket of machine B:
> > > 
> > > 		unpatched	patched
> > > machine A:	3m45s		2m27s
> > > machine B:	4m13s		4m14s
> > > 
> > > Create rates:
> > > 		unpatched	patched
> > > machine A:	246k+/-15k	384k+/-10k
> > > machine B:	225k+/-13k	223k+/-11k
> > > 
> > > Concurrent rm of same 50 million inodes:
> > > 
> > > 		unpatched	patched
> > > machine A:	8m30s		3m09s
> > > machine B:	4m02s		4m51s
> > > 
> > > The transaction rate on the fast machine went from about 250k/sec to
> > > over 600k/sec, which indicates just how much of a bottleneck this
> > > atomic counter was.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/xfs_mount.h |  2 +-
> > >  fs/xfs/xfs_super.c | 12 +++++++++---
> > >  fs/xfs/xfs_trans.c |  6 +++---
> > >  3 files changed, 13 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > index 712b3e2583316..af3d8b71e9591 100644
> > > --- a/fs/xfs/xfs_mount.h
> > > +++ b/fs/xfs/xfs_mount.h
> > > @@ -84,6 +84,7 @@ typedef struct xfs_mount {
> > >  	 * extents or anything related to the rt device.
> > >  	 */
> > >  	struct percpu_counter	m_delalloc_blks;
> > > +	struct percpu_counter	m_active_trans;	/* in progress xact counter */
> > >  
> > >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> > >  	char			*m_rtname;	/* realtime device name */
> > > @@ -164,7 +165,6 @@ typedef struct xfs_mount {
> > >  	uint64_t		m_resblks;	/* total reserved blocks */
> > >  	uint64_t		m_resblks_avail;/* available reserved blocks */
> > >  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> > > -	atomic_t		m_active_trans;	/* number trans frozen */
> > >  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
> > >  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> > >  	struct delayed_work	m_eofblocks_work; /* background eof blocks
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index e80bd2c4c279e..bc4853525ce18 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -883,7 +883,7 @@ xfs_quiesce_attr(
> > >  	int	error = 0;
> > >  
> > >  	/* wait for all modifications to complete */
> > > -	while (atomic_read(&mp->m_active_trans) > 0)
> > > +	while (percpu_counter_sum(&mp->m_active_trans) > 0)
> > >  		delay(100);
> > 
> > Hmm.  AFAICT, this counter stops us from quiescing the log while
> > transactions are still running.  We only quiesce the log for unmount,
> > remount-ro, and fs freeze.  Given that we now start_sb_write for
> > xfs_getfsmap and the background freeing threads, I wonder, do we still
> > need this at all?
> 
> Perhaps not - I didn't look that far. It's basically only needed to
> protect against XFS_TRANS_NO_WRITECOUNT transactions, which is
> really just xfs_sync_sb() these days. This can come from several
> places, but the only one outside of mount/freeze/unmount is the log
> worker.  Perhaps the log worker can be cancelled before calling
> xfs_quiesce_attr() rather than after?

What if we skip bumping m_active_trans for NO_WRITECOUNT transactions?
There aren't that many of them, and it'd probably better for memory
consumption on 1000-core systems. ;)

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Darrick J. Wong May 13, 2020, 5:17 p.m. UTC | #6
On Tue, May 12, 2020 at 03:59:18PM -0700, Darrick J. Wong wrote:
> On Wed, May 13, 2020 at 07:39:19AM +1000, Dave Chinner wrote:
> > On Tue, May 12, 2020 at 09:03:52AM -0700, Darrick J. Wong wrote:
> > > On Tue, May 12, 2020 at 12:59:49PM +1000, Dave Chinner wrote:
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > It's a global atomic counter, and we are hitting it at a rate of
> > > > half a million transactions a second, so it's bouncing the counter
> > > > cacheline all over the place on large machines. Convert it to a
> > > > per-cpu counter.
> > > > 
> > > > And .... oh wow, that was unexpected!
> > > > 
> > > > Concurrent create, 50 million inodes, identical 16p/16GB virtual
> > > > machines on different physical hosts. Machine A has twice the CPU
> > > > cores per socket of machine B:
> > > > 
> > > > 		unpatched	patched
> > > > machine A:	3m45s		2m27s
> > > > machine B:	4m13s		4m14s
> > > > 
> > > > Create rates:
> > > > 		unpatched	patched
> > > > machine A:	246k+/-15k	384k+/-10k
> > > > machine B:	225k+/-13k	223k+/-11k
> > > > 
> > > > Concurrent rm of same 50 million inodes:
> > > > 
> > > > 		unpatched	patched
> > > > machine A:	8m30s		3m09s
> > > > machine B:	4m02s		4m51s
> > > > 
> > > > The transaction rate on the fast machine went from about 250k/sec to
> > > > over 600k/sec, which indicates just how much of a bottleneck this
> > > > atomic counter was.
> > > > 
> > > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > > ---
> > > >  fs/xfs/xfs_mount.h |  2 +-
> > > >  fs/xfs/xfs_super.c | 12 +++++++++---
> > > >  fs/xfs/xfs_trans.c |  6 +++---
> > > >  3 files changed, 13 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > > index 712b3e2583316..af3d8b71e9591 100644
> > > > --- a/fs/xfs/xfs_mount.h
> > > > +++ b/fs/xfs/xfs_mount.h
> > > > @@ -84,6 +84,7 @@ typedef struct xfs_mount {
> > > >  	 * extents or anything related to the rt device.
> > > >  	 */
> > > >  	struct percpu_counter	m_delalloc_blks;
> > > > +	struct percpu_counter	m_active_trans;	/* in progress xact counter */
> > > >  
> > > >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> > > >  	char			*m_rtname;	/* realtime device name */
> > > > @@ -164,7 +165,6 @@ typedef struct xfs_mount {
> > > >  	uint64_t		m_resblks;	/* total reserved blocks */
> > > >  	uint64_t		m_resblks_avail;/* available reserved blocks */
> > > >  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> > > > -	atomic_t		m_active_trans;	/* number trans frozen */
> > > >  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
> > > >  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> > > >  	struct delayed_work	m_eofblocks_work; /* background eof blocks
> > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > index e80bd2c4c279e..bc4853525ce18 100644
> > > > --- a/fs/xfs/xfs_super.c
> > > > +++ b/fs/xfs/xfs_super.c
> > > > @@ -883,7 +883,7 @@ xfs_quiesce_attr(
> > > >  	int	error = 0;
> > > >  
> > > >  	/* wait for all modifications to complete */
> > > > -	while (atomic_read(&mp->m_active_trans) > 0)
> > > > +	while (percpu_counter_sum(&mp->m_active_trans) > 0)
> > > >  		delay(100);
> > > 
> > > Hmm.  AFAICT, this counter stops us from quiescing the log while
> > > transactions are still running.  We only quiesce the log for unmount,
> > > remount-ro, and fs freeze.  Given that we now start_sb_write for
> > > xfs_getfsmap and the background freeing threads, I wonder, do we still
> > > need this at all?
> > 
> > Perhaps not - I didn't look that far. It's basically only needed to
> > protect against XFS_TRANS_NO_WRITECOUNT transactions, which is
> > really just xfs_sync_sb() these days. This can come from several
> > places, but the only one outside of mount/freeze/unmount is the log
> > worker.  Perhaps the log worker can be cancelled before calling
> > xfs_quiesce_attr() rather than after?
> 
> What if we skip bumping m_active_trans for NO_WRITECOUNT transactions?
> There aren't that many of them, and it'd probably better for memory
> consumption on 1000-core systems. ;)

I think the log worker is the only piece remaining that uses
NO_WRITECOUNT transactions without fsfreeze protection, and AFAICT we
should be able to cancel it at the beginning of xfs_quiesce_attr since
the function forces the log out and pushes the AIL manually.  That seems
like it should enable us to remove m_active_trans...

--D

> --D
> 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 712b3e2583316..af3d8b71e9591 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -84,6 +84,7 @@  typedef struct xfs_mount {
 	 * extents or anything related to the rt device.
 	 */
 	struct percpu_counter	m_delalloc_blks;
+	struct percpu_counter	m_active_trans;	/* in progress xact counter */
 
 	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
 	char			*m_rtname;	/* realtime device name */
@@ -164,7 +165,6 @@  typedef struct xfs_mount {
 	uint64_t		m_resblks;	/* total reserved blocks */
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
-	atomic_t		m_active_trans;	/* number trans frozen */
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct delayed_work	m_eofblocks_work; /* background eof blocks
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e80bd2c4c279e..bc4853525ce18 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -883,7 +883,7 @@  xfs_quiesce_attr(
 	int	error = 0;
 
 	/* wait for all modifications to complete */
-	while (atomic_read(&mp->m_active_trans) > 0)
+	while (percpu_counter_sum(&mp->m_active_trans) > 0)
 		delay(100);
 
 	/* force the log to unpin objects from the now complete transactions */
@@ -902,7 +902,7 @@  xfs_quiesce_attr(
 	 * Just warn here till VFS can correctly support
 	 * read-only remount without racing.
 	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+	WARN_ON(percpu_counter_sum(&mp->m_active_trans) != 0);
 
 	xfs_log_quiesce(mp);
 }
@@ -1027,8 +1027,14 @@  xfs_init_percpu_counters(
 	if (error)
 		goto free_fdblocks;
 
+	error = percpu_counter_init(&mp->m_active_trans, 0, GFP_KERNEL);
+	if (error)
+		goto free_delalloc_blocks;
+
 	return 0;
 
+free_delalloc_blocks:
+	percpu_counter_destroy(&mp->m_delalloc_blks);
 free_fdblocks:
 	percpu_counter_destroy(&mp->m_fdblocks);
 free_ifree:
@@ -1057,6 +1063,7 @@  xfs_destroy_percpu_counters(
 	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
 	       percpu_counter_sum(&mp->m_delalloc_blks) == 0);
 	percpu_counter_destroy(&mp->m_delalloc_blks);
+	percpu_counter_destroy(&mp->m_active_trans);
 }
 
 static void
@@ -1792,7 +1799,6 @@  static int xfs_init_fs_context(
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	spin_lock_init(&mp->m_perag_lock);
 	mutex_init(&mp->m_growlock);
-	atomic_set(&mp->m_active_trans, 0);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 28b983ff8b113..636df5017782e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -68,7 +68,7 @@  xfs_trans_free(
 	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
 	trace_xfs_trans_free(tp, _RET_IP_);
-	atomic_dec(&tp->t_mountp->m_active_trans);
+	percpu_counter_dec(&tp->t_mountp->m_active_trans);
 	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_end_intwrite(tp->t_mountp->m_super);
 	xfs_trans_free_dqinfo(tp);
@@ -126,7 +126,7 @@  xfs_trans_dup(
 
 	xfs_trans_dup_dqinfo(tp, ntp);
 
-	atomic_inc(&tp->t_mountp->m_active_trans);
+	percpu_counter_inc(&tp->t_mountp->m_active_trans);
 	return ntp;
 }
 
@@ -275,7 +275,7 @@  xfs_trans_alloc(
 	 */
 	WARN_ON(resp->tr_logres > 0 &&
 		mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
-	atomic_inc(&mp->m_active_trans);
+	percpu_counter_inc(&mp->m_active_trans);
 
 	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
 	tp->t_flags = flags;