diff mbox series

[1/5] xfs: separate read-only variables in struct xfs_mount

Message ID 20200512092811.1846252-2-david@fromorbit.com (mailing list archive)
State Superseded, archived
Headers show
Series xfs: fix a couple of performance issues | expand

Commit Message

Dave Chinner May 12, 2020, 9:28 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Seeing massive cpu usage from xfs_agino_range() on one machine;
instruction level profiles look similar to another machine running
the same workload, only one machien is consuming 10x as much CPU as
the other and going much slower. The only real difference between
the two machines is core count per socket. Both are running
identical 16p/16GB virtual machine configurations

Machine A:

  25.83%  [k] xfs_agino_range
  12.68%  [k] __xfs_dir3_data_check
   6.95%  [k] xfs_verify_ino
   6.78%  [k] xfs_dir2_data_entry_tag_p
   3.56%  [k] xfs_buf_find
   2.31%  [k] xfs_verify_dir_ino
   2.02%  [k] xfs_dabuf_map.constprop.0
   1.65%  [k] xfs_ag_block_count

And takes around 13 minutes to remove 50 million inodes.

Machine B:

  13.90%  [k] __pv_queued_spin_lock_slowpath
   3.76%  [k] do_raw_spin_lock
   2.83%  [k] xfs_dir3_leaf_check_int
   2.75%  [k] xfs_agino_range
   2.51%  [k] __raw_callee_save___pv_queued_spin_unlock
   2.18%  [k] __xfs_dir3_data_check
   2.02%  [k] xfs_log_commit_cil

And takes around 5m30s to remove 50 million inodes.

Suspect is cacheline contention on m_sectbb_log which is used in one
of the macros in xfs_agino_range. This is a read-only variable but
shares a cacheline with m_active_trans which is a global atomic that
gets bounced all around the machine.

The workload is trying to run hundreds of thousands of transactions
per second and hence cacheline contention will be occuring on this
atomic counter. Hence xfs_agino_range() is likely just be an
innocent bystander as the cache coherency protocol fights over the
cacheline between CPU cores and sockets.

On machine A, this rearrangement of the struct xfs_mount
results in the profile changing to:

   9.77%  [kernel]  [k] xfs_agino_range
   6.27%  [kernel]  [k] __xfs_dir3_data_check
   5.31%  [kernel]  [k] __pv_queued_spin_lock_slowpath
   4.54%  [kernel]  [k] xfs_buf_find
   3.79%  [kernel]  [k] do_raw_spin_lock
   3.39%  [kernel]  [k] xfs_verify_ino
   2.73%  [kernel]  [k] __raw_callee_save___pv_queued_spin_unlock

Vastly less CPU usage in xfs_agino_range(), but still 3x the amount
of machine B and still runs substantially slower than it should.

Current rm -rf of 50 million files:

		vanilla		patched
machine A	13m20s		8m30s
machine B	5m30s		5m02s

It's an improvement, hence indicating that separation and further
optimisation of read-only global filesystem data is worthwhile, but
it clearly isn't the underlying issue causing this specific
performance degradation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

Comments

Brian Foster May 12, 2020, 12:30 p.m. UTC | #1
On Tue, May 12, 2020 at 07:28:07PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Seeing massive cpu usage from xfs_agino_range() on one machine;
> instruction level profiles look similar to another machine running
> the same workload, only one machien is consuming 10x as much CPU as
> the other and going much slower. The only real difference between
> the two machines is core count per socket. Both are running
> identical 16p/16GB virtual machine configurations
> 
...
> 
> It's an improvement, hence indicating that separation and further
> optimisation of read-only global filesystem data is worthwhile, but
> it clearly isn't the underlying issue causing this specific
> performance degradation.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---

Pretty neat improvement. Could you share your test script that generated
the above? I have a 80 CPU box I'd be interested to give this a whirl
on...

>  fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
>  1 file changed, 29 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index aba5a15792792..712b3e2583316 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -88,21 +88,12 @@ typedef struct xfs_mount {
>  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
>  	char			*m_rtname;	/* realtime device name */
>  	char			*m_logname;	/* external log device name */
> -	int			m_bsize;	/* fs logical block size */
>  	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
>  	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
>  	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
> -	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> -	uint			m_allocsize_log;/* min write size log bytes */
> -	uint			m_allocsize_blocks; /* min write size blocks */
>  	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
>  	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
>  	struct xlog		*m_log;		/* log specific stuff */
> -	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> -	int			m_logbufs;	/* number of log buffers */
> -	int			m_logbsize;	/* size of each log buffer */
> -	uint			m_rsumlevels;	/* rt summary levels */
> -	uint			m_rsumsize;	/* size of rt summary, bytes */
>  	/*
>  	 * Optional cache of rt summary level per bitmap block with the
>  	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
> @@ -117,9 +108,15 @@ typedef struct xfs_mount {
>  	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
>  	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
>  	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
> +
> +	/*
> +	 * Read-only variables that are pre-calculated at mount time.
> +	 */

The intent here is to align the entire section below, right? If so, the
connection with the cache line alignment is a bit tenuous. Could we
tweak and/or add a sentence to the comment to be more explicit? I.e.:

	/*
	 * Align the following pre-calculated fields to a cache line to
	 * prevent cache line bouncing between frequently read and
	 * frequently written fields.
	 */

> +	int ____cacheline_aligned m_bsize;	/* fs logical block size */
>  	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
>  	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
>  	uint8_t			m_agno_log;	/* log #ag's */
> +	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
>  	uint			m_blockmask;	/* sb_blocksize-1 */
>  	uint			m_blockwsize;	/* sb_blocksize in words */
>  	uint			m_blockwmask;	/* blockwsize-1 */
> @@ -138,20 +135,35 @@ typedef struct xfs_mount {
>  	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
>  	uint			m_alloc_set_aside; /* space we can't use */
>  	uint			m_ag_max_usable; /* max space per AG */
> -	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> -	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> -	struct mutex		m_growlock;	/* growfs mutex */
> +	int			m_dalign;	/* stripe unit */
> +	int			m_swidth;	/* stripe width */
> +	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> +	uint			m_allocsize_log;/* min write size log bytes */
> +	uint			m_allocsize_blocks; /* min write size blocks */
> +	int			m_logbufs;	/* number of log buffers */
> +	int			m_logbsize;	/* size of each log buffer */
> +	uint			m_rsumlevels;	/* rt summary levels */
> +	uint			m_rsumsize;	/* size of rt summary, bytes */
>  	int			m_fixedfsid[2];	/* unchanged for life of FS */
> -	uint64_t		m_flags;	/* global mount flags */
> -	bool			m_finobt_nores; /* no per-AG finobt resv. */
>  	uint			m_qflags;	/* quota status flags */
> +	uint64_t		m_flags;	/* global mount flags */
> +	int64_t			m_low_space[XFS_LOWSP_MAX];
> +	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
>  	struct xfs_trans_resv	m_resv;		/* precomputed res values */
> +						/* low free space thresholds */
> +	bool			m_always_cow;
> +	bool			m_fail_unmount;
> +	bool			m_finobt_nores; /* no per-AG finobt resv. */
> +	/*
> +	 * End of pre-calculated read-only variables
> +	 */

m_always_cow and m_fail_unmount are mutable via sysfs knobs so
technically not read-only. I'm assuming that has no practical impact on
the performance optimization, but it might be worth leaving them where
they are to avoid confusion.

With those nits fixed:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +
> +	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> +	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> +	struct mutex		m_growlock;	/* growfs mutex */
>  	uint64_t		m_resblks;	/* total reserved blocks */
>  	uint64_t		m_resblks_avail;/* available reserved blocks */
>  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> -	int			m_dalign;	/* stripe unit */
> -	int			m_swidth;	/* stripe width */
> -	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
>  	atomic_t		m_active_trans;	/* number trans frozen */
>  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
>  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> @@ -160,8 +172,6 @@ typedef struct xfs_mount {
>  	struct delayed_work	m_cowblocks_work; /* background cow blocks
>  						     trimming */
>  	bool			m_update_sb;	/* sb needs update in mount */
> -	int64_t			m_low_space[XFS_LOWSP_MAX];
> -						/* low free space thresholds */
>  	struct xfs_kobj		m_kobj;
>  	struct xfs_kobj		m_error_kobj;
>  	struct xfs_kobj		m_error_meta_kobj;
> @@ -191,8 +201,6 @@ typedef struct xfs_mount {
>  	 */
>  	uint32_t		m_generation;
>  
> -	bool			m_always_cow;
> -	bool			m_fail_unmount;
>  #ifdef DEBUG
>  	/*
>  	 * Frequency with which errors are injected.  Replaces xfs_etest; the
> -- 
> 2.26.1.301.g55bc3eb7cb9
>
Darrick J. Wong May 12, 2020, 4:09 p.m. UTC | #2
On Tue, May 12, 2020 at 08:30:27AM -0400, Brian Foster wrote:
> On Tue, May 12, 2020 at 07:28:07PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Seeing massive cpu usage from xfs_agino_range() on one machine;
> > instruction level profiles look similar to another machine running
> > the same workload, only one machien is consuming 10x as much CPU as
> > the other and going much slower. The only real difference between
> > the two machines is core count per socket. Both are running
> > identical 16p/16GB virtual machine configurations
> > 
> ...
> > 
> > It's an improvement, hence indicating that separation and further
> > optimisation of read-only global filesystem data is worthwhile, but
> > it clearly isn't the underlying issue causing this specific
> > performance degradation.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> 
> Pretty neat improvement. Could you share your test script that generated
> the above? I have a 80 CPU box I'd be interested to give this a whirl
> on...
> 
> >  fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
> >  1 file changed, 29 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index aba5a15792792..712b3e2583316 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -88,21 +88,12 @@ typedef struct xfs_mount {
> >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> >  	char			*m_rtname;	/* realtime device name */
> >  	char			*m_logname;	/* external log device name */
> > -	int			m_bsize;	/* fs logical block size */
> >  	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
> >  	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
> >  	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
> > -	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > -	uint			m_allocsize_log;/* min write size log bytes */
> > -	uint			m_allocsize_blocks; /* min write size blocks */
> >  	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
> >  	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
> >  	struct xlog		*m_log;		/* log specific stuff */
> > -	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> > -	int			m_logbufs;	/* number of log buffers */
> > -	int			m_logbsize;	/* size of each log buffer */
> > -	uint			m_rsumlevels;	/* rt summary levels */
> > -	uint			m_rsumsize;	/* size of rt summary, bytes */
> >  	/*
> >  	 * Optional cache of rt summary level per bitmap block with the
> >  	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
> > @@ -117,9 +108,15 @@ typedef struct xfs_mount {
> >  	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
> >  	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
> >  	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
> > +
> > +	/*
> > +	 * Read-only variables that are pre-calculated at mount time.
> > +	 */
> 
> The intent here is to align the entire section below, right? If so, the
> connection with the cache line alignment is a bit tenuous. Could we
> tweak and/or add a sentence to the comment to be more explicit? I.e.:
> 
> 	/*
> 	 * Align the following pre-calculated fields to a cache line to
> 	 * prevent cache line bouncing between frequently read and
> 	 * frequently written fields.
> 	 */

I kinda wish we laid out via comments each place we cross a 64b boundary
on a 64-bit CPU, but I guess seeing as some of these structures can
change size depending on config option and kernel version that's
probably just asking for confusion and madness.

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D


> > +	int ____cacheline_aligned m_bsize;	/* fs logical block size */
> >  	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
> >  	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
> >  	uint8_t			m_agno_log;	/* log #ag's */
> > +	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
> >  	uint			m_blockmask;	/* sb_blocksize-1 */
> >  	uint			m_blockwsize;	/* sb_blocksize in words */
> >  	uint			m_blockwmask;	/* blockwsize-1 */
> > @@ -138,20 +135,35 @@ typedef struct xfs_mount {
> >  	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
> >  	uint			m_alloc_set_aside; /* space we can't use */
> >  	uint			m_ag_max_usable; /* max space per AG */
> > -	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> > -	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> > -	struct mutex		m_growlock;	/* growfs mutex */
> > +	int			m_dalign;	/* stripe unit */
> > +	int			m_swidth;	/* stripe width */
> > +	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > +	uint			m_allocsize_log;/* min write size log bytes */
> > +	uint			m_allocsize_blocks; /* min write size blocks */
> > +	int			m_logbufs;	/* number of log buffers */
> > +	int			m_logbsize;	/* size of each log buffer */
> > +	uint			m_rsumlevels;	/* rt summary levels */
> > +	uint			m_rsumsize;	/* size of rt summary, bytes */
> >  	int			m_fixedfsid[2];	/* unchanged for life of FS */
> > -	uint64_t		m_flags;	/* global mount flags */
> > -	bool			m_finobt_nores; /* no per-AG finobt resv. */
> >  	uint			m_qflags;	/* quota status flags */
> > +	uint64_t		m_flags;	/* global mount flags */
> > +	int64_t			m_low_space[XFS_LOWSP_MAX];
> > +	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> >  	struct xfs_trans_resv	m_resv;		/* precomputed res values */
> > +						/* low free space thresholds */
> > +	bool			m_always_cow;
> > +	bool			m_fail_unmount;
> > +	bool			m_finobt_nores; /* no per-AG finobt resv. */
> > +	/*
> > +	 * End of pre-calculated read-only variables
> > +	 */
> 
> m_always_cow and m_fail_unmount are mutable via sysfs knobs so
> technically not read-only. I'm assuming that has no practical impact on
> the performance optimization, but it might be worth leaving them where
> they are to avoid confusion.
> 
> With those nits fixed:
> 
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> 
> > +
> > +	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> > +	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> > +	struct mutex		m_growlock;	/* growfs mutex */
> >  	uint64_t		m_resblks;	/* total reserved blocks */
> >  	uint64_t		m_resblks_avail;/* available reserved blocks */
> >  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> > -	int			m_dalign;	/* stripe unit */
> > -	int			m_swidth;	/* stripe width */
> > -	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
> >  	atomic_t		m_active_trans;	/* number trans frozen */
> >  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
> >  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> > @@ -160,8 +172,6 @@ typedef struct xfs_mount {
> >  	struct delayed_work	m_cowblocks_work; /* background cow blocks
> >  						     trimming */
> >  	bool			m_update_sb;	/* sb needs update in mount */
> > -	int64_t			m_low_space[XFS_LOWSP_MAX];
> > -						/* low free space thresholds */
> >  	struct xfs_kobj		m_kobj;
> >  	struct xfs_kobj		m_error_kobj;
> >  	struct xfs_kobj		m_error_meta_kobj;
> > @@ -191,8 +201,6 @@ typedef struct xfs_mount {
> >  	 */
> >  	uint32_t		m_generation;
> >  
> > -	bool			m_always_cow;
> > -	bool			m_fail_unmount;
> >  #ifdef DEBUG
> >  	/*
> >  	 * Frequency with which errors are injected.  Replaces xfs_etest; the
> > -- 
> > 2.26.1.301.g55bc3eb7cb9
> > 
>
Dave Chinner May 12, 2020, 9:43 p.m. UTC | #3
On Tue, May 12, 2020 at 09:09:58AM -0700, Darrick J. Wong wrote:
> On Tue, May 12, 2020 at 08:30:27AM -0400, Brian Foster wrote:
> > On Tue, May 12, 2020 at 07:28:07PM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Seeing massive cpu usage from xfs_agino_range() on one machine;
> > > instruction level profiles look similar to another machine running
> > > the same workload, only one machien is consuming 10x as much CPU as
> > > the other and going much slower. The only real difference between
> > > the two machines is core count per socket. Both are running
> > > identical 16p/16GB virtual machine configurations
> > > 
> > ...
> > > 
> > > It's an improvement, hence indicating that separation and further
> > > optimisation of read-only global filesystem data is worthwhile, but
> > > it clearly isn't the underlying issue causing this specific
> > > performance degradation.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > 
> > Pretty neat improvement. Could you share your test script that generated
> > the above? I have a 80 CPU box I'd be interested to give this a whirl
> > on...
> > 
> > >  fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
> > >  1 file changed, 29 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > > index aba5a15792792..712b3e2583316 100644
> > > --- a/fs/xfs/xfs_mount.h
> > > +++ b/fs/xfs/xfs_mount.h
> > > @@ -88,21 +88,12 @@ typedef struct xfs_mount {
> > >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> > >  	char			*m_rtname;	/* realtime device name */
> > >  	char			*m_logname;	/* external log device name */
> > > -	int			m_bsize;	/* fs logical block size */
> > >  	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
> > >  	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
> > >  	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
> > > -	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > > -	uint			m_allocsize_log;/* min write size log bytes */
> > > -	uint			m_allocsize_blocks; /* min write size blocks */
> > >  	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
> > >  	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
> > >  	struct xlog		*m_log;		/* log specific stuff */
> > > -	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> > > -	int			m_logbufs;	/* number of log buffers */
> > > -	int			m_logbsize;	/* size of each log buffer */
> > > -	uint			m_rsumlevels;	/* rt summary levels */
> > > -	uint			m_rsumsize;	/* size of rt summary, bytes */
> > >  	/*
> > >  	 * Optional cache of rt summary level per bitmap block with the
> > >  	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
> > > @@ -117,9 +108,15 @@ typedef struct xfs_mount {
> > >  	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
> > >  	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
> > >  	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
> > > +
> > > +	/*
> > > +	 * Read-only variables that are pre-calculated at mount time.
> > > +	 */
> > 
> > The intent here is to align the entire section below, right? If so, the
> > connection with the cache line alignment is a bit tenuous. Could we
> > tweak and/or add a sentence to the comment to be more explicit? I.e.:
> > 
> > 	/*
> > 	 * Align the following pre-calculated fields to a cache line to
> > 	 * prevent cache line bouncing between frequently read and
> > 	 * frequently written fields.
> > 	 */
> 
> I kinda wish we laid out via comments each place we cross a 64b boundary
> on a 64-bit CPU, but I guess seeing as some of these structures can
> change size depending on config option and kernel version that's
> probably just asking for confusion and madness.

Yup, that's why we have tools like pahole. Stuff like lock debugging
or even different locking options change the size and layout of
structures like this, so you really have to look at pahole output to
determine what sits in the same cacheline for any given kernel
build.

Cheers,

Dave.
Dave Chinner May 12, 2020, 9:53 p.m. UTC | #4
On Tue, May 12, 2020 at 08:30:27AM -0400, Brian Foster wrote:
> On Tue, May 12, 2020 at 07:28:07PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Seeing massive cpu usage from xfs_agino_range() on one machine;
> > instruction level profiles look similar to another machine running
> > the same workload, only one machien is consuming 10x as much CPU as
> > the other and going much slower. The only real difference between
> > the two machines is core count per socket. Both are running
> > identical 16p/16GB virtual machine configurations
> > 
> ...
> > 
> > It's an improvement, hence indicating that separation and further
> > optimisation of read-only global filesystem data is worthwhile, but
> > it clearly isn't the underlying issue causing this specific
> > performance degradation.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> 
> Pretty neat improvement. Could you share your test script that generated
> the above? I have a 80 CPU box I'd be interested to give this a whirl
> on...

I need to punch it out to a git repo somewhere...

> >  fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
> >  1 file changed, 29 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index aba5a15792792..712b3e2583316 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -88,21 +88,12 @@ typedef struct xfs_mount {
> >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> >  	char			*m_rtname;	/* realtime device name */
> >  	char			*m_logname;	/* external log device name */
> > -	int			m_bsize;	/* fs logical block size */
> >  	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
> >  	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
> >  	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
> > -	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > -	uint			m_allocsize_log;/* min write size log bytes */
> > -	uint			m_allocsize_blocks; /* min write size blocks */
> >  	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
> >  	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
> >  	struct xlog		*m_log;		/* log specific stuff */
> > -	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> > -	int			m_logbufs;	/* number of log buffers */
> > -	int			m_logbsize;	/* size of each log buffer */
> > -	uint			m_rsumlevels;	/* rt summary levels */
> > -	uint			m_rsumsize;	/* size of rt summary, bytes */
> >  	/*
> >  	 * Optional cache of rt summary level per bitmap block with the
> >  	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
> > @@ -117,9 +108,15 @@ typedef struct xfs_mount {
> >  	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
> >  	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
> >  	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
> > +
> > +	/*
> > +	 * Read-only variables that are pre-calculated at mount time.
> > +	 */
> 
> The intent here is to align the entire section below, right? If so, the
> connection with the cache line alignment is a bit tenuous. Could we
> tweak and/or add a sentence to the comment to be more explicit? I.e.:
> 
> 	/*
> 	 * Align the following pre-calculated fields to a cache line to
> 	 * prevent cache line bouncing between frequently read and
> 	 * frequently written fields.
> 	 */

I can make it more verbose, but we don't tend to verbosely comment
variables tagged with __read_mostly as tagging them implies that
they need separation from frequently written variables. I can't use
__read_mostly here (it's for global variables and affects the data
segment they are placed in) so I just put a simple comment in. I
should have used them "Read-mostly variables" in the comment
because...

> > +	bool			m_always_cow;
> > +	bool			m_fail_unmount;
> > +	bool			m_finobt_nores; /* no per-AG finobt resv. */
> > +	/*
> > +	 * End of pre-calculated read-only variables
> > +	 */
> 
> m_always_cow and m_fail_unmount are mutable via sysfs knobs so
> technically not read-only.

... yes, these are read-mostly variables, not "read-only". The
optimisation still stands for them, as they may never be changed
after the initial user setup post mount....

I'll do another round on this patch to take all the comments into
account...

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index aba5a15792792..712b3e2583316 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -88,21 +88,12 @@  typedef struct xfs_mount {
 	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
 	char			*m_rtname;	/* realtime device name */
 	char			*m_logname;	/* external log device name */
-	int			m_bsize;	/* fs logical block size */
 	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
 	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
 	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
-	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
-	uint			m_allocsize_log;/* min write size log bytes */
-	uint			m_allocsize_blocks; /* min write size blocks */
 	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
 	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
 	struct xlog		*m_log;		/* log specific stuff */
-	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
-	int			m_logbufs;	/* number of log buffers */
-	int			m_logbsize;	/* size of each log buffer */
-	uint			m_rsumlevels;	/* rt summary levels */
-	uint			m_rsumsize;	/* size of rt summary, bytes */
 	/*
 	 * Optional cache of rt summary level per bitmap block with the
 	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
@@ -117,9 +108,15 @@  typedef struct xfs_mount {
 	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
 	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
 	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+
+	/*
+	 * Read-only variables that are pre-calculated at mount time.
+	 */
+	int ____cacheline_aligned m_bsize;	/* fs logical block size */
 	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
 	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
 	uint8_t			m_agno_log;	/* log #ag's */
+	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -138,20 +135,35 @@  typedef struct xfs_mount {
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
-	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
-	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
-	struct mutex		m_growlock;	/* growfs mutex */
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	uint			m_allocsize_log;/* min write size log bytes */
+	uint			m_allocsize_blocks; /* min write size blocks */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
-	uint64_t		m_flags;	/* global mount flags */
-	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	uint			m_qflags;	/* quota status flags */
+	uint64_t		m_flags;	/* global mount flags */
+	int64_t			m_low_space[XFS_LOWSP_MAX];
+	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
+						/* low free space thresholds */
+	bool			m_always_cow;
+	bool			m_fail_unmount;
+	bool			m_finobt_nores; /* no per-AG finobt resv. */
+	/*
+	 * End of pre-calculated read-only variables
+	 */
+
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
+	struct mutex		m_growlock;	/* growfs mutex */
 	uint64_t		m_resblks;	/* total reserved blocks */
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
-	int			m_dalign;	/* stripe unit */
-	int			m_swidth;	/* stripe width */
-	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
 	atomic_t		m_active_trans;	/* number trans frozen */
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
@@ -160,8 +172,6 @@  typedef struct xfs_mount {
 	struct delayed_work	m_cowblocks_work; /* background cow blocks
 						     trimming */
 	bool			m_update_sb;	/* sb needs update in mount */
-	int64_t			m_low_space[XFS_LOWSP_MAX];
-						/* low free space thresholds */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
@@ -191,8 +201,6 @@  typedef struct xfs_mount {
 	 */
 	uint32_t		m_generation;
 
-	bool			m_always_cow;
-	bool			m_fail_unmount;
 #ifdef DEBUG
 	/*
 	 * Frequency with which errors are injected.  Replaces xfs_etest; the