diff mbox series

[02/10] xfs: convert quotacheck to use the new iwalk functions

Message ID 155968498085.1657646.3518168545540841602.stgit@magnolia (mailing list archive)
State Superseded, archived
Headers show
Series xfs: refactor and improve inode iteration | expand

Commit Message

Darrick J. Wong June 4, 2019, 9:49 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Convert quotacheck to use the new iwalk iterator to dig through the
inodes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

Comments

Brian Foster June 10, 2019, 1:58 p.m. UTC | #1
On Tue, Jun 04, 2019 at 02:49:40PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Convert quotacheck to use the new iwalk iterator to dig through the
> inodes.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
>  1 file changed, 20 insertions(+), 42 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index aa6b6db3db0e..a5b2260406a8 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
...
> @@ -1136,20 +1135,18 @@ xfs_qm_dqusage_adjust(
>  	 * rootino must have its resources accounted for, not so with the quota
>  	 * inodes.
>  	 */
> -	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
> -		*res = BULKSTAT_RV_NOTHING;
> -		return -EINVAL;
> -	}
> +	if (xfs_is_quota_inode(&mp->m_sb, ino))
> +		return 0;
>  
>  	/*
>  	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
>  	 * at mount time and therefore nobody will be racing chown/chproj.
>  	 */
> -	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
> -	if (error) {
> -		*res = BULKSTAT_RV_NOTHING;
> +	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);

I was wondering if we should start using IGET_UNTRUSTED here, but I
guess we're 1.) protected by quotacheck context and 2.) have the same
record validity semantics as the existing bulkstat walker. LGTM:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +	if (error == -EINVAL || error == -ENOENT)
> +		return 0;
> +	if (error)
>  		return error;
> -	}
>  
>  	ASSERT(ip->i_delayed_blks == 0);
>  
> @@ -1157,7 +1154,7 @@ xfs_qm_dqusage_adjust(
>  		struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>  
>  		if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> -			error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> +			error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
>  			if (error)
>  				goto error0;
>  		}
> @@ -1200,13 +1197,8 @@ xfs_qm_dqusage_adjust(
>  			goto error0;
>  	}
>  
> -	xfs_irele(ip);
> -	*res = BULKSTAT_RV_DIDONE;
> -	return 0;
> -
>  error0:
>  	xfs_irele(ip);
> -	*res = BULKSTAT_RV_GIVEUP;
>  	return error;
>  }
>  
> @@ -1270,18 +1262,13 @@ STATIC int
>  xfs_qm_quotacheck(
>  	xfs_mount_t	*mp)
>  {
> -	int			done, count, error, error2;
> -	xfs_ino_t		lastino;
> -	size_t			structsz;
> +	int			error, error2;
>  	uint			flags;
>  	LIST_HEAD		(buffer_list);
>  	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
>  	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
>  	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
>  
> -	count = INT_MAX;
> -	structsz = 1;
> -	lastino = 0;
>  	flags = 0;
>  
>  	ASSERT(uip || gip || pip);
> @@ -1318,18 +1305,9 @@ xfs_qm_quotacheck(
>  		flags |= XFS_PQUOTA_CHKD;
>  	}
>  
> -	do {
> -		/*
> -		 * Iterate thru all the inodes in the file system,
> -		 * adjusting the corresponding dquot counters in core.
> -		 */
> -		error = xfs_bulkstat(mp, &lastino, &count,
> -				     xfs_qm_dqusage_adjust,
> -				     structsz, NULL, &done);
> -		if (error)
> -			break;
> -
> -	} while (!done);
> +	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> +	if (error)
> +		goto error_return;
>  
>  	/*
>  	 * We've made all the changes that we need to make incore.  Flush them
>
Darrick J. Wong June 10, 2019, 5:10 p.m. UTC | #2
On Mon, Jun 10, 2019 at 09:58:52AM -0400, Brian Foster wrote:
> On Tue, Jun 04, 2019 at 02:49:40PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Convert quotacheck to use the new iwalk iterator to dig through the
> > inodes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
> >  1 file changed, 20 insertions(+), 42 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index aa6b6db3db0e..a5b2260406a8 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> ...
> > @@ -1136,20 +1135,18 @@ xfs_qm_dqusage_adjust(
> >  	 * rootino must have its resources accounted for, not so with the quota
> >  	 * inodes.
> >  	 */
> > -	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
> > -		*res = BULKSTAT_RV_NOTHING;
> > -		return -EINVAL;
> > -	}
> > +	if (xfs_is_quota_inode(&mp->m_sb, ino))
> > +		return 0;
> >  
> >  	/*
> >  	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
> >  	 * at mount time and therefore nobody will be racing chown/chproj.
> >  	 */
> > -	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > -	if (error) {
> > -		*res = BULKSTAT_RV_NOTHING;
> > +	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
> 
> I was wondering if we should start using IGET_UNTRUSTED here, but I
> guess we're 1.) protected by quotacheck context and 2.) have the same
> record validity semantics as the existing bulkstat walker. LGTM:

Yep.  There's nothing else running in the fs at quotacheck time so inode
records should not be changing while quotacheck runs.

> Reviewed-by: Brian Foster <bfoster@redhat.com>
> 
> > +	if (error == -EINVAL || error == -ENOENT)
> > +		return 0;
> > +	if (error)
> >  		return error;
> > -	}
> >  
> >  	ASSERT(ip->i_delayed_blks == 0);
> >  
> > @@ -1157,7 +1154,7 @@ xfs_qm_dqusage_adjust(
> >  		struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> >  
> >  		if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> > -			error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> > +			error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
> >  			if (error)
> >  				goto error0;
> >  		}
> > @@ -1200,13 +1197,8 @@ xfs_qm_dqusage_adjust(
> >  			goto error0;
> >  	}
> >  
> > -	xfs_irele(ip);
> > -	*res = BULKSTAT_RV_DIDONE;
> > -	return 0;
> > -
> >  error0:
> >  	xfs_irele(ip);
> > -	*res = BULKSTAT_RV_GIVEUP;
> >  	return error;
> >  }
> >  
> > @@ -1270,18 +1262,13 @@ STATIC int
> >  xfs_qm_quotacheck(
> >  	xfs_mount_t	*mp)
> >  {
> > -	int			done, count, error, error2;
> > -	xfs_ino_t		lastino;
> > -	size_t			structsz;
> > +	int			error, error2;
> >  	uint			flags;
> >  	LIST_HEAD		(buffer_list);
> >  	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
> >  	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
> >  	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
> >  
> > -	count = INT_MAX;
> > -	structsz = 1;
> > -	lastino = 0;
> >  	flags = 0;
> >  
> >  	ASSERT(uip || gip || pip);
> > @@ -1318,18 +1305,9 @@ xfs_qm_quotacheck(
> >  		flags |= XFS_PQUOTA_CHKD;
> >  	}
> >  
> > -	do {
> > -		/*
> > -		 * Iterate thru all the inodes in the file system,
> > -		 * adjusting the corresponding dquot counters in core.
> > -		 */
> > -		error = xfs_bulkstat(mp, &lastino, &count,
> > -				     xfs_qm_dqusage_adjust,
> > -				     structsz, NULL, &done);
> > -		if (error)
> > -			break;
> > -
> > -	} while (!done);
> > +	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > +	if (error)
> > +		goto error_return;
> >  
> >  	/*
> >  	 * We've made all the changes that we need to make incore.  Flush them
> >
Dave Chinner June 11, 2019, 11:23 p.m. UTC | #3
On Mon, Jun 10, 2019 at 09:58:52AM -0400, Brian Foster wrote:
> On Tue, Jun 04, 2019 at 02:49:40PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Convert quotacheck to use the new iwalk iterator to dig through the
> > inodes.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
> >  1 file changed, 20 insertions(+), 42 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index aa6b6db3db0e..a5b2260406a8 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> ...
> > @@ -1136,20 +1135,18 @@ xfs_qm_dqusage_adjust(
> >  	 * rootino must have its resources accounted for, not so with the quota
> >  	 * inodes.
> >  	 */
> > -	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
> > -		*res = BULKSTAT_RV_NOTHING;
> > -		return -EINVAL;
> > -	}
> > +	if (xfs_is_quota_inode(&mp->m_sb, ino))
> > +		return 0;
> >  
> >  	/*
> >  	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
> >  	 * at mount time and therefore nobody will be racing chown/chproj.
> >  	 */
> > -	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > -	if (error) {
> > -		*res = BULKSTAT_RV_NOTHING;
> > +	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
> 
> I was wondering if we should start using IGET_UNTRUSTED here, but I
> guess we're 1.) protected by quotacheck context and 2.) have the same
> record validity semantics as the existing bulkstat walker. LGTM:

FWIW, I'd be wanting to go the other way with bulkstat. i.e. finding
ways of reducing IGET_UNTRUSTED in bulkstat because it adds
substantial CPU overhead during inode lookup because it has to look
up the inobt to validate the inode number. i.e. we are locking the
AGI and doing an inobt lookup on every inode we bulkstat because
there is some time between the initial inobt lookup and the
xfs_iget() call and that's when the inode chunk can get removed.

IOWs, we only need to validate that the inode buffer still contains
inodes before we start instantiating inodes from it, but because we
don't hold any locks across individual inode processing in bulkstat
we have to revalidate that buffer contains inodes for every
allocated inode in that buffer. If we had a way of passing a locked
cluster buffer into xfs_iget to avoid having to look it up and read
it, we could do a single inode cluster read after validating the
inobt record is still valid, we could cycle all the remaining inodes
through xfs_iget() without having to use IGET_UNTRUSTED to validate
the inode cluster still contains valid inodes on every inode....

We still need to cycle inodes through the cache (so bulkstat is
coherent with other inode operations), but this would substantially
reduce the per-inode bulkstat CPU overhead, I think....

Cheers,

Dave.
Darrick J. Wong June 12, 2019, 12:32 a.m. UTC | #4
On Wed, Jun 12, 2019 at 09:23:47AM +1000, Dave Chinner wrote:
> On Mon, Jun 10, 2019 at 09:58:52AM -0400, Brian Foster wrote:
> > On Tue, Jun 04, 2019 at 02:49:40PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Convert quotacheck to use the new iwalk iterator to dig through the
> > > inodes.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
> > >  1 file changed, 20 insertions(+), 42 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > index aa6b6db3db0e..a5b2260406a8 100644
> > > --- a/fs/xfs/xfs_qm.c
> > > +++ b/fs/xfs/xfs_qm.c
> > ...
> > > @@ -1136,20 +1135,18 @@ xfs_qm_dqusage_adjust(
> > >  	 * rootino must have its resources accounted for, not so with the quota
> > >  	 * inodes.
> > >  	 */
> > > -	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
> > > -		*res = BULKSTAT_RV_NOTHING;
> > > -		return -EINVAL;
> > > -	}
> > > +	if (xfs_is_quota_inode(&mp->m_sb, ino))
> > > +		return 0;
> > >  
> > >  	/*
> > >  	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
> > >  	 * at mount time and therefore nobody will be racing chown/chproj.
> > >  	 */
> > > -	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > > -	if (error) {
> > > -		*res = BULKSTAT_RV_NOTHING;
> > > +	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > 
> > I was wondering if we should start using IGET_UNTRUSTED here, but I
> > guess we're 1.) protected by quotacheck context and 2.) have the same
> > record validity semantics as the existing bulkstat walker. LGTM:
> 
> FWIW, I'd be wanting to go the other way with bulkstat. i.e. finding
> ways of reducing IGET_UNTRUSTED in bulkstat because it adds
> substantial CPU overhead during inode lookup because it has to look
> up the inobt to validate the inode number. i.e. we are locking the
> AGI and doing an inobt lookup on every inode we bulkstat because
> there is some time between the initial inobt lookup and the
> xfs_iget() call and that's when the inode chunk can get removed.
> 
> IOWs, we only need to validate that the inode buffer still contains
> inodes before we start instantiating inodes from it, but because we
> don't hold any locks across individual inode processing in bulkstat
> we have to revalidate that buffer contains inodes for every
> allocated inode in that buffer. If we had a way of passing a locked
> cluster buffer into xfs_iget to avoid having to look it up and read
> it, we could do a single inode cluster read after validating the
> inobt record is still valid, we could cycle all the remaining inodes
> through xfs_iget() without having to use IGET_UNTRUSTED to validate
> the inode cluster still contains valid inodes on every inode....
> 
> We still need to cycle inodes through the cache (so bulkstat is
> coherent with other inode operations), but this would substantially
> reduce the per-inode bulkstat CPU overhead, I think....

I'll think about this as an addendum to the series, because I suspect
that remodelling the existing users is going to be an entire series on
its own.  (IOWs, my brain is too tired for today)

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Brian Foster June 12, 2019, 12:55 p.m. UTC | #5
On Tue, Jun 11, 2019 at 05:32:19PM -0700, Darrick J. Wong wrote:
> On Wed, Jun 12, 2019 at 09:23:47AM +1000, Dave Chinner wrote:
> > On Mon, Jun 10, 2019 at 09:58:52AM -0400, Brian Foster wrote:
> > > On Tue, Jun 04, 2019 at 02:49:40PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Convert quotacheck to use the new iwalk iterator to dig through the
> > > > inodes.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > Reviewed-by: Dave Chinner <dchinner@redhat.com>
> > > > ---
> > > >  fs/xfs/xfs_qm.c |   62 ++++++++++++++++++-------------------------------------
> > > >  1 file changed, 20 insertions(+), 42 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > > index aa6b6db3db0e..a5b2260406a8 100644
> > > > --- a/fs/xfs/xfs_qm.c
> > > > +++ b/fs/xfs/xfs_qm.c
> > > ...
> > > > @@ -1136,20 +1135,18 @@ xfs_qm_dqusage_adjust(
> > > >  	 * rootino must have its resources accounted for, not so with the quota
> > > >  	 * inodes.
> > > >  	 */
> > > > -	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
> > > > -		*res = BULKSTAT_RV_NOTHING;
> > > > -		return -EINVAL;
> > > > -	}
> > > > +	if (xfs_is_quota_inode(&mp->m_sb, ino))
> > > > +		return 0;
> > > >  
> > > >  	/*
> > > >  	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
> > > >  	 * at mount time and therefore nobody will be racing chown/chproj.
> > > >  	 */
> > > > -	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > > > -	if (error) {
> > > > -		*res = BULKSTAT_RV_NOTHING;
> > > > +	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
> > > 
> > > I was wondering if we should start using IGET_UNTRUSTED here, but I
> > > guess we're 1.) protected by quotacheck context and 2.) have the same
> > > record validity semantics as the existing bulkstat walker. LGTM:
> > 
> > FWIW, I'd be wanting to go the other way with bulkstat. i.e. finding
> > ways of reducing IGET_UNTRUSTED in bulkstat because it adds
> > substantial CPU overhead during inode lookup because it has to look
> > up the inobt to validate the inode number. i.e. we are locking the
> > AGI and doing an inobt lookup on every inode we bulkstat because
> > there is some time between the initial inobt lookup and the
> > xfs_iget() call and that's when the inode chunk can get removed.
> > 

Right.. note that I was just surmising whether IGET_UNTRUSTED was
required for correctness here due to the semantics of the iwalk.

> > IOWs, we only need to validate that the inode buffer still contains
> > inodes before we start instantiating inodes from it, but because we
> > don't hold any locks across individual inode processing in bulkstat
> > we have to revalidate that buffer contains inodes for every
> > allocated inode in that buffer. If we had a way of passing a locked
> > cluster buffer into xfs_iget to avoid having to look it up and read
> > it, we could do a single inode cluster read after validating the
> > inobt record is still valid, we could cycle all the remaining inodes
> > through xfs_iget() without having to use IGET_UNTRUSTED to validate
> > the inode cluster still contains valid inodes on every inode....
> > 

Yep, that sounds like a nice idea to me. Perhaps we could cache the
current cluster buffer somewhere in the iteration structure and use that
to validate incoming inode numbers until we move out of the cluster.

> > We still need to cycle inodes through the cache (so bulkstat is
> > coherent with other inode operations), but this would substantially
> > reduce the per-inode bulkstat CPU overhead, I think....
> 

Since we're already discussing tweaks to readahead, another approach to
this problem could be to try and readahead all the way into the inode
cache. For example, consider a mechanism where a cluster buffer
readahead sets a flag on the buffer that effectively triggers an iget of
each allocated inode in the buffer. Darrick has already shown that the
inode memory allocation and iget itself has considerable overhead even
when the cluster buffer is already cached. We know that's not due to
btree lookups because quotacheck isn't using IGET_UNTRUSTED, so perhaps
we could amortize more of this cost via readahead.

The caveats are that would probably be more involved than something that
just caches the current cluster buffer and passes it into the iget path.
We'd have to rectify readahead in-core inodes against DONTCACHE inodes
used by bulkstat, for example, though I don't think that would be too
difficult to address via a new inode readahead flag or some such
preserve existing DONTCACHE behavior.

It's also likely that passing the buffer into iget would already address
most of the overhead associated with the buffer lookup, so there might
not be enough tangible benefit at that point. The positive is that it's
probably an incremental step on top of an "iget from an existing cluster
buffer" mechanism and so could be easily prototyped by hacking in a read
side b_iodone handler or something.

Anyways, just thinking out loud a bit..

> I'll think about this as an addendum to the series, because I suspect
> that remodelling the existing users is going to be an entire series on
> its own.  (IOWs, my brain is too tired for today)
> 

Heh, definitely something for a separate patch (series). :)

Brian

> --D
> 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
Dave Chinner June 12, 2019, 11:33 p.m. UTC | #6
On Wed, Jun 12, 2019 at 08:55:06AM -0400, Brian Foster wrote:
> On Tue, Jun 11, 2019 at 05:32:19PM -0700, Darrick J. Wong wrote:
> > On Wed, Jun 12, 2019 at 09:23:47AM +1000, Dave Chinner wrote:
> Since we're already discussing tweaks to readahead, another approach to
> this problem could be to try and readahead all the way into the inode
> cache. For example, consider a mechanism where a cluster buffer
> readahead sets a flag on the buffer that effectively triggers an iget of
> each allocated inode in the buffer. Darrick has already shown that the
> inode memory allocation and iget itself has considerable overhead even
> when the cluster buffer is already cached. We know that's not due to
> btree lookups because quotacheck isn't using IGET_UNTRUSTED, so perhaps
> we could amortize more of this cost via readahead.

The DONTCACHE inode caching semantics of bulkstat tend to conflict
with "readahead all the way to the inode cache".

> The caveats are that would probably be more involved than something that
> just caches the current cluster buffer and passes it into the iget path.
> We'd have to rectify readahead in-core inodes against DONTCACHE inodes
> used by bulkstat, for example, though I don't think that would be too
> difficult to address via a new inode readahead flag or some such
> preserve existing DONTCACHE behavior.

I did try that once, the cache thrashing was .... difficult to
contain under memory pressure. bulkstat pushes hundreds of thousands
of inodes a second through the inode cache, and under memory
pressure it will still cause working set perturbation with DONTCACHE
being set. Holding DONTCACHE inodes for some time in the cache kinda
defeats the simple purpose it has, and relying on cache hits to
convert "readahead" to "dont cache" becomes really nasty when we
try to use inode readahead for other things (like speeding up
directory traversals by having xfs_readdir() issue inode readahead).

The largest delay in bulkstat is the inode cluster IO latency.
Getting rid of that is where the biggest win is (hence cluster
read-ahead). The second largest overhead is the CPU burnt doing
inode lookups, and on filesystems with lots of inodes, a significant
amount of that is in the IGET_UNTRUSTED inobt lookup. IOWs, avoiding
GET_UNTRUSTED is relatively low hanging fruit.

The next limitation for bulkstat is the superblock inode list lock
contention. Getting rid of the IGET_UNTRUSTED overhead is likely to
push the lock contention into the severe range (the lock is already
the largest CPU consumer at 16 threads bulkstating 600,000 inodes/s
on a 16p machine) so until we get rid of that lock contention, there
isn't much point in doing major rework to the bulkstat algorithm as
it doesn't address the limitations that the current algorithm has.

> It's also likely that passing the buffer into iget would already address
> most of the overhead associated with the buffer lookup, so there might
> not be enough tangible benefit at that point. The positive is that it's
> probably an incremental step on top of an "iget from an existing cluster
> buffer" mechanism and so could be easily prototyped by hacking in a read
> side b_iodone handler or something.

We don't want to put inode cache insertion into a IO completion
routine. Tried it, caused horrible problems with metadata read IO
latency and substantially increased inode cache lock contention by
bouncing the radix trees around both submission and completion CPU
contexts...

/me has spent many, many years trying lots of different ways to make
the inode cache in XFS go faster and has failed most of the time....

Cheers,

Dave.
Brian Foster June 13, 2019, 6:34 p.m. UTC | #7
On Thu, Jun 13, 2019 at 09:33:02AM +1000, Dave Chinner wrote:
> On Wed, Jun 12, 2019 at 08:55:06AM -0400, Brian Foster wrote:
> > On Tue, Jun 11, 2019 at 05:32:19PM -0700, Darrick J. Wong wrote:
> > > On Wed, Jun 12, 2019 at 09:23:47AM +1000, Dave Chinner wrote:
> > Since we're already discussing tweaks to readahead, another approach to
> > this problem could be to try and readahead all the way into the inode
> > cache. For example, consider a mechanism where a cluster buffer
> > readahead sets a flag on the buffer that effectively triggers an iget of
> > each allocated inode in the buffer. Darrick has already shown that the
> > inode memory allocation and iget itself has considerable overhead even
> > when the cluster buffer is already cached. We know that's not due to
> > btree lookups because quotacheck isn't using IGET_UNTRUSTED, so perhaps
> > we could amortize more of this cost via readahead.
> 
> The DONTCACHE inode caching semantics of bulkstat tend to conflict
> with "readahead all the way to the inode cache".
> 

Yep..

> > The caveats are that would probably be more involved than something that
> > just caches the current cluster buffer and passes it into the iget path.
> > We'd have to rectify readahead in-core inodes against DONTCACHE inodes
> > used by bulkstat, for example, though I don't think that would be too
> > difficult to address via a new inode readahead flag or some such
> > preserve existing DONTCACHE behavior.
> 
> I did try that once, the cache thrashing was .... difficult to
> contain under memory pressure. bulkstat pushes hundreds of thousands
> of inodes a second through the inode cache, and under memory
> pressure it will still cause working set perturbation with DONTCACHE
> being set. Holding DONTCACHE inodes for some time in the cache kinda
> defeats the simple purpose it has, and relying on cache hits to
> convert "readahead" to "dont cache" becomes really nasty when we
> try to use inode readahead for other things (like speeding up
> directory traversals by having xfs_readdir() issue inode readahead).
> 

Yeah, though you're taking things a bit further than I was originally
thinking by applying this to bulkstat in general. I conflated bulkstat
with quotacheck above (because the latter uses the former), but I
probably should have just referred to quotacheck since the bulkstat
callback is where we actually grab the inode anyways.

The experiment I was thinking about was really just intended for
quotacheck because it's already running in a fairly isolated context.
There's no other working set, no contention to worry about with other
tasks, etc., so it's a simple environment to evaluate this kind of
(hacky) experiment. FWIW the motivation is related, since when qc is
required the user basically has to sit there and wait for it to complete
before the fs is usable (as opposed to bulkstat, which I just chalk up
to being a long running fs operation requested by the user).

Of course if there's simply no benefit in the quotacheck context, then
there's clearly not much reason to consider similar behavior for
bulkstat in general. But even if there was a benefit to qc, I agree with
your point that things get a whole lot more complex when we have to
consider working set and workload of an operational fs. Either way, I'm
still curious if it helps in qc context. :)

> The largest delay in bulkstat is the inode cluster IO latency.
> Getting rid of that is where the biggest win is (hence cluster
> read-ahead). The second largest overhead is the CPU burnt doing
> inode lookups, and on filesystems with lots of inodes, a significant
> amount of that is in the IGET_UNTRUSTED inobt lookup. IOWs, avoiding
> GET_UNTRUSTED is relatively low hanging fruit.
> 

Agreed, but this again is more applicable to general bulkstat. Have you
looked at Darrick's quotacheck flame graph? It shows the majority of
quotacheck overhead in the inode memory allocation and grabbing the
(presumably already in-core, though that is not visible in the graphic)
cluster buffer. IGET_UNTRUSTED is not a factor here because the special
quotacheck context allows us to elide it.

BTW, a related point here is that perhaps anything that speeds up
xfs_buf_find() might also speed this up (hmm.. _less_ read-ahead
perhaps?) without resorting to special case inode preallocation hacks.
It's probably worth collecting more detailed perf data on that qc
codepath before getting too far into the weeds...

> The next limitation for bulkstat is the superblock inode list lock
> contention. Getting rid of the IGET_UNTRUSTED overhead is likely to
> push the lock contention into the severe range (the lock is already
> the largest CPU consumer at 16 threads bulkstating 600,000 inodes/s
> on a 16p machine) so until we get rid of that lock contention, there
> isn't much point in doing major rework to the bulkstat algorithm as
> it doesn't address the limitations that the current algorithm has.
> 
> > It's also likely that passing the buffer into iget would already address
> > most of the overhead associated with the buffer lookup, so there might
> > not be enough tangible benefit at that point. The positive is that it's
> > probably an incremental step on top of an "iget from an existing cluster
> > buffer" mechanism and so could be easily prototyped by hacking in a read
> > side b_iodone handler or something.
> 
> We don't want to put inode cache insertion into a IO completion
> routine. Tried it, caused horrible problems with metadata read IO
> latency and substantially increased inode cache lock contention by
> bouncing the radix trees around both submission and completion CPU
> contexts...
> 

Agreed, that certainly makes sense. It would be kind of crazy to have
single inode lookups waiting on inode cache population of entire cluster
buffers worth of inodes.

> /me has spent many, many years trying lots of different ways to make
> the inode cache in XFS go faster and has failed most of the time....
> 

Heh. :)

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index aa6b6db3db0e..a5b2260406a8 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -26,6 +26,7 @@ 
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_cksum.h"
+#include "xfs_iwalk.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -1118,17 +1119,15 @@  xfs_qm_quotacheck_dqadjust(
 /* ARGSUSED */
 STATIC int
 xfs_qm_dqusage_adjust(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* not used */
-	int		ubsize,		/* not used */
-	int		*ubused,	/* not used */
-	int		*res)		/* result code value */
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		ino,
+	void			*data)
 {
-	xfs_inode_t	*ip;
-	xfs_qcnt_t	nblks;
-	xfs_filblks_t	rtblks = 0;	/* total rt blks */
-	int		error;
+	struct xfs_inode	*ip;
+	xfs_qcnt_t		nblks;
+	xfs_filblks_t		rtblks = 0;	/* total rt blks */
+	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
@@ -1136,20 +1135,18 @@  xfs_qm_dqusage_adjust(
 	 * rootino must have its resources accounted for, not so with the quota
 	 * inodes.
 	 */
-	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
-		*res = BULKSTAT_RV_NOTHING;
-		return -EINVAL;
-	}
+	if (xfs_is_quota_inode(&mp->m_sb, ino))
+		return 0;
 
 	/*
 	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
 	 * at mount time and therefore nobody will be racing chown/chproj.
 	 */
-	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
-	if (error) {
-		*res = BULKSTAT_RV_NOTHING;
+	error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
+	if (error == -EINVAL || error == -ENOENT)
+		return 0;
+	if (error)
 		return error;
-	}
 
 	ASSERT(ip->i_delayed_blks == 0);
 
@@ -1157,7 +1154,7 @@  xfs_qm_dqusage_adjust(
 		struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 
 		if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-			error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+			error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
 			if (error)
 				goto error0;
 		}
@@ -1200,13 +1197,8 @@  xfs_qm_dqusage_adjust(
 			goto error0;
 	}
 
-	xfs_irele(ip);
-	*res = BULKSTAT_RV_DIDONE;
-	return 0;
-
 error0:
 	xfs_irele(ip);
-	*res = BULKSTAT_RV_GIVEUP;
 	return error;
 }
 
@@ -1270,18 +1262,13 @@  STATIC int
 xfs_qm_quotacheck(
 	xfs_mount_t	*mp)
 {
-	int			done, count, error, error2;
-	xfs_ino_t		lastino;
-	size_t			structsz;
+	int			error, error2;
 	uint			flags;
 	LIST_HEAD		(buffer_list);
 	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
 	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
 	struct xfs_inode	*pip = mp->m_quotainfo->qi_pquotaip;
 
-	count = INT_MAX;
-	structsz = 1;
-	lastino = 0;
 	flags = 0;
 
 	ASSERT(uip || gip || pip);
@@ -1318,18 +1305,9 @@  xfs_qm_quotacheck(
 		flags |= XFS_PQUOTA_CHKD;
 	}
 
-	do {
-		/*
-		 * Iterate thru all the inodes in the file system,
-		 * adjusting the corresponding dquot counters in core.
-		 */
-		error = xfs_bulkstat(mp, &lastino, &count,
-				     xfs_qm_dqusage_adjust,
-				     structsz, NULL, &done);
-		if (error)
-			break;
-
-	} while (!done);
+	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
+	if (error)
+		goto error_return;
 
 	/*
 	 * We've made all the changes that we need to make incore.  Flush them