diff mbox series

[21/26] xfs: make the RT allocator rtgroup aware

Message ID 172437088886.60592.11418423460788700576.stgit@frogsfrogsfrogs (mailing list archive)
State New
Headers show
Series [01/26] xfs: define the format of rt groups | expand

Commit Message

Darrick J. Wong Aug. 23, 2024, 12:26 a.m. UTC
From: Christoph Hellwig <hch@lst.de>

Make the allocator rtgroup aware by either picking a specific group if
there is a hint, or loop over all groups otherwise.  A simple rotor is
provided to pick the placement for initial allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c     |   13 +++++-
 fs/xfs/libxfs/xfs_rtbitmap.c |    6 ++-
 fs/xfs/xfs_mount.h           |    1 
 fs/xfs/xfs_rtalloc.c         |   98 ++++++++++++++++++++++++++++++++++++++----
 4 files changed, 105 insertions(+), 13 deletions(-)

Comments

Dave Chinner Aug. 26, 2024, 4:56 a.m. UTC | #1
On Thu, Aug 22, 2024 at 05:26:38PM -0700, Darrick J. Wong wrote:
> From: Christoph Hellwig <hch@lst.de>
> 
> Make the allocator rtgroup aware by either picking a specific group if
> there is a hint, or loop over all groups otherwise.  A simple rotor is
> provided to pick the placement for initial allocations.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/libxfs/xfs_bmap.c     |   13 +++++-
>  fs/xfs/libxfs/xfs_rtbitmap.c |    6 ++-
>  fs/xfs/xfs_mount.h           |    1 
>  fs/xfs/xfs_rtalloc.c         |   98 ++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 105 insertions(+), 13 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 126a0d253654a..88c62e1158ac7 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3151,8 +3151,17 @@ xfs_bmap_adjacent_valid(
>  	struct xfs_mount	*mp = ap->ip->i_mount;
>  
>  	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> -	    (ap->datatype & XFS_ALLOC_USERDATA))
> -		return x < mp->m_sb.sb_rblocks;
> +	    (ap->datatype & XFS_ALLOC_USERDATA)) {
> +		if (x >= mp->m_sb.sb_rblocks)
> +			return false;
> +		if (!xfs_has_rtgroups(mp))
> +			return true;
> +
> +		return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
> +			xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
> +			xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;

WHy do we need the xfs_has_rtgroups() check here? The new rtg logic will
return true for an old school rt device here, right?

> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index 3fedc552b51b0..2b57ff2687bf6 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1661,8 +1661,9 @@ xfs_rtalloc_align_minmax(
>  }
>  
>  static int
> -xfs_rtallocate(
> +xfs_rtallocate_rtg(
>  	struct xfs_trans	*tp,
> +	xfs_rgnumber_t		rgno,
>  	xfs_rtblock_t		bno_hint,
>  	xfs_rtxlen_t		minlen,
>  	xfs_rtxlen_t		maxlen,
> @@ -1682,16 +1683,33 @@ xfs_rtallocate(
>  	xfs_rtxlen_t		len = 0;
>  	int			error = 0;
>  
> -	args.rtg = xfs_rtgroup_grab(args.mp, 0);
> +	args.rtg = xfs_rtgroup_grab(args.mp, rgno);
>  	if (!args.rtg)
>  		return -ENOSPC;
>  
>  	/*
> -	 * Lock out modifications to both the RT bitmap and summary inodes.
> +	 * We need to lock out modifications to both the RT bitmap and summary
> +	 * inodes for finding free space in xfs_rtallocate_extent_{near,size}
> +	 * and join the bitmap and summary inodes for the actual allocation
> +	 * down in xfs_rtallocate_range.
> +	 *
> +	 * For RTG-enabled file system we don't want to join the inodes to the
> +	 * transaction until we are committed to allocate to allocate from this
> +	 * RTG so that only one inode of each type is locked at a time.
> +	 *
> +	 * But for pre-RTG file systems we need to already to join the bitmap
> +	 * inode to the transaction for xfs_rtpick_extent, which bumps the
> +	 * sequence number in it, so we'll have to join the inode to the
> +	 * transaction early here.
> +	 *
> +	 * This is all a bit messy, but at least the mess is contained in
> +	 * this function.
>  	 */
>  	if (!*rtlocked) {
>  		xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
> -		xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP);
> +		if (!xfs_has_rtgroups(args.mp))
> +			xfs_rtgroup_trans_join(tp, args.rtg,
> +					XFS_RTGLOCK_BITMAP);
>  		*rtlocked = true;
>  	}
>  
> @@ -1701,7 +1719,7 @@ xfs_rtallocate(
>  	 */
>  	if (bno_hint)
>  		start = xfs_rtb_to_rtx(args.mp, bno_hint);
> -	else if (initial_user_data)
> +	else if (!xfs_has_rtgroups(args.mp) && initial_user_data)
>  		start = xfs_rtpick_extent(args.rtg, tp, maxlen);

Check initial_user_data first - we don't care if there are rtgroups
enabled if initial_user_data is not true, and we only ever allocate
initial data on an inode once...

> @@ -1741,6 +1767,53 @@ xfs_rtallocate(
>  	return error;
>  }
>  
> +static int
> +xfs_rtallocate_rtgs(
> +	struct xfs_trans	*tp,
> +	xfs_fsblock_t		bno_hint,
> +	xfs_rtxlen_t		minlen,
> +	xfs_rtxlen_t		maxlen,
> +	xfs_rtxlen_t		prod,
> +	bool			wasdel,
> +	bool			initial_user_data,
> +	xfs_rtblock_t		*bno,
> +	xfs_extlen_t		*blen)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	xfs_rgnumber_t		start_rgno, rgno;
> +	int			error;
> +
> +	/*
> +	 * For now this just blindly iterates over the RTGs for an initial
> +	 * allocation.  We could try to keep an in-memory rtg_longest member
> +	 * to avoid the locking when just looking for big enough free space,
> +	 * but for now this keep things simple.
> +	 */
> +	if (bno_hint != NULLFSBLOCK)
> +		start_rgno = xfs_rtb_to_rgno(mp, bno_hint);
> +	else
> +		start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) %
> +				mp->m_sb.sb_rgcount;
> +
> +	rgno = start_rgno;
> +	do {
> +		bool		rtlocked = false;
> +
> +		error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen,
> +				prod, wasdel, initial_user_data, &rtlocked,
> +				bno, blen);
> +		if (error != -ENOSPC)
> +			return error;
> +		ASSERT(!rtlocked);
> +
> +		if (++rgno == mp->m_sb.sb_rgcount)
> +			rgno = 0;
> +		bno_hint = NULLFSBLOCK;
> +	} while (rgno != start_rgno);
> +
> +	return -ENOSPC;
> +}
> +
>  static int
>  xfs_rtallocate_align(
>  	struct xfs_bmalloca	*ap,
> @@ -1835,9 +1908,16 @@ xfs_bmap_rtalloc(
>  	if (xfs_bmap_adjacent(ap))
>  		bno_hint = ap->blkno;
>  
> -	error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
> -			ap->wasdel, initial_user_data, &rtlocked,
> -			&ap->blkno, &ap->length);
> +	if (xfs_has_rtgroups(ap->ip->i_mount)) {
> +		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
> +				prod, ap->wasdel, initial_user_data,
> +				&ap->blkno, &ap->length);
> +	} else {
> +		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
> +				prod, ap->wasdel, initial_user_data,
> +				&rtlocked, &ap->blkno, &ap->length);
> +	}

The xfs_has_rtgroups() check is unnecessary.  The iterator in
xfs_rtallocate_rtgs() will do the right thing for the
!xfs_has_rtgroups() case - it'll set start_rgno = 0 and break out
after a single call to xfs_rtallocate_rtg() with rgno = 0.

Another thing that probably should be done here is push all the
constant value calculations a couple of functions down the stack to
where they are used. Then we only need to pass two parameters down
through the rg iterator here, not 11...

-Dave.
Darrick J. Wong Aug. 26, 2024, 7:40 p.m. UTC | #2
On Mon, Aug 26, 2024 at 02:56:37PM +1000, Dave Chinner wrote:
> On Thu, Aug 22, 2024 at 05:26:38PM -0700, Darrick J. Wong wrote:
> > From: Christoph Hellwig <hch@lst.de>
> > 
> > Make the allocator rtgroup aware by either picking a specific group if
> > there is a hint, or loop over all groups otherwise.  A simple rotor is
> > provided to pick the placement for initial allocations.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c     |   13 +++++-
> >  fs/xfs/libxfs/xfs_rtbitmap.c |    6 ++-
> >  fs/xfs/xfs_mount.h           |    1 
> >  fs/xfs/xfs_rtalloc.c         |   98 ++++++++++++++++++++++++++++++++++++++----
> >  4 files changed, 105 insertions(+), 13 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > index 126a0d253654a..88c62e1158ac7 100644
> > --- a/fs/xfs/libxfs/xfs_bmap.c
> > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > @@ -3151,8 +3151,17 @@ xfs_bmap_adjacent_valid(
> >  	struct xfs_mount	*mp = ap->ip->i_mount;
> >  
> >  	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> > -	    (ap->datatype & XFS_ALLOC_USERDATA))
> > -		return x < mp->m_sb.sb_rblocks;
> > +	    (ap->datatype & XFS_ALLOC_USERDATA)) {
> > +		if (x >= mp->m_sb.sb_rblocks)
> > +			return false;
> > +		if (!xfs_has_rtgroups(mp))
> > +			return true;
> > +
> > +		return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
> > +			xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
> > +			xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
> 
> WHy do we need the xfs_has_rtgroups() check here? The new rtg logic will
> return true for an old school rt device here, right?

The incore sb_rgextents is zero on !rtg filesystems, so we need the
xfs_has_rtgroups.

> > diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> > index 3fedc552b51b0..2b57ff2687bf6 100644
> > --- a/fs/xfs/xfs_rtalloc.c
> > +++ b/fs/xfs/xfs_rtalloc.c
> > @@ -1661,8 +1661,9 @@ xfs_rtalloc_align_minmax(
> >  }
> >  
> >  static int
> > -xfs_rtallocate(
> > +xfs_rtallocate_rtg(
> >  	struct xfs_trans	*tp,
> > +	xfs_rgnumber_t		rgno,
> >  	xfs_rtblock_t		bno_hint,
> >  	xfs_rtxlen_t		minlen,
> >  	xfs_rtxlen_t		maxlen,
> > @@ -1682,16 +1683,33 @@ xfs_rtallocate(
> >  	xfs_rtxlen_t		len = 0;
> >  	int			error = 0;
> >  
> > -	args.rtg = xfs_rtgroup_grab(args.mp, 0);
> > +	args.rtg = xfs_rtgroup_grab(args.mp, rgno);
> >  	if (!args.rtg)
> >  		return -ENOSPC;
> >  
> >  	/*
> > -	 * Lock out modifications to both the RT bitmap and summary inodes.
> > +	 * We need to lock out modifications to both the RT bitmap and summary
> > +	 * inodes for finding free space in xfs_rtallocate_extent_{near,size}
> > +	 * and join the bitmap and summary inodes for the actual allocation
> > +	 * down in xfs_rtallocate_range.
> > +	 *
> > +	 * For RTG-enabled file system we don't want to join the inodes to the
> > +	 * transaction until we are committed to allocate to allocate from this
> > +	 * RTG so that only one inode of each type is locked at a time.
> > +	 *
> > +	 * But for pre-RTG file systems we need to already to join the bitmap
> > +	 * inode to the transaction for xfs_rtpick_extent, which bumps the
> > +	 * sequence number in it, so we'll have to join the inode to the
> > +	 * transaction early here.
> > +	 *
> > +	 * This is all a bit messy, but at least the mess is contained in
> > +	 * this function.
> >  	 */
> >  	if (!*rtlocked) {
> >  		xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
> > -		xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP);
> > +		if (!xfs_has_rtgroups(args.mp))
> > +			xfs_rtgroup_trans_join(tp, args.rtg,
> > +					XFS_RTGLOCK_BITMAP);
> >  		*rtlocked = true;
> >  	}
> >  
> > @@ -1701,7 +1719,7 @@ xfs_rtallocate(
> >  	 */
> >  	if (bno_hint)
> >  		start = xfs_rtb_to_rtx(args.mp, bno_hint);
> > -	else if (initial_user_data)
> > +	else if (!xfs_has_rtgroups(args.mp) && initial_user_data)
> >  		start = xfs_rtpick_extent(args.rtg, tp, maxlen);
> 
> Check initial_user_data first - we don't care if there are rtgroups
> enabled if initial_user_data is not true, and we only ever allocate
> initial data on an inode once...

<nod>

> > @@ -1741,6 +1767,53 @@ xfs_rtallocate(
> >  	return error;
> >  }
> >  
> > +static int
> > +xfs_rtallocate_rtgs(
> > +	struct xfs_trans	*tp,
> > +	xfs_fsblock_t		bno_hint,
> > +	xfs_rtxlen_t		minlen,
> > +	xfs_rtxlen_t		maxlen,
> > +	xfs_rtxlen_t		prod,
> > +	bool			wasdel,
> > +	bool			initial_user_data,
> > +	xfs_rtblock_t		*bno,
> > +	xfs_extlen_t		*blen)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	xfs_rgnumber_t		start_rgno, rgno;
> > +	int			error;
> > +
> > +	/*
> > +	 * For now this just blindly iterates over the RTGs for an initial
> > +	 * allocation.  We could try to keep an in-memory rtg_longest member
> > +	 * to avoid the locking when just looking for big enough free space,
> > +	 * but for now this keep things simple.
> > +	 */
> > +	if (bno_hint != NULLFSBLOCK)
> > +		start_rgno = xfs_rtb_to_rgno(mp, bno_hint);
> > +	else
> > +		start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) %
> > +				mp->m_sb.sb_rgcount;
> > +
> > +	rgno = start_rgno;
> > +	do {
> > +		bool		rtlocked = false;
> > +
> > +		error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen,
> > +				prod, wasdel, initial_user_data, &rtlocked,
> > +				bno, blen);
> > +		if (error != -ENOSPC)
> > +			return error;
> > +		ASSERT(!rtlocked);
> > +
> > +		if (++rgno == mp->m_sb.sb_rgcount)
> > +			rgno = 0;
> > +		bno_hint = NULLFSBLOCK;
> > +	} while (rgno != start_rgno);
> > +
> > +	return -ENOSPC;
> > +}
> > +
> >  static int
> >  xfs_rtallocate_align(
> >  	struct xfs_bmalloca	*ap,
> > @@ -1835,9 +1908,16 @@ xfs_bmap_rtalloc(
> >  	if (xfs_bmap_adjacent(ap))
> >  		bno_hint = ap->blkno;
> >  
> > -	error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
> > -			ap->wasdel, initial_user_data, &rtlocked,
> > -			&ap->blkno, &ap->length);
> > +	if (xfs_has_rtgroups(ap->ip->i_mount)) {
> > +		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
> > +				prod, ap->wasdel, initial_user_data,
> > +				&ap->blkno, &ap->length);
> > +	} else {
> > +		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
> > +				prod, ap->wasdel, initial_user_data,
> > +				&rtlocked, &ap->blkno, &ap->length);
> > +	}
> 
> The xfs_has_rtgroups() check is unnecessary.  The iterator in
> xfs_rtallocate_rtgs() will do the right thing for the
> !xfs_has_rtgroups() case - it'll set start_rgno = 0 and break out
> after a single call to xfs_rtallocate_rtg() with rgno = 0.
> 
> Another thing that probably should be done here is push all the
> constant value calculations a couple of functions down the stack to
> where they are used. Then we only need to pass two parameters down
> through the rg iterator here, not 11...

..and pass the ap itself too, to remove three of the parameters?

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner Aug. 27, 2024, 1:56 a.m. UTC | #3
On Mon, Aug 26, 2024 at 12:40:28PM -0700, Darrick J. Wong wrote:
> On Mon, Aug 26, 2024 at 02:56:37PM +1000, Dave Chinner wrote:
> > On Thu, Aug 22, 2024 at 05:26:38PM -0700, Darrick J. Wong wrote:
> > > From: Christoph Hellwig <hch@lst.de>
> > > 
> > > Make the allocator rtgroup aware by either picking a specific group if
> > > there is a hint, or loop over all groups otherwise.  A simple rotor is
> > > provided to pick the placement for initial allocations.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > >  fs/xfs/libxfs/xfs_bmap.c     |   13 +++++-
> > >  fs/xfs/libxfs/xfs_rtbitmap.c |    6 ++-
> > >  fs/xfs/xfs_mount.h           |    1 
> > >  fs/xfs/xfs_rtalloc.c         |   98 ++++++++++++++++++++++++++++++++++++++----
> > >  4 files changed, 105 insertions(+), 13 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > > index 126a0d253654a..88c62e1158ac7 100644
> > > --- a/fs/xfs/libxfs/xfs_bmap.c
> > > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > > @@ -3151,8 +3151,17 @@ xfs_bmap_adjacent_valid(
> > >  	struct xfs_mount	*mp = ap->ip->i_mount;
> > >  
> > >  	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> > > -	    (ap->datatype & XFS_ALLOC_USERDATA))
> > > -		return x < mp->m_sb.sb_rblocks;
> > > +	    (ap->datatype & XFS_ALLOC_USERDATA)) {
> > > +		if (x >= mp->m_sb.sb_rblocks)
> > > +			return false;
> > > +		if (!xfs_has_rtgroups(mp))
> > > +			return true;
> > > +
> > > +		return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
> > > +			xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
> > > +			xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
> > 
> > WHy do we need the xfs_has_rtgroups() check here? The new rtg logic will
> > return true for an old school rt device here, right?
> 
> The incore sb_rgextents is zero on !rtg filesystems, so we need the
> xfs_has_rtgroups.

Hmmm. Could we initialise it in memory only for !rtg filesystems,
and make sure we never write it back via a check in the
xfs_sb_to_disk() formatter function?

That would remove one of the problematic in-memory differences
between old skool rtdev setups and the new rtg-based setups...

> > > @@ -1835,9 +1908,16 @@ xfs_bmap_rtalloc(
> > >  	if (xfs_bmap_adjacent(ap))
> > >  		bno_hint = ap->blkno;
> > >  
> > > -	error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
> > > -			ap->wasdel, initial_user_data, &rtlocked,
> > > -			&ap->blkno, &ap->length);
> > > +	if (xfs_has_rtgroups(ap->ip->i_mount)) {
> > > +		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
> > > +				prod, ap->wasdel, initial_user_data,
> > > +				&ap->blkno, &ap->length);
> > > +	} else {
> > > +		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
> > > +				prod, ap->wasdel, initial_user_data,
> > > +				&rtlocked, &ap->blkno, &ap->length);
> > > +	}
> > 
> > The xfs_has_rtgroups() check is unnecessary.  The iterator in
> > xfs_rtallocate_rtgs() will do the right thing for the
> > !xfs_has_rtgroups() case - it'll set start_rgno = 0 and break out
> > after a single call to xfs_rtallocate_rtg() with rgno = 0.
> > 
> > Another thing that probably should be done here is push all the
> > constant value calculations a couple of functions down the stack to
> > where they are used. Then we only need to pass two parameters down
> > through the rg iterator here, not 11...
> 
> ..and pass the ap itself too, to remove three of the parameters?

Yeah, I was thinking that the iterator only needs the bno_hint
to determine which group to start iterating. Everything else is
derived from information in the ap structure and so doesn't need to
be calculated above the iterator.

Though we could just lift the xfs_rtalloc_args() up to this level
and stuff all the parameters into that structure and pass it down
instead (like we do with xfs_alloc_args for the btree allocator).
Then we only need to pass args through xfs_rtallocate(),
xfs_rtallocate_extent_near/size() and all the other helper
functions, too.

That's a much bigger sort of cleanup, though, but I think it would
be worth doing a some point because it would bring the rtalloc code
closer to how the btalloc code is structured. And, perhaps, allow us
to potentially share group selection and iteration code between
the bt and rt allocators in future...

-Dave.
Darrick J. Wong Aug. 27, 2024, 2:16 a.m. UTC | #4
On Tue, Aug 27, 2024 at 11:56:31AM +1000, Dave Chinner wrote:
> On Mon, Aug 26, 2024 at 12:40:28PM -0700, Darrick J. Wong wrote:
> > On Mon, Aug 26, 2024 at 02:56:37PM +1000, Dave Chinner wrote:
> > > On Thu, Aug 22, 2024 at 05:26:38PM -0700, Darrick J. Wong wrote:
> > > > From: Christoph Hellwig <hch@lst.de>
> > > > 
> > > > Make the allocator rtgroup aware by either picking a specific group if
> > > > there is a hint, or loop over all groups otherwise.  A simple rotor is
> > > > provided to pick the placement for initial allocations.
> > > > 
> > > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > > >  fs/xfs/libxfs/xfs_bmap.c     |   13 +++++-
> > > >  fs/xfs/libxfs/xfs_rtbitmap.c |    6 ++-
> > > >  fs/xfs/xfs_mount.h           |    1 
> > > >  fs/xfs/xfs_rtalloc.c         |   98 ++++++++++++++++++++++++++++++++++++++----
> > > >  4 files changed, 105 insertions(+), 13 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > > > index 126a0d253654a..88c62e1158ac7 100644
> > > > --- a/fs/xfs/libxfs/xfs_bmap.c
> > > > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > > > @@ -3151,8 +3151,17 @@ xfs_bmap_adjacent_valid(
> > > >  	struct xfs_mount	*mp = ap->ip->i_mount;
> > > >  
> > > >  	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> > > > -	    (ap->datatype & XFS_ALLOC_USERDATA))
> > > > -		return x < mp->m_sb.sb_rblocks;
> > > > +	    (ap->datatype & XFS_ALLOC_USERDATA)) {
> > > > +		if (x >= mp->m_sb.sb_rblocks)
> > > > +			return false;
> > > > +		if (!xfs_has_rtgroups(mp))
> > > > +			return true;
> > > > +
> > > > +		return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
> > > > +			xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
> > > > +			xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
> > > 
> > > WHy do we need the xfs_has_rtgroups() check here? The new rtg logic will
> > > return true for an old school rt device here, right?
> > 
> > The incore sb_rgextents is zero on !rtg filesystems, so we need the
> > xfs_has_rtgroups.
> 
> Hmmm. Could we initialise it in memory only for !rtg filesystems,
> and make sure we never write it back via a check in the
> xfs_sb_to_disk() formatter function?

Only if the incore sb_rgextents becomes u64, which will then cause the
incore and ondisk superblock structures not to match anymore.  There's
probably not much reason to keep them the same anymore.  That said, up
until recently the metadir patchset actually broke the two apart, but
then hch and I put things back to reduce our own confusion.

> That would remove one of the problematic in-memory differences
> between old skool rtdev setups and the new rtg-based setups...
> 
> > > > @@ -1835,9 +1908,16 @@ xfs_bmap_rtalloc(
> > > >  	if (xfs_bmap_adjacent(ap))
> > > >  		bno_hint = ap->blkno;
> > > >  
> > > > -	error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
> > > > -			ap->wasdel, initial_user_data, &rtlocked,
> > > > -			&ap->blkno, &ap->length);
> > > > +	if (xfs_has_rtgroups(ap->ip->i_mount)) {
> > > > +		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
> > > > +				prod, ap->wasdel, initial_user_data,
> > > > +				&ap->blkno, &ap->length);
> > > > +	} else {
> > > > +		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
> > > > +				prod, ap->wasdel, initial_user_data,
> > > > +				&rtlocked, &ap->blkno, &ap->length);
> > > > +	}
> > > 
> > > The xfs_has_rtgroups() check is unnecessary.  The iterator in
> > > xfs_rtallocate_rtgs() will do the right thing for the
> > > !xfs_has_rtgroups() case - it'll set start_rgno = 0 and break out
> > > after a single call to xfs_rtallocate_rtg() with rgno = 0.
> > > 
> > > Another thing that probably should be done here is push all the
> > > constant value calculations a couple of functions down the stack to
> > > where they are used. Then we only need to pass two parameters down
> > > through the rg iterator here, not 11...
> > 
> > ..and pass the ap itself too, to remove three of the parameters?
> 
> Yeah, I was thinking that the iterator only needs the bno_hint
> to determine which group to start iterating. Everything else is
> derived from information in the ap structure and so doesn't need to
> be calculated above the iterator.
> 
> Though we could just lift the xfs_rtalloc_args() up to this level
> and stuff all the parameters into that structure and pass it down
> instead (like we do with xfs_alloc_args for the btree allocator).
> Then we only need to pass args through xfs_rtallocate(),
> xfs_rtallocate_extent_near/size() and all the other helper
> functions, too.
> 
> That's a much bigger sort of cleanup, though, but I think it would
> be worth doing a some point because it would bring the rtalloc code
> closer to how the btalloc code is structured. And, perhaps, allow us
> to potentially share group selection and iteration code between
> the bt and rt allocators in future...

Well we're already tearing the rt allocator to pieces and rebuilding it,
so why not...

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Christoph Hellwig Aug. 27, 2024, 4:59 a.m. UTC | #5
On Mon, Aug 26, 2024 at 02:56:37PM +1000, Dave Chinner wrote:
> > +	if (xfs_has_rtgroups(ap->ip->i_mount)) {
> > +		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
> > +				prod, ap->wasdel, initial_user_data,
> > +				&ap->blkno, &ap->length);
> > +	} else {
> > +		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
> > +				prod, ap->wasdel, initial_user_data,
> > +				&rtlocked, &ap->blkno, &ap->length);
> > +	}
> 
> The xfs_has_rtgroups() check is unnecessary.  The iterator in
> xfs_rtallocate_rtgs() will do the right thing for the
> !xfs_has_rtgroups() case - it'll set start_rgno = 0 and break out
> after a single call to xfs_rtallocate_rtg() with rgno = 0.

The iterator itself does, but the start_rgno calculation does not.
But we can make that conditional, which shouldn't be too bad especially
if we merge xfs_rtallocate_rtgs into xfs_bmap_rtalloc.

> Another thing that probably should be done here is push all the
> constant value calculations a couple of functions down the stack to
> where they are used. Then we only need to pass two parameters down
> through the rg iterator here, not 11...

Well, not too much of that actually is constant.
Christoph Hellwig Aug. 27, 2024, 5 a.m. UTC | #6
On Mon, Aug 26, 2024 at 12:40:28PM -0700, Darrick J. Wong wrote:
> ..and pass the ap itself too, to remove three of the parameters?

I tried that earlier, but it breaks the allocation added from the
repair code later in your tree.  We could fake up a partial
xfs_bmalloca there, but that seemed pretty ugly.
Christoph Hellwig Aug. 27, 2024, 5 a.m. UTC | #7
On Mon, Aug 26, 2024 at 07:16:09PM -0700, Darrick J. Wong wrote:
> > Hmmm. Could we initialise it in memory only for !rtg filesystems,
> > and make sure we never write it back via a check in the
> > xfs_sb_to_disk() formatter function?
> 
> Only if the incore sb_rgextents becomes u64, which will then cause the
> incore and ondisk superblock structures not to match anymore.  There's
> probably not much reason to keep them the same anymore.  That said, up
> until recently the metadir patchset actually broke the two apart, but
> then hch and I put things back to reduce our own confusion.

Note that the incore sb really isn't much of a thing.  It's a random
structure that only exists embedded into the XFS mount.  The only
reason we keep adding fields to it is because some of the conversion
functions from/to disk are a mess.  The RT growfs cleanups earlier
in this patchbomb actually take care of a large part of that, so
we should be able to retire it in the not too distant future.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 126a0d253654a..88c62e1158ac7 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3151,8 +3151,17 @@  xfs_bmap_adjacent_valid(
 	struct xfs_mount	*mp = ap->ip->i_mount;
 
 	if (XFS_IS_REALTIME_INODE(ap->ip) &&
-	    (ap->datatype & XFS_ALLOC_USERDATA))
-		return x < mp->m_sb.sb_rblocks;
+	    (ap->datatype & XFS_ALLOC_USERDATA)) {
+		if (x >= mp->m_sb.sb_rblocks)
+			return false;
+		if (!xfs_has_rtgroups(mp))
+			return true;
+
+		return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
+			xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
+			xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
+
+	}
 
 	return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) &&
 		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount &&
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index c8958d3e0abe0..ef94b67feccd7 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1084,11 +1084,13 @@  xfs_rtfree_extent(
 	 * Mark more blocks free in the superblock.
 	 */
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+
 	/*
 	 * If we've now freed all the blocks, reset the file sequence
-	 * number to 0.
+	 * number to 0 for pre-RTG file systems.
 	 */
-	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	if (!xfs_has_rtgroups(mp) &&
+	    tp->t_frextents_delta + mp->m_sb.sb_frextents ==
 	    mp->m_sb.sb_rextents) {
 		if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
 			rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c4e4f5414a299..7e68812db1be7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -223,6 +223,7 @@  typedef struct xfs_mount {
 #endif
 	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
 	atomic_t		m_agirotor;	/* last ag dir inode alloced */
+	atomic_t		m_rtgrotor;	/* last rtgroup rtpicked */
 
 	/* Memory shrinker to throttle and reprioritize inodegc */
 	struct shrinker		*m_inodegc_shrinker;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 3fedc552b51b0..2b57ff2687bf6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1661,8 +1661,9 @@  xfs_rtalloc_align_minmax(
 }
 
 static int
-xfs_rtallocate(
+xfs_rtallocate_rtg(
 	struct xfs_trans	*tp,
+	xfs_rgnumber_t		rgno,
 	xfs_rtblock_t		bno_hint,
 	xfs_rtxlen_t		minlen,
 	xfs_rtxlen_t		maxlen,
@@ -1682,16 +1683,33 @@  xfs_rtallocate(
 	xfs_rtxlen_t		len = 0;
 	int			error = 0;
 
-	args.rtg = xfs_rtgroup_grab(args.mp, 0);
+	args.rtg = xfs_rtgroup_grab(args.mp, rgno);
 	if (!args.rtg)
 		return -ENOSPC;
 
 	/*
-	 * Lock out modifications to both the RT bitmap and summary inodes.
+	 * We need to lock out modifications to both the RT bitmap and summary
+	 * inodes for finding free space in xfs_rtallocate_extent_{near,size}
+	 * and join the bitmap and summary inodes for the actual allocation
+	 * down in xfs_rtallocate_range.
+	 *
+	 * For RTG-enabled file system we don't want to join the inodes to the
+	 * transaction until we are committed to allocate to allocate from this
+	 * RTG so that only one inode of each type is locked at a time.
+	 *
+	 * But for pre-RTG file systems we need to already to join the bitmap
+	 * inode to the transaction for xfs_rtpick_extent, which bumps the
+	 * sequence number in it, so we'll have to join the inode to the
+	 * transaction early here.
+	 *
+	 * This is all a bit messy, but at least the mess is contained in
+	 * this function.
 	 */
 	if (!*rtlocked) {
 		xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
-		xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP);
+		if (!xfs_has_rtgroups(args.mp))
+			xfs_rtgroup_trans_join(tp, args.rtg,
+					XFS_RTGLOCK_BITMAP);
 		*rtlocked = true;
 	}
 
@@ -1701,7 +1719,7 @@  xfs_rtallocate(
 	 */
 	if (bno_hint)
 		start = xfs_rtb_to_rtx(args.mp, bno_hint);
-	else if (initial_user_data)
+	else if (!xfs_has_rtgroups(args.mp) && initial_user_data)
 		start = xfs_rtpick_extent(args.rtg, tp, maxlen);
 
 	if (start) {
@@ -1722,8 +1740,16 @@  xfs_rtallocate(
 				prod, &rtx);
 	}
 
-	if (error)
+	if (error) {
+		if (xfs_has_rtgroups(args.mp)) {
+			xfs_rtgroup_unlock(args.rtg, XFS_RTGLOCK_BITMAP);
+			*rtlocked = false;
+		}
 		goto out_release;
+	}
+
+	if (xfs_has_rtgroups(args.mp))
+		xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP);
 
 	error = xfs_rtallocate_range(&args, rtx, len);
 	if (error)
@@ -1741,6 +1767,53 @@  xfs_rtallocate(
 	return error;
 }
 
+static int
+xfs_rtallocate_rtgs(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		bno_hint,
+	xfs_rtxlen_t		minlen,
+	xfs_rtxlen_t		maxlen,
+	xfs_rtxlen_t		prod,
+	bool			wasdel,
+	bool			initial_user_data,
+	xfs_rtblock_t		*bno,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_rgnumber_t		start_rgno, rgno;
+	int			error;
+
+	/*
+	 * For now this just blindly iterates over the RTGs for an initial
+	 * allocation.  We could try to keep an in-memory rtg_longest member
+	 * to avoid the locking when just looking for big enough free space,
+	 * but for now this keep things simple.
+	 */
+	if (bno_hint != NULLFSBLOCK)
+		start_rgno = xfs_rtb_to_rgno(mp, bno_hint);
+	else
+		start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) %
+				mp->m_sb.sb_rgcount;
+
+	rgno = start_rgno;
+	do {
+		bool		rtlocked = false;
+
+		error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen,
+				prod, wasdel, initial_user_data, &rtlocked,
+				bno, blen);
+		if (error != -ENOSPC)
+			return error;
+		ASSERT(!rtlocked);
+
+		if (++rgno == mp->m_sb.sb_rgcount)
+			rgno = 0;
+		bno_hint = NULLFSBLOCK;
+	} while (rgno != start_rgno);
+
+	return -ENOSPC;
+}
+
 static int
 xfs_rtallocate_align(
 	struct xfs_bmalloca	*ap,
@@ -1835,9 +1908,16 @@  xfs_bmap_rtalloc(
 	if (xfs_bmap_adjacent(ap))
 		bno_hint = ap->blkno;
 
-	error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
-			ap->wasdel, initial_user_data, &rtlocked,
-			&ap->blkno, &ap->length);
+	if (xfs_has_rtgroups(ap->ip->i_mount)) {
+		error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
+				prod, ap->wasdel, initial_user_data,
+				&ap->blkno, &ap->length);
+	} else {
+		error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
+				prod, ap->wasdel, initial_user_data,
+				&rtlocked, &ap->blkno, &ap->length);
+	}
+
 	if (error == -ENOSPC) {
 		if (!noalign) {
 			/*