diff mbox series

[11/24] xfs: create incore realtime group structures

Message ID 172437087433.59588.10419191726395528458.stgit@frogsfrogsfrogs (mailing list archive)
State New
Headers show
Series [01/24] xfs: clean up the ISVALID macro in xfs_bmap_adjacent | expand

Commit Message

Darrick J. Wong Aug. 23, 2024, 12:17 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Create an incore object that will contain information about a realtime
allocation group.  This will eventually enable us to shard the realtime
section in a similar manner to how we shard the data section, but for
now just a single object for the entire RT subvolume is created.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile             |    1 
 fs/xfs/libxfs/xfs_format.h  |    3 +
 fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_sb.c      |    7 +
 fs/xfs/libxfs/xfs_types.h   |    4 +
 fs/xfs/xfs_log_recover.c    |   20 ++++
 fs/xfs/xfs_mount.c          |   16 +++
 fs/xfs/xfs_mount.h          |   14 +++
 fs/xfs/xfs_rtalloc.c        |    6 +
 fs/xfs/xfs_super.c          |    1 
 fs/xfs/xfs_trace.c          |    1 
 fs/xfs/xfs_trace.h          |   38 ++++++++
 13 files changed, 517 insertions(+), 2 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
 create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h

Comments

Christoph Hellwig Aug. 23, 2024, 5:01 a.m. UTC | #1
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Dave Chinner Aug. 25, 2024, 11:56 p.m. UTC | #2
On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create an incore object that will contain information about a realtime
> allocation group.  This will eventually enable us to shard the realtime
> section in a similar manner to how we shard the data section, but for
> now just a single object for the entire RT subvolume is created.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/Makefile             |    1 
>  fs/xfs/libxfs/xfs_format.h  |    3 +
>  fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_sb.c      |    7 +
>  fs/xfs/libxfs/xfs_types.h   |    4 +
>  fs/xfs/xfs_log_recover.c    |   20 ++++
>  fs/xfs/xfs_mount.c          |   16 +++
>  fs/xfs/xfs_mount.h          |   14 +++
>  fs/xfs/xfs_rtalloc.c        |    6 +
>  fs/xfs/xfs_super.c          |    1 
>  fs/xfs/xfs_trace.c          |    1 
>  fs/xfs/xfs_trace.h          |   38 ++++++++
>  13 files changed, 517 insertions(+), 2 deletions(-)
>  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
>  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h

Ok, how is the global address space for real time extents laid out
across rt groups? i.e. is it sparse similar to how fsbnos and inode
numbers are created for the data device like so?

	fsbno = (agno << agblklog) | agbno

Or is it something different? I can't find that defined anywhere in
this patch, so I can't determine if the unit conversion code and
validation is correct or not...

> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 4d8ca08cdd0ec..388b5cef48ca5 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -60,6 +60,7 @@ xfs-y				+= $(addprefix libxfs/, \
>  # xfs_rtbitmap is shared with libxfs
>  xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
>  				   xfs_rtbitmap.o \
> +				   xfs_rtgroup.o \
>  				   )
>  
>  # highlevel code
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 16a7bc02aa5f5..fa5cfc8265d92 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -176,6 +176,9 @@ typedef struct xfs_sb {
>  
>  	xfs_ino_t	sb_metadirino;	/* metadata directory tree root */
>  
> +	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
> +	xfs_rtxlen_t	sb_rgextents;	/* size of a realtime group in rtx */

So min/max rtgroup size is defined by the sb_rextsize field? What
redundant metadata do we end up with that allows us to validate
the sb_rextsize field is still valid w.r.t. rtgroups geometry?

Also, rtgroup lengths are defined by "rtx counts", but the
definitions in the xfs_mount later on are "m_rtblklog" and
"m_rgblocks" and we use xfs_rgblock_t and rgbno all over the place.

Just from the context of this patch, it is somewhat confusing trying
to work out what the difference is...


>  	/* must be padded to 64 bit alignment */
>  } xfs_sb_t;
>  
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
> new file mode 100644
> index 0000000000000..2bad1ecb811eb
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_rtgroup.c
> @@ -0,0 +1,196 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@kernel.org>
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_bit.h"
> +#include "xfs_sb.h"
> +#include "xfs_mount.h"
> +#include "xfs_btree.h"
> +#include "xfs_alloc_btree.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_alloc.h"
> +#include "xfs_ialloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_ag.h"
> +#include "xfs_ag_resv.h"
> +#include "xfs_health.h"
> +#include "xfs_error.h"
> +#include "xfs_bmap.h"
> +#include "xfs_defer.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_trace.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_rtgroup.h"
> +#include "xfs_rtbitmap.h"
> +
> +/*
> + * Passive reference counting access wrappers to the rtgroup structures.  If
> + * the rtgroup structure is to be freed, the freeing code is responsible for
> + * cleaning up objects with passive references before freeing the structure.
> + */
> +struct xfs_rtgroup *
> +xfs_rtgroup_get(
> +	struct xfs_mount	*mp,
> +	xfs_rgnumber_t		rgno)
> +{
> +	struct xfs_rtgroup	*rtg;
> +
> +	rcu_read_lock();
> +	rtg = xa_load(&mp->m_rtgroups, rgno);
> +	if (rtg) {
> +		trace_xfs_rtgroup_get(rtg, _RET_IP_);
> +		ASSERT(atomic_read(&rtg->rtg_ref) >= 0);
> +		atomic_inc(&rtg->rtg_ref);
> +	}
> +	rcu_read_unlock();
> +	return rtg;
> +}
> +
> +/* Get a passive reference to the given rtgroup. */
> +struct xfs_rtgroup *
> +xfs_rtgroup_hold(
> +	struct xfs_rtgroup	*rtg)
> +{
> +	ASSERT(atomic_read(&rtg->rtg_ref) > 0 ||
> +	       atomic_read(&rtg->rtg_active_ref) > 0);
> +
> +	trace_xfs_rtgroup_hold(rtg, _RET_IP_);
> +	atomic_inc(&rtg->rtg_ref);
> +	return rtg;
> +}
> +
> +void
> +xfs_rtgroup_put(
> +	struct xfs_rtgroup	*rtg)
> +{
> +	trace_xfs_rtgroup_put(rtg, _RET_IP_);
> +	ASSERT(atomic_read(&rtg->rtg_ref) > 0);
> +	atomic_dec(&rtg->rtg_ref);
> +}
> +
> +/*
> + * Active references for rtgroup structures. This is for short term access to
> + * the rtgroup structures for walking trees or accessing state. If an rtgroup
> + * is being shrunk or is offline, then this will fail to find that group and
> + * return NULL instead.
> + */
> +struct xfs_rtgroup *
> +xfs_rtgroup_grab(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno)
> +{
> +	struct xfs_rtgroup	*rtg;
> +
> +	rcu_read_lock();
> +	rtg = xa_load(&mp->m_rtgroups, agno);
> +	if (rtg) {
> +		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
> +		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
> +			rtg = NULL;
> +	}
> +	rcu_read_unlock();
> +	return rtg;
> +}
> +
> +void
> +xfs_rtgroup_rele(
> +	struct xfs_rtgroup	*rtg)
> +{
> +	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
> +	if (atomic_dec_and_test(&rtg->rtg_active_ref))
> +		wake_up(&rtg->rtg_active_wq);
> +}

This is all duplicates of the xfs_perag code. Can you put together a
patchset to abstract this into a "xfs_group" and embed them in both
the perag and and rtgroup structures?

That way we only need one set of lookup and iterator infrastructure,
and it will work for both data and rt groups...

> +
> +/* Compute the number of rt extents in this realtime group. */
> +xfs_rtxnum_t
> +xfs_rtgroup_extents(
+	struct xfs_mount	*mp,
> +	xfs_rgnumber_t		rgno)
> +{
> +	xfs_rgnumber_t		rgcount = mp->m_sb.sb_rgcount;
> +
> +	ASSERT(rgno < rgcount);
> +	if (rgno == rgcount - 1)
> +		return mp->m_sb.sb_rextents -
> +			((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);

Urk. So this relies on a non-rtgroup filesystem doing a
multiplication by zero of a field that the on-disk format does not
understand to get the right result.  I think this is a copying a bad
pattern we've been slowly trying to remove from the normal
allocation group code.

> +
> +	ASSERT(xfs_has_rtgroups(mp));
> +	return mp->m_sb.sb_rgextents;
> +}

We already embed the length of the rtgroup in the rtgroup structure.
THis should be looking up the rtgroup (or being passed the rtgroup
the caller already has) and doing the right thing. i.e.

	if (!rtg || !xfs_has_rtgroups(rtg->rtg_mount))
		return mp->m_sb.sb_rextents;
	return rtg->rtg_extents;


> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> new file mode 100644
> index 0000000000000..2c09ecfc50328
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -0,0 +1,212 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@kernel.org>
> + */
> +#ifndef __LIBXFS_RTGROUP_H
> +#define __LIBXFS_RTGROUP_H 1
> +
> +struct xfs_mount;
> +struct xfs_trans;
> +
> +/*
> + * Realtime group incore structure, similar to the per-AG structure.
> + */
> +struct xfs_rtgroup {
> +	struct xfs_mount	*rtg_mount;
> +	xfs_rgnumber_t		rtg_rgno;
> +	atomic_t		rtg_ref;	/* passive reference count */
> +	atomic_t		rtg_active_ref;	/* active reference count */
> +	wait_queue_head_t	rtg_active_wq;/* woken active_ref falls to zero */

Yeah, that's all common with xfs_perag....

....
> +/*
> + * rt group iteration APIs
> + */
> +static inline struct xfs_rtgroup *
> +xfs_rtgroup_next(
> +	struct xfs_rtgroup	*rtg,
> +	xfs_rgnumber_t		*rgno,
> +	xfs_rgnumber_t		end_rgno)
> +{
> +	struct xfs_mount	*mp = rtg->rtg_mount;
> +
> +	*rgno = rtg->rtg_rgno + 1;
> +	xfs_rtgroup_rele(rtg);
> +	if (*rgno > end_rgno)
> +		return NULL;
> +	return xfs_rtgroup_grab(mp, *rgno);
> +}
> +
> +#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \
> +	for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \
> +		(rtg) != NULL; \
> +		(rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno)))
> +
> +#define for_each_rtgroup_from(mp, rgno, rtg) \
> +	for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg))
> +
> +
> +#define for_each_rtgroup(mp, rgno, rtg) \
> +	(rgno) = 0; \
> +	for_each_rtgroup_from((mp), (rgno), (rtg))

Yup, that's all common with xfs_perag iteration, too. Can you put
together a patchset to unify these, please?

> +static inline bool
> +xfs_verify_rgbno(
> +	struct xfs_rtgroup	*rtg,
> +	xfs_rgblock_t		rgbno)

Ok, what's the difference between and xfs_rgblock_t and a "rtx"?

OH.... Then penny just dropped - it's another "single letter
difference that's really, really hard to spot" problem. You've
defined "xfs_r*g*block_t" for the like a a*g*bno, but we have
xfs_r*t*block_t for the global 64bit block number instead of a
xfs_fsbno_t.

We just had a bug caused by exactly this sort of confusion with a
patch that mixed xfs_[f]inobt changes together and one of the
conversions was incorrect. Nobody spotted the single incorrect
letter in the bigger patch, and I can see -exactly- the same sort of
confusion happening with rtblock vs rgblock causing implicit 32/64
bit integer promotion bugs...

> +{
> +	struct xfs_mount	*mp = rtg->rtg_mount;
> +
> +	if (rgbno >= rtg->rtg_extents * mp->m_sb.sb_rextsize)
> +		return false;

Why isn't the max valid "rgbno" stored in the rtgroup instead of
having to multiply the extent count by extent size every time we
have to verify a rgbno? (i.e. same as pag->block_count).

We know from the agbno verification this will be a -very- hot path,
and so precalculating all the constants and storing them in the rtg
should be done right from the start here.

> +	if (xfs_has_rtsb(mp) && rtg->rtg_rgno == 0 &&
> +	    rgbno < mp->m_sb.sb_rextsize)
> +		return false;

Same here - this value is stored in pag->min_block...

> +	return true;
> +}

And then, if we put the max_bno and min_bno in the generic
"xfs_group" structure, we suddenly have a generic "group bno"
verification mechanism that is independent of whether the group

static inline bool
xfs_verify_gbno(
     struct xfs_group      *g,
     xfs_gblock_t         gbno)
{
     struct xfs_mount        *mp = g->g_mount;

     if (gbno >= g->block_count)
             return false;
     if (gbno < g->min_block)
             return false;
     return true;
}

And the rest of these functions fall out the same way....


> +static inline xfs_rtblock_t
> +xfs_rgno_start_rtb(
> +	struct xfs_mount	*mp,
> +	xfs_rgnumber_t		rgno)
> +{
> +	if (mp->m_rgblklog >= 0)
> +		return ((xfs_rtblock_t)rgno << mp->m_rgblklog);
> +	return ((xfs_rtblock_t)rgno * mp->m_rgblocks);
> +}

Where does mp->m_rgblklog come from? That wasn't added to the
on-disk superblock structure and it is always initialised to zero
in this patch.

When will m_rgblklog be zero and when will it be non-zero? If it's
only going to be zero for existing non-rtg realtime systems,
then this code makes little sense (again, relying on multiplication
by zero to get the right result). If it's not always used for
rtg enabled filesytsems, then the reason for that has not been
explained and I can't work out why this would ever need to be done.


> +static inline xfs_rtblock_t
> +xfs_rgbno_to_rtb(
> +	struct xfs_mount	*mp,
> +	xfs_rgnumber_t		rgno,
> +	xfs_rgblock_t		rgbno)
> +{
> +	return xfs_rgno_start_rtb(mp, rgno) + rgbno;
> +}
> +
> +static inline xfs_rgnumber_t
> +xfs_rtb_to_rgno(
> +	struct xfs_mount	*mp,
> +	xfs_rtblock_t		rtbno)
> +{
> +	if (!xfs_has_rtgroups(mp))
> +		return 0;
> +
> +	if (mp->m_rgblklog >= 0)
> +		return rtbno >> mp->m_rgblklog;
> +
> +	return div_u64(rtbno, mp->m_rgblocks);
> +}

Ah, now I'm really confused, because m_rgblklog is completely
bypassed for legacy rt filesystems.

And I just realised, this "if (mp->m_rgblklog >= 0)" implies that
m_rgblklog can have negative values and there's no comments anywhere
about why that can happen and what would trigger it. 

We validate sb_agblklog during the superblock verifier, and so once
the filesystem is mounted we never, ever need to check whether
sb_agblklog is in range. Why is the rtblklog being handled so
differently here?


> +
> +static inline uint64_t
> +__xfs_rtb_to_rgbno(
> +	struct xfs_mount	*mp,
> +	xfs_rtblock_t		rtbno)
> +{
> +	uint32_t		rem;
> +
> +	if (!xfs_has_rtgroups(mp))
> +		return rtbno;
> +
> +	if (mp->m_rgblklog >= 0)
> +		return rtbno & mp->m_rgblkmask;
> +
> +	div_u64_rem(rtbno, mp->m_rgblocks, &rem);
> +	return rem;
> +}

Why is this function returning a uint64_t - a xfs_rgblock_t is only
a 32 bit type...

> +
> +static inline xfs_rgblock_t
> +xfs_rtb_to_rgbno(
> +	struct xfs_mount	*mp,
> +	xfs_rtblock_t		rtbno)
> +{
> +	return __xfs_rtb_to_rgbno(mp, rtbno);
> +}
> +
> +static inline xfs_daddr_t
> +xfs_rtb_to_daddr(
> +	struct xfs_mount	*mp,
> +	xfs_rtblock_t		rtbno)
> +{
> +	return rtbno << mp->m_blkbb_log;
> +}
> +
> +static inline xfs_rtblock_t
> +xfs_daddr_to_rtb(
> +	struct xfs_mount	*mp,
> +	xfs_daddr_t		daddr)
> +{
> +	return daddr >> mp->m_blkbb_log;
> +}

Ah. This code doesn't sparsify the xfs_rtblock_t address space for
rtgroups. xfs_rtblock_t is still direct physical encoding of the
location on disk.

I really think that needs to be changed to match how xfs_fsbno_t is
a sparse encoding before these changes get merged. It shouldn't
affect any of the other code in the patch set - the existing rt code
has a rtgno of 0, so it will always be a direct physical encoding
even when using a sparse xfs_rtblock_t address space.

All that moving to a sparse encoding means is that the addresses
stored in the BMBT are logical addresses rather than physical
addresses.  It should not affect any of the other code, just what
ends up stored on disk for global 64-bit rt extent addresses...

In doing this, I think we can greatly simply all this group
management stuff as most of the verification, type conversion and
iteration infrastructure can then be shared between the exist perag
and the new rtg infrastructure....

> diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> index a8cd44d03ef64..1ce4b9eb16f47 100644
> --- a/fs/xfs/libxfs/xfs_types.h
> +++ b/fs/xfs/libxfs/xfs_types.h
> @@ -9,10 +9,12 @@
>  typedef uint32_t	prid_t;		/* project ID */
>  
>  typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
> +typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */

Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can
be 2^20 bytes:

#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024)

Hence for a 4kB fsbno filesystem, the actual maximum size of an rtg
in filesystem blocks far exceeds what we can address with a 32 bit
variable.

If xfs_rgblock_t is actually indexing multi-fsbno rtextents, then it
is an extent number index, not a "block" index. An extent number
index won't overflow 32 bits (because the rtg has a max of 2^32 - 1
rtextents)

IOWs, shouldn't this be named soemthing like:

typedef uint32_t	xfs_rgext_t;	/* extent number in realtime group */

>  typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
>  typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
>  typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
>  typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
> +typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
>  typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
>  typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
>  typedef int64_t		xfs_fsize_t;	/* bytes in a file */
> @@ -53,7 +55,9 @@ typedef void *		xfs_failaddr_t;
>  #define	NULLFILEOFF	((xfs_fileoff_t)-1)
>  
>  #define	NULLAGBLOCK	((xfs_agblock_t)-1)
> +#define NULLRGBLOCK	((xfs_rgblock_t)-1)
>  #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
> +#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)

What's the maximum valid rtg number? We're not ever going to be
supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
this at and validate it at?

>  #define NULLCOMMITLSN	((xfs_lsn_t)-1)
>  
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 4423dd344239b..c627cde3bb1e0 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -28,6 +28,7 @@
>  #include "xfs_ag.h"
>  #include "xfs_quota.h"
>  #include "xfs_reflink.h"
> +#include "xfs_rtgroup.h"
>  
>  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
>  
> @@ -3346,6 +3347,7 @@ xlog_do_recover(
>  	struct xfs_mount	*mp = log->l_mp;
>  	struct xfs_buf		*bp = mp->m_sb_bp;
>  	struct xfs_sb		*sbp = &mp->m_sb;
> +	xfs_rgnumber_t		old_rgcount = sbp->sb_rgcount;
>  	int			error;
>  
>  	trace_xfs_log_recover(log, head_blk, tail_blk);
> @@ -3399,6 +3401,24 @@ xlog_do_recover(
>  		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
>  		return error;
>  	}
> +
> +	if (sbp->sb_rgcount < old_rgcount) {
> +		xfs_warn(mp, "rgcount shrink not supported");
> +		return -EINVAL;
> +	}
> +	if (sbp->sb_rgcount > old_rgcount) {
> +		xfs_rgnumber_t		rgno;
> +
> +		for (rgno = old_rgcount; rgno < sbp->sb_rgcount; rgno++) {
> +			error = xfs_rtgroup_alloc(mp, rgno);
> +			if (error) {
> +				xfs_warn(mp,
> +	"Failed post-recovery rtgroup init: %d",
> +						error);
> +				return error;
> +			}
> +		}
> +	}

Please factor this out into a separate function with all the other
rtgroup init/teardown code. That means we don't have to care about
how rtgrowfs functions in recovery code, similar to the
xfs_initialize_perag() already in this function for handling
recovery of data device growing...

>  	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
>  
>  	/* Normal transactions can now occur */
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index b0ea88acdb618..e1e849101cdd4 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -36,6 +36,7 @@
>  #include "xfs_ag.h"
>  #include "xfs_rtbitmap.h"
>  #include "xfs_metafile.h"
> +#include "xfs_rtgroup.h"
>  #include "scrub/stats.h"
>  
>  static DEFINE_MUTEX(xfs_uuid_table_mutex);
> @@ -664,6 +665,7 @@ xfs_mountfs(
>  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
>  	uint			quotamount = 0;
>  	uint			quotaflags = 0;
> +	xfs_rgnumber_t		rgno;
>  	int			error = 0;
>  
>  	xfs_sb_mount_common(mp, sbp);
> @@ -830,10 +832,18 @@ xfs_mountfs(
>  		goto out_free_dir;
>  	}
>  
> +	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
> +		error = xfs_rtgroup_alloc(mp, rgno);
> +		if (error) {
> +			xfs_warn(mp, "Failed rtgroup init: %d", error);
> +			goto out_free_rtgroup;
> +		}
> +	}

Same - factor this to a xfs_rtgroup_init() function located with the
rest of the rtgroup infrastructure...

> +
>  	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
>  		xfs_warn(mp, "no log defined");
>  		error = -EFSCORRUPTED;
> -		goto out_free_perag;
> +		goto out_free_rtgroup;
>  	}
>  
>  	error = xfs_inodegc_register_shrinker(mp);
> @@ -1068,7 +1078,8 @@ xfs_mountfs(
>  	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
>  		xfs_buftarg_drain(mp->m_logdev_targp);
>  	xfs_buftarg_drain(mp->m_ddev_targp);
> - out_free_perag:
> + out_free_rtgroup:
> +	xfs_free_rtgroups(mp, rgno);
>  	xfs_free_perag(mp);
>   out_free_dir:
>  	xfs_da_unmount(mp);
> @@ -1152,6 +1163,7 @@ xfs_unmountfs(
>  	xfs_errortag_clearall(mp);
>  #endif
>  	shrinker_free(mp->m_inodegc_shrinker);
> +	xfs_free_rtgroups(mp, mp->m_sb.sb_rgcount);

... like you've already for the cleanup side ;)

....

> @@ -1166,6 +1169,9 @@ xfs_rtmount_inodes(
>  	if (error)
>  		goto out_rele_summary;
>  
> +	for_each_rtgroup(mp, rgno, rtg)
> +		rtg->rtg_extents = xfs_rtgroup_extents(mp, rtg->rtg_rgno);
> +

This also needs to be done after recovery has initialised new rtgs
as a result fo replaying a sb growfs modification, right?

Which leads to the next question: if there are thousands of rtgs,
this requires walking every rtg at mount time, right? We know that
walking thousands of static structures at mount time is a
scalability issue, so can we please avoid this if at all possible?
i.e. do demand loading of per-rtg metadata when it is first required
(like we do with agf/agi information) rather than doing it all at
mount time...

-Dave.
Darrick J. Wong Aug. 26, 2024, 7:14 p.m. UTC | #3
On Mon, Aug 26, 2024 at 09:56:08AM +1000, Dave Chinner wrote:
> On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Create an incore object that will contain information about a realtime
> > allocation group.  This will eventually enable us to shard the realtime
> > section in a similar manner to how we shard the data section, but for
> > now just a single object for the entire RT subvolume is created.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/xfs/Makefile             |    1 
> >  fs/xfs/libxfs/xfs_format.h  |    3 +
> >  fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/libxfs/xfs_sb.c      |    7 +
> >  fs/xfs/libxfs/xfs_types.h   |    4 +
> >  fs/xfs/xfs_log_recover.c    |   20 ++++
> >  fs/xfs/xfs_mount.c          |   16 +++
> >  fs/xfs/xfs_mount.h          |   14 +++
> >  fs/xfs/xfs_rtalloc.c        |    6 +
> >  fs/xfs/xfs_super.c          |    1 
> >  fs/xfs/xfs_trace.c          |    1 
> >  fs/xfs/xfs_trace.h          |   38 ++++++++
> >  13 files changed, 517 insertions(+), 2 deletions(-)
> >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
> >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h
> 
> Ok, how is the global address space for real time extents laid out
> across rt groups? i.e. is it sparse similar to how fsbnos and inode
> numbers are created for the data device like so?
> 
> 	fsbno = (agno << agblklog) | agbno
> 
> Or is it something different? I can't find that defined anywhere in
> this patch, so I can't determine if the unit conversion code and
> validation is correct or not...

They're not sparse like fsbnos on the data device, they're laid end to
end.  IOWs, it's a straight linear translation.  If you have an rtgroup
that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).

This patch, FWIW, refactors the existing rt code so that a !rtgroups
filesystem is represented by one large "group", with xfs_rtxnum_t now
indexing rt extents within a group.  Probably it should be renamed to
xfs_rgxnum_t.

Note that we haven't defined the rtgroup ondisk format yet, so I'll go
amend that patch to spell out the ondisk format of the brave new world.

> > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > index 4d8ca08cdd0ec..388b5cef48ca5 100644
> > --- a/fs/xfs/Makefile
> > +++ b/fs/xfs/Makefile
> > @@ -60,6 +60,7 @@ xfs-y				+= $(addprefix libxfs/, \
> >  # xfs_rtbitmap is shared with libxfs
> >  xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
> >  				   xfs_rtbitmap.o \
> > +				   xfs_rtgroup.o \
> >  				   )
> >  
> >  # highlevel code
> > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > index 16a7bc02aa5f5..fa5cfc8265d92 100644
> > --- a/fs/xfs/libxfs/xfs_format.h
> > +++ b/fs/xfs/libxfs/xfs_format.h
> > @@ -176,6 +176,9 @@ typedef struct xfs_sb {
> >  
> >  	xfs_ino_t	sb_metadirino;	/* metadata directory tree root */
> >  
> > +	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
> > +	xfs_rtxlen_t	sb_rgextents;	/* size of a realtime group in rtx */
> 
> So min/max rtgroup size is defined by the sb_rextsize field? What
> redundant metadata do we end up with that allows us to validate
> the sb_rextsize field is still valid w.r.t. rtgroups geometry?
> 
> Also, rtgroup lengths are defined by "rtx counts", but the
> definitions in the xfs_mount later on are "m_rtblklog" and
> "m_rgblocks" and we use xfs_rgblock_t and rgbno all over the place.
> 
> Just from the context of this patch, it is somewhat confusing trying
> to work out what the difference is...
> 
> 
> >  	/* must be padded to 64 bit alignment */
> >  } xfs_sb_t;
> >  
> > diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
> > new file mode 100644
> > index 0000000000000..2bad1ecb811eb
> > --- /dev/null
> > +++ b/fs/xfs/libxfs/xfs_rtgroup.c
> > @@ -0,0 +1,196 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <djwong@kernel.org>
> > + */
> > +#include "xfs.h"
> > +#include "xfs_fs.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_bit.h"
> > +#include "xfs_sb.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_btree.h"
> > +#include "xfs_alloc_btree.h"
> > +#include "xfs_rmap_btree.h"
> > +#include "xfs_alloc.h"
> > +#include "xfs_ialloc.h"
> > +#include "xfs_rmap.h"
> > +#include "xfs_ag.h"
> > +#include "xfs_ag_resv.h"
> > +#include "xfs_health.h"
> > +#include "xfs_error.h"
> > +#include "xfs_bmap.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans.h"
> > +#include "xfs_trace.h"
> > +#include "xfs_inode.h"
> > +#include "xfs_icache.h"
> > +#include "xfs_rtgroup.h"
> > +#include "xfs_rtbitmap.h"
> > +
> > +/*
> > + * Passive reference counting access wrappers to the rtgroup structures.  If
> > + * the rtgroup structure is to be freed, the freeing code is responsible for
> > + * cleaning up objects with passive references before freeing the structure.
> > + */
> > +struct xfs_rtgroup *
> > +xfs_rtgroup_get(
> > +	struct xfs_mount	*mp,
> > +	xfs_rgnumber_t		rgno)
> > +{
> > +	struct xfs_rtgroup	*rtg;
> > +
> > +	rcu_read_lock();
> > +	rtg = xa_load(&mp->m_rtgroups, rgno);
> > +	if (rtg) {
> > +		trace_xfs_rtgroup_get(rtg, _RET_IP_);
> > +		ASSERT(atomic_read(&rtg->rtg_ref) >= 0);
> > +		atomic_inc(&rtg->rtg_ref);
> > +	}
> > +	rcu_read_unlock();
> > +	return rtg;
> > +}
> > +
> > +/* Get a passive reference to the given rtgroup. */
> > +struct xfs_rtgroup *
> > +xfs_rtgroup_hold(
> > +	struct xfs_rtgroup	*rtg)
> > +{
> > +	ASSERT(atomic_read(&rtg->rtg_ref) > 0 ||
> > +	       atomic_read(&rtg->rtg_active_ref) > 0);
> > +
> > +	trace_xfs_rtgroup_hold(rtg, _RET_IP_);
> > +	atomic_inc(&rtg->rtg_ref);
> > +	return rtg;
> > +}
> > +
> > +void
> > +xfs_rtgroup_put(
> > +	struct xfs_rtgroup	*rtg)
> > +{
> > +	trace_xfs_rtgroup_put(rtg, _RET_IP_);
> > +	ASSERT(atomic_read(&rtg->rtg_ref) > 0);
> > +	atomic_dec(&rtg->rtg_ref);
> > +}
> > +
> > +/*
> > + * Active references for rtgroup structures. This is for short term access to
> > + * the rtgroup structures for walking trees or accessing state. If an rtgroup
> > + * is being shrunk or is offline, then this will fail to find that group and
> > + * return NULL instead.
> > + */
> > +struct xfs_rtgroup *
> > +xfs_rtgroup_grab(
> > +	struct xfs_mount	*mp,
> > +	xfs_agnumber_t		agno)
> > +{
> > +	struct xfs_rtgroup	*rtg;
> > +
> > +	rcu_read_lock();
> > +	rtg = xa_load(&mp->m_rtgroups, agno);
> > +	if (rtg) {
> > +		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
> > +		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
> > +			rtg = NULL;
> > +	}
> > +	rcu_read_unlock();
> > +	return rtg;
> > +}
> > +
> > +void
> > +xfs_rtgroup_rele(
> > +	struct xfs_rtgroup	*rtg)
> > +{
> > +	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
> > +	if (atomic_dec_and_test(&rtg->rtg_active_ref))
> > +		wake_up(&rtg->rtg_active_wq);
> > +}
> 
> This is all duplicates of the xfs_perag code. Can you put together a
> patchset to abstract this into a "xfs_group" and embed them in both
> the perag and and rtgroup structures?
> 
> That way we only need one set of lookup and iterator infrastructure,
> and it will work for both data and rt groups...

How will that work with perags still using the radix tree and rtgroups
using the xarray?  Yes, we should move the perags to use the xarray too
(and indeed hch already has a series on list to do that) but here's
really not the time to do that because I don't want to frontload a bunch
more core changes onto this already huge patchset.

> > +
> > +/* Compute the number of rt extents in this realtime group. */
> > +xfs_rtxnum_t
> > +xfs_rtgroup_extents(
> +	struct xfs_mount	*mp,
> > +	xfs_rgnumber_t		rgno)
> > +{
> > +	xfs_rgnumber_t		rgcount = mp->m_sb.sb_rgcount;
> > +
> > +	ASSERT(rgno < rgcount);
> > +	if (rgno == rgcount - 1)
> > +		return mp->m_sb.sb_rextents -
> > +			((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
> 
> Urk. So this relies on a non-rtgroup filesystem doing a
> multiplication by zero of a field that the on-disk format does not
> understand to get the right result.  I think this is a copying a bad
> pattern we've been slowly trying to remove from the normal
> allocation group code.
> 
> > +
> > +	ASSERT(xfs_has_rtgroups(mp));
> > +	return mp->m_sb.sb_rgextents;
> > +}
> 
> We already embed the length of the rtgroup in the rtgroup structure.
> THis should be looking up the rtgroup (or being passed the rtgroup
> the caller already has) and doing the right thing. i.e.
> 
> 	if (!rtg || !xfs_has_rtgroups(rtg->rtg_mount))
> 		return mp->m_sb.sb_rextents;
> 	return rtg->rtg_extents;

xfs_rtgroup_extents is the function that we use to set rtg->rtg_extents.

> > diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> > new file mode 100644
> > index 0000000000000..2c09ecfc50328
> > --- /dev/null
> > +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> > @@ -0,0 +1,212 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +/*
> > + * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <djwong@kernel.org>
> > + */
> > +#ifndef __LIBXFS_RTGROUP_H
> > +#define __LIBXFS_RTGROUP_H 1
> > +
> > +struct xfs_mount;
> > +struct xfs_trans;
> > +
> > +/*
> > + * Realtime group incore structure, similar to the per-AG structure.
> > + */
> > +struct xfs_rtgroup {
> > +	struct xfs_mount	*rtg_mount;
> > +	xfs_rgnumber_t		rtg_rgno;
> > +	atomic_t		rtg_ref;	/* passive reference count */
> > +	atomic_t		rtg_active_ref;	/* active reference count */
> > +	wait_queue_head_t	rtg_active_wq;/* woken active_ref falls to zero */
> 
> Yeah, that's all common with xfs_perag....
> 
> ....
> > +/*
> > + * rt group iteration APIs
> > + */
> > +static inline struct xfs_rtgroup *
> > +xfs_rtgroup_next(
> > +	struct xfs_rtgroup	*rtg,
> > +	xfs_rgnumber_t		*rgno,
> > +	xfs_rgnumber_t		end_rgno)
> > +{
> > +	struct xfs_mount	*mp = rtg->rtg_mount;
> > +
> > +	*rgno = rtg->rtg_rgno + 1;
> > +	xfs_rtgroup_rele(rtg);
> > +	if (*rgno > end_rgno)
> > +		return NULL;
> > +	return xfs_rtgroup_grab(mp, *rgno);
> > +}
> > +
> > +#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \
> > +	for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \
> > +		(rtg) != NULL; \
> > +		(rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno)))
> > +
> > +#define for_each_rtgroup_from(mp, rgno, rtg) \
> > +	for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg))
> > +
> > +
> > +#define for_each_rtgroup(mp, rgno, rtg) \
> > +	(rgno) = 0; \
> > +	for_each_rtgroup_from((mp), (rgno), (rtg))
> 
> Yup, that's all common with xfs_perag iteration, too. Can you put
> together a patchset to unify these, please?

Here's the first part of that, to convert perags to xarrays...
https://lore.kernel.org/linux-xfs/20240821063901.650776-1-hch@lst.de/

> > +static inline bool
> > +xfs_verify_rgbno(
> > +	struct xfs_rtgroup	*rtg,
> > +	xfs_rgblock_t		rgbno)
> 
> Ok, what's the difference between and xfs_rgblock_t and a "rtx"?
> 
> OH.... Then penny just dropped - it's another "single letter
> difference that's really, really hard to spot" problem. You've
> defined "xfs_r*g*block_t" for the like a a*g*bno, but we have
> xfs_r*t*block_t for the global 64bit block number instead of a
> xfs_fsbno_t.
> 
> We just had a bug caused by exactly this sort of confusion with a
> patch that mixed xfs_[f]inobt changes together and one of the
> conversions was incorrect. Nobody spotted the single incorrect
> letter in the bigger patch, and I can see -exactly- the same sort of
> confusion happening with rtblock vs rgblock causing implicit 32/64
> bit integer promotion bugs...
> 
> > +{
> > +	struct xfs_mount	*mp = rtg->rtg_mount;
> > +
> > +	if (rgbno >= rtg->rtg_extents * mp->m_sb.sb_rextsize)
> > +		return false;
> 
> Why isn't the max valid "rgbno" stored in the rtgroup instead of
> having to multiply the extent count by extent size every time we
> have to verify a rgbno? (i.e. same as pag->block_count).
> 
> We know from the agbno verification this will be a -very- hot path,
> and so precalculating all the constants and storing them in the rtg
> should be done right from the start here.
> 
> > +	if (xfs_has_rtsb(mp) && rtg->rtg_rgno == 0 &&
> > +	    rgbno < mp->m_sb.sb_rextsize)
> > +		return false;
> 
> Same here - this value is stored in pag->min_block...
> 
> > +	return true;
> > +}
> 
> And then, if we put the max_bno and min_bno in the generic
> "xfs_group" structure, we suddenly have a generic "group bno"
> verification mechanism that is independent of whether the group
> 
> static inline bool
> xfs_verify_gbno(
>      struct xfs_group      *g,
>      xfs_gblock_t         gbno)
> {
>      struct xfs_mount        *mp = g->g_mount;
> 
>      if (gbno >= g->block_count)
>              return false;
>      if (gbno < g->min_block)
>              return false;
>      return true;
> }
> 
> And the rest of these functions fall out the same way....
> 
> 
> > +static inline xfs_rtblock_t
> > +xfs_rgno_start_rtb(
> > +	struct xfs_mount	*mp,
> > +	xfs_rgnumber_t		rgno)
> > +{
> > +	if (mp->m_rgblklog >= 0)
> > +		return ((xfs_rtblock_t)rgno << mp->m_rgblklog);
> > +	return ((xfs_rtblock_t)rgno * mp->m_rgblocks);
> > +}
> 
> Where does mp->m_rgblklog come from? That wasn't added to the
> on-disk superblock structure and it is always initialised to zero
> in this patch.
> 
> When will m_rgblklog be zero and when will it be non-zero? If it's

As I mentioned before, this patch merely ports non-rtg filesystems to
use the rtgroup structure.  m_rgblklog will be set to nonzero values
when we get to defining the ondisk rtgroup structure.

But, to cut ahead here, m_rgblklog will be set to a non-negative value
if the rtgroup size (in blocks) is a power of two.  Then these unit
conversion functions can use shifts instead of expensive multiplication
and divisions.  The same goes for rt extent to {fs,rt}block conversions.

> only going to be zero for existing non-rtg realtime systems,
> then this code makes little sense (again, relying on multiplication
> by zero to get the right result). If it's not always used for
> rtg enabled filesytsems, then the reason for that has not been
> explained and I can't work out why this would ever need to be done.

https://lore.kernel.org/linux-xfs/172437088534.60592.14072619855969226822.stgit@frogsfrogsfrogs/

> 
> > +static inline xfs_rtblock_t
> > +xfs_rgbno_to_rtb(
> > +	struct xfs_mount	*mp,
> > +	xfs_rgnumber_t		rgno,
> > +	xfs_rgblock_t		rgbno)
> > +{
> > +	return xfs_rgno_start_rtb(mp, rgno) + rgbno;
> > +}
> > +
> > +static inline xfs_rgnumber_t
> > +xfs_rtb_to_rgno(
> > +	struct xfs_mount	*mp,
> > +	xfs_rtblock_t		rtbno)
> > +{
> > +	if (!xfs_has_rtgroups(mp))
> > +		return 0;
> > +
> > +	if (mp->m_rgblklog >= 0)
> > +		return rtbno >> mp->m_rgblklog;
> > +
> > +	return div_u64(rtbno, mp->m_rgblocks);
> > +}
> 
> Ah, now I'm really confused, because m_rgblklog is completely
> bypassed for legacy rt filesystems.
> 
> And I just realised, this "if (mp->m_rgblklog >= 0)" implies that
> m_rgblklog can have negative values and there's no comments anywhere
> about why that can happen and what would trigger it. 

-1 is the magic value for "the rtgroup size is not a power of two, so
you have to use slow integer division and multiplication".

> We validate sb_agblklog during the superblock verifier, and so once
> the filesystem is mounted we never, ever need to check whether
> sb_agblklog is in range. Why is the rtblklog being handled so
> differently here?

This all could be documented better across the patches.  Originally the
incore and ondisk patches were adjacent and it was at least somewhat
easier to figure these things out, but hch really wanted to shard the
rtbitmaps, so now the patchset has grown even larger and harder to
understand if you only read one patch at a time.

> > +
> > +static inline uint64_t
> > +__xfs_rtb_to_rgbno(
> > +	struct xfs_mount	*mp,
> > +	xfs_rtblock_t		rtbno)
> > +{
> > +	uint32_t		rem;
> > +
> > +	if (!xfs_has_rtgroups(mp))
> > +		return rtbno;
> > +
> > +	if (mp->m_rgblklog >= 0)
> > +		return rtbno & mp->m_rgblkmask;
> > +
> > +	div_u64_rem(rtbno, mp->m_rgblocks, &rem);
> > +	return rem;
> > +}
> 
> Why is this function returning a uint64_t - a xfs_rgblock_t is only
> a 32 bit type...

group 0 on a !rtg filesystem can be 64-bits in block/rt count.  This is
a /very/ annoying pain point -- if you actually created such a
filesystem it actually would never work because the rtsummary file would
be created undersized due to an integer overflow, but the verifiers
never checked any of that, and due to the same underflow the rtallocator
would search the wrong places and (eventually) fall back to a dumb
linear scan.

Soooooo this is an obnoxious usecase (broken large !rtg filesystems)
that we can't just drop, though I'm pretty sure there aren't any systems
in the wild.

> > +
> > +static inline xfs_rgblock_t
> > +xfs_rtb_to_rgbno(
> > +	struct xfs_mount	*mp,
> > +	xfs_rtblock_t		rtbno)
> > +{
> > +	return __xfs_rtb_to_rgbno(mp, rtbno);
> > +}
> > +
> > +static inline xfs_daddr_t
> > +xfs_rtb_to_daddr(
> > +	struct xfs_mount	*mp,
> > +	xfs_rtblock_t		rtbno)
> > +{
> > +	return rtbno << mp->m_blkbb_log;
> > +}
> > +
> > +static inline xfs_rtblock_t
> > +xfs_daddr_to_rtb(
> > +	struct xfs_mount	*mp,
> > +	xfs_daddr_t		daddr)
> > +{
> > +	return daddr >> mp->m_blkbb_log;
> > +}
> 
> Ah. This code doesn't sparsify the xfs_rtblock_t address space for
> rtgroups. xfs_rtblock_t is still direct physical encoding of the
> location on disk.

Yes.

> I really think that needs to be changed to match how xfs_fsbno_t is
> a sparse encoding before these changes get merged. It shouldn't
> affect any of the other code in the patch set - the existing rt code
> has a rtgno of 0, so it will always be a direct physical encoding
> even when using a sparse xfs_rtblock_t address space.
> 
> All that moving to a sparse encoding means is that the addresses
> stored in the BMBT are logical addresses rather than physical
> addresses.  It should not affect any of the other code, just what
> ends up stored on disk for global 64-bit rt extent addresses...
> 
> In doing this, I think we can greatly simply all this group
> management stuff as most of the verification, type conversion and
> iteration infrastructure can then be shared between the exist perag
> and the new rtg infrastructure....
> 
> > diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> > index a8cd44d03ef64..1ce4b9eb16f47 100644
> > --- a/fs/xfs/libxfs/xfs_types.h
> > +++ b/fs/xfs/libxfs/xfs_types.h
> > @@ -9,10 +9,12 @@
> >  typedef uint32_t	prid_t;		/* project ID */
> >  
> >  typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
> > +typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
> 
> Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can
> be 2^20 bytes:
> 
> #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024)

No, the maximum rtgroup length is 2^32-1 blocks.

> Hence for a 4kB fsbno filesystem, the actual maximum size of an rtg
> in filesystem blocks far exceeds what we can address with a 32 bit
> variable.
> 
> If xfs_rgblock_t is actually indexing multi-fsbno rtextents, then it
> is an extent number index, not a "block" index. An extent number
> index won't overflow 32 bits (because the rtg has a max of 2^32 - 1
> rtextents)
> 
> IOWs, shouldn't this be named soemthing like:
> 
> typedef uint32_t	xfs_rgext_t;	/* extent number in realtime group */

and again, we can't do that because we emulate !rtg filesystems with a
single "rtgroup" that can be more than 2^32 rtx long.

> >  typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
> >  typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
> >  typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
> >  typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
> > +typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
> >  typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
> >  typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
> >  typedef int64_t		xfs_fsize_t;	/* bytes in a file */
> > @@ -53,7 +55,9 @@ typedef void *		xfs_failaddr_t;
> >  #define	NULLFILEOFF	((xfs_fileoff_t)-1)
> >  
> >  #define	NULLAGBLOCK	((xfs_agblock_t)-1)
> > +#define NULLRGBLOCK	((xfs_rgblock_t)-1)
> >  #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
> > +#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
> 
> What's the maximum valid rtg number? We're not ever going to be
> supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
> this at and validate it at?

/me shrugs -- the smallest AG size on the data device is 16M, which
technically speaking means that one /could/ format 2^(63-24) groups,
or order 39.

Realistically with the maximum rtgroup size of 2^31 blocks, we probably
only need 2^(63 - (31 + 10)) = 2^22 rtgroups max on a 1k fsblock fs.

> >  #define NULLCOMMITLSN	((xfs_lsn_t)-1)
> >  
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index 4423dd344239b..c627cde3bb1e0 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -28,6 +28,7 @@
> >  #include "xfs_ag.h"
> >  #include "xfs_quota.h"
> >  #include "xfs_reflink.h"
> > +#include "xfs_rtgroup.h"
> >  
> >  #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
> >  
> > @@ -3346,6 +3347,7 @@ xlog_do_recover(
> >  	struct xfs_mount	*mp = log->l_mp;
> >  	struct xfs_buf		*bp = mp->m_sb_bp;
> >  	struct xfs_sb		*sbp = &mp->m_sb;
> > +	xfs_rgnumber_t		old_rgcount = sbp->sb_rgcount;
> >  	int			error;
> >  
> >  	trace_xfs_log_recover(log, head_blk, tail_blk);
> > @@ -3399,6 +3401,24 @@ xlog_do_recover(
> >  		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
> >  		return error;
> >  	}
> > +
> > +	if (sbp->sb_rgcount < old_rgcount) {
> > +		xfs_warn(mp, "rgcount shrink not supported");
> > +		return -EINVAL;
> > +	}
> > +	if (sbp->sb_rgcount > old_rgcount) {
> > +		xfs_rgnumber_t		rgno;
> > +
> > +		for (rgno = old_rgcount; rgno < sbp->sb_rgcount; rgno++) {
> > +			error = xfs_rtgroup_alloc(mp, rgno);
> > +			if (error) {
> > +				xfs_warn(mp,
> > +	"Failed post-recovery rtgroup init: %d",
> > +						error);
> > +				return error;
> > +			}
> > +		}
> > +	}
> 
> Please factor this out into a separate function with all the other
> rtgroup init/teardown code. That means we don't have to care about
> how rtgrowfs functions in recovery code, similar to the
> xfs_initialize_perag() already in this function for handling
> recovery of data device growing...
> 
> >  	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
> >  
> >  	/* Normal transactions can now occur */
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index b0ea88acdb618..e1e849101cdd4 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -36,6 +36,7 @@
> >  #include "xfs_ag.h"
> >  #include "xfs_rtbitmap.h"
> >  #include "xfs_metafile.h"
> > +#include "xfs_rtgroup.h"
> >  #include "scrub/stats.h"
> >  
> >  static DEFINE_MUTEX(xfs_uuid_table_mutex);
> > @@ -664,6 +665,7 @@ xfs_mountfs(
> >  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
> >  	uint			quotamount = 0;
> >  	uint			quotaflags = 0;
> > +	xfs_rgnumber_t		rgno;
> >  	int			error = 0;
> >  
> >  	xfs_sb_mount_common(mp, sbp);
> > @@ -830,10 +832,18 @@ xfs_mountfs(
> >  		goto out_free_dir;
> >  	}
> >  
> > +	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
> > +		error = xfs_rtgroup_alloc(mp, rgno);
> > +		if (error) {
> > +			xfs_warn(mp, "Failed rtgroup init: %d", error);
> > +			goto out_free_rtgroup;
> > +		}
> > +	}
> 
> Same - factor this to a xfs_rtgroup_init() function located with the
> rest of the rtgroup infrastructure...
> 
> > +
> >  	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
> >  		xfs_warn(mp, "no log defined");
> >  		error = -EFSCORRUPTED;
> > -		goto out_free_perag;
> > +		goto out_free_rtgroup;
> >  	}
> >  
> >  	error = xfs_inodegc_register_shrinker(mp);
> > @@ -1068,7 +1078,8 @@ xfs_mountfs(
> >  	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> >  		xfs_buftarg_drain(mp->m_logdev_targp);
> >  	xfs_buftarg_drain(mp->m_ddev_targp);
> > - out_free_perag:
> > + out_free_rtgroup:
> > +	xfs_free_rtgroups(mp, rgno);
> >  	xfs_free_perag(mp);
> >   out_free_dir:
> >  	xfs_da_unmount(mp);
> > @@ -1152,6 +1163,7 @@ xfs_unmountfs(
> >  	xfs_errortag_clearall(mp);
> >  #endif
> >  	shrinker_free(mp->m_inodegc_shrinker);
> > +	xfs_free_rtgroups(mp, mp->m_sb.sb_rgcount);
> 
> ... like you've already for the cleanup side ;)
> 
> ....
> 
> > @@ -1166,6 +1169,9 @@ xfs_rtmount_inodes(
> >  	if (error)
> >  		goto out_rele_summary;
> >  
> > +	for_each_rtgroup(mp, rgno, rtg)
> > +		rtg->rtg_extents = xfs_rtgroup_extents(mp, rtg->rtg_rgno);
> > +
> 
> This also needs to be done after recovery has initialised new rtgs
> as a result fo replaying a sb growfs modification, right?
> 
> Which leads to the next question: if there are thousands of rtgs,
> this requires walking every rtg at mount time, right? We know that
> walking thousands of static structures at mount time is a
> scalability issue, so can we please avoid this if at all possible?
> i.e. do demand loading of per-rtg metadata when it is first required
> (like we do with agf/agi information) rather than doing it all at
> mount time...

Sounds like a reasonable optimization patch...

--D

> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Dave Chinner Aug. 27, 2024, 12:57 a.m. UTC | #4
On Mon, Aug 26, 2024 at 12:14:04PM -0700, Darrick J. Wong wrote:
> On Mon, Aug 26, 2024 at 09:56:08AM +1000, Dave Chinner wrote:
> > On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > Create an incore object that will contain information about a realtime
> > > allocation group.  This will eventually enable us to shard the realtime
> > > section in a similar manner to how we shard the data section, but for
> > > now just a single object for the entire RT subvolume is created.
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > >  fs/xfs/Makefile             |    1 
> > >  fs/xfs/libxfs/xfs_format.h  |    3 +
> > >  fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/libxfs/xfs_sb.c      |    7 +
> > >  fs/xfs/libxfs/xfs_types.h   |    4 +
> > >  fs/xfs/xfs_log_recover.c    |   20 ++++
> > >  fs/xfs/xfs_mount.c          |   16 +++
> > >  fs/xfs/xfs_mount.h          |   14 +++
> > >  fs/xfs/xfs_rtalloc.c        |    6 +
> > >  fs/xfs/xfs_super.c          |    1 
> > >  fs/xfs/xfs_trace.c          |    1 
> > >  fs/xfs/xfs_trace.h          |   38 ++++++++
> > >  13 files changed, 517 insertions(+), 2 deletions(-)
> > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
> > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h
> > 
> > Ok, how is the global address space for real time extents laid out
> > across rt groups? i.e. is it sparse similar to how fsbnos and inode
> > numbers are created for the data device like so?
> > 
> > 	fsbno = (agno << agblklog) | agbno
> > 
> > Or is it something different? I can't find that defined anywhere in
> > this patch, so I can't determine if the unit conversion code and
> > validation is correct or not...
> 
> They're not sparse like fsbnos on the data device, they're laid end to
> end.  IOWs, it's a straight linear translation.  If you have an rtgroup
> that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).

Yes, I figured that out later. I think that's less than optimal,
because it essentially repeats the problems we have with AGs being
fixed size without the potential for fixing it easily. i.e. the
global sharded fsbno address space is sparse, so we can actually
space out the sparse address regions to allow future flexibility in
group size and location work.

By having the rtgroup addressing being purely physical, we're
completely stuck with fixed sized rtgroups and there is no way
around that. IOWs, the physical address space sharding repeats the
existing grow and shrink problems we have with the existing fixed
size AGs.

We're discussing how to use the sparse fsbno addressing to allow
resizing of AGs, but we will not be able to do that at all with
rtgroups as they stand. The limitation is a 64 bit global rt extent
address is essential the physical address of the extent in the block
device LBA space.

> 
> This patch, FWIW, refactors the existing rt code so that a !rtgroups
> filesystem is represented by one large "group", with xfs_rtxnum_t now
> indexing rt extents within a group. 

Right, we can do that regardless of whether we use logical or
physical addressing for the global rtbno for sharded rtgroup layout.
the rtgno of 0 for that rtg always results in logical = physical
addressing.

> Probably it should be renamed to xfs_rgxnum_t.

That might be a good idea.

> Note that we haven't defined the rtgroup ondisk format yet, so I'll go
> amend that patch to spell out the ondisk format of the brave new world.

Yes, please! That would have made working out all the differences
between all the combinations of rt, rtx, rg, num, len, blk, etc a
whole lot easier to work out.

> > > +struct xfs_rtgroup *
> > > +xfs_rtgroup_grab(
> > > +	struct xfs_mount	*mp,
> > > +	xfs_agnumber_t		agno)
> > > +{
> > > +	struct xfs_rtgroup	*rtg;
> > > +
> > > +	rcu_read_lock();
> > > +	rtg = xa_load(&mp->m_rtgroups, agno);
> > > +	if (rtg) {
> > > +		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
> > > +		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
> > > +			rtg = NULL;
> > > +	}
> > > +	rcu_read_unlock();
> > > +	return rtg;
> > > +}
> > > +
> > > +void
> > > +xfs_rtgroup_rele(
> > > +	struct xfs_rtgroup	*rtg)
> > > +{
> > > +	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
> > > +	if (atomic_dec_and_test(&rtg->rtg_active_ref))
> > > +		wake_up(&rtg->rtg_active_wq);
> > > +}
> > 
> > This is all duplicates of the xfs_perag code. Can you put together a
> > patchset to abstract this into a "xfs_group" and embed them in both
> > the perag and and rtgroup structures?
> > 
> > That way we only need one set of lookup and iterator infrastructure,
> > and it will work for both data and rt groups...
> 
> How will that work with perags still using the radix tree and rtgroups
> using the xarray?  Yes, we should move the perags to use the xarray too
> (and indeed hch already has a series on list to do that) but here's
> really not the time to do that because I don't want to frontload a bunch
> more core changes onto this already huge patchset.

Let's first assume they both use xarray (that's just a matter of
time, yes?) so it's easier to reason about. Then we have something
like this:

/*
 * xfs_group - a contiguous 32 bit block address space group
 */
struct xfs_group {
	struct xarray		xarr;
	u32			num_groups;
};

struct xfs_group_item {
	struct xfs_group	*group; /* so put/rele don't need any other context */
	u32			gno;
	atomic_t		passive_refs;
	atomic_t		active_refs;
	wait_queue_head_t	active_wq;
	unsigned long		opstate;

	u32			blocks;		/* length in fsb */
	u32			extents;	/* length in extents */
	u32			blk_log;	/* extent size in fsb */

	/* limits for min/max valid addresses */
	u32			max_addr;
	u32			min_addr;
};

And then we define:

struct xfs_perag {
	struct xfs_group_item	g;

	/* perag specific stuff follows */
	....
};

struct xfs_rtgroup {
	struct xfs_group_item	g;

	/* rtg specific stuff follows */
	.....

}

And then a couple of generic macros:

#define to_grpi(grpi, gi)	container_of((gi), typeof(grpi), g)
#define to_gi(grpi)		(&(grpi)->g)

though this might be better as just typed macros:

#define gi_to_pag(gi)	container_of((gi), struct xfs_perag, g)
#define gi_to_rtg(gi)	container_of((gi), struct xfs_rtgroup, g)

And then all the grab/rele/get/put stuff becomes:

	rtg = to_grpi(rtg, xfs_group_grab(mp->m_rtgroups, rgno));
	pag = to_grpi(pag, xfs_group_grab(mp->m_perags, agno));
	....
	xfs_group_put(&rtg->g);
	xfs_group_put(&pag->g);


or

	rtg = gi_to_rtg(xfs_group_grab(mp->m_rtgroups, rgno));
	pag = gi_to_pag(xfs_group_grab(mp->m_perags, agno));
	....
	xfs_group_put(&rtg->g);
	xfs_group_put(&pag->g);


then we pass the group to each of the "for_each_group..." iterators
like so:

	for_each_group(&mp->m_perags, agno, pag) {
		/* do stuff with pag */
	}

or
	for_each_group(&mp->m_rtgroups, rtgno, rtg) {
		/* do stuff with rtg */
	}

And we use typeof() and container_of() to access the group structure
within the pag/rtg. Something like:

#define to_grpi(grpi, gi)	container_of((gi), typeof(grpi), g)
#define to_gi(grpi)		(&(grpi)->g)

#define for_each_group(grp, gno, grpi)					\
	(gno) = 0;							\
	for ((grpi) = to_grpi((grpi), xfs_group_grab((grp), (gno)));	\
	     (grpi) != NULL;						\
	     (grpi) = to_grpi(grpi, xfs_group_next((grp), to_gi(grpi),	\
					&(gno), (grp)->num_groups))

And now we essentially have common group infrstructure for
access, iteration, geometry and address verification purposes...

> 
> > > +
> > > +/* Compute the number of rt extents in this realtime group. */
> > > +xfs_rtxnum_t
> > > +xfs_rtgroup_extents(
> > +	struct xfs_mount	*mp,
> > > +	xfs_rgnumber_t		rgno)
> > > +{
> > > +	xfs_rgnumber_t		rgcount = mp->m_sb.sb_rgcount;
> > > +
> > > +	ASSERT(rgno < rgcount);
> > > +	if (rgno == rgcount - 1)
> > > +		return mp->m_sb.sb_rextents -
> > > +			((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
> > 
> > Urk. So this relies on a non-rtgroup filesystem doing a
> > multiplication by zero of a field that the on-disk format does not
> > understand to get the right result.  I think this is a copying a bad
> > pattern we've been slowly trying to remove from the normal
> > allocation group code.
> > 
> > > +
> > > +	ASSERT(xfs_has_rtgroups(mp));
> > > +	return mp->m_sb.sb_rgextents;
> > > +}
> > 
> > We already embed the length of the rtgroup in the rtgroup structure.
> > THis should be looking up the rtgroup (or being passed the rtgroup
> > the caller already has) and doing the right thing. i.e.
> > 
> > 	if (!rtg || !xfs_has_rtgroups(rtg->rtg_mount))
> > 		return mp->m_sb.sb_rextents;
> > 	return rtg->rtg_extents;
> 
> xfs_rtgroup_extents is the function that we use to set rtg->rtg_extents.

That wasn't clear from the context of the patch. Perhaps a better
name xfs_rtgroup_calc_extents() to indicate that it is a setup
function, not something that should be regularly called at runtime?

> 
> > > +static inline xfs_rtblock_t
> > > +xfs_rgno_start_rtb(
> > > +	struct xfs_mount	*mp,
> > > +	xfs_rgnumber_t		rgno)
> > > +{
> > > +	if (mp->m_rgblklog >= 0)
> > > +		return ((xfs_rtblock_t)rgno << mp->m_rgblklog);
> > > +	return ((xfs_rtblock_t)rgno * mp->m_rgblocks);
> > > +}
> > 
> > Where does mp->m_rgblklog come from? That wasn't added to the
> > on-disk superblock structure and it is always initialised to zero
> > in this patch.
> > 
> > When will m_rgblklog be zero and when will it be non-zero? If it's
> 
> As I mentioned before, this patch merely ports non-rtg filesystems to
> use the rtgroup structure.  m_rgblklog will be set to nonzero values
> when we get to defining the ondisk rtgroup structure.

Yeah, which makes some of the context in the patch hard to
understand... :/

> But, to cut ahead here, m_rgblklog will be set to a non-negative value
> if the rtgroup size (in blocks) is a power of two.  Then these unit
> conversion functions can use shifts instead of expensive multiplication
> and divisions.  The same goes for rt extent to {fs,rt}block conversions.

yeah, so mp->m_rgblklog is not equivalent of mp->m_agblklog at all.
It took me some time to understand that - the names are the same,
they are used in similar address conversions, but they have
completely different functions.

I suspect we need some better naming here, regardless of the
rtgroups global address space layout discussion...

> > > +
> > > +static inline uint64_t
> > > +__xfs_rtb_to_rgbno(
> > > +	struct xfs_mount	*mp,
> > > +	xfs_rtblock_t		rtbno)
> > > +{
> > > +	uint32_t		rem;
> > > +
> > > +	if (!xfs_has_rtgroups(mp))
> > > +		return rtbno;
> > > +
> > > +	if (mp->m_rgblklog >= 0)
> > > +		return rtbno & mp->m_rgblkmask;
> > > +
> > > +	div_u64_rem(rtbno, mp->m_rgblocks, &rem);
> > > +	return rem;
> > > +}
> > 
> > Why is this function returning a uint64_t - a xfs_rgblock_t is only
> > a 32 bit type...
> 
> group 0 on a !rtg filesystem can be 64-bits in block/rt count.  This is
> a /very/ annoying pain point -- if you actually created such a
> filesystem it actually would never work because the rtsummary file would
> be created undersized due to an integer overflow, but the verifiers
> never checked any of that, and due to the same underflow the rtallocator
> would search the wrong places and (eventually) fall back to a dumb
> linear scan.
> 
> Soooooo this is an obnoxious usecase (broken large !rtg filesystems)
> that we can't just drop, though I'm pretty sure there aren't any systems
> in the wild.

Ugh. That definitely needs to be a comment somewhere in the code to
explain this. :/

> > > diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> > > index a8cd44d03ef64..1ce4b9eb16f47 100644
> > > --- a/fs/xfs/libxfs/xfs_types.h
> > > +++ b/fs/xfs/libxfs/xfs_types.h
> > > @@ -9,10 +9,12 @@
> > >  typedef uint32_t	prid_t;		/* project ID */
> > >  
> > >  typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
> > > +typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
> > 
> > Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can
> > be 2^20 bytes:
> > 
> > #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024)
> 
> No, the maximum rtgroup length is 2^32-1 blocks.

I couldn't tell if the max length was being defined as the maximum
number of rt extents that the rtgroup could index, of whether it was
the maximum number of filesystem blocks (i.e. data device fsblock
size) tha an rtgroup could index...


> > Hence for a 4kB fsbno filesystem, the actual maximum size of an rtg
> > in filesystem blocks far exceeds what we can address with a 32 bit
> > variable.
> > 
> > If xfs_rgblock_t is actually indexing multi-fsbno rtextents, then it
> > is an extent number index, not a "block" index. An extent number
> > index won't overflow 32 bits (because the rtg has a max of 2^32 - 1
> > rtextents)
> > 
> > IOWs, shouldn't this be named soemthing like:
> > 
> > typedef uint32_t	xfs_rgext_t;	/* extent number in realtime group */
> 
> and again, we can't do that because we emulate !rtg filesystems with a
> single "rtgroup" that can be more than 2^32 rtx long.

*nod*

> > >  typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
> > >  typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
> > >  typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
> > >  typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
> > > +typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
> > >  typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
> > >  typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
> > >  typedef int64_t		xfs_fsize_t;	/* bytes in a file */
> > > @@ -53,7 +55,9 @@ typedef void *		xfs_failaddr_t;
> > >  #define	NULLFILEOFF	((xfs_fileoff_t)-1)
> > >  
> > >  #define	NULLAGBLOCK	((xfs_agblock_t)-1)
> > > +#define NULLRGBLOCK	((xfs_rgblock_t)-1)
> > >  #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
> > > +#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
> > 
> > What's the maximum valid rtg number? We're not ever going to be
> > supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
> > this at and validate it at?
> 
> /me shrugs -- the smallest AG size on the data device is 16M, which
> technically speaking means that one /could/ format 2^(63-24) groups,
> or order 39.
> 
> Realistically with the maximum rtgroup size of 2^31 blocks, we probably
> only need 2^(63 - (31 + 10)) = 2^22 rtgroups max on a 1k fsblock fs.

Right, those are the theoretical maximums. Practically speaking,
though, mkfs and mount iteration of all AGs means millions to
billions of IOs need to be done before the filesystem can even be
fully mounted. Hence the practical limit to AG count is closer to a
few tens of thousands, not hundreds of billions.

Hence I'm wondering if we should actually cap the maximum number of
rtgroups. WE're just about at BS > PS, so with a 64k block size a
single rtgroup can index 2^32 * 2^16 bytes which puts individual
rtgs at 256TB in size. Unless there are use cases for rtgroup sizes
smaller than a few GBs, I just don't see the need for support
theoretical maximum counts on tiny block size filesystems. Thirty
thousand rtgs at 256TB per rtg puts us at 64 bit device size limits,
and we hit those limits on 4kB block sizes at around 500,000 rtgs.

So do we need to support millions of rtgs? I'd say no....

-Dave.
Darrick J. Wong Aug. 27, 2024, 1:55 a.m. UTC | #5
On Tue, Aug 27, 2024 at 10:57:34AM +1000, Dave Chinner wrote:
> On Mon, Aug 26, 2024 at 12:14:04PM -0700, Darrick J. Wong wrote:
> > On Mon, Aug 26, 2024 at 09:56:08AM +1000, Dave Chinner wrote:
> > > On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > 
> > > > Create an incore object that will contain information about a realtime
> > > > allocation group.  This will eventually enable us to shard the realtime
> > > > section in a similar manner to how we shard the data section, but for
> > > > now just a single object for the entire RT subvolume is created.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > > >  fs/xfs/Makefile             |    1 
> > > >  fs/xfs/libxfs/xfs_format.h  |    3 +
> > > >  fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_sb.c      |    7 +
> > > >  fs/xfs/libxfs/xfs_types.h   |    4 +
> > > >  fs/xfs/xfs_log_recover.c    |   20 ++++
> > > >  fs/xfs/xfs_mount.c          |   16 +++
> > > >  fs/xfs/xfs_mount.h          |   14 +++
> > > >  fs/xfs/xfs_rtalloc.c        |    6 +
> > > >  fs/xfs/xfs_super.c          |    1 
> > > >  fs/xfs/xfs_trace.c          |    1 
> > > >  fs/xfs/xfs_trace.h          |   38 ++++++++
> > > >  13 files changed, 517 insertions(+), 2 deletions(-)
> > > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
> > > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h
> > > 
> > > Ok, how is the global address space for real time extents laid out
> > > across rt groups? i.e. is it sparse similar to how fsbnos and inode
> > > numbers are created for the data device like so?
> > > 
> > > 	fsbno = (agno << agblklog) | agbno
> > > 
> > > Or is it something different? I can't find that defined anywhere in
> > > this patch, so I can't determine if the unit conversion code and
> > > validation is correct or not...
> > 
> > They're not sparse like fsbnos on the data device, they're laid end to
> > end.  IOWs, it's a straight linear translation.  If you have an rtgroup
> > that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).
> 
> Yes, I figured that out later. I think that's less than optimal,
> because it essentially repeats the problems we have with AGs being
> fixed size without the potential for fixing it easily. i.e. the
> global sharded fsbno address space is sparse, so we can actually
> space out the sparse address regions to allow future flexibility in
> group size and location work.
> 
> By having the rtgroup addressing being purely physical, we're
> completely stuck with fixed sized rtgroups and there is no way
> around that. IOWs, the physical address space sharding repeats the
> existing grow and shrink problems we have with the existing fixed
> size AGs.
> 
> We're discussing how to use the sparse fsbno addressing to allow
> resizing of AGs, but we will not be able to do that at all with
> rtgroups as they stand. The limitation is a 64 bit global rt extent
> address is essential the physical address of the extent in the block
> device LBA space.

<nod> I /think/ it's pretty simple to convert the rtgroups rtblock
numbers to sparse ala xfs_fsblock_t -- all we have to do is make sure
that mp->m_rgblklog is set to highbit64(rtgroup block count) and then
delete all the multiply/divide code, just like we do on the data device.

The thing I *don't* know is how will this affect hch's zoned device
support -- he's mentioned that rtgroups will eventually have both a size
and a "capacity" to keep the zones aligned to groups, or groups aligned
to zones, I don't remember which.  I don't know if segmenting
br_startblock for rt mappings makes things better or worse for that.

> > This patch, FWIW, refactors the existing rt code so that a !rtgroups
> > filesystem is represented by one large "group", with xfs_rtxnum_t now
> > indexing rt extents within a group. 
> 
> Right, we can do that regardless of whether we use logical or
> physical addressing for the global rtbno for sharded rtgroup layout.
> the rtgno of 0 for that rtg always results in logical = physical
> addressing.
> 
> > Probably it should be renamed to xfs_rgxnum_t.
> 
> That might be a good idea.
> 
> > Note that we haven't defined the rtgroup ondisk format yet, so I'll go
> > amend that patch to spell out the ondisk format of the brave new world.
> 
> Yes, please! That would have made working out all the differences
> between all the combinations of rt, rtx, rg, num, len, blk, etc a
> whole lot easier to work out.

<Nod> I'll go work all that out.

> > > > +struct xfs_rtgroup *
> > > > +xfs_rtgroup_grab(
> > > > +	struct xfs_mount	*mp,
> > > > +	xfs_agnumber_t		agno)
> > > > +{
> > > > +	struct xfs_rtgroup	*rtg;
> > > > +
> > > > +	rcu_read_lock();
> > > > +	rtg = xa_load(&mp->m_rtgroups, agno);
> > > > +	if (rtg) {
> > > > +		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
> > > > +		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
> > > > +			rtg = NULL;
> > > > +	}
> > > > +	rcu_read_unlock();
> > > > +	return rtg;
> > > > +}
> > > > +
> > > > +void
> > > > +xfs_rtgroup_rele(
> > > > +	struct xfs_rtgroup	*rtg)
> > > > +{
> > > > +	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
> > > > +	if (atomic_dec_and_test(&rtg->rtg_active_ref))
> > > > +		wake_up(&rtg->rtg_active_wq);
> > > > +}
> > > 
> > > This is all duplicates of the xfs_perag code. Can you put together a
> > > patchset to abstract this into a "xfs_group" and embed them in both
> > > the perag and and rtgroup structures?
> > > 
> > > That way we only need one set of lookup and iterator infrastructure,
> > > and it will work for both data and rt groups...
> > 
> > How will that work with perags still using the radix tree and rtgroups
> > using the xarray?  Yes, we should move the perags to use the xarray too
> > (and indeed hch already has a series on list to do that) but here's
> > really not the time to do that because I don't want to frontload a bunch
> > more core changes onto this already huge patchset.
> 
> Let's first assume they both use xarray (that's just a matter of
> time, yes?) so it's easier to reason about. Then we have something
> like this:
> 
> /*
>  * xfs_group - a contiguous 32 bit block address space group
>  */
> struct xfs_group {
> 	struct xarray		xarr;
> 	u32			num_groups;
> };

Ah, that's the group head.  I might call this struct xfs_groups?

So ... would it theoretically make more sense to use an rhashtable here?
Insofar as the only place that totally falls down is if you want to
iterate tagged groups; and that's only done for AGs.

I'm ok with using an xarray here, fwiw.

> struct xfs_group_item {
> 	struct xfs_group	*group; /* so put/rele don't need any other context */
> 	u32			gno;
> 	atomic_t		passive_refs;
> 	atomic_t		active_refs;
> 	wait_queue_head_t	active_wq;
> 	unsigned long		opstate;
> 
> 	u32			blocks;		/* length in fsb */
> 	u32			extents;	/* length in extents */
> 	u32			blk_log;	/* extent size in fsb */
> 
> 	/* limits for min/max valid addresses */
> 	u32			max_addr;
> 	u32			min_addr;
> };

Yeah, that's pretty much what I had in the prototype that I shredded an
hour ago.

> And then we define:
> 
> struct xfs_perag {
> 	struct xfs_group_item	g;
> 
> 	/* perag specific stuff follows */
> 	....
> };
> 
> struct xfs_rtgroup {
> 	struct xfs_group_item	g;
> 
> 	/* rtg specific stuff follows */
> 	.....
> 
> }
> 
> And then a couple of generic macros:
> 
> #define to_grpi(grpi, gi)	container_of((gi), typeof(grpi), g)
> #define to_gi(grpi)		(&(grpi)->g)
> 
> though this might be better as just typed macros:
> 
> #define gi_to_pag(gi)	container_of((gi), struct xfs_perag, g)
> #define gi_to_rtg(gi)	container_of((gi), struct xfs_rtgroup, g)
> 
> And then all the grab/rele/get/put stuff becomes:
> 
> 	rtg = to_grpi(rtg, xfs_group_grab(mp->m_rtgroups, rgno));
> 	pag = to_grpi(pag, xfs_group_grab(mp->m_perags, agno));
> 	....
> 	xfs_group_put(&rtg->g);
> 	xfs_group_put(&pag->g);
> 
> 
> or
> 
> 	rtg = gi_to_rtg(xfs_group_grab(mp->m_rtgroups, rgno));
> 	pag = gi_to_pag(xfs_group_grab(mp->m_perags, agno));
> 	....
> 	xfs_group_put(&rtg->g);
> 	xfs_group_put(&pag->g);
> 
> 
> then we pass the group to each of the "for_each_group..." iterators
> like so:
> 
> 	for_each_group(&mp->m_perags, agno, pag) {
> 		/* do stuff with pag */
> 	}
> 
> or
> 	for_each_group(&mp->m_rtgroups, rtgno, rtg) {
> 		/* do stuff with rtg */
> 	}
> 
> And we use typeof() and container_of() to access the group structure
> within the pag/rtg. Something like:
> 
> #define to_grpi(grpi, gi)	container_of((gi), typeof(grpi), g)
> #define to_gi(grpi)		(&(grpi)->g)
> 
> #define for_each_group(grp, gno, grpi)					\
> 	(gno) = 0;							\
> 	for ((grpi) = to_grpi((grpi), xfs_group_grab((grp), (gno)));	\
> 	     (grpi) != NULL;						\
> 	     (grpi) = to_grpi(grpi, xfs_group_next((grp), to_gi(grpi),	\
> 					&(gno), (grp)->num_groups))
> 
> And now we essentially have common group infrstructure for
> access, iteration, geometry and address verification purposes...

<nod> That's pretty much what I had drafted, albeit with different
helper macros since I kept the for_each_{perag,rtgroup} things around
for type safety.  Though I think for_each_perag just becomes:

#define for_each_perag(mp, agno, pag) \
	for_each_group((mp)->m_perags, (agno), (pag))

Right?

> > 
> > > > +
> > > > +/* Compute the number of rt extents in this realtime group. */
> > > > +xfs_rtxnum_t
> > > > +xfs_rtgroup_extents(
> > > +	struct xfs_mount	*mp,
> > > > +	xfs_rgnumber_t		rgno)
> > > > +{
> > > > +	xfs_rgnumber_t		rgcount = mp->m_sb.sb_rgcount;
> > > > +
> > > > +	ASSERT(rgno < rgcount);
> > > > +	if (rgno == rgcount - 1)
> > > > +		return mp->m_sb.sb_rextents -
> > > > +			((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
> > > 
> > > Urk. So this relies on a non-rtgroup filesystem doing a
> > > multiplication by zero of a field that the on-disk format does not
> > > understand to get the right result.  I think this is a copying a bad
> > > pattern we've been slowly trying to remove from the normal
> > > allocation group code.
> > > 
> > > > +
> > > > +	ASSERT(xfs_has_rtgroups(mp));
> > > > +	return mp->m_sb.sb_rgextents;
> > > > +}
> > > 
> > > We already embed the length of the rtgroup in the rtgroup structure.
> > > THis should be looking up the rtgroup (or being passed the rtgroup
> > > the caller already has) and doing the right thing. i.e.
> > > 
> > > 	if (!rtg || !xfs_has_rtgroups(rtg->rtg_mount))
> > > 		return mp->m_sb.sb_rextents;
> > > 	return rtg->rtg_extents;
> > 
> > xfs_rtgroup_extents is the function that we use to set rtg->rtg_extents.
> 
> That wasn't clear from the context of the patch. Perhaps a better
> name xfs_rtgroup_calc_extents() to indicate that it is a setup
> function, not something that should be regularly called at runtime?

<nod>

> > 
> > > > +static inline xfs_rtblock_t
> > > > +xfs_rgno_start_rtb(
> > > > +	struct xfs_mount	*mp,
> > > > +	xfs_rgnumber_t		rgno)
> > > > +{
> > > > +	if (mp->m_rgblklog >= 0)
> > > > +		return ((xfs_rtblock_t)rgno << mp->m_rgblklog);
> > > > +	return ((xfs_rtblock_t)rgno * mp->m_rgblocks);
> > > > +}
> > > 
> > > Where does mp->m_rgblklog come from? That wasn't added to the
> > > on-disk superblock structure and it is always initialised to zero
> > > in this patch.
> > > 
> > > When will m_rgblklog be zero and when will it be non-zero? If it's
> > 
> > As I mentioned before, this patch merely ports non-rtg filesystems to
> > use the rtgroup structure.  m_rgblklog will be set to nonzero values
> > when we get to defining the ondisk rtgroup structure.
> 
> Yeah, which makes some of the context in the patch hard to
> understand... :/
> 
> > But, to cut ahead here, m_rgblklog will be set to a non-negative value
> > if the rtgroup size (in blocks) is a power of two.  Then these unit
> > conversion functions can use shifts instead of expensive multiplication
> > and divisions.  The same goes for rt extent to {fs,rt}block conversions.
> 
> yeah, so mp->m_rgblklog is not equivalent of mp->m_agblklog at all.
> It took me some time to understand that - the names are the same,
> they are used in similar address conversions, but they have
> completely different functions.
> 
> I suspect we need some better naming here, regardless of the
> rtgroups global address space layout discussion...

Or just make xfs_rtblock_t sparse, in which case I think m_rgblklog
usage patterns become exactly the same as m_agblklog.

> > > > +
> > > > +static inline uint64_t
> > > > +__xfs_rtb_to_rgbno(
> > > > +	struct xfs_mount	*mp,
> > > > +	xfs_rtblock_t		rtbno)
> > > > +{
> > > > +	uint32_t		rem;
> > > > +
> > > > +	if (!xfs_has_rtgroups(mp))
> > > > +		return rtbno;
> > > > +
> > > > +	if (mp->m_rgblklog >= 0)
> > > > +		return rtbno & mp->m_rgblkmask;
> > > > +
> > > > +	div_u64_rem(rtbno, mp->m_rgblocks, &rem);
> > > > +	return rem;
> > > > +}
> > > 
> > > Why is this function returning a uint64_t - a xfs_rgblock_t is only
> > > a 32 bit type...
> > 
> > group 0 on a !rtg filesystem can be 64-bits in block/rt count.  This is
> > a /very/ annoying pain point -- if you actually created such a
> > filesystem it actually would never work because the rtsummary file would
> > be created undersized due to an integer overflow, but the verifiers
> > never checked any of that, and due to the same underflow the rtallocator
> > would search the wrong places and (eventually) fall back to a dumb
> > linear scan.
> > 
> > Soooooo this is an obnoxious usecase (broken large !rtg filesystems)
> > that we can't just drop, though I'm pretty sure there aren't any systems
> > in the wild.
> 
> Ugh. That definitely needs to be a comment somewhere in the code to
> explain this. :/

Well it's all in the commit that fixed the rtsummary for those things.

> > > > diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> > > > index a8cd44d03ef64..1ce4b9eb16f47 100644
> > > > --- a/fs/xfs/libxfs/xfs_types.h
> > > > +++ b/fs/xfs/libxfs/xfs_types.h
> > > > @@ -9,10 +9,12 @@
> > > >  typedef uint32_t	prid_t;		/* project ID */
> > > >  
> > > >  typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
> > > > +typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
> > > 
> > > Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can
> > > be 2^20 bytes:
> > > 
> > > #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024)
> > 
> > No, the maximum rtgroup length is 2^32-1 blocks.
> 
> I couldn't tell if the max length was being defined as the maximum
> number of rt extents that the rtgroup could index, of whether it was
> the maximum number of filesystem blocks (i.e. data device fsblock
> size) tha an rtgroup could index...

The max rtgroup length is defined in blocks; the min is defined in rt
extents.  I might want to bump up the minimum a bit, but I think
Christoph should weigh in on that first -- I think his zns patchset
currently assigns one rtgroup to each zone?  Because he was muttering
about how 130,000x 256MB rtgroups really sucks.  Would it be very messy
to have a minimum size of (say) 1GB?

> > > Hence for a 4kB fsbno filesystem, the actual maximum size of an rtg
> > > in filesystem blocks far exceeds what we can address with a 32 bit
> > > variable.
> > > 
> > > If xfs_rgblock_t is actually indexing multi-fsbno rtextents, then it
> > > is an extent number index, not a "block" index. An extent number
> > > index won't overflow 32 bits (because the rtg has a max of 2^32 - 1
> > > rtextents)
> > > 
> > > IOWs, shouldn't this be named soemthing like:
> > > 
> > > typedef uint32_t	xfs_rgext_t;	/* extent number in realtime group */
> > 
> > and again, we can't do that because we emulate !rtg filesystems with a
> > single "rtgroup" that can be more than 2^32 rtx long.
> 
> *nod*
> 
> > > >  typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
> > > >  typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
> > > >  typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
> > > >  typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
> > > > +typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
> > > >  typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
> > > >  typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
> > > >  typedef int64_t		xfs_fsize_t;	/* bytes in a file */
> > > > @@ -53,7 +55,9 @@ typedef void *		xfs_failaddr_t;
> > > >  #define	NULLFILEOFF	((xfs_fileoff_t)-1)
> > > >  
> > > >  #define	NULLAGBLOCK	((xfs_agblock_t)-1)
> > > > +#define NULLRGBLOCK	((xfs_rgblock_t)-1)
> > > >  #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
> > > > +#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
> > > 
> > > What's the maximum valid rtg number? We're not ever going to be
> > > supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
> > > this at and validate it at?
> > 
> > /me shrugs -- the smallest AG size on the data device is 16M, which
> > technically speaking means that one /could/ format 2^(63-24) groups,
> > or order 39.
> > 
> > Realistically with the maximum rtgroup size of 2^31 blocks, we probably
> > only need 2^(63 - (31 + 10)) = 2^22 rtgroups max on a 1k fsblock fs.
> 
> Right, those are the theoretical maximums. Practically speaking,
> though, mkfs and mount iteration of all AGs means millions to
> billions of IOs need to be done before the filesystem can even be
> fully mounted. Hence the practical limit to AG count is closer to a
> few tens of thousands, not hundreds of billions.
> 
> Hence I'm wondering if we should actually cap the maximum number of
> rtgroups. WE're just about at BS > PS, so with a 64k block size a
> single rtgroup can index 2^32 * 2^16 bytes which puts individual
> rtgs at 256TB in size. Unless there are use cases for rtgroup sizes
> smaller than a few GBs, I just don't see the need for support
> theoretical maximum counts on tiny block size filesystems. Thirty
> thousand rtgs at 256TB per rtg puts us at 64 bit device size limits,
> and we hit those limits on 4kB block sizes at around 500,000 rtgs.
> 
> So do we need to support millions of rtgs? I'd say no....

...but we might.  Christoph, how gnarly does zns support get if you have
to be able to pack multiple SMR zones into a single rtgroup?

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Aug. 27, 2024, 3 a.m. UTC | #6
On Mon, Aug 26, 2024 at 06:55:58PM -0700, Darrick J. Wong wrote:
> On Tue, Aug 27, 2024 at 10:57:34AM +1000, Dave Chinner wrote:
> > On Mon, Aug 26, 2024 at 12:14:04PM -0700, Darrick J. Wong wrote:
> > > On Mon, Aug 26, 2024 at 09:56:08AM +1000, Dave Chinner wrote:
> > > > On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > > 
> > > > > Create an incore object that will contain information about a realtime
> > > > > allocation group.  This will eventually enable us to shard the realtime
> > > > > section in a similar manner to how we shard the data section, but for
> > > > > now just a single object for the entire RT subvolume is created.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > > ---
> > > > >  fs/xfs/Makefile             |    1 
> > > > >  fs/xfs/libxfs/xfs_format.h  |    3 +
> > > > >  fs/xfs/libxfs/xfs_rtgroup.c |  196 ++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/libxfs/xfs_rtgroup.h |  212 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/libxfs/xfs_sb.c      |    7 +
> > > > >  fs/xfs/libxfs/xfs_types.h   |    4 +
> > > > >  fs/xfs/xfs_log_recover.c    |   20 ++++
> > > > >  fs/xfs/xfs_mount.c          |   16 +++
> > > > >  fs/xfs/xfs_mount.h          |   14 +++
> > > > >  fs/xfs/xfs_rtalloc.c        |    6 +
> > > > >  fs/xfs/xfs_super.c          |    1 
> > > > >  fs/xfs/xfs_trace.c          |    1 
> > > > >  fs/xfs/xfs_trace.h          |   38 ++++++++
> > > > >  13 files changed, 517 insertions(+), 2 deletions(-)
> > > > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c
> > > > >  create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h
> > > > 
> > > > Ok, how is the global address space for real time extents laid out
> > > > across rt groups? i.e. is it sparse similar to how fsbnos and inode
> > > > numbers are created for the data device like so?
> > > > 
> > > > 	fsbno = (agno << agblklog) | agbno
> > > > 
> > > > Or is it something different? I can't find that defined anywhere in
> > > > this patch, so I can't determine if the unit conversion code and
> > > > validation is correct or not...
> > > 
> > > They're not sparse like fsbnos on the data device, they're laid end to
> > > end.  IOWs, it's a straight linear translation.  If you have an rtgroup
> > > that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).
> > 
> > Yes, I figured that out later. I think that's less than optimal,
> > because it essentially repeats the problems we have with AGs being
> > fixed size without the potential for fixing it easily. i.e. the
> > global sharded fsbno address space is sparse, so we can actually
> > space out the sparse address regions to allow future flexibility in
> > group size and location work.
> > 
> > By having the rtgroup addressing being purely physical, we're
> > completely stuck with fixed sized rtgroups and there is no way
> > around that. IOWs, the physical address space sharding repeats the
> > existing grow and shrink problems we have with the existing fixed
> > size AGs.
> > 
> > We're discussing how to use the sparse fsbno addressing to allow
> > resizing of AGs, but we will not be able to do that at all with
> > rtgroups as they stand. The limitation is a 64 bit global rt extent
> > address is essential the physical address of the extent in the block
> > device LBA space.
> 
> <nod> I /think/ it's pretty simple to convert the rtgroups rtblock
> numbers to sparse ala xfs_fsblock_t -- all we have to do is make sure
> that mp->m_rgblklog is set to highbit64(rtgroup block count) and then
> delete all the multiply/divide code, just like we do on the data device.
> 
> The thing I *don't* know is how will this affect hch's zoned device
> support -- he's mentioned that rtgroups will eventually have both a size
> and a "capacity" to keep the zones aligned to groups, or groups aligned
> to zones, I don't remember which.  I don't know if segmenting
> br_startblock for rt mappings makes things better or worse for that.

I can't really comment on that because I haven't heard anything
about this requirement. It kinda sounds like sparse addressing just
with different names, but I'm just guessing there. Maybe Christoph
can educate us here...

> > > > This is all duplicates of the xfs_perag code. Can you put together a
> > > > patchset to abstract this into a "xfs_group" and embed them in both
> > > > the perag and and rtgroup structures?
> > > > 
> > > > That way we only need one set of lookup and iterator infrastructure,
> > > > and it will work for both data and rt groups...
> > > 
> > > How will that work with perags still using the radix tree and rtgroups
> > > using the xarray?  Yes, we should move the perags to use the xarray too
> > > (and indeed hch already has a series on list to do that) but here's
> > > really not the time to do that because I don't want to frontload a bunch
> > > more core changes onto this already huge patchset.
> > 
> > Let's first assume they both use xarray (that's just a matter of
> > time, yes?) so it's easier to reason about. Then we have something
> > like this:
> > 
> > /*
> >  * xfs_group - a contiguous 32 bit block address space group
> >  */
> > struct xfs_group {
> > 	struct xarray		xarr;
> > 	u32			num_groups;
> > };
> 
> Ah, that's the group head.  I might call this struct xfs_groups?

Sure.

> 
> So ... would it theoretically make more sense to use an rhashtable here?
> Insofar as the only place that totally falls down is if you want to
> iterate tagged groups; and that's only done for AGs.

The index is contiguous and starts at zero, so it packs extremely
well into an xarray. For small numbers of groups (i.e. the vast
majority of installations) item lookup is essentially O(1) (single
node), and it scales out at O(log N) for large numbers and random
access.  It also has efficient sequential iteration, which is what
we mostly do with groups.

rhashtable has an advantage at scale of being mostly O(1), but it
comes at an increased memory footprint and has terrible for ordered
iteration behaviour even ignoring tags (essentially random memory
access).

> I'm ok with using an xarray here, fwiw.

OK.

> > then we pass the group to each of the "for_each_group..." iterators
> > like so:
> > 
> > 	for_each_group(&mp->m_perags, agno, pag) {
> > 		/* do stuff with pag */
> > 	}
> > 
> > or
> > 	for_each_group(&mp->m_rtgroups, rtgno, rtg) {
> > 		/* do stuff with rtg */
> > 	}
> > 
> > And we use typeof() and container_of() to access the group structure
> > within the pag/rtg. Something like:
> > 
> > #define to_grpi(grpi, gi)	container_of((gi), typeof(grpi), g)
> > #define to_gi(grpi)		(&(grpi)->g)
> > 
> > #define for_each_group(grp, gno, grpi)					\
> > 	(gno) = 0;							\
> > 	for ((grpi) = to_grpi((grpi), xfs_group_grab((grp), (gno)));	\
> > 	     (grpi) != NULL;						\
> > 	     (grpi) = to_grpi(grpi, xfs_group_next((grp), to_gi(grpi),	\
> > 					&(gno), (grp)->num_groups))
> > 
> > And now we essentially have common group infrstructure for
> > access, iteration, geometry and address verification purposes...
> 
> <nod> That's pretty much what I had drafted, albeit with different
> helper macros since I kept the for_each_{perag,rtgroup} things around
> for type safety.  Though I think for_each_perag just becomes:
> 
> #define for_each_perag(mp, agno, pag) \
> 	for_each_group((mp)->m_perags, (agno), (pag))
> 
> Right?

Yeah, that's what I though of doing first, but then figured a little
bit of compiler magic gets rid of the need for the type specific
iterator wrappers altogether...

> > > > > diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> > > > > index a8cd44d03ef64..1ce4b9eb16f47 100644
> > > > > --- a/fs/xfs/libxfs/xfs_types.h
> > > > > +++ b/fs/xfs/libxfs/xfs_types.h
> > > > > @@ -9,10 +9,12 @@
> > > > >  typedef uint32_t	prid_t;		/* project ID */
> > > > >  
> > > > >  typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
> > > > > +typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
> > > > 
> > > > Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can
> > > > be 2^20 bytes:
> > > > 
> > > > #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024)
> > > 
> > > No, the maximum rtgroup length is 2^32-1 blocks.
> > 
> > I couldn't tell if the max length was being defined as the maximum
> > number of rt extents that the rtgroup could index, of whether it was
> > the maximum number of filesystem blocks (i.e. data device fsblock
> > size) tha an rtgroup could index...
> 
> The max rtgroup length is defined in blocks; the min is defined in rt
> extents.

I think that's part of the problem - can we define min and max in
the same units? Or have two sets of definitions - one for each unit?

> I might want to bump up the minimum a bit, but I think
> Christoph should weigh in on that first -- I think his zns patchset
> currently assigns one rtgroup to each zone?  Because he was muttering
> about how 130,000x 256MB rtgroups really sucks.

Ah, that might be the capacity vs size thing - to allow rtgroups to
be sized as an integer multiple of the zone capacity and so have an
rtgroup for every N contiguous zones....

> Would it be very messy
> to have a minimum size of (say) 1GB?

I was thinking of larger than that, but the question comes down to
how *small* do we need to support for rtg based rtdevs? I was
thinking that hundreds of GB would be the smallest size device we
might deploy this sort of feature on, in which case somewhere around
50GB would be the typical minimum rtg size...

I'm kind worried that 1GB sizes still allows the crazy growfs small
to huge capacity problems we have with AGs. It's probably a good
place to start, but I think larger would be better...

-Dave.
Christoph Hellwig Aug. 27, 2024, 4:27 a.m. UTC | #7
On Mon, Aug 26, 2024 at 12:14:04PM -0700, Darrick J. Wong wrote:
> They're not sparse like fsbnos on the data device, they're laid end to
> end.  IOWs, it's a straight linear translation.  If you have an rtgroup
> that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).

Except with the zone capacity features on ZNS devices, where they
already are sparse.  But that's like 200 patches away from the state
here..

> group 0 on a !rtg filesystem can be 64-bits in block/rt count.  This is
> a /very/ annoying pain point -- if you actually created such a
> filesystem it actually would never work because the rtsummary file would
> be created undersized due to an integer overflow, but the verifiers
> never checked any of that, and due to the same underflow the rtallocator
> would search the wrong places and (eventually) fall back to a dumb
> linear scan.
> 
> Soooooo this is an obnoxious usecase (broken large !rtg filesystems)
> that we can't just drop, though I'm pretty sure there aren't any systems
> in the wild.

So, do we really need to support that?  I think we've always supported
a 64-bit block count, so we'll have to support that, but if a > 32bit
extent count was always broken maybe we should simply stop to pretend
to support it?

> > What's the maximum valid rtg number? We're not ever going to be
> > supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
> > this at and validate it at?
> 
> /me shrugs -- the smallest AG size on the data device is 16M, which
> technically speaking means that one /could/ format 2^(63-24) groups,
> or order 39.
> 
> Realistically with the maximum rtgroup size of 2^31 blocks, we probably
> only need 2^(63 - (31 + 10)) = 2^22 rtgroups max on a 1k fsblock fs.

Note that with zoned file system later on we are bound by hardware
size.  SMR HDDs by convention some with 256MB zones.  This is a bit
on the small side, but grouping multiple of those into a RT group
would be a major pain.  I hope the hardware size will eventually
increase, maybe when they move to 3-digit TB capcity points.
Christoph Hellwig Aug. 27, 2024, 4:38 a.m. UTC | #8
On Tue, Aug 27, 2024 at 10:57:34AM +1000, Dave Chinner wrote:
> We're discussing how to use the sparse fsbno addressing to allow
> resizing of AGs, but we will not be able to do that at all with
> rtgroups as they stand. The limitation is a 64 bit global rt extent
> address is essential the physical address of the extent in the block
> device LBA space.

With this series there are not global RT extent addresses, the extents
are always relative to the group and an entity only used in the
allocator.

> /*
>  * xfs_group - a contiguous 32 bit block address space group
>  */
> struct xfs_group {
> 	struct xarray		xarr;
> 	u32			num_groups;
> };
> 
> struct xfs_group_item {
> 	struct xfs_group	*group; /* so put/rele don't need any other context */
> 	u32			gno;
> 	atomic_t		passive_refs;
> 	atomic_t		active_refs;

What is the point of splitting the group and group_item?  This isn't
done in the current perag struture either.

> Hence I'm wondering if we should actually cap the maximum number of
> rtgroups. WE're just about at BS > PS, so with a 64k block size a
> single rtgroup can index 2^32 * 2^16 bytes which puts individual
> rtgs at 256TB in size. Unless there are use cases for rtgroup sizes
> smaller than a few GBs, I just don't see the need for support
> theoretical maximum counts on tiny block size filesystems. Thirty
> thousand rtgs at 256TB per rtg puts us at 64 bit device size limits,
> and we hit those limits on 4kB block sizes at around 500,000 rtgs.
> 
> So do we need to support millions of rtgs? I'd say no....

As said before hardware is having a word with with the 256GB hardware
zone size in SMR HDDs.  I hope that size will eventually increase, but
I would not bet my house on it.
Christoph Hellwig Aug. 27, 2024, 4:44 a.m. UTC | #9
On Mon, Aug 26, 2024 at 06:55:58PM -0700, Darrick J. Wong wrote:
> The thing I *don't* know is how will this affect hch's zoned device
> support -- he's mentioned that rtgroups will eventually have both a size
> and a "capacity" to keep the zones aligned to groups, or groups aligned
> to zones, I don't remember which.  I don't know if segmenting
> br_startblock for rt mappings makes things better or worse for that.

This should be fine.  The ZNS zone capacity features where zones have
a size (LBA space allocated to it) and a capacity (LBAs that can
actually be written to) is the hardware equivalent of this.

> So ... would it theoretically make more sense to use an rhashtable here?
> Insofar as the only place that totally falls down is if you want to
> iterate tagged groups; and that's only done for AGs.

It also is an important part of garbage collection for zoned XFS, where
we'll use it on RTGs.

> > 
> > #define for_each_group(grp, gno, grpi)					\
> > 	(gno) = 0;							\
> > 	for ((grpi) = to_grpi((grpi), xfs_group_grab((grp), (gno)));	\
> > 	     (grpi) != NULL;						\
> > 	     (grpi) = to_grpi(grpi, xfs_group_next((grp), to_gi(grpi),	\
> > 					&(gno), (grp)->num_groups))
> > 
> > And now we essentially have common group infrstructure for
> > access, iteration, geometry and address verification purposes...
> 
> <nod> That's pretty much what I had drafted, albeit with different
> helper macros since I kept the for_each_{perag,rtgroup} things around
> for type safety.  Though I think for_each_perag just becomes:
> 
> #define for_each_perag(mp, agno, pag) \
> 	for_each_group((mp)->m_perags, (agno), (pag))
> 
> Right?

Btw, if we touch all of this anyway I'd drop the agno argument.
We can get the group number from the group struct (see my perag xarray
conversion series for an example where I'm doing this for the tagged
iteration).

> 
> The max rtgroup length is defined in blocks; the min is defined in rt
> extents.  I might want to bump up the minimum a bit, but I think
> Christoph should weigh in on that first -- I think his zns patchset
> currently assigns one rtgroup to each zone?  Because he was muttering
> about how 130,000x 256MB rtgroups really sucks.  Would it be very messy
> to have a minimum size of (say) 1GB?

Very messy.  I can live with a minimum of 256 MB, but no byte less :)
This is the size used by all shipping SMR hard drivers.  For ZNS SSDs
there are samples with very small zones size that are basically open
channel devices in disguise - no sane person would want them and they
don't make sense to support in XFS as they require extensive erasure
encoding and error correction.  The ZNS drives with full data integrity
support have zone sizes and capacities way over 1GB and growing.

> > and we hit those limits on 4kB block sizes at around 500,000 rtgs.
> > 
> > So do we need to support millions of rtgs? I'd say no....
> 
> ...but we might.  Christoph, how gnarly does zns support get if you have
> to be able to pack multiple SMR zones into a single rtgroup?

I thought about it, but it creates real accounting nightmares.  It's
not entirely doable, but really messy.
Darrick J. Wong Aug. 27, 2024, 5:17 a.m. UTC | #10
On Mon, Aug 26, 2024 at 09:38:16PM -0700, Christoph Hellwig wrote:
> On Tue, Aug 27, 2024 at 10:57:34AM +1000, Dave Chinner wrote:
> > We're discussing how to use the sparse fsbno addressing to allow
> > resizing of AGs, but we will not be able to do that at all with
> > rtgroups as they stand. The limitation is a 64 bit global rt extent
> > address is essential the physical address of the extent in the block
> > device LBA space.
> 
> With this series there are not global RT extent addresses, the extents
> are always relative to the group and an entity only used in the
> allocator.
> 
> > /*
> >  * xfs_group - a contiguous 32 bit block address space group
> >  */
> > struct xfs_group {
> > 	struct xarray		xarr;
> > 	u32			num_groups;
> > };
> > 
> > struct xfs_group_item {
> > 	struct xfs_group	*group; /* so put/rele don't need any other context */
> > 	u32			gno;
> > 	atomic_t		passive_refs;
> > 	atomic_t		active_refs;
> 
> What is the point of splitting the group and group_item?  This isn't
> done in the current perag struture either.

I think xfs_group encapsulates/replaces the radix tree root in struct
xfs_mount, and the xarray inside it points to xfs_group_item objects.

> > Hence I'm wondering if we should actually cap the maximum number of
> > rtgroups. WE're just about at BS > PS, so with a 64k block size a
> > single rtgroup can index 2^32 * 2^16 bytes which puts individual
> > rtgs at 256TB in size. Unless there are use cases for rtgroup sizes
> > smaller than a few GBs, I just don't see the need for support
> > theoretical maximum counts on tiny block size filesystems. Thirty
> > thousand rtgs at 256TB per rtg puts us at 64 bit device size limits,
> > and we hit those limits on 4kB block sizes at around 500,000 rtgs.
> > 
> > So do we need to support millions of rtgs? I'd say no....
> 
> As said before hardware is having a word with with the 256GB hardware
> zone size in SMR HDDs.  I hope that size will eventually increase, but
> I would not bet my house on it.

Wait, 256 *gigabytes*?  That wouldn't be such a bad minimum.

--D
Christoph Hellwig Aug. 27, 2024, 5:18 a.m. UTC | #11
On Mon, Aug 26, 2024 at 10:17:19PM -0700, Darrick J. Wong wrote:
> > allocator.
> > 
> > > /*
> > >  * xfs_group - a contiguous 32 bit block address space group
> > >  */
> > > struct xfs_group {
> > > 	struct xarray		xarr;
> > > 	u32			num_groups;
> > > };
> > > 
> > > struct xfs_group_item {
> > > 	struct xfs_group	*group; /* so put/rele don't need any other context */
> > > 	u32			gno;
> > > 	atomic_t		passive_refs;
> > > 	atomic_t		active_refs;
> > 
> > What is the point of splitting the group and group_item?  This isn't
> > done in the current perag struture either.
> 
> I think xfs_group encapsulates/replaces the radix tree root in struct
> xfs_mount, and the xarray inside it points to xfs_group_item objects.

Ahh.  So it's now a xfs_group structure, but a xfs_groups one,
with the group item really being xfs_group.

> 
> > > Hence I'm wondering if we should actually cap the maximum number of
> > > rtgroups. WE're just about at BS > PS, so with a 64k block size a
> > > single rtgroup can index 2^32 * 2^16 bytes which puts individual
> > > rtgs at 256TB in size. Unless there are use cases for rtgroup sizes
> > > smaller than a few GBs, I just don't see the need for support
> > > theoretical maximum counts on tiny block size filesystems. Thirty
> > > thousand rtgs at 256TB per rtg puts us at 64 bit device size limits,
> > > and we hit those limits on 4kB block sizes at around 500,000 rtgs.
> > > 
> > > So do we need to support millions of rtgs? I'd say no....
> > 
> > As said before hardware is having a word with with the 256GB hardware
> > zone size in SMR HDDs.  I hope that size will eventually increase, but
> > I would not bet my house on it.
> 
> Wait, 256 *gigabytes*?  That wouldn't be such a bad minimum.

Sorry, MB.  My units really suck this morning :)
Darrick J. Wong Aug. 27, 2024, 5:19 a.m. UTC | #12
On Mon, Aug 26, 2024 at 09:27:03PM -0700, Christoph Hellwig wrote:
> On Mon, Aug 26, 2024 at 12:14:04PM -0700, Darrick J. Wong wrote:
> > They're not sparse like fsbnos on the data device, they're laid end to
> > end.  IOWs, it's a straight linear translation.  If you have an rtgroup
> > that is 50 blocks long, then rtgroup 1 starts at (50 * blocksize).
> 
> Except with the zone capacity features on ZNS devices, where they
> already are sparse.  But that's like 200 patches away from the state
> here..

Heh.

> > group 0 on a !rtg filesystem can be 64-bits in block/rt count.  This is
> > a /very/ annoying pain point -- if you actually created such a
> > filesystem it actually would never work because the rtsummary file would
> > be created undersized due to an integer overflow, but the verifiers
> > never checked any of that, and due to the same underflow the rtallocator
> > would search the wrong places and (eventually) fall back to a dumb
> > linear scan.
> > 
> > Soooooo this is an obnoxious usecase (broken large !rtg filesystems)
> > that we can't just drop, though I'm pretty sure there aren't any systems
> > in the wild.
> 
> So, do we really need to support that?  I think we've always supported
> a 64-bit block count, so we'll have to support that, but if a > 32bit
> extent count was always broken maybe we should simply stop to pretend
> to support it?

I'm in favor of that.  The rextslog computation only got fixed in 6.8,
which means none of the LTS kernels really have it yet.  And the ones
that do are migrating verrrrry slowly due to the global rtbmp lock.

> > > What's the maximum valid rtg number? We're not ever going to be
> > > supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap
> > > this at and validate it at?
> > 
> > /me shrugs -- the smallest AG size on the data device is 16M, which
> > technically speaking means that one /could/ format 2^(63-24) groups,
> > or order 39.
> > 
> > Realistically with the maximum rtgroup size of 2^31 blocks, we probably
> > only need 2^(63 - (31 + 10)) = 2^22 rtgroups max on a 1k fsblock fs.
> 
> Note that with zoned file system later on we are bound by hardware
> size.  SMR HDDs by convention some with 256MB zones.  This is a bit
> on the small side, but grouping multiple of those into a RT group
> would be a major pain.  I hope the hardware size will eventually
> increase, maybe when they move to 3-digit TB capcity points.

<nod>

--D
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4d8ca08cdd0ec..388b5cef48ca5 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -60,6 +60,7 @@  xfs-y				+= $(addprefix libxfs/, \
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
 				   xfs_rtbitmap.o \
+				   xfs_rtgroup.o \
 				   )
 
 # highlevel code
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 16a7bc02aa5f5..fa5cfc8265d92 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -176,6 +176,9 @@  typedef struct xfs_sb {
 
 	xfs_ino_t	sb_metadirino;	/* metadata directory tree root */
 
+	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
+	xfs_rtxlen_t	sb_rgextents;	/* size of a realtime group in rtx */
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
new file mode 100644
index 0000000000000..2bad1ecb811eb
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -0,0 +1,196 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+
+/*
+ * Passive reference counting access wrappers to the rtgroup structures.  If
+ * the rtgroup structure is to be freed, the freeing code is responsible for
+ * cleaning up objects with passive references before freeing the structure.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_get(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = xa_load(&mp->m_rtgroups, rgno);
+	if (rtg) {
+		trace_xfs_rtgroup_get(rtg, _RET_IP_);
+		ASSERT(atomic_read(&rtg->rtg_ref) >= 0);
+		atomic_inc(&rtg->rtg_ref);
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+/* Get a passive reference to the given rtgroup. */
+struct xfs_rtgroup *
+xfs_rtgroup_hold(
+	struct xfs_rtgroup	*rtg)
+{
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0 ||
+	       atomic_read(&rtg->rtg_active_ref) > 0);
+
+	trace_xfs_rtgroup_hold(rtg, _RET_IP_);
+	atomic_inc(&rtg->rtg_ref);
+	return rtg;
+}
+
+void
+xfs_rtgroup_put(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_put(rtg, _RET_IP_);
+	ASSERT(atomic_read(&rtg->rtg_ref) > 0);
+	atomic_dec(&rtg->rtg_ref);
+}
+
+/*
+ * Active references for rtgroup structures. This is for short term access to
+ * the rtgroup structures for walking trees or accessing state. If an rtgroup
+ * is being shrunk or is offline, then this will fail to find that group and
+ * return NULL instead.
+ */
+struct xfs_rtgroup *
+xfs_rtgroup_grab(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rcu_read_lock();
+	rtg = xa_load(&mp->m_rtgroups, agno);
+	if (rtg) {
+		trace_xfs_rtgroup_grab(rtg, _RET_IP_);
+		if (!atomic_inc_not_zero(&rtg->rtg_active_ref))
+			rtg = NULL;
+	}
+	rcu_read_unlock();
+	return rtg;
+}
+
+void
+xfs_rtgroup_rele(
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_rtgroup_rele(rtg, _RET_IP_);
+	if (atomic_dec_and_test(&rtg->rtg_active_ref))
+		wake_up(&rtg->rtg_active_wq);
+}
+
+int
+xfs_rtgroup_alloc(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup	*rtg;
+	int			error;
+
+	rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL);
+	if (!rtg)
+		return -ENOMEM;
+	rtg->rtg_rgno = rgno;
+	rtg->rtg_mount = mp;
+
+	error = xa_insert(&mp->m_rtgroups, rgno, rtg, GFP_KERNEL);
+	if (error) {
+		WARN_ON_ONCE(error == -EBUSY);
+		goto out_free_rtg;
+	}
+
+#ifdef __KERNEL__
+	/* Place kernel structure only init below this point. */
+	spin_lock_init(&rtg->rtg_state_lock);
+	init_waitqueue_head(&rtg->rtg_active_wq);
+#endif /* __KERNEL__ */
+
+	/* Active ref owned by mount indicates rtgroup is online. */
+	atomic_set(&rtg->rtg_active_ref, 1);
+	return 0;
+
+out_free_rtg:
+	kfree(rtg);
+	return error;
+}
+
+void
+xfs_rtgroup_free(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	struct xfs_rtgroup	*rtg;
+
+	rtg = xa_erase(&mp->m_rtgroups, rgno);
+	if (!rtg) /* can happen when growfs fails */
+		return;
+
+	XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_ref) != 0);
+
+	/* drop the mount's active reference */
+	xfs_rtgroup_rele(rtg);
+	XFS_IS_CORRUPT(mp, atomic_read(&rtg->rtg_active_ref) != 0);
+
+	kfree_rcu_mightsleep(rtg);
+}
+
+/*
+ * Free up the rtgroup resources associated with the mount structure.
+ */
+void
+xfs_free_rtgroups(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgcount)
+{
+	xfs_rgnumber_t		rgno;
+
+	for (rgno = 0; rgno < rgcount; rgno++)
+		xfs_rtgroup_free(mp, rgno);
+}
+
+/* Compute the number of rt extents in this realtime group. */
+xfs_rtxnum_t
+xfs_rtgroup_extents(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	xfs_rgnumber_t		rgcount = mp->m_sb.sb_rgcount;
+
+	ASSERT(rgno < rgcount);
+	if (rgno == rgcount - 1)
+		return mp->m_sb.sb_rextents -
+			((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
+
+	ASSERT(xfs_has_rtgroups(mp));
+	return mp->m_sb.sb_rgextents;
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
new file mode 100644
index 0000000000000..2c09ecfc50328
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -0,0 +1,212 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_RTGROUP_H
+#define __LIBXFS_RTGROUP_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Realtime group incore structure, similar to the per-AG structure.
+ */
+struct xfs_rtgroup {
+	struct xfs_mount	*rtg_mount;
+	xfs_rgnumber_t		rtg_rgno;
+	atomic_t		rtg_ref;	/* passive reference count */
+	atomic_t		rtg_active_ref;	/* active reference count */
+	wait_queue_head_t	rtg_active_wq;/* woken active_ref falls to zero */
+
+	/* Number of blocks in this group */
+	xfs_rtxnum_t		rtg_extents;
+
+#ifdef __KERNEL__
+	/* -- kernel only structures below this line -- */
+	spinlock_t		rtg_state_lock;
+#endif /* __KERNEL__ */
+};
+
+#ifdef CONFIG_XFS_RT
+/* Passive rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_get(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+struct xfs_rtgroup *xfs_rtgroup_hold(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_put(struct xfs_rtgroup *rtg);
+
+/* Active rtgroup references */
+struct xfs_rtgroup *xfs_rtgroup_grab(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_rele(struct xfs_rtgroup *rtg);
+
+int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t rgcount);
+#else /* CONFIG_XFS_RT */
+static inline struct xfs_rtgroup *xfs_rtgroup_get(struct xfs_mount *mp,
+		xfs_rgnumber_t rgno)
+{
+	return NULL;
+}
+static inline struct xfs_rtgroup *xfs_rtgroup_hold(struct xfs_rtgroup *rtg)
+{
+	ASSERT(rtg == NULL);
+	return NULL;
+}
+static inline void xfs_rtgroup_put(struct xfs_rtgroup *rtg)
+{
+}
+static inline int xfs_rtgroup_alloc( struct xfs_mount *mp,
+		xfs_rgnumber_t rgno)
+{
+	return 0;
+}
+static inline void xfs_free_rtgroups(struct xfs_mount *mp,
+		xfs_rgnumber_t rgcount)
+{
+}
+#define xfs_rtgroup_grab			xfs_rtgroup_get
+#define xfs_rtgroup_rele			xfs_rtgroup_put
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * rt group iteration APIs
+ */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgnumber_t		*rgno,
+	xfs_rgnumber_t		end_rgno)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	*rgno = rtg->rtg_rgno + 1;
+	xfs_rtgroup_rele(rtg);
+	if (*rgno > end_rgno)
+		return NULL;
+	return xfs_rtgroup_grab(mp, *rgno);
+}
+
+#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \
+	for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \
+		(rtg) != NULL; \
+		(rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno)))
+
+#define for_each_rtgroup_from(mp, rgno, rtg) \
+	for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg))
+
+
+#define for_each_rtgroup(mp, rgno, rtg) \
+	(rgno) = 0; \
+	for_each_rtgroup_from((mp), (rgno), (rtg))
+
+static inline bool
+xfs_verify_rgbno(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno)
+{
+	struct xfs_mount	*mp = rtg->rtg_mount;
+
+	if (rgbno >= rtg->rtg_extents * mp->m_sb.sb_rextsize)
+		return false;
+	if (xfs_has_rtsb(mp) && rtg->rtg_rgno == 0 &&
+	    rgbno < mp->m_sb.sb_rextsize)
+		return false;
+	return true;
+}
+
+static inline bool
+xfs_verify_rgbext(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		rgbno,
+	xfs_rgblock_t		len)
+{
+	if (rgbno + len <= rgbno)
+		return false;
+
+	if (!xfs_verify_rgbno(rtg, rgbno))
+		return false;
+
+	return xfs_verify_rgbno(rtg, rgbno + len - 1);
+}
+
+static inline xfs_rtblock_t
+xfs_rgno_start_rtb(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno)
+{
+	if (mp->m_rgblklog >= 0)
+		return ((xfs_rtblock_t)rgno << mp->m_rgblklog);
+	return ((xfs_rtblock_t)rgno * mp->m_rgblocks);
+}
+
+static inline xfs_rtblock_t
+xfs_rgbno_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_rgnumber_t		rgno,
+	xfs_rgblock_t		rgbno)
+{
+	return xfs_rgno_start_rtb(mp, rgno) + rgbno;
+}
+
+static inline xfs_rgnumber_t
+xfs_rtb_to_rgno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	if (!xfs_has_rtgroups(mp))
+		return 0;
+
+	if (mp->m_rgblklog >= 0)
+		return rtbno >> mp->m_rgblklog;
+
+	return div_u64(rtbno, mp->m_rgblocks);
+}
+
+static inline uint64_t
+__xfs_rtb_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	uint32_t		rem;
+
+	if (!xfs_has_rtgroups(mp))
+		return rtbno;
+
+	if (mp->m_rgblklog >= 0)
+		return rtbno & mp->m_rgblkmask;
+
+	div_u64_rem(rtbno, mp->m_rgblocks, &rem);
+	return rem;
+}
+
+static inline xfs_rgblock_t
+xfs_rtb_to_rgbno(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return __xfs_rtb_to_rgbno(mp, rtbno);
+}
+
+static inline xfs_daddr_t
+xfs_rtb_to_daddr(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return rtbno << mp->m_blkbb_log;
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr)
+{
+	return daddr >> mp->m_blkbb_log;
+}
+
+#ifdef CONFIG_XFS_RT
+xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+#else
+# define xfs_rtgroup_extents(mp, rgno)		(0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b83ce29640511..f1cdffb2f3392 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -696,6 +696,9 @@  __xfs_sb_from_disk(
 		to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
 	else
 		to->sb_metadirino = NULLFSINO;
+
+	to->sb_rgcount = 1;
+	to->sb_rgextents = 0;
 }
 
 void
@@ -982,6 +985,10 @@  xfs_mount_sb_set_rextsize(
 {
 	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
 	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+
+	mp->m_rgblocks = 0;
+	mp->m_rgblklog = 0;
+	mp->m_rgblkmask = 0;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index a8cd44d03ef64..1ce4b9eb16f47 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -9,10 +9,12 @@ 
 typedef uint32_t	prid_t;		/* project ID */
 
 typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef uint32_t	xfs_rgblock_t;	/* blockno in realtime group */
 typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
 typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
 typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
 typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
+typedef uint32_t	xfs_rgnumber_t;	/* realtime group number */
 typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
 typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
 typedef int64_t		xfs_fsize_t;	/* bytes in a file */
@@ -53,7 +55,9 @@  typedef void *		xfs_failaddr_t;
 #define	NULLFILEOFF	((xfs_fileoff_t)-1)
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
+#define NULLRGBLOCK	((xfs_rgblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define	NULLRGNUMBER	((xfs_rgnumber_t)-1)
 
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4423dd344239b..c627cde3bb1e0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -28,6 +28,7 @@ 
 #include "xfs_ag.h"
 #include "xfs_quota.h"
 #include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
@@ -3346,6 +3347,7 @@  xlog_do_recover(
 	struct xfs_mount	*mp = log->l_mp;
 	struct xfs_buf		*bp = mp->m_sb_bp;
 	struct xfs_sb		*sbp = &mp->m_sb;
+	xfs_rgnumber_t		old_rgcount = sbp->sb_rgcount;
 	int			error;
 
 	trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -3399,6 +3401,24 @@  xlog_do_recover(
 		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
 		return error;
 	}
+
+	if (sbp->sb_rgcount < old_rgcount) {
+		xfs_warn(mp, "rgcount shrink not supported");
+		return -EINVAL;
+	}
+	if (sbp->sb_rgcount > old_rgcount) {
+		xfs_rgnumber_t		rgno;
+
+		for (rgno = old_rgcount; rgno < sbp->sb_rgcount; rgno++) {
+			error = xfs_rtgroup_alloc(mp, rgno);
+			if (error) {
+				xfs_warn(mp,
+	"Failed post-recovery rtgroup init: %d",
+						error);
+				return error;
+			}
+		}
+	}
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
 	/* Normal transactions can now occur */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b0ea88acdb618..e1e849101cdd4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -36,6 +36,7 @@ 
 #include "xfs_ag.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
 #include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -664,6 +665,7 @@  xfs_mountfs(
 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
 	uint			quotamount = 0;
 	uint			quotaflags = 0;
+	xfs_rgnumber_t		rgno;
 	int			error = 0;
 
 	xfs_sb_mount_common(mp, sbp);
@@ -830,10 +832,18 @@  xfs_mountfs(
 		goto out_free_dir;
 	}
 
+	for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+		error = xfs_rtgroup_alloc(mp, rgno);
+		if (error) {
+			xfs_warn(mp, "Failed rtgroup init: %d", error);
+			goto out_free_rtgroup;
+		}
+	}
+
 	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
 		xfs_warn(mp, "no log defined");
 		error = -EFSCORRUPTED;
-		goto out_free_perag;
+		goto out_free_rtgroup;
 	}
 
 	error = xfs_inodegc_register_shrinker(mp);
@@ -1068,7 +1078,8 @@  xfs_mountfs(
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
 		xfs_buftarg_drain(mp->m_logdev_targp);
 	xfs_buftarg_drain(mp->m_ddev_targp);
- out_free_perag:
+ out_free_rtgroup:
+	xfs_free_rtgroups(mp, rgno);
 	xfs_free_perag(mp);
  out_free_dir:
 	xfs_da_unmount(mp);
@@ -1152,6 +1163,7 @@  xfs_unmountfs(
 	xfs_errortag_clearall(mp);
 #endif
 	shrinker_free(mp->m_inodegc_shrinker);
+	xfs_free_rtgroups(mp, mp->m_sb.sb_rgcount);
 	xfs_free_perag(mp);
 
 	xfs_errortag_del(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9e883d2159fd9..f69da6802e8c1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -121,6 +121,7 @@  typedef struct xfs_mount {
 	uint8_t			m_agno_log;	/* log #ag's */
 	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
 	int8_t			m_rtxblklog;	/* log2 of rextsize, if possible */
+	int8_t			m_rgblklog;	/* log2 of rt group sz if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -149,12 +150,14 @@  typedef struct xfs_mount {
 	int			m_logbsize;	/* size of each log buffer */
 	uint			m_rsumlevels;	/* rt summary levels */
 	xfs_filblks_t		m_rsumblocks;	/* size of rt summary, FSBs */
+	uint32_t		m_rgblocks;	/* size of rtgroup in rtblocks */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_qflags;	/* quota status flags */
 	uint64_t		m_features;	/* active filesystem features */
 	uint64_t		m_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_low_rtexts[XFS_LOWSP_MAX];
 	uint64_t		m_rtxblkmask;	/* rt extent block mask */
+	uint64_t		m_rgblkmask;	/* rt group block mask */
 	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
 						/* low free space thresholds */
@@ -209,6 +212,7 @@  typedef struct xfs_mount {
 	 */
 	atomic64_t		m_allocbt_blks;
 
+	struct xarray		m_rtgroups;	/* per-rt group info */
 	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
 	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
 	uint64_t		m_resblks;	/* total reserved blocks */
@@ -358,6 +362,16 @@  __XFS_HAS_FEAT(large_extent_counts, NREXT64)
 __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
 __XFS_HAS_FEAT(metadir, METADIR)
 
+static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
+{
+	return false;
+}
+
+static inline bool xfs_has_rtsb(struct xfs_mount *mp)
+{
+	return false;
+}
+
 /*
  * Some features are always on for v5 file systems, allow the compiler to
  * eliminiate dead code when building without v4 support.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 46a920b192d19..59898117f817d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -27,6 +27,7 @@ 
 #include "xfs_health.h"
 #include "xfs_da_format.h"
 #include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -1136,6 +1137,8 @@  xfs_rtmount_inodes(
 {
 	struct xfs_trans	*tp;
 	struct xfs_sb		*sbp = &mp->m_sb;
+	struct xfs_rtgroup	*rtg;
+	xfs_rgnumber_t		rgno;
 	int			error;
 
 	error = xfs_trans_alloc_empty(mp, &tp);
@@ -1166,6 +1169,9 @@  xfs_rtmount_inodes(
 	if (error)
 		goto out_rele_summary;
 
+	for_each_rtgroup(mp, rgno, rtg)
+		rtg->rtg_extents = xfs_rtgroup_extents(mp, rtg->rtg_rgno);
+
 	error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
 	if (error)
 		goto out_rele_summary;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 34066b50585e8..cee64c1a7d650 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2015,6 +2015,7 @@  static int xfs_init_fs_context(
 	spin_lock_init(&mp->m_sb_lock);
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	spin_lock_init(&mp->m_perag_lock);
+	xa_init(&mp->m_rtgroups);
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index c5f818cf40c29..f888d41e3283f 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -46,6 +46,7 @@ 
 #include "xfs_refcount.h"
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
+#include "xfs_rtgroup.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7f259891ebcaa..4401a7c6230df 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -94,6 +94,7 @@  struct xfs_extent_free_item;
 struct xfs_rmap_intent;
 struct xfs_refcount_intent;
 struct xfs_metadir_update;
+struct xfs_rtgroup;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -220,6 +221,43 @@  DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_rtgroup_class,
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned long caller_ip),
+	TP_ARGS(rtg, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(int, refcount)
+		__field(int, active_refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg->rtg_mount->m_super->s_dev;
+		__entry->rgno = rtg->rtg_rgno;
+		__entry->refcount = atomic_read(&rtg->rtg_ref);
+		__entry->active_refcount = atomic_read(&rtg->rtg_active_ref);
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d rgno 0x%x passive refs %d active refs %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->refcount,
+		  __entry->active_refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_RTGROUP_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_rtgroup_class, name,	\
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned long caller_ip), \
+	TP_ARGS(rtg, caller_ip))
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_get);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_hold);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_put);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_grab);
+DEFINE_RTGROUP_REF_EVENT(xfs_rtgroup_rele);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xfs_inodegc_worker,
 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
 	TP_ARGS(mp, shrinker_hits),