@@ -93,4 +93,10 @@ howmany_64(uint64_t x, uint32_t y)
return x;
}
+static inline __attribute__((const))
+int is_power_of_2(unsigned long n)
+{
+ return (n != 0 && ((n & (n - 1)) == 0));
+}
+
#endif /* LIBFROG_DIV64_H_ */
@@ -36,3 +36,15 @@ memchr_inv(const void *start, int c, size_t bytes)
return NULL;
}
+
+unsigned int
+log2_rounddown(unsigned long long i)
+{
+ int rval;
+
+ for (rval = NBBY * sizeof(i) - 1; rval >= 0; rval--) {
+ if ((1ULL << rval) < i)
+ break;
+ }
+ return rval;
+}
@@ -7,6 +7,7 @@
#define __LIBFROG_UTIL_H__
unsigned int log2_roundup(unsigned int i);
+unsigned int log2_rounddown(unsigned long long i);
void *memchr_inv(const void *start, int c, size_t bytes);
@@ -80,6 +80,7 @@
#define xfs_btree_update libxfs_btree_update
#define xfs_btree_space_to_height libxfs_btree_space_to_height
#define xfs_btree_visit_blocks libxfs_btree_visit_blocks
+#define xfs_buf_delwri_queue libxfs_buf_delwri_queue
#define xfs_buf_delwri_submit libxfs_buf_delwri_submit
#define xfs_buf_get libxfs_buf_get
#define xfs_buf_get_uncached libxfs_buf_get_uncached
@@ -337,12 +337,6 @@ find_next_zero_bit(const unsigned long *addr, unsigned long size,
}
#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
-static inline __attribute__((const))
-int is_power_of_2(unsigned long n)
-{
- return (n != 0 && ((n & (n - 1)) == 0));
-}
-
/*
* xfs_iroundup: round up argument to next power of two
*/
@@ -89,6 +89,48 @@ calc_default_ag_geometry(
*agcount = dblocks / blocks + (dblocks % blocks != 0);
}
+void
+calc_default_rtgroup_geometry(
+ int blocklog,
+ uint64_t rblocks,
+ uint64_t *rgsize,
+ uint64_t *rgcount)
+{
+ uint64_t blocks = 0;
+ int shift = 0;
+
+ /*
+ * For a single underlying storage device over 4TB in size use the
+ * maximum rtgroup size. Between 128MB and 4TB, just use 4 rtgroups
+ * and scale up smoothly between min/max rtgroup sizes.
+ */
+ if (rblocks >= TERABYTES(4, blocklog)) {
+ blocks = XFS_MAX_RGBLOCKS;
+ goto done;
+ }
+ if (rblocks >= MEGABYTES(128, blocklog)) {
+ shift = XFS_NOMULTIDISK_AGLOG;
+ goto calc_blocks;
+ }
+
+ /*
+ * If rblocks is not evenly divisible by the number of desired rt
+ * groups, round "blocks" up so we don't lose the last bit of the
+ * filesystem. The same principle applies to the rt group count, so we
+ * don't lose the last rt group!
+ */
+calc_blocks:
+ ASSERT(shift >= 0 && shift <= XFS_MULTIDISK_AGLOG);
+ blocks = rblocks >> shift;
+ if (rblocks & xfs_mask32lo(shift)) {
+ if (blocks < XFS_MAX_RGBLOCKS)
+ blocks++;
+ }
+done:
+ *rgsize = blocks;
+ *rgcount = rblocks / blocks + (rblocks % blocks != 0);
+}
+
/*
* Check for existing filesystem or partition table on device.
* Returns:
@@ -32,6 +32,9 @@ calc_default_ag_geometry(
uint64_t *agsize,
uint64_t *agcount);
+void calc_default_rtgroup_geometry(int blocklog, uint64_t rblocks,
+ uint64_t *rgsize, uint64_t *rgcount);
+
extern int
check_overwrite(
const char *device);
@@ -418,6 +418,7 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
XFS_SB_FEAT_INCOMPAT_NREXT64| \
XFS_SB_FEAT_INCOMPAT_PARENT | \
+ XFS_SB_FEAT_INCOMPAT_RTGROUPS | \
XFS_SB_FEAT_INCOMPAT_METADIR)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
@@ -1116,6 +1116,50 @@ or logical volume containing the section.
.BI noalign
This option disables stripe size detection, enforcing a realtime device with no
stripe geometry.
+.TP
+.BI rtgroups= value
+This feature breaks the realtime section into multiple allocation groups for
+improved scalability.
+This feature is only available if the metadata directory tree feature is
+enabled.
+.IP
+By default,
+.B mkfs.xfs
+will not enable this feature.
+If the option
+.B \-r rtgroups=0
+is used, the rt group feature is not supported and is disabled.
+.TP
+.BI rgcount=
+This is used to specify the number of allocation groups in the realtime
+section.
+The realtime section of the filesystem can be divided into allocation groups to
+improve the performance of XFS.
+More allocation groups imply that more parallelism can be achieved when
+allocating blocks.
+The minimum allocation group size is 2 realtime extents; the maximum size is
+2^31 blocks.
+The rt section of the filesystem is divided into
+.I value
+allocation groups (default value is scaled automatically based
+on the underlying device size).
+.TP
+.BI rgsize= value
+This is an alternative to using the
+.B rgcount
+suboption. The
+.I value
+is the desired size of the realtime allocation group expressed in bytes
+(usually using the
+.BR m " or " g
+suffixes).
+This value must be a multiple of the realtime extent size,
+must be at least two realtime extents, and no more than 2^31 blocks.
+The
+.B rgcount
+and
+.B rgsize
+suboptions are mutually exclusive.
.RE
.PP
.PD 0
@@ -1001,6 +1001,46 @@ rtsummary_init(
}
}
+static void
+rtfreesp_init_groups(
+ struct xfs_mount *mp)
+{
+ xfs_rgnumber_t rgno;
+ int error;
+
+ for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) {
+ struct xfs_trans *tp;
+ xfs_rtblock_t rtbno;
+ xfs_rtxnum_t start_rtx;
+ xfs_rtxnum_t next_rtx;
+
+ rtbno = xfs_rgbno_to_rtb(mp, rgno, mp->m_sb.sb_rextsize);
+ start_rtx = xfs_rtb_to_rtx(mp, rtbno);
+
+ rtbno = xfs_rgbno_to_rtb(mp, rgno + 1, 0);
+ next_rtx = xfs_rtb_to_rtx(mp, rtbno);
+ next_rtx = min(next_rtx, mp->m_sb.sb_rextents);
+
+ error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ 0, 0, 0, &tp);
+ if (error)
+ res_failed(error);
+
+ libxfs_trans_ijoin(tp, mp->m_rbmip, 0);
+ error = -libxfs_rtfree_extent(tp, start_rtx,
+ next_rtx - start_rtx);
+ if (error) {
+ fail(_("Error initializing the realtime space"),
+ error);
+ }
+ error = -libxfs_trans_commit(tp);
+ if (error)
+ fail(_("Initialization of the realtime space failed"),
+ error);
+
+ }
+}
+
/*
* Free the whole realtime area using transactions.
* Do one transaction per bitmap block.
@@ -1049,7 +1089,10 @@ rtinit(
rtbitmap_init(mp);
rtsummary_init(mp);
- rtfreesp_init(mp);
+ if (xfs_has_rtgroups(mp))
+ rtfreesp_init_groups(mp);
+ else
+ rtfreesp_init(mp);
}
static long
@@ -131,6 +131,9 @@ enum {
R_FILE,
R_NAME,
R_NOALIGN,
+ R_RTGROUPS,
+ R_RGCOUNT,
+ R_RGSIZE,
R_MAX_OPTS,
};
@@ -718,6 +721,9 @@ static struct opt_params ropts = {
[R_FILE] = "file",
[R_NAME] = "name",
[R_NOALIGN] = "noalign",
+ [R_RTGROUPS] = "rtgroups",
+ [R_RGCOUNT] = "rgcount",
+ [R_RGSIZE] = "rgsize",
[R_MAX_OPTS] = NULL,
},
.subopt_params = {
@@ -757,6 +763,27 @@ static struct opt_params ropts = {
.defaultval = 1,
.conflicts = { { NULL, LAST_CONFLICT } },
},
+ { .index = R_RTGROUPS,
+ .conflicts = { { NULL, LAST_CONFLICT } },
+ .minval = 0,
+ .maxval = 1,
+ .defaultval = 1,
+ },
+ { .index = R_RGCOUNT,
+ .conflicts = { { &dopts, R_RGSIZE },
+ { NULL, LAST_CONFLICT } },
+ .minval = 1,
+ .maxval = XFS_MAX_RGNUMBER,
+ .defaultval = SUBOPT_NEEDS_VAL,
+ },
+ { .index = R_RGSIZE,
+ .conflicts = { { &dopts, R_RGCOUNT },
+ { NULL, LAST_CONFLICT } },
+ .convert = true,
+ .minval = 0,
+ .maxval = (unsigned long long)XFS_MAX_RGBLOCKS << XFS_MAX_BLOCKSIZE_LOG,
+ .defaultval = SUBOPT_NEEDS_VAL,
+ },
},
};
@@ -922,6 +949,7 @@ struct cli_params {
/* parameters that depend on sector/block size being validated. */
char *dsize;
char *agsize;
+ char *rgsize;
char *dsu;
char *dirblocksize;
char *logsize;
@@ -943,6 +971,7 @@ struct cli_params {
/* parameters where 0 is not a valid value */
int64_t agcount;
+ int64_t rgcount;
int inodesize;
int inopblock;
int imaxpct;
@@ -999,6 +1028,9 @@ struct mkfs_params {
uint64_t agsize;
uint64_t agcount;
+ uint64_t rgsize;
+ uint64_t rgcount;
+
int imaxpct;
bool loginternal;
@@ -1055,7 +1087,8 @@ usage( void )
/* no-op info only */ [-N]\n\
/* prototype file */ [-p fname]\n\
/* quiet */ [-q]\n\
-/* realtime subvol */ [-r extsize=num,size=num,rtdev=xxx]\n\
+/* realtime subvol */ [-r extsize=num,size=num,rtdev=xxx,rtgroups=0|1,\n\
+ rgcount=n,rgsize=n]\n\
/* sectorsize */ [-s size=num]\n\
/* version */ [-V]\n\
devicename\n\
@@ -1952,6 +1985,15 @@ rtdev_opts_parser(
case R_NOALIGN:
cli->sb_feat.nortalign = getnum(value, opts, subopt);
break;
+ case R_RTGROUPS:
+ cli->sb_feat.rtgroups = getnum(value, opts, subopt);
+ break;
+ case R_RGCOUNT:
+ cli->rgcount = getnum(value, opts, subopt);
+ break;
+ case R_RGSIZE:
+ cli->rgsize = getstr(value, opts, subopt);
+ break;
default:
return -EINVAL;
}
@@ -2447,6 +2489,15 @@ _("cowextsize not supported without reflink support\n"));
usage();
}
+ if (cli->sb_feat.rtgroups && !cli->sb_feat.metadir) {
+ if (cli_opt_set(&mopts, M_METADIR)) {
+ fprintf(stderr,
+_("realtime groups not supported without metadata directory support\n"));
+ usage();
+ }
+ cli->sb_feat.metadir = true;
+ }
+
/*
* Copy features across to config structure now.
*/
@@ -3413,6 +3464,181 @@ an AG size that is one stripe unit smaller or larger, for example %llu.\n"),
cfg->agsize, cfg->agcount);
}
+static uint64_t
+calc_rgsize_extsize_nonpower(
+ struct mkfs_params *cfg)
+{
+ uint64_t try_rgsize, rgsize, rgcount;
+
+ /*
+ * For non-power-of-two rt extent sizes, round the rtgroup size down to
+ * the nearest extent.
+ */
+ calc_default_rtgroup_geometry(cfg->blocklog, cfg->rtblocks, &rgsize,
+ &rgcount);
+ rgsize -= rgsize % cfg->rtextblocks;
+ rgsize = min(XFS_MAX_RGBLOCKS, rgsize);
+
+ /*
+ * If we would be left with a too-small rtgroup, increase or decrease
+ * the size of the group until we have a working geometry.
+ */
+ for (try_rgsize = rgsize;
+ try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+ try_rgsize += cfg->rtextblocks) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+ for (try_rgsize = rgsize;
+ try_rgsize > (2 * cfg->rtextblocks);
+ try_rgsize -= cfg->rtextblocks) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+
+ fprintf(stderr,
+_("realtime group size (%llu) not at all congruent with extent size (%llu)\n"),
+ (unsigned long long)rgsize,
+ (unsigned long long)cfg->rtextblocks);
+ usage();
+ return 0;
+}
+
+static uint64_t
+calc_rgsize_extsize_power(
+ struct mkfs_params *cfg)
+{
+ uint64_t try_rgsize, rgsize, rgcount;
+ unsigned int rgsizelog;
+
+ /*
+ * Find the rt group size that is both a power of two and yields at
+ * least as many rt groups as the default geometry specified.
+ */
+ calc_default_rtgroup_geometry(cfg->blocklog, cfg->rtblocks, &rgsize,
+ &rgcount);
+ rgsizelog = log2_rounddown(rgsize);
+ rgsize = min(XFS_MAX_RGBLOCKS, 1U << rgsizelog);
+
+ /*
+ * If we would be left with a too-small rtgroup, increase or decrease
+ * the size of the group by powers of 2 until we have a working
+ * geometry. If that doesn't work, try bumping by the extent size.
+ */
+ for (try_rgsize = rgsize;
+ try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+ try_rgsize <<= 2) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+ for (try_rgsize = rgsize;
+ try_rgsize > (2 * cfg->rtextblocks);
+ try_rgsize >>= 2) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+ for (try_rgsize = rgsize;
+ try_rgsize <= XFS_MAX_RGBLOCKS - cfg->rtextblocks;
+ try_rgsize += cfg->rtextblocks) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+ for (try_rgsize = rgsize;
+ try_rgsize > (2 * cfg->rtextblocks);
+ try_rgsize -= cfg->rtextblocks) {
+ if (cfg->rtblocks % try_rgsize >= (2 * cfg->rtextblocks))
+ return try_rgsize;
+ }
+
+ fprintf(stderr,
+_("realtime group size (%llu) not at all congruent with extent size (%llu)\n"),
+ (unsigned long long)rgsize,
+ (unsigned long long)cfg->rtextblocks);
+ usage();
+ return 0;
+}
+
+static void
+calculate_rtgroup_geometry(
+ struct mkfs_params *cfg,
+ struct cli_params *cli)
+{
+ if (!cli->sb_feat.rtgroups) {
+ cfg->rgcount = 0;
+ cfg->rgsize = 0;
+ return;
+ }
+
+ if (cli->rgsize) { /* User-specified rtgroup size */
+ cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE);
+
+ /*
+ * Check specified agsize is a multiple of blocksize.
+ */
+ if (cfg->rgsize % cfg->blocksize) {
+ fprintf(stderr,
+_("rgsize (%s) not a multiple of fs blk size (%d)\n"),
+ cli->rgsize, cfg->blocksize);
+ usage();
+ }
+ cfg->rgsize /= cfg->blocksize;
+ cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+ (cfg->rtblocks % cfg->rgsize != 0);
+
+ } else if (cli->rgcount) { /* User-specified rtgroup count */
+ cfg->rgcount = cli->rgcount;
+ cfg->rgsize = cfg->rtblocks / cfg->rgcount +
+ (cfg->rtblocks % cfg->rgcount != 0);
+ } else if (cfg->rtblocks == 0) {
+ /*
+ * If nobody specified a realtime device or the rtgroup size,
+ * try 1TB, rounded down to the nearest rt extent.
+ */
+ cfg->rgsize = TERABYTES(1, cfg->blocklog);
+ cfg->rgsize -= cfg->rgsize % cfg->rtextblocks;
+ cfg->rgcount = 0;
+ } else if (!is_power_of_2(cfg->rtextblocks)) {
+ cfg->rgsize = calc_rgsize_extsize_nonpower(cfg);
+ cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+ (cfg->rtblocks % cfg->rgsize != 0);
+ } else {
+ cfg->rgsize = calc_rgsize_extsize_power(cfg);
+ cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+ (cfg->rtblocks % cfg->rgsize != 0);
+ }
+
+ if (cfg->rgsize > XFS_MAX_RGBLOCKS) {
+ fprintf(stderr,
+_("realtime group size (%llu) must be less than the maximum (%u)\n"),
+ (unsigned long long)cfg->rgsize,
+ XFS_MAX_RGBLOCKS);
+ usage();
+ }
+
+ if (cfg->rgsize % cfg->rtextblocks != 0) {
+ fprintf(stderr,
+_("realtime group size (%llu) not a multiple of rt extent size (%llu)\n"),
+ (unsigned long long)cfg->rgsize,
+ (unsigned long long)cfg->rtextblocks);
+ usage();
+ }
+
+ if (cfg->rgsize <= cfg->rtextblocks) {
+ fprintf(stderr,
+_("realtime group size (%llu) must be at least two realtime extents\n"),
+ (unsigned long long)cfg->rgsize);
+ usage();
+ }
+
+ if (cfg->rgcount > XFS_MAX_RGNUMBER) {
+ fprintf(stderr,
+_("realtime group count (%llu) must be less than the maximum (%u)\n"),
+ (unsigned long long)cfg->rgcount,
+ XFS_MAX_RGNUMBER);
+ usage();
+ }
+}
+
static void
calculate_imaxpct(
struct mkfs_params *cfg,
@@ -3552,6 +3778,12 @@ sb_set_features(
if (fp->nrext64)
sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_NREXT64;
+
+ if (fp->rtgroups) {
+ sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_RTGROUPS;
+ sbp->sb_rgcount = cfg->rgcount;
+ sbp->sb_rgblocks = cfg->rgsize;
+ }
}
/*
@@ -4327,6 +4559,7 @@ main(
char **argv)
{
xfs_agnumber_t agno;
+ xfs_rgnumber_t rgno;
struct xfs_buf *buf;
int c;
int dry_run = 0;
@@ -4536,6 +4769,7 @@ main(
*/
calculate_initial_ag_geometry(&cfg, &cli, &xi);
align_ag_geometry(&cfg);
+ calculate_rtgroup_geometry(&cfg, &cli);
calculate_imaxpct(&cfg, &cli);
@@ -4636,6 +4870,42 @@ main(
exit(1);
}
+ /* Write all the realtime group superblocks. */
+ for (rgno = 0; rgno < cfg.rgcount; rgno++) {
+ struct xfs_buf *rtsb_bp;
+ struct xfs_buf *sb_bp = libxfs_getsb(mp);
+
+ if (!sb_bp) {
+ fprintf(stderr,
+ _("%s: couldn't grab buffers to write primary rt superblock\n"), progname);
+ exit(1);
+ }
+
+ error = -libxfs_buf_get_uncached(mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, 1), 0,
+ &rtsb_bp);
+ if (error) {
+ fprintf(stderr,
+ _("%s: couldn't grab primary rt superblock\n"), progname);
+ exit(1);
+ }
+ rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR;
+ rtsb_bp->b_ops = &xfs_rtsb_buf_ops;
+
+ libxfs_rtgroup_update_super(rtsb_bp, sb_bp);
+ libxfs_buf_mark_dirty(rtsb_bp);
+ libxfs_buf_relse(rtsb_bp);
+ libxfs_buf_relse(sb_bp);
+
+ error = -libxfs_rtgroup_update_secondary_sbs(mp);
+ if (error) {
+ fprintf(stderr,
+ _("%s: writing secondary rtgroup headers failed, err=%d\n"),
+ progname, error);
+ exit(1);
+ }
+ }
+
/*
* Initialise the freespace freelists (i.e. AGFLs) in each AG.
*/