Message ID | 20250206064511.2323878-11-hch@lst.de (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
Series | [01/43] xfs: factor out a xfs_rt_check_size helper | expand |
On Thu, Feb 06, 2025 at 07:44:26AM +0100, Christoph Hellwig wrote: > Currently each metabtree inode has it's own space reservation to ensure > it can be expanded to the maximum size. This is not very efficient as it > requires a large number of blocks to be set aside that can't be used at > all by other btrees. > > Switch to a model that uses a global pool instead in preparation for > reducing the amount of reserved space. ...because inodes draw from the global pool, so there's no reason to compartmentalize each rt rmap btree's reservation like we do for per-AG rmap btrees, right? > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/libxfs/xfs_metafile.c | 164 ++++++++++++++++++++----------- > fs/xfs/libxfs/xfs_metafile.h | 6 +- > fs/xfs/scrub/reap.c | 12 ++- > fs/xfs/scrub/repair.c | 54 ---------- > fs/xfs/scrub/repair.h | 1 - > fs/xfs/scrub/rtrefcount_repair.c | 34 ++----- > fs/xfs/scrub/rtrmap_repair.c | 29 +----- > fs/xfs/xfs_fsops.c | 19 ++-- > fs/xfs/xfs_inode.h | 16 +-- > fs/xfs/xfs_mount.h | 9 ++ > fs/xfs/xfs_reflink.c | 12 +-- > fs/xfs/xfs_rtalloc.c | 43 +------- > fs/xfs/xfs_rtalloc.h | 5 - > fs/xfs/xfs_super.c | 1 + > fs/xfs/xfs_trace.h | 23 ++--- > 15 files changed, 160 insertions(+), 268 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c > index 7625e694eb8d..88f011750add 100644 > --- a/fs/xfs/libxfs/xfs_metafile.c > +++ b/fs/xfs/libxfs/xfs_metafile.c > @@ -21,6 +21,9 @@ > #include "xfs_errortag.h" > #include "xfs_error.h" > #include "xfs_alloc.h" > +#include "xfs_rtgroup.h" > +#include "xfs_rtrmap_btree.h" > +#include "xfs_rtrefcount_btree.h" > > static const struct { > enum xfs_metafile_type mtype; > @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( > } > > /* > - * Is the amount of space that could be allocated towards a given metadata > - * file at or beneath a certain threshold? > + * Is the metafile reservations at or beneath a certain threshold? > */ > static inline bool > xfs_metafile_resv_can_cover( > - struct xfs_inode *ip, > + struct xfs_mount *mp, > int64_t rhs) > { > /* > @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( > * global free block count. Take care of the first case to avoid > * touching the per-cpu counter. > */ > - if (ip->i_delayed_blks >= rhs) > + if (mp->m_metafile_resv_avail >= rhs) > return true; > > /* > * There aren't enough blocks left in the inode's reservation, but it > * isn't critical unless there also isn't enough free space. > */ > - return xfs_compare_freecounter(ip->i_mount, XC_FREE_BLOCKS, > - rhs - ip->i_delayed_blks, 2048) >= 0; > + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, > + rhs - mp->m_metafile_resv_avail, 2048) >= 0; > } > > /* > - * Is this metadata file critically low on blocks? For now we'll define that > - * as the number of blocks we can get our hands on being less than 10% of what > - * we reserved or less than some arbitrary number (maximum btree height). > + * Is the metafile reservation critically low on blocks? For now we'll define > + * that as the number of blocks we can get our hands on being less than 10% of > + * what we reserved or less than some arbitrary number (maximum btree height). > */ > bool > xfs_metafile_resv_critical( > - struct xfs_inode *ip) > + struct xfs_mount *mp) > { > - uint64_t asked_low_water; > + ASSERT(xfs_has_metadir(mp)); > > - if (!ip) > - return false; > - > - ASSERT(xfs_is_metadir_inode(ip)); > - trace_xfs_metafile_resv_critical(ip, 0); > + trace_xfs_metafile_resv_critical(mp, 0); > > - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) > + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) > return true; > > - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); > - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) > + if (!xfs_metafile_resv_can_cover(mp, > + div_u64(mp->m_metafile_resv_target, 10))) > return true; > > - return XFS_TEST_ERROR(false, ip->i_mount, > - XFS_ERRTAG_METAFILE_RESV_CRITICAL); > + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); > } > > /* Allocate a block from the metadata file's reservation. */ > @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( > struct xfs_inode *ip, > struct xfs_alloc_arg *args) > { > + struct xfs_mount *mp = ip->i_mount; > int64_t len = args->len; > > ASSERT(xfs_is_metadir_inode(ip)); > ASSERT(args->resv == XFS_AG_RESV_METAFILE); > > - trace_xfs_metafile_resv_alloc_space(ip, args->len); > + trace_xfs_metafile_resv_alloc_space(mp, args->len); > > /* > * Allocate the blocks from the metadata inode's block reservation > * and update the ondisk sb counter. > */ > - if (ip->i_delayed_blks > 0) { > + mutex_lock(&mp->m_metafile_resv_lock); > + if (mp->m_metafile_resv_avail > 0) { > int64_t from_resv; > > - from_resv = min_t(int64_t, len, ip->i_delayed_blks); > - ip->i_delayed_blks -= from_resv; > + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); > + mp->m_metafile_resv_avail -= from_resv; > xfs_mod_delalloc(ip, 0, -from_resv); > xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, > -from_resv); > @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( > xfs_trans_mod_sb(args->tp, field, -len); > } > > + mp->m_metafile_resv_used += args->len; > + mutex_unlock(&mp->m_metafile_resv_lock); > + > ip->i_nblocks += args->len; > xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); > } > @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( > struct xfs_trans *tp, > xfs_filblks_t len) > { > + struct xfs_mount *mp = ip->i_mount; > int64_t to_resv; > > ASSERT(xfs_is_metadir_inode(ip)); > - trace_xfs_metafile_resv_free_space(ip, len); > + > + trace_xfs_metafile_resv_free_space(mp, len); > > ip->i_nblocks -= len; > xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > > + mutex_lock(&mp->m_metafile_resv_lock); > + mp->m_metafile_resv_used -= len; > + > /* > * Add the freed blocks back into the inode's delalloc reservation > * until it reaches the maximum size. Update the ondisk fdblocks only. > */ > - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); > + to_resv = mp->m_metafile_resv_target - > + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); This separates accounting of the usage of the metadata btree (e.g. rtrmapbt) from i_blocks, which is a convenient way to fix an accounting bug if ever there's a metadir inode with a big enough xattr structure that it blows out into a real ondisk block. So far I don't think that happens in practice, but it's not impossible. Clearly, the mapping of before to after is: i_meta_resv_asked -> m_metafile_resv_target i_nblocks -> m_metafile_resv_used i_delayed_blks -> m_metafile_resv_avail Makes sense. And I guess we still consider the reservations as "delayed allocations" in the sense that we subtract from fdblocks and put the same quantity into m_delalloc_blks. > if (to_resv > 0) { > to_resv = min_t(int64_t, to_resv, len); > - ip->i_delayed_blks += to_resv; > + mp->m_metafile_resv_avail += to_resv; > xfs_mod_delalloc(ip, 0, to_resv); > xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); > len -= to_resv; > } > + mutex_unlock(&mp->m_metafile_resv_lock); > > /* > * Everything else goes back to the filesystem, so update the in-core > @@ -215,61 +224,96 @@ xfs_metafile_resv_free_space( > xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); > } > > -/* Release a metadata file's space reservation. */ > +static void > +__xfs_metafile_resv_free( > + struct xfs_mount *mp) > +{ > + if (mp->m_metafile_resv_avail) { > + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); > + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); > + } > + mp->m_metafile_resv_avail = 0; > + mp->m_metafile_resv_used = 0; > + mp->m_metafile_resv_target = 0; > +} > + > +/* Release unused metafile space reservation. */ > void > xfs_metafile_resv_free( > - struct xfs_inode *ip) > + struct xfs_mount *mp) > { > - /* Non-btree metadata inodes don't need space reservations. */ > - if (!ip || !ip->i_meta_resv_asked) > + if (!xfs_has_metadir(mp)) > return; > > - ASSERT(xfs_is_metadir_inode(ip)); > - trace_xfs_metafile_resv_free(ip, 0); > + trace_xfs_metafile_resv_free(mp, 0); > > - if (ip->i_delayed_blks) { > - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); > - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); > - ip->i_delayed_blks = 0; > - } > - ip->i_meta_resv_asked = 0; > + mutex_lock(&mp->m_metafile_resv_lock); > + __xfs_metafile_resv_free(mp); > + mutex_unlock(&mp->m_metafile_resv_lock); > } > > -/* Set up a metadata file's space reservation. */ > +/* Set up a metafile space reservation. */ > int > xfs_metafile_resv_init( > - struct xfs_inode *ip, > - xfs_filblks_t ask) > + struct xfs_mount *mp) > { > + struct xfs_rtgroup *rtg = NULL; > + xfs_filblks_t used = 0, target = 0; > xfs_filblks_t hidden_space; > - xfs_filblks_t used; > - int error; > + int error = 0; > > - if (!ip || ip->i_meta_resv_asked > 0) > + if (!xfs_has_metadir(mp)) > return 0; > > - ASSERT(xfs_is_metadir_inode(ip)); > + /* > + * Free any previous reservation to have a clean slate. > + */ > + mutex_lock(&mp->m_metafile_resv_lock); > + __xfs_metafile_resv_free(mp); > + > + /* > + * Currently the only btree metafiles that require reservations are the > + * rtrmap and the rtrefcount. Anything new will have to be added here > + * as well. > + */ > + while ((rtg = xfs_rtgroup_next(mp, rtg))) { > + if (xfs_has_rtrmapbt(mp)) { > + used += rtg_rmap(rtg)->i_nblocks; > + target += xfs_rtrmapbt_calc_reserves(mp); > + } > + if (xfs_has_rtreflink(mp)) { > + used += rtg_refcount(rtg)->i_nblocks; > + target += xfs_rtrefcountbt_calc_reserves(mp); > + } > + } > + > + if (!target) > + goto out_unlock; > > /* > - * Space taken by all other metadata btrees are accounted on-disk as > + * Space taken by the per-AG metadata btrees are accounted on-disk as > * used space. We therefore only hide the space that is reserved but > * not used by the trees. > */ > - used = ip->i_nblocks; > - if (used > ask) > - ask = used; > - hidden_space = ask - used; > + if (used > target) > + target = used; > + hidden_space = target - used; > > - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); > + error = xfs_dec_fdblocks(mp, hidden_space, true); > if (error) { > - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); > - return error; > + trace_xfs_metafile_resv_init_error(mp, 0); > + goto out_unlock; > } > > - xfs_mod_delalloc(ip, 0, hidden_space); > - ip->i_delayed_blks = hidden_space; > - ip->i_meta_resv_asked = ask; > + xfs_mod_sb_delalloc(mp, hidden_space); > + > + mp->m_metafile_resv_target = target; > + mp->m_metafile_resv_used = used; > + mp->m_metafile_resv_avail = hidden_space; > + > + trace_xfs_metafile_resv_init(mp, target); > > - trace_xfs_metafile_resv_init(ip, ask); > - return 0; > +out_unlock: > + mutex_unlock(&mp->m_metafile_resv_lock); > + return error; > } > diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h > index 95af4b52e5a7..ae6f9e779b98 100644 > --- a/fs/xfs/libxfs/xfs_metafile.h > +++ b/fs/xfs/libxfs/xfs_metafile.h > @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); > /* Space reservations for metadata inodes. */ > struct xfs_alloc_arg; > > -bool xfs_metafile_resv_critical(struct xfs_inode *ip); > +bool xfs_metafile_resv_critical(struct xfs_mount *mp); > void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, > struct xfs_alloc_arg *args); > void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, > xfs_filblks_t len); > -void xfs_metafile_resv_free(struct xfs_inode *ip); > -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); > +void xfs_metafile_resv_free(struct xfs_mount *mp); > +int xfs_metafile_resv_init(struct xfs_mount *mp); > > /* Code specific to kernel/userspace; must be provided externally. */ > > diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c > index b32fb233cf84..cb66918832a9 100644 > --- a/fs/xfs/scrub/reap.c > +++ b/fs/xfs/scrub/reap.c > @@ -932,13 +932,17 @@ xrep_reap_metadir_fsblocks( > xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); > > error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); > - if (error) > + if (error || !xreap_dirty(&rs)) > return error; > > - if (xreap_dirty(&rs)) > - return xrep_defer_finish(sc); > + error = xrep_defer_finish(sc); > + if (error) > + return error; > > - return 0; > + /* > + * Resize the reservations so that we don't fail to expand the btree. > + */ > + return xfs_metafile_resv_init(sc->mp); I'm not sure this fixes the reservation correctly -- let's say we have a busted btree inode with a 100 fsblock reservation that was using 20 fsb. At mount, we'll have called xfs_dec_fdblocks for 100 - 20 = 80 fsb: hidden_space = target - used; xfs_dec_fdblocks(mp, hidden_space, true); Then we rebuild it with a more compact 18 fsb btree. Now we run xfs_metafile_resv_init and it thinks it needs to call xfs_dec_fdblocks for 100 - 18 = 82 fsb. But we already reserved 80 for this file, so the correct thing to do here is to xfs_dec_fdblocks 2fsb from fdblocks, not another 82. --D > } > > /* > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c > index 3b5288d3ef4e..5cdaf82aee7b 100644 > --- a/fs/xfs/scrub/repair.c > +++ b/fs/xfs/scrub/repair.c > @@ -1380,57 +1380,3 @@ xrep_inode_set_nblocks( > xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, > delta); > } > - > -/* Reset the block reservation for a metadata inode. */ > -int > -xrep_reset_metafile_resv( > - struct xfs_scrub *sc) > -{ > - struct xfs_inode *ip = sc->ip; > - int64_t delta; > - int error; > - > - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; > - if (delta == 0) > - return 0; > - > - /* > - * Too many blocks have been reserved, transfer some from the incore > - * reservation back to the filesystem. > - */ > - if (delta > 0) { > - int64_t give_back; > - > - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); > - if (give_back > 0) { > - xfs_mod_delalloc(ip, 0, -give_back); > - xfs_add_fdblocks(ip->i_mount, give_back); > - ip->i_delayed_blks -= give_back; > - } > - > - return 0; > - } > - > - /* > - * Not enough reservation; try to take some blocks from the filesystem > - * to the metadata inode. @delta is negative here, so invert the sign. > - */ > - delta = -delta; > - error = xfs_dec_fdblocks(sc->mp, delta, true); > - while (error == -ENOSPC) { > - delta--; > - if (delta == 0) { > - xfs_warn(sc->mp, > -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", > - ip->i_ino); > - return 0; > - } > - error = xfs_dec_fdblocks(sc->mp, delta, true); > - } > - if (error) > - return error; > - > - xfs_mod_delalloc(ip, 0, delta); > - ip->i_delayed_blks += delta; > - return 0; > -} > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h > index 823c00d1a502..342bae348b47 100644 > --- a/fs/xfs/scrub/repair.h > +++ b/fs/xfs/scrub/repair.h > @@ -186,7 +186,6 @@ void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); > > bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); > void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); > -int xrep_reset_metafile_resv(struct xfs_scrub *sc); > > #else > > diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c > index 257cfb24beb4..983362447826 100644 > --- a/fs/xfs/scrub/rtrefcount_repair.c > +++ b/fs/xfs/scrub/rtrefcount_repair.c > @@ -697,32 +697,6 @@ xrep_rtrefc_build_new_tree( > return error; > } > > -/* > - * Now that we've logged the roots of the new btrees, invalidate all of the > - * old blocks and free them. > - */ > -STATIC int > -xrep_rtrefc_remove_old_tree( > - struct xrep_rtrefc *rr) > -{ > - int error; > - > - /* > - * Free all the extents that were allocated to the former rtrefcountbt > - * and aren't cross-linked with something else. > - */ > - error = xrep_reap_metadir_fsblocks(rr->sc, > - &rr->old_rtrefcountbt_blocks); > - if (error) > - return error; > - > - /* > - * Ensure the proper reservation for the rtrefcount inode so that we > - * don't fail to expand the btree. > - */ > - return xrep_reset_metafile_resv(rr->sc); > -} > - > /* Rebuild the rt refcount btree. */ > int > xrep_rtrefcountbt( > @@ -769,8 +743,12 @@ xrep_rtrefcountbt( > if (error) > goto out_bitmap; > > - /* Kill the old tree. */ > - error = xrep_rtrefc_remove_old_tree(rr); > + /* > + * Free all the extents that were allocated to the former rtrefcountbt > + * and aren't cross-linked with something else. > + */ > + error = xrep_reap_metadir_fsblocks(rr->sc, > + &rr->old_rtrefcountbt_blocks); > if (error) > goto out_bitmap; > > diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c > index f2fdd7a9fc24..fc2592c53af5 100644 > --- a/fs/xfs/scrub/rtrmap_repair.c > +++ b/fs/xfs/scrub/rtrmap_repair.c > @@ -810,28 +810,6 @@ xrep_rtrmap_build_new_tree( > > /* Reaping the old btree. */ > > -/* Reap the old rtrmapbt blocks. */ > -STATIC int > -xrep_rtrmap_remove_old_tree( > - struct xrep_rtrmap *rr) > -{ > - int error; > - > - /* > - * Free all the extents that were allocated to the former rtrmapbt and > - * aren't cross-linked with something else. > - */ > - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); > - if (error) > - return error; > - > - /* > - * Ensure the proper reservation for the rtrmap inode so that we don't > - * fail to expand the new btree. > - */ > - return xrep_reset_metafile_resv(rr->sc); > -} > - > static inline bool > xrep_rtrmapbt_want_live_update( > struct xchk_iscan *iscan, > @@ -995,8 +973,11 @@ xrep_rtrmapbt( > if (error) > goto out_records; > > - /* Kill the old tree. */ > - error = xrep_rtrmap_remove_old_tree(rr); > + /* > + * Free all the extents that were allocated to the former rtrmapbt and > + * aren't cross-linked with something else. > + */ > + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); > if (error) > goto out_records; > > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c > index 73275d7a6ec0..22d50f8acd92 100644 > --- a/fs/xfs/xfs_fsops.c > +++ b/fs/xfs/xfs_fsops.c > @@ -24,6 +24,7 @@ > #include "xfs_rtalloc.h" > #include "xfs_rtrmap_btree.h" > #include "xfs_rtrefcount_btree.h" > +#include "xfs_metafile.h" > > /* > * Write new AG headers to disk. Non-transactional, but need to be > @@ -561,15 +562,13 @@ xfs_fs_reserve_ag_blocks( > return error; > } > > - if (xfs_has_realtime(mp)) { > - err2 = xfs_rt_resv_init(mp); > - if (err2 && err2 != -ENOSPC) { > - xfs_warn(mp, > - "Error %d reserving realtime metadata reserve pool.", err2); > - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > - } > + err2 = xfs_metafile_resv_init(mp); > + if (err2 && err2 != -ENOSPC) { > + xfs_warn(mp, > + "Error %d reserving realtime metadata reserve pool.", err2); > + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); > > - if (err2 && !error) > + if (!error) > error = err2; > } > > @@ -585,9 +584,7 @@ xfs_fs_unreserve_ag_blocks( > { > struct xfs_perag *pag = NULL; > > - if (xfs_has_realtime(mp)) > - xfs_rt_resv_free(mp); > - > + xfs_metafile_resv_free(mp); > while ((pag = xfs_perag_next(mp, pag))) > xfs_ag_resv_free(pag); > } > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index c08093a65352..1648dc5a8068 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -25,19 +25,9 @@ struct xfs_dquot; > typedef struct xfs_inode { > /* Inode linking and identification information. */ > struct xfs_mount *i_mount; /* fs mount struct ptr */ > - union { > - struct { > - struct xfs_dquot *i_udquot; /* user dquot */ > - struct xfs_dquot *i_gdquot; /* group dquot */ > - struct xfs_dquot *i_pdquot; /* project dquot */ > - }; > - > - /* > - * Space that has been set aside to accomodate expansions of a > - * metadata btree rooted in this file. > - */ > - uint64_t i_meta_resv_asked; > - }; > + struct xfs_dquot *i_udquot; /* user dquot */ > + struct xfs_dquot *i_gdquot; /* group dquot */ > + struct xfs_dquot *i_pdquot; /* project dquot */ > > /* Inode location stuff */ > xfs_ino_t i_ino; /* inode number (agno/agino)*/ > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index d4a57e2fdcc5..9bfb5e08715d 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -263,6 +263,11 @@ typedef struct xfs_mount { > atomic_t m_agirotor; /* last ag dir inode alloced */ > atomic_t m_rtgrotor; /* last rtgroup rtpicked */ > > + struct mutex m_metafile_resv_lock; > + uint64_t m_metafile_resv_target; > + uint64_t m_metafile_resv_used; > + uint64_t m_metafile_resv_avail; > + > /* Memory shrinker to throttle and reprioritize inodegc */ > struct shrinker *m_inodegc_shrinker; > /* > @@ -737,5 +742,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); > bool xfs_clear_incompat_log_features(struct xfs_mount *mp); > void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, > int64_t ind_delta); > +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) > +{ > + percpu_counter_add(&mp->m_delalloc_blks, delta); > +} > > #endif /* __XFS_MOUNT_H__ */ > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index 3e778e077d09..fd65e5d7994a 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space( > if (!xfs_has_rmapbt(mp)) > return 0; > if (XFS_IS_REALTIME_INODE(ip)) { > - struct xfs_rtgroup *rtg; > - xfs_rgnumber_t rgno; > - > - rgno = xfs_rtb_to_rgno(mp, fsb); > - rtg = xfs_rtgroup_get(mp, rgno); > - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) > - error = -ENOSPC; > - xfs_rtgroup_put(rtg); > - return error; > + if (xfs_metafile_resv_critical(mp)) > + return -ENOSPC; > + return 0; > } > > agno = XFS_FSB_TO_AGNO(mp, fsb); > diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c > index 8da2498417f5..f5a0dbc46a14 100644 > --- a/fs/xfs/xfs_rtalloc.c > +++ b/fs/xfs/xfs_rtalloc.c > @@ -1398,8 +1398,7 @@ xfs_growfs_rt( > error = error2; > > /* Reset the rt metadata btree space reservations. */ > - xfs_rt_resv_free(mp); > - error2 = xfs_rt_resv_init(mp); > + error2 = xfs_metafile_resv_init(mp); > if (error2 && error2 != -ENOSPC) > error = error2; > } > @@ -1523,46 +1522,6 @@ xfs_rtalloc_reinit_frextents( > return 0; > } > > -/* Free space reservations for rt metadata inodes. */ > -void > -xfs_rt_resv_free( > - struct xfs_mount *mp) > -{ > - struct xfs_rtgroup *rtg = NULL; > - unsigned int i; > - > - while ((rtg = xfs_rtgroup_next(mp, rtg))) { > - for (i = 0; i < XFS_RTGI_MAX; i++) > - xfs_metafile_resv_free(rtg->rtg_inodes[i]); > - } > -} > - > -/* Reserve space for rt metadata inodes' space expansion. */ > -int > -xfs_rt_resv_init( > - struct xfs_mount *mp) > -{ > - struct xfs_rtgroup *rtg = NULL; > - xfs_filblks_t ask; > - int error = 0; > - > - while ((rtg = xfs_rtgroup_next(mp, rtg))) { > - int err2; > - > - ask = xfs_rtrmapbt_calc_reserves(mp); > - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); > - if (err2 && !error) > - error = err2; > - > - ask = xfs_rtrefcountbt_calc_reserves(mp); > - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); > - if (err2 && !error) > - error = err2; > - } > - > - return error; > -} > - > /* > * Read in the bmbt of an rt metadata inode so that we never have to load them > * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use > diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h > index 0d95b29092c9..78a690b489ed 100644 > --- a/fs/xfs/xfs_rtalloc.h > +++ b/fs/xfs/xfs_rtalloc.h > @@ -34,9 +34,6 @@ int /* error */ > xfs_rtmount_inodes( > struct xfs_mount *mp); /* file system mount structure */ > > -void xfs_rt_resv_free(struct xfs_mount *mp); > -int xfs_rt_resv_init(struct xfs_mount *mp); > - > /* > * Grow the realtime area of the filesystem. > */ > @@ -65,8 +62,6 @@ xfs_rtmount_init( > } > # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) > # define xfs_rtunmount_inodes(m) > -# define xfs_rt_resv_free(mp) ((void)0) > -# define xfs_rt_resv_init(mp) (0) > > static inline int > xfs_growfs_check_rtgeom(const struct xfs_mount *mp, > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 5c9a2a0826ff..4414c8542144 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -2089,6 +2089,7 @@ xfs_init_fs_context( > for (i = 0; i < XG_TYPE_MAX; i++) > xa_init(&mp->m_groups[i].xa); > mutex_init(&mp->m_growlock); > + mutex_init(&mp->m_metafile_resv_lock); > INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); > INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); > mp->m_kobj.kobject.kset = xfs_kset; > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 740e0a8c3eca..a02129c202b2 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -5605,11 +5605,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); > /* metadata inode space reservations */ > > DECLARE_EVENT_CLASS(xfs_metafile_resv_class, > - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), > - TP_ARGS(ip, len), > + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), > + TP_ARGS(mp, len), > TP_STRUCT__entry( > __field(dev_t, dev) > - __field(xfs_ino_t, ino) > __field(unsigned long long, freeblks) > __field(unsigned long long, reserved) > __field(unsigned long long, asked) > @@ -5617,19 +5616,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, > __field(unsigned long long, len) > ), > TP_fast_assign( > - struct xfs_mount *mp = ip->i_mount; > - > __entry->dev = mp->m_super->s_dev; > - __entry->ino = ip->i_ino; > __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); > - __entry->reserved = ip->i_delayed_blks; > - __entry->asked = ip->i_meta_resv_asked; > - __entry->used = ip->i_nblocks; > + __entry->reserved = mp->m_metafile_resv_avail; > + __entry->asked = mp->m_metafile_resv_target; > + __entry->used = mp->m_metafile_resv_used; > __entry->len = len; > ), > - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", > + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", > MAJOR(__entry->dev), MINOR(__entry->dev), > - __entry->ino, > __entry->freeblks, > __entry->reserved, > __entry->asked, > @@ -5638,14 +5633,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, > ) > #define DEFINE_METAFILE_RESV_EVENT(name) \ > DEFINE_EVENT(xfs_metafile_resv_class, name, \ > - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ > - TP_ARGS(ip, len)) > + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ > + TP_ARGS(mp, len)) > DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); > DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); > DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); > DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); > DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); > -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); > +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); > > #ifdef CONFIG_XFS_RT > TRACE_EVENT(xfs_growfs_check_rtgeom, > -- > 2.45.2 > >
On Thu, Feb 06, 2025 at 12:50:21PM -0800, Darrick J. Wong wrote: > On Thu, Feb 06, 2025 at 07:44:26AM +0100, Christoph Hellwig wrote: > > Currently each metabtree inode has it's own space reservation to ensure > > it can be expanded to the maximum size. This is not very efficient as it > > requires a large number of blocks to be set aside that can't be used at > > all by other btrees. > > > > Switch to a model that uses a global pool instead in preparation for > > reducing the amount of reserved space. > > ...because inodes draw from the global pool, so there's no reason to > compartmentalize each rt rmap btree's reservation like we do for per-AG > rmap btrees, right? Yes. > > - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); > > + to_resv = mp->m_metafile_resv_target - > > + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); > > This separates accounting of the usage of the metadata btree (e.g. > rtrmapbt) from i_blocks, which is a convenient way to fix an accounting > bug if ever there's a metadir inode with a big enough xattr structure > that it blows out into a real ondisk block. So far I don't think that > happens in practice, but it's not impossible. > Heh. I didn't think of that, but yes even if very theoretical at the moment. > Clearly, the mapping of before to after is: > > i_meta_resv_asked -> m_metafile_resv_target > i_nblocks -> m_metafile_resv_used > i_delayed_blks -> m_metafile_resv_avail > > Makes sense. And I guess we still consider the reservations as "delayed > allocations" in the sense that we subtract from fdblocks and put the > same quantity into m_delalloc_blks. Yes. > > @@ -932,13 +932,17 @@ xrep_reap_metadir_fsblocks( > > xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); ... > > + /* > > + * Resize the reservations so that we don't fail to expand the btree. > > + */ > > + return xfs_metafile_resv_init(sc->mp); > > I'm not sure this fixes the reservation correctly -- let's say we have a > busted btree inode with a 100 fsblock reservation that was using 20 fsb. > At mount, we'll have called xfs_dec_fdblocks for 100 - 20 = 80 fsb: > > hidden_space = target - used; > xfs_dec_fdblocks(mp, hidden_space, true); > > Then we rebuild it with a more compact 18 fsb btree. Now we run > xfs_metafile_resv_init and it thinks it needs to call xfs_dec_fdblocks > for 100 - 18 = 82 fsb. But we already reserved 80 for this file, so the > correct thing to do here is to xfs_dec_fdblocks 2fsb from fdblocks, not > another 82. Hmm, yes. If only we had a good test for this :)
On Fri, Feb 07, 2025 at 05:18:48AM +0100, Christoph Hellwig wrote: > On Thu, Feb 06, 2025 at 12:50:21PM -0800, Darrick J. Wong wrote: > > On Thu, Feb 06, 2025 at 07:44:26AM +0100, Christoph Hellwig wrote: > > > Currently each metabtree inode has it's own space reservation to ensure > > > it can be expanded to the maximum size. This is not very efficient as it > > > requires a large number of blocks to be set aside that can't be used at > > > all by other btrees. > > > > > > Switch to a model that uses a global pool instead in preparation for > > > reducing the amount of reserved space. > > > > ...because inodes draw from the global pool, so there's no reason to > > compartmentalize each rt rmap btree's reservation like we do for per-AG > > rmap btrees, right? > > Yes. > > > > - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); > > > + to_resv = mp->m_metafile_resv_target - > > > + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); > > > > This separates accounting of the usage of the metadata btree (e.g. > > rtrmapbt) from i_blocks, which is a convenient way to fix an accounting > > bug if ever there's a metadir inode with a big enough xattr structure > > that it blows out into a real ondisk block. So far I don't think that > > happens in practice, but it's not impossible. > > > > Heh. I didn't think of that, but yes even if very theoretical at the > moment. > > > Clearly, the mapping of before to after is: > > > > i_meta_resv_asked -> m_metafile_resv_target > > i_nblocks -> m_metafile_resv_used > > i_delayed_blks -> m_metafile_resv_avail > > > > Makes sense. And I guess we still consider the reservations as "delayed > > allocations" in the sense that we subtract from fdblocks and put the > > same quantity into m_delalloc_blks. > > Yes. > > > > @@ -932,13 +932,17 @@ xrep_reap_metadir_fsblocks( > > > xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); > > ... > > > > + /* > > > + * Resize the reservations so that we don't fail to expand the btree. > > > + */ > > > + return xfs_metafile_resv_init(sc->mp); > > > > I'm not sure this fixes the reservation correctly -- let's say we have a > > busted btree inode with a 100 fsblock reservation that was using 20 fsb. > > At mount, we'll have called xfs_dec_fdblocks for 100 - 20 = 80 fsb: > > > > hidden_space = target - used; > > xfs_dec_fdblocks(mp, hidden_space, true); > > > > Then we rebuild it with a more compact 18 fsb btree. Now we run > > xfs_metafile_resv_init and it thinks it needs to call xfs_dec_fdblocks > > for 100 - 18 = 82 fsb. But we already reserved 80 for this file, so the > > correct thing to do here is to xfs_dec_fdblocks 2fsb from fdblocks, not > > another 82. > > Hmm, yes. If only we had a good test for this :) You could populate an rt filesystem with a lot of small mappings, then use magic debug switches to xfs_repair to rebuild the rt btrees with half full leaf and node blocks, and then force online repair to rebuild it. Then it'll rebuild them with 75% full blocks. Come to think of it, IIRC online repair also has secret debug knobs for that kind of thing. --D
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 7625e694eb8d..88f011750add 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -21,6 +21,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" static const struct { enum xfs_metafile_type mtype; @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( } /* - * Is the amount of space that could be allocated towards a given metadata - * file at or beneath a certain threshold? + * Is the metafile reservations at or beneath a certain threshold? */ static inline bool xfs_metafile_resv_can_cover( - struct xfs_inode *ip, + struct xfs_mount *mp, int64_t rhs) { /* @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( * global free block count. Take care of the first case to avoid * touching the per-cpu counter. */ - if (ip->i_delayed_blks >= rhs) + if (mp->m_metafile_resv_avail >= rhs) return true; /* * There aren't enough blocks left in the inode's reservation, but it * isn't critical unless there also isn't enough free space. */ - return xfs_compare_freecounter(ip->i_mount, XC_FREE_BLOCKS, - rhs - ip->i_delayed_blks, 2048) >= 0; + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; } /* - * Is this metadata file critically low on blocks? For now we'll define that - * as the number of blocks we can get our hands on being less than 10% of what - * we reserved or less than some arbitrary number (maximum btree height). + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). */ bool xfs_metafile_resv_critical( - struct xfs_inode *ip) + struct xfs_mount *mp) { - uint64_t asked_low_water; + ASSERT(xfs_has_metadir(mp)); - if (!ip) - return false; - - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_critical(ip, 0); + trace_xfs_metafile_resv_critical(mp, 0); - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) return true; - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, ip->i_mount, - XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( struct xfs_inode *ip, struct xfs_alloc_arg *args) { + struct xfs_mount *mp = ip->i_mount; int64_t len = args->len; ASSERT(xfs_is_metadir_inode(ip)); ASSERT(args->resv == XFS_AG_RESV_METAFILE); - trace_xfs_metafile_resv_alloc_space(ip, args->len); + trace_xfs_metafile_resv_alloc_space(mp, args->len); /* * Allocate the blocks from the metadata inode's block reservation * and update the ondisk sb counter. */ - if (ip->i_delayed_blks > 0) { + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { int64_t from_resv; - from_resv = min_t(int64_t, len, ip->i_delayed_blks); - ip->i_delayed_blks -= from_resv; + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; xfs_mod_delalloc(ip, 0, -from_resv); xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -from_resv); @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( xfs_trans_mod_sb(args->tp, field, -len); } + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + ip->i_nblocks += args->len; xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); } @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( struct xfs_trans *tp, xfs_filblks_t len) { + struct xfs_mount *mp = ip->i_mount; int64_t to_resv; ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free_space(ip, len); + + trace_xfs_metafile_resv_free_space(mp, len); ip->i_nblocks -= len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + /* * Add the freed blocks back into the inode's delalloc reservation * until it reaches the maximum size. Update the ondisk fdblocks only. */ - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); if (to_resv > 0) { to_resv = min_t(int64_t, to_resv, len); - ip->i_delayed_blks += to_resv; + mp->m_metafile_resv_avail += to_resv; xfs_mod_delalloc(ip, 0, to_resv); xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); len -= to_resv; } + mutex_unlock(&mp->m_metafile_resv_lock); /* * Everything else goes back to the filesystem, so update the in-core @@ -215,61 +224,96 @@ xfs_metafile_resv_free_space( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); } -/* Release a metadata file's space reservation. */ +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ void xfs_metafile_resv_free( - struct xfs_inode *ip) + struct xfs_mount *mp) { - /* Non-btree metadata inodes don't need space reservations. */ - if (!ip || !ip->i_meta_resv_asked) + if (!xfs_has_metadir(mp)) return; - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free(ip, 0); + trace_xfs_metafile_resv_free(mp, 0); - if (ip->i_delayed_blks) { - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); - ip->i_delayed_blks = 0; - } - ip->i_meta_resv_asked = 0; + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); } -/* Set up a metadata file's space reservation. */ +/* Set up a metafile space reservation. */ int xfs_metafile_resv_init( - struct xfs_inode *ip, - xfs_filblks_t ask) + struct xfs_mount *mp) { + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; xfs_filblks_t hidden_space; - xfs_filblks_t used; - int error; + int error = 0; - if (!ip || ip->i_meta_resv_asked > 0) + if (!xfs_has_metadir(mp)) return 0; - ASSERT(xfs_is_metadir_inode(ip)); + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; /* - * Space taken by all other metadata btrees are accounted on-disk as + * Space taken by the per-AG metadata btrees are accounted on-disk as * used space. We therefore only hide the space that is reserved but * not used by the trees. */ - used = ip->i_nblocks; - if (used > ask) - ask = used; - hidden_space = ask - used; + if (used > target) + target = used; + hidden_space = target - used; - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); - return error; + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; } - xfs_mod_delalloc(ip, 0, hidden_space); - ip->i_delayed_blks = hidden_space; - ip->i_meta_resv_asked = ask; + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); - trace_xfs_metafile_resv_init(ip, ask); - return 0; +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; } diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h index 95af4b52e5a7..ae6f9e779b98 100644 --- a/fs/xfs/libxfs/xfs_metafile.h +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); /* Space reservations for metadata inodes. */ struct xfs_alloc_arg; -bool xfs_metafile_resv_critical(struct xfs_inode *ip); +bool xfs_metafile_resv_critical(struct xfs_mount *mp); void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, struct xfs_alloc_arg *args); void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, xfs_filblks_t len); -void xfs_metafile_resv_free(struct xfs_inode *ip); -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); /* Code specific to kernel/userspace; must be provided externally. */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b32fb233cf84..cb66918832a9 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -932,13 +932,17 @@ xrep_reap_metadir_fsblocks( xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); - if (error) + if (error || !xreap_dirty(&rs)) return error; - if (xreap_dirty(&rs)) - return xrep_defer_finish(sc); + error = xrep_defer_finish(sc); + if (error) + return error; - return 0; + /* + * Resize the reservations so that we don't fail to expand the btree. + */ + return xfs_metafile_resv_init(sc->mp); } /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3b5288d3ef4e..5cdaf82aee7b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1380,57 +1380,3 @@ xrep_inode_set_nblocks( xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); } - -/* Reset the block reservation for a metadata inode. */ -int -xrep_reset_metafile_resv( - struct xfs_scrub *sc) -{ - struct xfs_inode *ip = sc->ip; - int64_t delta; - int error; - - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; - if (delta == 0) - return 0; - - /* - * Too many blocks have been reserved, transfer some from the incore - * reservation back to the filesystem. - */ - if (delta > 0) { - int64_t give_back; - - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); - if (give_back > 0) { - xfs_mod_delalloc(ip, 0, -give_back); - xfs_add_fdblocks(ip->i_mount, give_back); - ip->i_delayed_blks -= give_back; - } - - return 0; - } - - /* - * Not enough reservation; try to take some blocks from the filesystem - * to the metadata inode. @delta is negative here, so invert the sign. - */ - delta = -delta; - error = xfs_dec_fdblocks(sc->mp, delta, true); - while (error == -ENOSPC) { - delta--; - if (delta == 0) { - xfs_warn(sc->mp, -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", - ip->i_ino); - return 0; - } - error = xfs_dec_fdblocks(sc->mp, delta, true); - } - if (error) - return error; - - xfs_mod_delalloc(ip, 0, delta); - ip->i_delayed_blks += delta; - return 0; -} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 823c00d1a502..342bae348b47 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -186,7 +186,6 @@ void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); -int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c index 257cfb24beb4..983362447826 100644 --- a/fs/xfs/scrub/rtrefcount_repair.c +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -697,32 +697,6 @@ xrep_rtrefc_build_new_tree( return error; } -/* - * Now that we've logged the roots of the new btrees, invalidate all of the - * old blocks and free them. - */ -STATIC int -xrep_rtrefc_remove_old_tree( - struct xrep_rtrefc *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrefcountbt - * and aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, - &rr->old_rtrefcountbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrefcount inode so that we - * don't fail to expand the btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - /* Rebuild the rt refcount btree. */ int xrep_rtrefcountbt( @@ -769,8 +743,12 @@ xrep_rtrefcountbt( if (error) goto out_bitmap; - /* Kill the old tree. */ - error = xrep_rtrefc_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); if (error) goto out_bitmap; diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index f2fdd7a9fc24..fc2592c53af5 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -810,28 +810,6 @@ xrep_rtrmap_build_new_tree( /* Reaping the old btree. */ -/* Reap the old rtrmapbt blocks. */ -STATIC int -xrep_rtrmap_remove_old_tree( - struct xrep_rtrmap *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrmapbt and - * aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrmap inode so that we don't - * fail to expand the new btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - static inline bool xrep_rtrmapbt_want_live_update( struct xchk_iscan *iscan, @@ -995,8 +973,11 @@ xrep_rtrmapbt( if (error) goto out_records; - /* Kill the old tree. */ - error = xrep_rtrmap_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); if (error) goto out_records; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 73275d7a6ec0..22d50f8acd92 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -561,15 +562,13 @@ xfs_fs_reserve_ag_blocks( return error; } - if (xfs_has_realtime(mp)) { - err2 = xfs_rt_resv_init(mp); - if (err2 && err2 != -ENOSPC) { - xfs_warn(mp, - "Error %d reserving realtime metadata reserve pool.", err2); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (err2 && !error) + if (!error) error = err2; } @@ -585,9 +584,7 @@ xfs_fs_unreserve_ag_blocks( { struct xfs_perag *pag = NULL; - if (xfs_has_realtime(mp)) - xfs_rt_resv_free(mp); - + xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352..1648dc5a8068 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -25,19 +25,9 @@ struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - union { - struct { - struct xfs_dquot *i_udquot; /* user dquot */ - struct xfs_dquot *i_gdquot; /* group dquot */ - struct xfs_dquot *i_pdquot; /* project dquot */ - }; - - /* - * Space that has been set aside to accomodate expansions of a - * metadata btree rooted in this file. - */ - uint64_t i_meta_resv_asked; - }; + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d4a57e2fdcc5..9bfb5e08715d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -263,6 +263,11 @@ typedef struct xfs_mount { atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; + /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* @@ -737,5 +742,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3e778e077d09..fd65e5d7994a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_rtgroup *rtg; - xfs_rgnumber_t rgno; - - rgno = xfs_rtb_to_rgno(mp, fsb); - rtg = xfs_rtgroup_get(mp, rgno); - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) - error = -ENOSPC; - xfs_rtgroup_put(rtg); - return error; + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; } agno = XFS_FSB_TO_AGNO(mp, fsb); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8da2498417f5..f5a0dbc46a14 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1398,8 +1398,7 @@ xfs_growfs_rt( error = error2; /* Reset the rt metadata btree space reservations. */ - xfs_rt_resv_free(mp); - error2 = xfs_rt_resv_init(mp); + error2 = xfs_metafile_resv_init(mp); if (error2 && error2 != -ENOSPC) error = error2; } @@ -1523,46 +1522,6 @@ xfs_rtalloc_reinit_frextents( return 0; } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - unsigned int i; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - for (i = 0; i < XFS_RTGI_MAX; i++) - xfs_metafile_resv_free(rtg->rtg_inodes[i]); - } -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - xfs_filblks_t ask; - int error = 0; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - int err2; - - ask = xfs_rtrmapbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); - if (err2 && !error) - error = err2; - - ask = xfs_rtrefcountbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); - if (err2 && !error) - error = err2; - } - - return error; -} - /* * Read in the bmbt of an rt metadata inode so that we never have to load them * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0d95b29092c9..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -34,9 +34,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -void xfs_rt_resv_free(struct xfs_mount *mp); -int xfs_rt_resv_init(struct xfs_mount *mp); - /* * Grow the realtime area of the filesystem. */ @@ -65,8 +62,6 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) -# define xfs_rt_resv_free(mp) ((void)0) -# define xfs_rt_resv_init(mp) (0) static inline int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5c9a2a0826ff..4414c8542144 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2089,6 +2089,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 740e0a8c3eca..a02129c202b2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -5605,11 +5605,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); /* metadata inode space reservations */ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), - TP_ARGS(ip, len), + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_ino_t, ino) __field(unsigned long long, freeblks) __field(unsigned long long, reserved) __field(unsigned long long, asked) @@ -5617,19 +5616,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, __field(unsigned long long, len) ), TP_fast_assign( - struct xfs_mount *mp = ip->i_mount; - __entry->dev = mp->m_super->s_dev; - __entry->ino = ip->i_ino; __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); - __entry->reserved = ip->i_delayed_blks; - __entry->asked = ip->i_meta_resv_asked; - __entry->used = ip->i_nblocks; + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; __entry->len = len; ), - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->freeblks, __entry->reserved, __entry->asked, @@ -5638,14 +5633,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, ) #define DEFINE_METAFILE_RESV_EVENT(name) \ DEFINE_EVENT(xfs_metafile_resv_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ - TP_ARGS(ip, len)) + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); #ifdef CONFIG_XFS_RT TRACE_EVENT(xfs_growfs_check_rtgeom,
Currently each metabtree inode has it's own space reservation to ensure it can be expanded to the maximum size. This is not very efficient as it requires a large number of blocks to be set aside that can't be used at all by other btrees. Switch to a model that uses a global pool instead in preparation for reducing the amount of reserved space. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_metafile.c | 164 ++++++++++++++++++++----------- fs/xfs/libxfs/xfs_metafile.h | 6 +- fs/xfs/scrub/reap.c | 12 ++- fs/xfs/scrub/repair.c | 54 ---------- fs/xfs/scrub/repair.h | 1 - fs/xfs/scrub/rtrefcount_repair.c | 34 ++----- fs/xfs/scrub/rtrmap_repair.c | 29 +----- fs/xfs/xfs_fsops.c | 19 ++-- fs/xfs/xfs_inode.h | 16 +-- fs/xfs/xfs_mount.h | 9 ++ fs/xfs/xfs_reflink.c | 12 +-- fs/xfs/xfs_rtalloc.c | 43 +------- fs/xfs/xfs_rtalloc.h | 5 - fs/xfs/xfs_super.c | 1 + fs/xfs/xfs_trace.h | 23 ++--- 15 files changed, 160 insertions(+), 268 deletions(-)