Message ID | 1483107598-6779-1-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Fri, Dec 30, 2016 at 03:19:58PM +0100, Christoph Hellwig wrote: > Currently we try to rely on the global reserved block pool for block > allocations for the free inode btree, but I have customer reports > (fairly complex workload, need to find an easier reproducer) where that > is not enough as the AG where we free an inode that requires a new > finobt block is entirely full. This causes us to cancel a dirty > transaction and thus a file system shutdown. > > I think the right way to guard against this is to treat the finot the same > way as the refcount btree and have a per-AG reservations for the possible > worst case size of it, and the patch below implements that. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/libxfs/xfs_ag_resv.c | 6 +++ > fs/xfs/libxfs/xfs_ialloc.h | 1 - > fs/xfs/libxfs/xfs_ialloc_btree.c | 96 ++++++++++++++++++++++++++++++++++++++-- > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ > fs/xfs/libxfs/xfs_trans_space.h | 3 -- > fs/xfs/xfs_inode.c | 25 ++--------- > 6 files changed, 105 insertions(+), 29 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c > index e5ebc37..592754b 100644 > --- a/fs/xfs/libxfs/xfs_ag_resv.c > +++ b/fs/xfs/libxfs/xfs_ag_resv.c > @@ -39,6 +39,7 @@ > #include "xfs_rmap_btree.h" > #include "xfs_btree.h" > #include "xfs_refcount_btree.h" > +#include "xfs_ialloc_btree.h" > > /* > * Per-AG Block Reservations > @@ -236,6 +237,11 @@ xfs_ag_resv_init( > if (error) > goto out; > > + error = xfs_finobt_calc_reserves(pag->pag_mount, > + pag->pag_agno, &ask, &used); > + if (error) > + goto out; > + > error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, > ask, used); > if (error) > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h > index 0bb8966..3f24725 100644 > --- a/fs/xfs/libxfs/xfs_ialloc.h > +++ b/fs/xfs/libxfs/xfs_ialloc.h > @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, > int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, > xfs_agnumber_t agno, struct xfs_buf **bpp); > > - > #endif /* __XFS_IALLOC_H__ */ > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c > index 0fd086d..3aba153 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c > @@ -82,11 +82,12 @@ xfs_finobt_set_root( > } > > STATIC int > -xfs_inobt_alloc_block( > +__xfs_inobt_alloc_block( > struct xfs_btree_cur *cur, > union xfs_btree_ptr *start, > union xfs_btree_ptr *new, > - int *stat) > + int *stat, > + enum xfs_ag_resv_type resv) > { > xfs_alloc_arg_t args; /* block allocation args */ > int error; /* error return value */ > @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( > args.maxlen = 1; > args.prod = 1; > args.type = XFS_ALLOCTYPE_NEAR_BNO; > + args.resv = resv; > > error = xfs_alloc_vextent(&args); > if (error) { > @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( > } > > STATIC int > +xfs_inobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); > +} > + > +STATIC int > +xfs_finobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, > + XFS_AG_RESV_METADATA); > +} > + > +STATIC int > xfs_inobt_free_block( > struct xfs_btree_cur *cur, > struct xfs_buf *bp) > @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { > > .dup_cursor = xfs_inobt_dup_cursor, > .set_root = xfs_finobt_set_root, > - .alloc_block = xfs_inobt_alloc_block, > + .alloc_block = xfs_finobt_alloc_block, > .free_block = xfs_inobt_free_block, > .get_minrecs = xfs_inobt_get_minrecs, > .get_maxrecs = xfs_inobt_get_maxrecs, > @@ -480,3 +503,70 @@ xfs_inobt_rec_check_count( > return 0; > } > #endif /* DEBUG */ > + > +static xfs_extlen_t > +xfs_inobt_calc_size( > + struct xfs_mount *mp, > + unsigned long long len) > +{ > + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, len); This call figures out worst-case how many blocks we need to store a given number of records... > +} > + > +static xfs_extlen_t > +xfs_inobt_max_size( > + struct xfs_mount *mp) > +{ > + /* Bail out if we're uninitialized, which can happen in mkfs. */ > + if (mp->m_inobt_mxr[0] == 0) > + return 0; > + > + return xfs_inobt_calc_size(mp, mp->m_sb.sb_agblocks); ...and so here we calculate the number of blocks needed to store the maximum number of finobt records possible for an AG. IIRC, each *inobt record refers to a single chunk of 64 inodes (or at least a theoretical chunk in the spinodes=1 case), so I think we can reduce the reservation to... nr = m_sb.sb_agblocks * m_sb.sb_inopblock / XFS_INODES_PER_CHUNK; return xfs_inobt_calc_size(mp, nr); ...right? For rmap/refcount the maximum number of records is the number of blocks in the AG. > +} > + > +static int > +xfs_inobt_count_blocks( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_btnum_t btnum, > + xfs_extlen_t *tree_blocks) > +{ > + struct xfs_buf *agbp; > + struct xfs_btree_cur *cur; > + int error; > + > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); > + if (error) > + return error; > + > + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); > + error = xfs_btree_count_blocks(cur, tree_blocks); > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > + xfs_buf_relse(agbp); > + > + return error; > +} > + > +/* > + * Figure out how many blocks to reserve and how many are used by this btree. > + */ > +int > +xfs_finobt_calc_reserves( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_extlen_t *ask, > + xfs_extlen_t *used) > +{ > + xfs_extlen_t tree_len = 0; > + int error; > + > + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) > + return 0; > + > + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); This requires us to traverse all the blocks in the finobt at mount time, which isn't necessarily quick. For refcount/rmap we cache the number of tree blocks in the AGF to speed this up... but it was easy to sneak that into the disk format. :) For finobt I wonder if one could defer the block counting work to a separate thread if the AG has enough free blocks to cover, say, 10x the maximum reservation? Though that could be racy and maybe finobts are small enough that the impact on mount time is low anyway? There's also the unsolved problem of what happens if we mount and find agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and hope that we don't later ENOSPC and crash. For reflink and rmap we will have always had the AG reservation and therefore it should never happen that we have fewer free blocks than reserved blocks. (Unless the user does something pathological like using CoW to create many billions of separate rmap records...) But as for retroactively adding AG reservations for an existing tree, I guess we'll have to come up with a strategy for dealing with insufficient free blocks. I suppose one could try to use xfs_fsr to move large contiguous extents to a less full AG, if there are any... (I actually hit the "insufficient freeblks for AG reservation" case over the holiday when I "upgraded" an XFS to rmap+reflink the foolish way and neglected to ensure the free space...) --D > + if (error) > + return error; > + > + *ask += xfs_inobt_max_size(mp); > + *used += tree_len; > + return 0; > +} > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h > index bd88453..aa81e2e 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.h > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h > @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, > #define xfs_inobt_rec_check_count(mp, rec) 0 > #endif /* DEBUG */ > > +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, > + xfs_extlen_t *ask, xfs_extlen_t *used); > + > #endif /* __XFS_IALLOC_BTREE_H__ */ > diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h > index 7917f6e..6db3e2d 100644 > --- a/fs/xfs/libxfs/xfs_trans_space.h > +++ b/fs/xfs/libxfs/xfs_trans_space.h > @@ -94,8 +94,5 @@ > (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) > #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ > (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) > -#define XFS_IFREE_SPACE_RES(mp) \ > - (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) > - > > #endif /* __XFS_TRANS_SPACE_H__ */ > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index b955779..0283ab6 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -1792,30 +1792,11 @@ xfs_inactive_ifree( > int error; > > /* > - * The ifree transaction might need to allocate blocks for record > - * insertion to the finobt. We don't want to fail here at ENOSPC, so > - * allow ifree to dip into the reserved block pool if necessary. > - * > - * Freeing large sets of inodes generally means freeing inode chunks, > - * directory and file data blocks, so this should be relatively safe. > - * Only under severe circumstances should it be possible to free enough > - * inodes to exhaust the reserve block pool via finobt expansion while > - * at the same time not creating free space in the filesystem. > - * > - * Send a warning if the reservation does happen to fail, as the inode > - * now remains allocated and sits on the unlinked list until the fs is > - * repaired. > + * We use a per-AG reservation for any block needed by the finobt tree. > */ > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); > if (error) { > - if (error == -ENOSPC) { > - xfs_warn_ratelimited(mp, > - "Failed to remove inode(s) from unlinked list. " > - "Please free space, unmount and run xfs_repair."); > - } else { > - ASSERT(XFS_FORCED_SHUTDOWN(mp)); > - } > + ASSERT(XFS_FORCED_SHUTDOWN(mp)); > return error; > } > > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 3, 2017 at 9:24 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote: ... > > There's also the unsolved problem of what happens if we mount and find > agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and > hope that we don't later ENOSPC and crash. For reflink and rmap we will > have always had the AG reservation and therefore it should never happen > that we have fewer free blocks than reserved blocks. (Unless the user > does something pathological like using CoW to create many billions of > separate rmap records...) > Darrick, Can you explain the "Unless the user..." part? Is it not possible to actively limit the user from getting to the pathologic case? If AG reservation size is a function of the maximum block refcount, then an operation that is about to increase the maximum block refcount to a size that will increase the worst case reservation beyond a certain percentage of the AG (or beyond available space for that matter) should be denied by a conservative ENOSPC. I imagine it would be much easier and also understandable from user POV to get a preventative ENOSPC for over cloning, then to get it some time in the far future for trying to delete or deduplicate blocks. Amir. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 03, 2017 at 11:24:26AM -0800, Darrick J. Wong wrote: > On Fri, Dec 30, 2016 at 03:19:58PM +0100, Christoph Hellwig wrote: > > Currently we try to rely on the global reserved block pool for block > > allocations for the free inode btree, but I have customer reports > > (fairly complex workload, need to find an easier reproducer) where that > > is not enough as the AG where we free an inode that requires a new > > finobt block is entirely full. This causes us to cancel a dirty > > transaction and thus a file system shutdown. > > > > I think the right way to guard against this is to treat the finot the same > > way as the refcount btree and have a per-AG reservations for the possible > > worst case size of it, and the patch below implements that. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > --- > > fs/xfs/libxfs/xfs_ag_resv.c | 6 +++ > > fs/xfs/libxfs/xfs_ialloc.h | 1 - > > fs/xfs/libxfs/xfs_ialloc_btree.c | 96 ++++++++++++++++++++++++++++++++++++++-- > > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ > > fs/xfs/libxfs/xfs_trans_space.h | 3 -- > > fs/xfs/xfs_inode.c | 25 ++--------- > > 6 files changed, 105 insertions(+), 29 deletions(-) > > ... > > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c > > index 0fd086d..3aba153 100644 > > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c > > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c ... > > @@ -480,3 +503,70 @@ xfs_inobt_rec_check_count( ... > > +static int > > +xfs_inobt_count_blocks( > > + struct xfs_mount *mp, > > + xfs_agnumber_t agno, > > + xfs_btnum_t btnum, > > + xfs_extlen_t *tree_blocks) > > +{ > > + struct xfs_buf *agbp; > > + struct xfs_btree_cur *cur; > > + int error; > > + > > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); > > + if (error) > > + return error; > > + > > + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); > > + error = xfs_btree_count_blocks(cur, tree_blocks); > > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > > + xfs_buf_relse(agbp); > > + > > + return error; > > +} > > + > > +/* > > + * Figure out how many blocks to reserve and how many are used by this btree. > > + */ > > +int > > +xfs_finobt_calc_reserves( > > + struct xfs_mount *mp, > > + xfs_agnumber_t agno, > > + xfs_extlen_t *ask, > > + xfs_extlen_t *used) > > +{ > > + xfs_extlen_t tree_len = 0; > > + int error; > > + > > + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) > > + return 0; > > + > > + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); > > This requires us to traverse all the blocks in the finobt at mount time, > which isn't necessarily quick. For refcount/rmap we cache the number of > tree blocks in the AGF to speed this up... but it was easy to sneak that > into the disk format. :) > > For finobt I wonder if one could defer the block counting work to a > separate thread if the AG has enough free blocks to cover, say, 10x the > maximum reservation? Though that could be racy and maybe finobts are > small enough that the impact on mount time is low anyway? > While the finobt is a subset of the inobt, I don't think we can really assume it's small, at least enough that it wouldn't ever cause mount delays. It might require an uncommon workload to cause, but if we have one that causes this problem in the first place then I wouldn't rule it out. ;P How accurate does the reservation really need to be? I'm wondering if we could get away with a conservative estimate of how many blocks are used based on tree level or something (and/or perhaps use that as a heuristic to determine when to count vs. estimate). We reserve up to the theoretical max and may never consume those blocks with the finobt, so it seems to me we're already affecting when the user hits ENOSPC in most cases. I'd just be leery about how much further we could throw that off to the point that ENOSPC could occur annoyingly prematurely. > > There's also the unsolved problem of what happens if we mount and find > agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and > hope that we don't later ENOSPC and crash. For reflink and rmap we will > have always had the AG reservation and therefore it should never happen > that we have fewer free blocks than reserved blocks. (Unless the user > does something pathological like using CoW to create many billions of > separate rmap records...) > This might be less severe/rare enough than the reflink use case so as to not require a hard failure. This is the first I've heard of such a condition since finobt was introduced (and I'm interested in a reproducer or workload description, if possible). Note that we also had similar behavior with the global reserve pool in low free space conditions and only discovered that bug by accident. :P Perhaps a mount time warning for the finobt-specific bit of the reservation would suffice? That could indicate to the user to free up space, grow the fs or risk the consequences. :P Taking a quick look, however, it looks like that could introduce problems for reflink=1 fs' because everything is pooled into one allocation request. So I guess even that would require some effort to support subsequent/additive reservation requests with different failure constraints. :/ Brian > But as for retroactively adding AG reservations for an existing tree, I > guess we'll have to come up with a strategy for dealing with > insufficient free blocks. I suppose one could try to use xfs_fsr to > move large contiguous extents to a less full AG, if there are any... > > (I actually hit the "insufficient freeblks for AG reservation" case over > the holiday when I "upgraded" an XFS to rmap+reflink the foolish way and > neglected to ensure the free space...) > > --D > > > + if (error) > > + return error; > > + > > + *ask += xfs_inobt_max_size(mp); > > + *used += tree_len; > > + return 0; > > +} > > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h > > index bd88453..aa81e2e 100644 > > --- a/fs/xfs/libxfs/xfs_ialloc_btree.h > > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h > > @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, > > #define xfs_inobt_rec_check_count(mp, rec) 0 > > #endif /* DEBUG */ > > > > +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, > > + xfs_extlen_t *ask, xfs_extlen_t *used); > > + > > #endif /* __XFS_IALLOC_BTREE_H__ */ > > diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h > > index 7917f6e..6db3e2d 100644 > > --- a/fs/xfs/libxfs/xfs_trans_space.h > > +++ b/fs/xfs/libxfs/xfs_trans_space.h > > @@ -94,8 +94,5 @@ > > (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) > > #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ > > (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) > > -#define XFS_IFREE_SPACE_RES(mp) \ > > - (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) > > - > > > > #endif /* __XFS_TRANS_SPACE_H__ */ > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > > index b955779..0283ab6 100644 > > --- a/fs/xfs/xfs_inode.c > > +++ b/fs/xfs/xfs_inode.c > > @@ -1792,30 +1792,11 @@ xfs_inactive_ifree( > > int error; > > > > /* > > - * The ifree transaction might need to allocate blocks for record > > - * insertion to the finobt. We don't want to fail here at ENOSPC, so > > - * allow ifree to dip into the reserved block pool if necessary. > > - * > > - * Freeing large sets of inodes generally means freeing inode chunks, > > - * directory and file data blocks, so this should be relatively safe. > > - * Only under severe circumstances should it be possible to free enough > > - * inodes to exhaust the reserve block pool via finobt expansion while > > - * at the same time not creating free space in the filesystem. > > - * > > - * Send a warning if the reservation does happen to fail, as the inode > > - * now remains allocated and sits on the unlinked list until the fs is > > - * repaired. > > + * We use a per-AG reservation for any block needed by the finobt tree. > > */ > > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > > - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); > > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); > > if (error) { > > - if (error == -ENOSPC) { > > - xfs_warn_ratelimited(mp, > > - "Failed to remove inode(s) from unlinked list. " > > - "Please free space, unmount and run xfs_repair."); > > - } else { > > - ASSERT(XFS_FORCED_SHUTDOWN(mp)); > > - } > > + ASSERT(XFS_FORCED_SHUTDOWN(mp)); > > return error; > > } > > > > -- > > 2.1.4 > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 04, 2017 at 12:21:26PM +0200, Amir Goldstein wrote: > On Tue, Jan 3, 2017 at 9:24 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote: > ... > > > > There's also the unsolved problem of what happens if we mount and find > > agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and > > hope that we don't later ENOSPC and crash. For reflink and rmap we will > > have always had the AG reservation and therefore it should never happen > > that we have fewer free blocks than reserved blocks. (Unless the user > > does something pathological like using CoW to create many billions of > > separate rmap records...) > > > > Darrick, > > Can you explain the "Unless the user..." part? > > Is it not possible to actively limit the user from getting to the > pathologic case? It's difficult to avoid the pathologic case for CoW operations. Say we have a shared extent in AG0, which is full. A CoW operation finds that AG0 is full and allocates the new blocks in AG1, but updating the reference counts might cause AG0's refcount btree to split, which we can't do because there are no free blocks left in AG0. > If AG reservation size is a function of the maximum block refcount, then > an operation that is about to increase the maximum block refcount to a size > that will increase the worst case reservation beyond a certain percentage > of the AG (or beyond available space for that matter) should be denied > by a conservative ENOSPC. For the refcount btree there's an upper bound on how many refcount records we'll ever need to store, so we introduced a per-AG block reservation system to hide blocks from the allocators unless the allocation request has the magic key. In other words, we permanently reserve all the blocks we'll ever need for the refcount tree, which prevents us ever from encountering ENOSPC. This costs us 0.6% of the filesystem. (So, yeah, XFS does what you outlined, though somewhat differently.) However, it's the rmap btree that's the problem. Any inode can reflink the same block to all possible file block offsets, which (roughly) means that (theoretically) we could need to store 2^63 * 2^51 * 2^31 = 2^145 records. That exceeds the size of any AG, so we can't just hide some blocks and expect that everything will be all right. In theory we could, for each CoW reservation, calculate the worst case per-AG btree block requirements for each AG in write_begin/page_mkwrite, fail with ENOSPC if there's not enough space, and track the reservation all the way to the conclusion in CoW remap. That's problematic if we keep CoW reservations around long term, which we do to reduce fragmentation -- rarely do we actually /need/ the worst case. Maybe we'll end up doing something like that some day, but for now the focus is on reducing fragmentation and preventing clones on really full AGs (see below) to try to keep the rmap size sane. By default we reserve enough space that each AG block can have its own rmap record. > I imagine it would be much easier and also understandable from user > POV to get a preventative ENOSPC for over cloning, then to get it > some time in the far future for trying to delete or deduplicate blocks. XFS sends the user a preemptive ENOSPC in response to a reflink request when the per-AG reservation dips below 10% or the maximum number of blocks needed for a full btree split in the hopes that the copy operation will fall back to regular copy. We also established a default CoW extent allocation size hint of 32 blocks to reduce fragmentation, which should reduce the pressure somewhat. (So, yes.) Keep in mind that reflink and rmap will be experimental for a while, so we can tweak the reservations and/or re-engineer deficient parts. The upcoming xfs_spaceman will make it easier to monitor which AGs are getting low on space while the fs is mounted. --D > > Amir. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 03, 2017 at 11:24:26AM -0800, Darrick J. Wong wrote: > ...and so here we calculate the number of blocks needed to store the > maximum number of finobt records possible for an AG. IIRC, each *inobt > record refers to a single chunk of 64 inodes (or at least a theoretical > chunk in the spinodes=1 case), so I think we can reduce the reservation > to... > > nr = m_sb.sb_agblocks * m_sb.sb_inopblock / XFS_INODES_PER_CHUNK; > return xfs_inobt_calc_size(mp, nr); > > ...right? Yes, that should reduce the reservation quite a bit. > This requires us to traverse all the blocks in the finobt at mount time, > which isn't necessarily quick. For refcount/rmap we cache the number of > tree blocks in the AGF to speed this up... but it was easy to sneak that > into the disk format. :) But for finobt it's too late to do that without another incompatible feature flag. > For finobt I wonder if one could defer the block counting work to a > separate thread if the AG has enough free blocks to cover, say, 10x the > maximum reservation? Though that could be racy and maybe finobts are > small enough that the impact on mount time is low anyway? Usually they are small. And if they aren't - well that's life. I don't think anync counting for a reservation is a good idea. If we see a problem with the time needed to count in practice we'll have to keep a count an introduce a feature flag. > There's also the unsolved problem of what happens if we mount and find > agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and > hope that we don't later ENOSPC and crash. Yes. Which is exactly the situation we would have without this patch anyway.. > But as for retroactively adding AG reservations for an existing tree, I > guess we'll have to come up with a strategy for dealing with > insufficient free blocks. I suppose one could try to use xfs_fsr to > move large contiguous extents to a less full AG, if there are any... Eww. We could just fall back to the old code before this patch, which would then eventually shut down.. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 13, 2017 at 06:43:00PM +0100, Christoph Hellwig wrote: > On Tue, Jan 03, 2017 at 11:24:26AM -0800, Darrick J. Wong wrote: > > ...and so here we calculate the number of blocks needed to store the > > maximum number of finobt records possible for an AG. IIRC, each *inobt > > record refers to a single chunk of 64 inodes (or at least a theoretical > > chunk in the spinodes=1 case), so I think we can reduce the reservation > > to... > > > > nr = m_sb.sb_agblocks * m_sb.sb_inopblock / XFS_INODES_PER_CHUNK; > > return xfs_inobt_calc_size(mp, nr); > > > > ...right? > > Yes, that should reduce the reservation quite a bit. > > > This requires us to traverse all the blocks in the finobt at mount time, > > which isn't necessarily quick. For refcount/rmap we cache the number of > > tree blocks in the AGF to speed this up... but it was easy to sneak that > > into the disk format. :) > > But for finobt it's too late to do that without another incompatible > feature flag. Agree. > > For finobt I wonder if one could defer the block counting work to a > > separate thread if the AG has enough free blocks to cover, say, 10x the > > maximum reservation? Though that could be racy and maybe finobts are > > small enough that the impact on mount time is low anyway? > > Usually they are small. And if they aren't - well that's life. > > I don't think anync counting for a reservation is a good idea. If we > see a problem with the time needed to count in practice we'll have to > keep a count an introduce a feature flag. Yeah, that seems less tricky to get right. > > There's also the unsolved problem of what happens if we mount and find > > agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and > > hope that we don't later ENOSPC and crash. > > Yes. Which is exactly the situation we would have without this > patch anyway.. > > > But as for retroactively adding AG reservations for an existing tree, I > > guess we'll have to come up with a strategy for dealing with > > insufficient free blocks. I suppose one could try to use xfs_fsr to > > move large contiguous extents to a less full AG, if there are any... > > Eww. We could just fall back to the old code before this patch, > which would then eventually shut down.. For now I'm tempted just to xfs_info a warning that some AG is low on space and this could potentially cause the fs to go down. It's not like we currently have the ability to silently move data out of an AG to spread the load more evenly. --D > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e5ebc37..592754b 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -236,6 +237,11 @@ xfs_ag_resv_init( if (error) goto out; + error = xfs_finobt_calc_reserves(pag->pag_mount, + pag->pag_agno, &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ask, used); if (error) diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 0bb8966..3f24725 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); - #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 0fd086d..3aba153 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -480,3 +503,70 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, len); +} + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_inobt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453..aa81e2e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e..6db3e2d 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -94,8 +94,5 @@ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) -#define XFS_IFREE_SPACE_RES(mp) \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) - #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b955779..0283ab6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1792,30 +1792,11 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. - * - * Send a warning if the reservation does happen to fail, as the inode - * now remains allocated and sits on the unlinked list until the fs is - * repaired. + * We use a per-AG reservation for any block needed by the finobt tree. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); if (error) { - if (error == -ENOSPC) { - xfs_warn_ratelimited(mp, - "Failed to remove inode(s) from unlinked list. " - "Please free space, unmount and run xfs_repair."); - } else { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - } + ASSERT(XFS_FORCED_SHUTDOWN(mp)); return error; }
Currently we try to rely on the global reserved block pool for block allocations for the free inode btree, but I have customer reports (fairly complex workload, need to find an easier reproducer) where that is not enough as the AG where we free an inode that requires a new finobt block is entirely full. This causes us to cancel a dirty transaction and thus a file system shutdown. I think the right way to guard against this is to treat the finot the same way as the refcount btree and have a per-AG reservations for the possible worst case size of it, and the patch below implements that. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_ag_resv.c | 6 +++ fs/xfs/libxfs/xfs_ialloc.h | 1 - fs/xfs/libxfs/xfs_ialloc_btree.c | 96 ++++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ fs/xfs/libxfs/xfs_trans_space.h | 3 -- fs/xfs/xfs_inode.c | 25 ++--------- 6 files changed, 105 insertions(+), 29 deletions(-)