Message ID | 1485194742-23185-1-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Mon, Jan 23, 2017 at 07:05:42PM +0100, Christoph Hellwig wrote: > Currently we try to rely on the global reserved block pool for block > allocations for the free inode btree, but I have customer reports > (fairly complex workload, need to find an easier reproducer) where that > is not enough as the AG where we free an inode that requires a new > finobt block is entirely full. This causes us to cancel a dirty > transaction and thus a file system shutdown. > > I think the right way to guard against this is to treat the finot the same > way as the refcount btree and have a per-AG reservations for the possible > worst case size of it, and the patch below implements that. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > --- > > Changes since V1: > - reduce the size of the reservation > - warn and fall back to the old somewhat buggy code if we can't > get the reservation at mount time > - read the AGF before asserting based on values in it > --- > fs/xfs/libxfs/xfs_ag_resv.c | 47 ++++++++++++++++++--- > fs/xfs/libxfs/xfs_ialloc.h | 1 - > fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++-- > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ > fs/xfs/xfs_inode.c | 23 +++++----- > fs/xfs/xfs_mount.h | 1 + > 6 files changed, 144 insertions(+), 21 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c > index d346d42..4773c1e 100644 > --- a/fs/xfs/libxfs/xfs_ag_resv.c > +++ b/fs/xfs/libxfs/xfs_ag_resv.c > @@ -39,6 +39,7 @@ > #include "xfs_rmap_btree.h" > #include "xfs_btree.h" > #include "xfs_refcount_btree.h" > +#include "xfs_ialloc_btree.h" > > /* > * Per-AG Block Reservations > @@ -223,6 +224,8 @@ int > xfs_ag_resv_init( > struct xfs_perag *pag) > { > + struct xfs_mount *mp = pag->pag_mount; > + xfs_agnumber_t agno = pag->pag_agno; > xfs_extlen_t ask; > xfs_extlen_t used; > int error = 0; > @@ -231,23 +234,48 @@ xfs_ag_resv_init( > if (pag->pag_meta_resv.ar_asked == 0) { > ask = used = 0; > > - error = xfs_refcountbt_calc_reserves(pag->pag_mount, > - pag->pag_agno, &ask, &used); > + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); > if (error) > goto out; > > - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, > - ask, used); > + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); > if (error) > goto out; > + > + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, > + ask, used); > + if (error) { > + /* > + * Because we didn't have per-AG reservations when the > + * finobt feature was added we might not ne able to > + * reservere all needed blocks. Warn and fall back to > + * the old and potentially buggy code in that case, > + * but ensure we do have the reservation for the > + * refcountbt. > + */ > + ask = used = 0; > + > + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, > + &used); > + if (error) > + goto out; > + > + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, > + ask, used); But... _init decreases m_ag_max_usable prior to calling _mod_fdblocks, so double-calling _init decreases m_ag_max_usable twice. OTOH if the _mod_fdblocks call fails then m_fdblocks hasn't been updated, which means that we can't call _ag_resv_free to reset m_ag_max_usable prior to retrying _init. I think we could solve this by increasing m_ag_max_usable by ar_asked at the very top of __xfs_ag_resv_init. The perag structures are kzalloc'd, so this won't have any effect on a freshly mounting filesystem. I also want to warn more loudly about AGs that don't actually have enough space to handle the reservation. --D > + if (error) > + goto out; > + > + xfs_warn(mp, > +"Per-AG reservation for finobt failed. File System may run out of space.\n"); > + mp->m_inotbt_nores = true; > + } > } > > /* Create the AGFL metadata reservation */ > if (pag->pag_agfl_resv.ar_asked == 0) { > ask = used = 0; > > - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, > - &ask, &used); > + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); > if (error) > goto out; > > @@ -256,9 +284,16 @@ xfs_ag_resv_init( > goto out; > } > > +#ifdef DEBUG > + /* need to read in the AGF for the ASSERT below to work */ > + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); > + if (error) > + return error; > + > ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + > xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= > pag->pagf_freeblks + pag->pagf_flcount); > +#endif > out: > return error; > } > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h > index 0bb8966..3f24725 100644 > --- a/fs/xfs/libxfs/xfs_ialloc.h > +++ b/fs/xfs/libxfs/xfs_ialloc.h > @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, > int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, > xfs_agnumber_t agno, struct xfs_buf **bpp); > > - > #endif /* __XFS_IALLOC_H__ */ > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c > index 0fd086d..7c47188 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c > @@ -82,11 +82,12 @@ xfs_finobt_set_root( > } > > STATIC int > -xfs_inobt_alloc_block( > +__xfs_inobt_alloc_block( > struct xfs_btree_cur *cur, > union xfs_btree_ptr *start, > union xfs_btree_ptr *new, > - int *stat) > + int *stat, > + enum xfs_ag_resv_type resv) > { > xfs_alloc_arg_t args; /* block allocation args */ > int error; /* error return value */ > @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( > args.maxlen = 1; > args.prod = 1; > args.type = XFS_ALLOCTYPE_NEAR_BNO; > + args.resv = resv; > > error = xfs_alloc_vextent(&args); > if (error) { > @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( > } > > STATIC int > +xfs_inobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); > +} > + > +STATIC int > +xfs_finobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, > + XFS_AG_RESV_METADATA); > +} > + > +STATIC int > xfs_inobt_free_block( > struct xfs_btree_cur *cur, > struct xfs_buf *bp) > @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { > > .dup_cursor = xfs_inobt_dup_cursor, > .set_root = xfs_finobt_set_root, > - .alloc_block = xfs_inobt_alloc_block, > + .alloc_block = xfs_finobt_alloc_block, > .free_block = xfs_inobt_free_block, > .get_minrecs = xfs_inobt_get_minrecs, > .get_maxrecs = xfs_inobt_get_maxrecs, > @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count( > return 0; > } > #endif /* DEBUG */ > + > +static xfs_extlen_t > +xfs_inobt_max_size( > + struct xfs_mount *mp) > +{ > + /* Bail out if we're uninitialized, which can happen in mkfs. */ > + if (mp->m_inobt_mxr[0] == 0) > + return 0; > + > + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, > + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / > + XFS_INODES_PER_CHUNK); > +} > + > +static int > +xfs_inobt_count_blocks( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_btnum_t btnum, > + xfs_extlen_t *tree_blocks) > +{ > + struct xfs_buf *agbp; > + struct xfs_btree_cur *cur; > + int error; > + > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); > + if (error) > + return error; > + > + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); > + error = xfs_btree_count_blocks(cur, tree_blocks); > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > + xfs_buf_relse(agbp); > + > + return error; > +} > + > +/* > + * Figure out how many blocks to reserve and how many are used by this btree. > + */ > +int > +xfs_finobt_calc_reserves( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_extlen_t *ask, > + xfs_extlen_t *used) > +{ > + xfs_extlen_t tree_len = 0; > + int error; > + > + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) > + return 0; > + > + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); > + if (error) > + return error; > + > + *ask += xfs_inobt_max_size(mp); > + *used += tree_len; > + return 0; > +} > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h > index bd88453..aa81e2e 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.h > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h > @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, > #define xfs_inobt_rec_check_count(mp, rec) 0 > #endif /* DEBUG */ > > +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, > + xfs_extlen_t *ask, xfs_extlen_t *used); > + > #endif /* __XFS_IALLOC_BTREE_H__ */ > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index b955779..de32f0f 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -1792,22 +1792,23 @@ xfs_inactive_ifree( > int error; > > /* > - * The ifree transaction might need to allocate blocks for record > - * insertion to the finobt. We don't want to fail here at ENOSPC, so > - * allow ifree to dip into the reserved block pool if necessary. > - * > - * Freeing large sets of inodes generally means freeing inode chunks, > - * directory and file data blocks, so this should be relatively safe. > - * Only under severe circumstances should it be possible to free enough > - * inodes to exhaust the reserve block pool via finobt expansion while > - * at the same time not creating free space in the filesystem. > + * We try to use a per-AG reservation for any block needed by the finobt > + * tree, but as the finobt feature predates the per-AG reservation > + * support a degraded file system might not have enough space for the > + * reservation at mount time. In that case try to dip into the reserved > + * pool and pray. > * > * Send a warning if the reservation does happen to fail, as the inode > * now remains allocated and sits on the unlinked list until the fs is > * repaired. > */ > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); > + if (unlikely(mp->m_inotbt_nores)) { > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, > + &tp); > + } else { > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); > + } > if (error) { > if (error == -ENOSPC) { > xfs_warn_ratelimited(mp, > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 84f7852..7f351f7 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -140,6 +140,7 @@ typedef struct xfs_mount { > int m_fixedfsid[2]; /* unchanged for life of FS */ > uint m_dmevmask; /* DMI events for this FS */ > __uint64_t m_flags; /* global mount flags */ > + bool m_inotbt_nores; /* no per-AG finobt resv. */ > int m_ialloc_inos; /* inodes in inode allocation */ > int m_ialloc_blks; /* blocks in inode allocation */ > int m_ialloc_min_blks;/* min blocks in sparse inode > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jan 23, 2017 at 11:28:47AM -0800, Darrick J. Wong wrote: > I also want to warn more loudly about AGs that don't actually have > enough space to handle the reservation. > > + xfs_warn(mp, > > +"Per-AG reservation for finobt failed. File System may run out of space.\n"); > > + mp->m_inotbt_nores = true; > > + } Any suggestion for a better warning than this one? Except for this bit I'm ready to resend the series. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jan 23, 2017 at 07:05:42PM +0100, Christoph Hellwig wrote: > Currently we try to rely on the global reserved block pool for block > allocations for the free inode btree, but I have customer reports > (fairly complex workload, need to find an easier reproducer) where that > is not enough as the AG where we free an inode that requires a new > finobt block is entirely full. This causes us to cancel a dirty > transaction and thus a file system shutdown. > > I think the right way to guard against this is to treat the finot the same > way as the refcount btree and have a per-AG reservations for the possible > worst case size of it, and the patch below implements that. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > --- > > Changes since V1: > - reduce the size of the reservation > - warn and fall back to the old somewhat buggy code if we can't > get the reservation at mount time > - read the AGF before asserting based on values in it > --- > fs/xfs/libxfs/xfs_ag_resv.c | 47 ++++++++++++++++++--- > fs/xfs/libxfs/xfs_ialloc.h | 1 - > fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++-- > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ > fs/xfs/xfs_inode.c | 23 +++++----- > fs/xfs/xfs_mount.h | 1 + > 6 files changed, 144 insertions(+), 21 deletions(-) > ... > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h > index 0bb8966..3f24725 100644 > --- a/fs/xfs/libxfs/xfs_ialloc.h > +++ b/fs/xfs/libxfs/xfs_ialloc.h ... > @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count( ... > +static int > +xfs_inobt_count_blocks( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_btnum_t btnum, > + xfs_extlen_t *tree_blocks) > +{ > + struct xfs_buf *agbp; > + struct xfs_btree_cur *cur; > + int error; > + > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); > + if (error) > + return error; > + > + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); > + error = xfs_btree_count_blocks(cur, tree_blocks); > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > + xfs_buf_relse(agbp); > + Darrick called out in the previous version that this requires traversal of the entire tree at mount time. Do you have any test results on what kind of worst case mount delays we could be looking at here? As mentioned in the previous version, I don't think we can just assume that the finobt is always going to be small. The purpose of the feature is to facilitate constant time lookups for free inode records in filesystems with extremely large inode counts. The number of free inodes at any particular time is probably just a question of allocation patterns. We can hope that the inode frees are not fragmented across records, but even then we still have to take the ikeep mount option into consideration (which is probably an easy way to test this, btw). Brian > + return error; > +} > + > +/* > + * Figure out how many blocks to reserve and how many are used by this btree. > + */ > +int > +xfs_finobt_calc_reserves( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_extlen_t *ask, > + xfs_extlen_t *used) > +{ > + xfs_extlen_t tree_len = 0; > + int error; > + > + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) > + return 0; > + > + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); > + if (error) > + return error; > + > + *ask += xfs_inobt_max_size(mp); > + *used += tree_len; > + return 0; > +} > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h > index bd88453..aa81e2e 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.h > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h > @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, > #define xfs_inobt_rec_check_count(mp, rec) 0 > #endif /* DEBUG */ > > +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, > + xfs_extlen_t *ask, xfs_extlen_t *used); > + > #endif /* __XFS_IALLOC_BTREE_H__ */ > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index b955779..de32f0f 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -1792,22 +1792,23 @@ xfs_inactive_ifree( > int error; > > /* > - * The ifree transaction might need to allocate blocks for record > - * insertion to the finobt. We don't want to fail here at ENOSPC, so > - * allow ifree to dip into the reserved block pool if necessary. > - * > - * Freeing large sets of inodes generally means freeing inode chunks, > - * directory and file data blocks, so this should be relatively safe. > - * Only under severe circumstances should it be possible to free enough > - * inodes to exhaust the reserve block pool via finobt expansion while > - * at the same time not creating free space in the filesystem. > + * We try to use a per-AG reservation for any block needed by the finobt > + * tree, but as the finobt feature predates the per-AG reservation > + * support a degraded file system might not have enough space for the > + * reservation at mount time. In that case try to dip into the reserved > + * pool and pray. > * > * Send a warning if the reservation does happen to fail, as the inode > * now remains allocated and sits on the unlinked list until the fs is > * repaired. > */ > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); > + if (unlikely(mp->m_inotbt_nores)) { > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, > + &tp); > + } else { > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); > + } > if (error) { > if (error == -ENOSPC) { > xfs_warn_ratelimited(mp, > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 84f7852..7f351f7 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -140,6 +140,7 @@ typedef struct xfs_mount { > int m_fixedfsid[2]; /* unchanged for life of FS */ > uint m_dmevmask; /* DMI events for this FS */ > __uint64_t m_flags; /* global mount flags */ > + bool m_inotbt_nores; /* no per-AG finobt resv. */ > int m_ialloc_inos; /* inodes in inode allocation */ > int m_ialloc_blks; /* blocks in inode allocation */ > int m_ialloc_min_blks;/* min blocks in sparse inode > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote: > Darrick called out in the previous version that this requires traversal > of the entire tree at mount time. Do you have any test results on what > kind of worst case mount delays we could be looking at here? Even with pretty horribly fragmented file systems I've not seen major delays. But I don't have a setup with a lot of actual disks but mostly SSDs these days, so this might not statistically significant. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 03:49:37PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote: > > Darrick called out in the previous version that this requires traversal > > of the entire tree at mount time. Do you have any test results on what > > kind of worst case mount delays we could be looking at here? > > Even with pretty horribly fragmented file systems I've not seen > major delays. But I don't have a setup with a lot of actual disks > but mostly SSDs these days, so this might not statistically significant. Heh, I might have some systems with slow storage around. ;P It may take a little time to populate a large enough fs with inodes though.. Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 03:02:47PM +0100, Christoph Hellwig wrote: > On Mon, Jan 23, 2017 at 11:28:47AM -0800, Darrick J. Wong wrote: > > I also want to warn more loudly about AGs that don't actually have > > enough space to handle the reservation. > > > > + xfs_warn(mp, > > > +"Per-AG reservation for finobt failed. File System may run out of space.\n"); > > > + mp->m_inotbt_nores = true; > > > + } > > Any suggestion for a better warning than this one? Except for this > bit I'm ready to resend the series. I'd just print out: "Per-AG reservation for AG ${agno} failed. Filesystem may run out of space." if any of the (now three) invocations of __xfs_ag_resv_init() returns ENOSPC, since the AG is more than 98% full. Some day, a sysadmin could then use the fsmap data to identify which inodes are using the most space in that AG and arrange to migrate/remove/etc the data to another AG. --D -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 11:19:18AM -0500, Brian Foster wrote: > On Tue, Jan 24, 2017 at 03:49:37PM +0100, Christoph Hellwig wrote: > > On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote: > > > Darrick called out in the previous version that this requires traversal > > > of the entire tree at mount time. Do you have any test results on what > > > kind of worst case mount delays we could be looking at here? > > > > Even with pretty horribly fragmented file systems I've not seen > > major delays. But I don't have a setup with a lot of actual disks > > but mostly SSDs these days, so this might not statistically significant. > > Heh, I might have some systems with slow storage around. ;P It may take > a little time to populate a large enough fs with inodes though.. <anecdote> So on this laptop, we have: $ df -i /storage/; df /storage/ Filesystem Inodes IUsed IFree IUse% Mounted on /dev/mapper/birch_disk-storage 466M 1.8M 464M 1% /storage Filesystem Size Used Avail Use% Mounted on /dev/mapper/birch_disk-storage 931G 525G 407G 57% /storage $ sudo xfs_io -c 'fsmap -v -n 1024' . | grep 'inode btree' | \ awk '{moo[$6] += $8}END{for (x=0;x<=255;x++) if (x in moo) print x, moo[x]}' 0 144 1 160 2 152 3 136 4 152 5 152 6 168 7 152 So on average we have ~160 sectors (or about 20 blocks) of inobt/finobt in each of 8 AGs. </anecdote> --D > > Brian > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 08:48:37AM -0800, Darrick J. Wong wrote: > I'd just print out: > > "Per-AG reservation for AG ${agno} failed. Filesystem may run out of space." Sure, I can do that. > > if any of the (now three) invocations of __xfs_ag_resv_init() returns > ENOSPC, since the AG is more than 98% full. Some day, a sysadmin could > then use the fsmap data to identify which inodes are using the most > space in that AG and arrange to migrate/remove/etc the data to another > AG. But for all but the reflink one this can't happen as the reservations have been there since the beginning. And if they fail neverless we fail the mount. Or am I missing something here? -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 08:50:50PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 08:48:37AM -0800, Darrick J. Wong wrote: > > I'd just print out: > > > > "Per-AG reservation for AG ${agno} failed. Filesystem may run out of space." > > Sure, I can do that. > > > > > if any of the (now three) invocations of __xfs_ag_resv_init() returns > > ENOSPC, since the AG is more than 98% full. Some day, a sysadmin could > > then use the fsmap data to identify which inodes are using the most > > space in that AG and arrange to migrate/remove/etc the data to another > > AG. > > But for all but the reflink one this can't happen as the reservations > have been there since the beginning. And if they fail neverless we fail > the mount. Or am I missing something here? The code I wrote doesn't fail the mount (or remount) on ENOSPC so that it's still possible to delete files off the fs even when space is low. In theory it wasn't possible to run out, but I've been thinking that it's better to warn noisily just in case we /do/ run out of space later, since it's not entirely obvious when the fs goes offline because we tried to expand a btree and hit ENOSPC. (Also if people perform underhanded "upgrades" then they can hit this problem, but maybe that was just me. :P) --D -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index d346d42..4773c1e 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -223,6 +224,8 @@ int xfs_ag_resv_init( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0; @@ -231,23 +234,48 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(pag->pag_mount, - pag->pag_agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, - ask, used); + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) { + /* + * Because we didn't have per-AG reservations when the + * finobt feature was added we might not ne able to + * reservere all needed blocks. Warn and fall back to + * the old and potentially buggy code in that case, + * but ensure we do have the reservation for the + * refcountbt. + */ + ask = used = 0; + + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, + &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + + xfs_warn(mp, +"Per-AG reservation for finobt failed. File System may run out of space.\n"); + mp->m_inotbt_nores = true; + } } /* Create the AGFL metadata reservation */ if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, - &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; @@ -256,9 +284,16 @@ xfs_ag_resv_init( goto out; } +#ifdef DEBUG + /* need to read in the AGF for the ASSERT below to work */ + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); + if (error) + return error; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount); +#endif out: return error; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 0bb8966..3f24725 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); - #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 0fd086d..7c47188 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453..aa81e2e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b955779..de32f0f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1792,22 +1792,23 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. + * We try to use a per-AG reservation for any block needed by the finobt + * tree, but as the finobt feature predates the per-AG reservation + * support a degraded file system might not have enough space for the + * reservation at mount time. In that case try to dip into the reserved + * pool and pray. * * Send a warning if the reservation does happen to fail, as the inode * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (unlikely(mp->m_inotbt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + } else { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); + } if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 84f7852..7f351f7 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -140,6 +140,7 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ + bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode
Currently we try to rely on the global reserved block pool for block allocations for the free inode btree, but I have customer reports (fairly complex workload, need to find an easier reproducer) where that is not enough as the AG where we free an inode that requires a new finobt block is entirely full. This causes us to cancel a dirty transaction and thus a file system shutdown. I think the right way to guard against this is to treat the finot the same way as the refcount btree and have a per-AG reservations for the possible worst case size of it, and the patch below implements that. Signed-off-by: Christoph Hellwig <hch@lst.de> --- Changes since V1: - reduce the size of the reservation - warn and fall back to the old somewhat buggy code if we can't get the reservation at mount time - read the AGF before asserting based on values in it --- fs/xfs/libxfs/xfs_ag_resv.c | 47 ++++++++++++++++++--- fs/xfs/libxfs/xfs_ialloc.h | 1 - fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ fs/xfs/xfs_inode.c | 23 +++++----- fs/xfs/xfs_mount.h | 1 + 6 files changed, 144 insertions(+), 21 deletions(-)