diff mbox

[v2] xfs: use per-AG reservations for the finobt

Message ID 1485194742-23185-1-git-send-email-hch@lst.de (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Christoph Hellwig Jan. 23, 2017, 6:05 p.m. UTC
Currently we try to rely on the global reserved block pool for block
allocations for the free inode btree, but I have customer reports
(fairly complex workload, need to find an easier reproducer) where that
is not enough as the AG where we free an inode that requires a new
finobt block is entirely full.  This causes us to cancel a dirty
transaction and thus a file system shutdown.

I think the right way to guard against this is to treat the finot the same
way as the refcount btree and have a per-AG reservations for the possible
worst case size of it, and the patch below implements that.

Signed-off-by: Christoph Hellwig <hch@lst.de>

---

Changes since V1:
 - reduce the size of the reservation
 - warn and fall back to the old somewhat buggy code if we can't
   get the reservation at mount time
 - read the AGF before asserting based on values in it
---
 fs/xfs/libxfs/xfs_ag_resv.c      | 47 ++++++++++++++++++---
 fs/xfs/libxfs/xfs_ialloc.h       |  1 -
 fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_ialloc_btree.h |  3 ++
 fs/xfs/xfs_inode.c               | 23 +++++-----
 fs/xfs/xfs_mount.h               |  1 +
 6 files changed, 144 insertions(+), 21 deletions(-)

Comments

Darrick J. Wong Jan. 23, 2017, 7:28 p.m. UTC | #1
On Mon, Jan 23, 2017 at 07:05:42PM +0100, Christoph Hellwig wrote:
> Currently we try to rely on the global reserved block pool for block
> allocations for the free inode btree, but I have customer reports
> (fairly complex workload, need to find an easier reproducer) where that
> is not enough as the AG where we free an inode that requires a new
> finobt block is entirely full.  This causes us to cancel a dirty
> transaction and thus a file system shutdown.
> 
> I think the right way to guard against this is to treat the finot the same
> way as the refcount btree and have a per-AG reservations for the possible
> worst case size of it, and the patch below implements that.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> ---
> 
> Changes since V1:
>  - reduce the size of the reservation
>  - warn and fall back to the old somewhat buggy code if we can't
>    get the reservation at mount time
>  - read the AGF before asserting based on values in it
> ---
>  fs/xfs/libxfs/xfs_ag_resv.c      | 47 ++++++++++++++++++---
>  fs/xfs/libxfs/xfs_ialloc.h       |  1 -
>  fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++--
>  fs/xfs/libxfs/xfs_ialloc_btree.h |  3 ++
>  fs/xfs/xfs_inode.c               | 23 +++++-----
>  fs/xfs/xfs_mount.h               |  1 +
>  6 files changed, 144 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
> index d346d42..4773c1e 100644
> --- a/fs/xfs/libxfs/xfs_ag_resv.c
> +++ b/fs/xfs/libxfs/xfs_ag_resv.c
> @@ -39,6 +39,7 @@
>  #include "xfs_rmap_btree.h"
>  #include "xfs_btree.h"
>  #include "xfs_refcount_btree.h"
> +#include "xfs_ialloc_btree.h"
>  
>  /*
>   * Per-AG Block Reservations
> @@ -223,6 +224,8 @@ int
>  xfs_ag_resv_init(
>  	struct xfs_perag		*pag)
>  {
> +	struct xfs_mount		*mp = pag->pag_mount;
> +	xfs_agnumber_t			agno = pag->pag_agno;
>  	xfs_extlen_t			ask;
>  	xfs_extlen_t			used;
>  	int				error = 0;
> @@ -231,23 +234,48 @@ xfs_ag_resv_init(
>  	if (pag->pag_meta_resv.ar_asked == 0) {
>  		ask = used = 0;
>  
> -		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
> -				pag->pag_agno, &ask, &used);
> +		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
>  
> -		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> -				ask, used);
> +		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
> +
> +		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> +				ask, used);
> +		if (error) {
> +			/*
> +			 * Because we didn't have per-AG reservations when the
> +			 * finobt feature was added we might not ne able to
> +			 * reservere all needed blocks.  Warn and fall back to
> +			 * the old and potentially buggy code in that case,
> +			 * but ensure we do have the reservation for the
> +			 * refcountbt.
> +			 */
> +			ask = used = 0;
> +
> +			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
> +					&used);
> +			if (error)
> +				goto out;
> +
> +			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> +					ask, used);

But... _init decreases m_ag_max_usable prior to calling _mod_fdblocks,
so double-calling _init decreases m_ag_max_usable twice.

OTOH if the _mod_fdblocks call fails then m_fdblocks hasn't been
updated, which means that we can't call _ag_resv_free to reset
m_ag_max_usable prior to retrying _init.

I think we could solve this by increasing m_ag_max_usable by ar_asked at
the very top of __xfs_ag_resv_init.  The perag structures are kzalloc'd,
so this won't have any effect on a freshly mounting filesystem.

I also want to warn more loudly about AGs that don't actually have
enough space to handle the reservation.

--D

> +			if (error)
> +				goto out;
> +
> +			xfs_warn(mp,
> +"Per-AG reservation for finobt failed.  File System may run out of space.\n");
> +			mp->m_inotbt_nores = true;
> +		}
>  	}
>  
>  	/* Create the AGFL metadata reservation */
>  	if (pag->pag_agfl_resv.ar_asked == 0) {
>  		ask = used = 0;
>  
> -		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
> -				&ask, &used);
> +		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
>  
> @@ -256,9 +284,16 @@ xfs_ag_resv_init(
>  			goto out;
>  	}
>  
> +#ifdef DEBUG
> +	/* need to read in the AGF for the ASSERT below to work */
> +	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
> +	if (error)
> +		return error;
> +
>  	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
>  	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
>  	       pag->pagf_freeblks + pag->pagf_flcount);
> +#endif
>  out:
>  	return error;
>  }
> diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> index 0bb8966..3f24725 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.h
> +++ b/fs/xfs/libxfs/xfs_ialloc.h
> @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
>  int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
>  		xfs_agnumber_t agno, struct xfs_buf **bpp);
>  
> -
>  #endif	/* __XFS_IALLOC_H__ */
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> index 0fd086d..7c47188 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
> @@ -82,11 +82,12 @@ xfs_finobt_set_root(
>  }
>  
>  STATIC int
> -xfs_inobt_alloc_block(
> +__xfs_inobt_alloc_block(
>  	struct xfs_btree_cur	*cur,
>  	union xfs_btree_ptr	*start,
>  	union xfs_btree_ptr	*new,
> -	int			*stat)
> +	int			*stat,
> +	enum xfs_ag_resv_type	resv)
>  {
>  	xfs_alloc_arg_t		args;		/* block allocation args */
>  	int			error;		/* error return value */
> @@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
>  	args.maxlen = 1;
>  	args.prod = 1;
>  	args.type = XFS_ALLOCTYPE_NEAR_BNO;
> +	args.resv = resv;
>  
>  	error = xfs_alloc_vextent(&args);
>  	if (error) {
> @@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
>  }
>  
>  STATIC int
> +xfs_inobt_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*start,
> +	union xfs_btree_ptr	*new,
> +	int			*stat)
> +{
> +	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
> +}
> +
> +STATIC int
> +xfs_finobt_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*start,
> +	union xfs_btree_ptr	*new,
> +	int			*stat)
> +{
> +	return __xfs_inobt_alloc_block(cur, start, new, stat,
> +			XFS_AG_RESV_METADATA);
> +}
> +
> +STATIC int
>  xfs_inobt_free_block(
>  	struct xfs_btree_cur	*cur,
>  	struct xfs_buf		*bp)
> @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
>  
>  	.dup_cursor		= xfs_inobt_dup_cursor,
>  	.set_root		= xfs_finobt_set_root,
> -	.alloc_block		= xfs_inobt_alloc_block,
> +	.alloc_block		= xfs_finobt_alloc_block,
>  	.free_block		= xfs_inobt_free_block,
>  	.get_minrecs		= xfs_inobt_get_minrecs,
>  	.get_maxrecs		= xfs_inobt_get_maxrecs,
> @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
>  	return 0;
>  }
>  #endif	/* DEBUG */
> +
> +static xfs_extlen_t
> +xfs_inobt_max_size(
> +	struct xfs_mount	*mp)
> +{
> +	/* Bail out if we're uninitialized, which can happen in mkfs. */
> +	if (mp->m_inobt_mxr[0] == 0)
> +		return 0;
> +
> +	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
> +		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
> +				XFS_INODES_PER_CHUNK);
> +}
> +
> +static int
> +xfs_inobt_count_blocks(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_btnum_t		btnum,
> +	xfs_extlen_t		*tree_blocks)
> +{
> +	struct xfs_buf		*agbp;
> +	struct xfs_btree_cur	*cur;
> +	int			error;
> +
> +	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
> +	error = xfs_btree_count_blocks(cur, tree_blocks);
> +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	xfs_buf_relse(agbp);
> +
> +	return error;
> +}
> +
> +/*
> + * Figure out how many blocks to reserve and how many are used by this btree.
> + */
> +int
> +xfs_finobt_calc_reserves(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_extlen_t		*ask,
> +	xfs_extlen_t		*used)
> +{
> +	xfs_extlen_t		tree_len = 0;
> +	int			error;
> +
> +	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
> +		return 0;
> +
> +	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
> +	if (error)
> +		return error;
> +
> +	*ask += xfs_inobt_max_size(mp);
> +	*used += tree_len;
> +	return 0;
> +}
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
> index bd88453..aa81e2e 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.h
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
> @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
>  #define xfs_inobt_rec_check_count(mp, rec)	0
>  #endif	/* DEBUG */
>  
> +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		xfs_extlen_t *ask, xfs_extlen_t *used);
> +
>  #endif	/* __XFS_IALLOC_BTREE_H__ */
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b955779..de32f0f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
>  	int			error;
>  
>  	/*
> -	 * The ifree transaction might need to allocate blocks for record
> -	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
> -	 * allow ifree to dip into the reserved block pool if necessary.
> -	 *
> -	 * Freeing large sets of inodes generally means freeing inode chunks,
> -	 * directory and file data blocks, so this should be relatively safe.
> -	 * Only under severe circumstances should it be possible to free enough
> -	 * inodes to exhaust the reserve block pool via finobt expansion while
> -	 * at the same time not creating free space in the filesystem.
> +	 * We try to use a per-AG reservation for any block needed by the finobt
> +	 * tree, but as the finobt feature predates the per-AG reservation
> +	 * support a degraded file system might not have enough space for the
> +	 * reservation at mount time.  In that case try to dip into the reserved
> +	 * pool and pray.
>  	 *
>  	 * Send a warning if the reservation does happen to fail, as the inode
>  	 * now remains allocated and sits on the unlinked list until the fs is
>  	 * repaired.
>  	 */
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> -			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> +	if (unlikely(mp->m_inotbt_nores)) {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> +				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
> +				&tp);
> +	} else {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> +	}
>  	if (error) {
>  		if (error == -ENOSPC) {
>  			xfs_warn_ratelimited(mp,
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 84f7852..7f351f7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -140,6 +140,7 @@ typedef struct xfs_mount {
>  	int			m_fixedfsid[2];	/* unchanged for life of FS */
>  	uint			m_dmevmask;	/* DMI events for this FS */
>  	__uint64_t		m_flags;	/* global mount flags */
> +	bool			m_inotbt_nores; /* no per-AG finobt resv. */
>  	int			m_ialloc_inos;	/* inodes in inode allocation */
>  	int			m_ialloc_blks;	/* blocks in inode allocation */
>  	int			m_ialloc_min_blks;/* min blocks in sparse inode
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Jan. 24, 2017, 2:02 p.m. UTC | #2
On Mon, Jan 23, 2017 at 11:28:47AM -0800, Darrick J. Wong wrote:
> I also want to warn more loudly about AGs that don't actually have
> enough space to handle the reservation.

> > +			xfs_warn(mp,
> > +"Per-AG reservation for finobt failed.  File System may run out of space.\n");
> > +			mp->m_inotbt_nores = true;
> > +		}

Any suggestion for a better warning than this one?  Except for this
bit I'm ready to resend the series.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Jan. 24, 2017, 2:06 p.m. UTC | #3
On Mon, Jan 23, 2017 at 07:05:42PM +0100, Christoph Hellwig wrote:
> Currently we try to rely on the global reserved block pool for block
> allocations for the free inode btree, but I have customer reports
> (fairly complex workload, need to find an easier reproducer) where that
> is not enough as the AG where we free an inode that requires a new
> finobt block is entirely full.  This causes us to cancel a dirty
> transaction and thus a file system shutdown.
> 
> I think the right way to guard against this is to treat the finot the same
> way as the refcount btree and have a per-AG reservations for the possible
> worst case size of it, and the patch below implements that.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> ---
> 
> Changes since V1:
>  - reduce the size of the reservation
>  - warn and fall back to the old somewhat buggy code if we can't
>    get the reservation at mount time
>  - read the AGF before asserting based on values in it
> ---
>  fs/xfs/libxfs/xfs_ag_resv.c      | 47 ++++++++++++++++++---
>  fs/xfs/libxfs/xfs_ialloc.h       |  1 -
>  fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++--
>  fs/xfs/libxfs/xfs_ialloc_btree.h |  3 ++
>  fs/xfs/xfs_inode.c               | 23 +++++-----
>  fs/xfs/xfs_mount.h               |  1 +
>  6 files changed, 144 insertions(+), 21 deletions(-)
> 
...
> diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> index 0bb8966..3f24725 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.h
> +++ b/fs/xfs/libxfs/xfs_ialloc.h
...
> @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
...
> +static int
> +xfs_inobt_count_blocks(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_btnum_t		btnum,
> +	xfs_extlen_t		*tree_blocks)
> +{
> +	struct xfs_buf		*agbp;
> +	struct xfs_btree_cur	*cur;
> +	int			error;
> +
> +	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
> +	error = xfs_btree_count_blocks(cur, tree_blocks);
> +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	xfs_buf_relse(agbp);
> +

Darrick called out in the previous version that this requires traversal
of the entire tree at mount time. Do you have any test results on what
kind of worst case mount delays we could be looking at here?

As mentioned in the previous version, I don't think we can just assume
that the finobt is always going to be small. The purpose of the feature
is to facilitate constant time lookups for free inode records in
filesystems with extremely large inode counts. The number of free inodes
at any particular time is probably just a question of allocation
patterns. We can hope that the inode frees are not fragmented across
records, but even then we still have to take the ikeep mount option into
consideration (which is probably an easy way to test this, btw).

Brian

> +	return error;
> +}
> +
> +/*
> + * Figure out how many blocks to reserve and how many are used by this btree.
> + */
> +int
> +xfs_finobt_calc_reserves(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_extlen_t		*ask,
> +	xfs_extlen_t		*used)
> +{
> +	xfs_extlen_t		tree_len = 0;
> +	int			error;
> +
> +	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
> +		return 0;
> +
> +	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
> +	if (error)
> +		return error;
> +
> +	*ask += xfs_inobt_max_size(mp);
> +	*used += tree_len;
> +	return 0;
> +}
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
> index bd88453..aa81e2e 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.h
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
> @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
>  #define xfs_inobt_rec_check_count(mp, rec)	0
>  #endif	/* DEBUG */
>  
> +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		xfs_extlen_t *ask, xfs_extlen_t *used);
> +
>  #endif	/* __XFS_IALLOC_BTREE_H__ */
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b955779..de32f0f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
>  	int			error;
>  
>  	/*
> -	 * The ifree transaction might need to allocate blocks for record
> -	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
> -	 * allow ifree to dip into the reserved block pool if necessary.
> -	 *
> -	 * Freeing large sets of inodes generally means freeing inode chunks,
> -	 * directory and file data blocks, so this should be relatively safe.
> -	 * Only under severe circumstances should it be possible to free enough
> -	 * inodes to exhaust the reserve block pool via finobt expansion while
> -	 * at the same time not creating free space in the filesystem.
> +	 * We try to use a per-AG reservation for any block needed by the finobt
> +	 * tree, but as the finobt feature predates the per-AG reservation
> +	 * support a degraded file system might not have enough space for the
> +	 * reservation at mount time.  In that case try to dip into the reserved
> +	 * pool and pray.
>  	 *
>  	 * Send a warning if the reservation does happen to fail, as the inode
>  	 * now remains allocated and sits on the unlinked list until the fs is
>  	 * repaired.
>  	 */
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> -			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> +	if (unlikely(mp->m_inotbt_nores)) {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> +				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
> +				&tp);
> +	} else {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> +	}
>  	if (error) {
>  		if (error == -ENOSPC) {
>  			xfs_warn_ratelimited(mp,
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 84f7852..7f351f7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -140,6 +140,7 @@ typedef struct xfs_mount {
>  	int			m_fixedfsid[2];	/* unchanged for life of FS */
>  	uint			m_dmevmask;	/* DMI events for this FS */
>  	__uint64_t		m_flags;	/* global mount flags */
> +	bool			m_inotbt_nores; /* no per-AG finobt resv. */
>  	int			m_ialloc_inos;	/* inodes in inode allocation */
>  	int			m_ialloc_blks;	/* blocks in inode allocation */
>  	int			m_ialloc_min_blks;/* min blocks in sparse inode
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Jan. 24, 2017, 2:49 p.m. UTC | #4
On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote:
> Darrick called out in the previous version that this requires traversal
> of the entire tree at mount time. Do you have any test results on what
> kind of worst case mount delays we could be looking at here?

Even with pretty horribly fragmented file systems I've not seen
major delays.  But I don't have a setup with a lot of actual disks
but mostly SSDs these days, so this might not statistically significant.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Jan. 24, 2017, 4:19 p.m. UTC | #5
On Tue, Jan 24, 2017 at 03:49:37PM +0100, Christoph Hellwig wrote:
> On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote:
> > Darrick called out in the previous version that this requires traversal
> > of the entire tree at mount time. Do you have any test results on what
> > kind of worst case mount delays we could be looking at here?
> 
> Even with pretty horribly fragmented file systems I've not seen
> major delays.  But I don't have a setup with a lot of actual disks
> but mostly SSDs these days, so this might not statistically significant.

Heh, I might have some systems with slow storage around. ;P It may take
a little time to populate a large enough fs with inodes though..

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 24, 2017, 4:48 p.m. UTC | #6
On Tue, Jan 24, 2017 at 03:02:47PM +0100, Christoph Hellwig wrote:
> On Mon, Jan 23, 2017 at 11:28:47AM -0800, Darrick J. Wong wrote:
> > I also want to warn more loudly about AGs that don't actually have
> > enough space to handle the reservation.
> 
> > > +			xfs_warn(mp,
> > > +"Per-AG reservation for finobt failed.  File System may run out of space.\n");
> > > +			mp->m_inotbt_nores = true;
> > > +		}
> 
> Any suggestion for a better warning than this one?  Except for this
> bit I'm ready to resend the series.

I'd just print out:

"Per-AG reservation for AG ${agno} failed.  Filesystem may run out of space."

if any of the (now three) invocations of __xfs_ag_resv_init() returns
ENOSPC, since the AG is more than 98% full.  Some day, a sysadmin could
then use the fsmap data to identify which inodes are using the most
space in that AG and arrange to migrate/remove/etc the data to another
AG.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 24, 2017, 6:27 p.m. UTC | #7
On Tue, Jan 24, 2017 at 11:19:18AM -0500, Brian Foster wrote:
> On Tue, Jan 24, 2017 at 03:49:37PM +0100, Christoph Hellwig wrote:
> > On Tue, Jan 24, 2017 at 09:06:49AM -0500, Brian Foster wrote:
> > > Darrick called out in the previous version that this requires traversal
> > > of the entire tree at mount time. Do you have any test results on what
> > > kind of worst case mount delays we could be looking at here?
> > 
> > Even with pretty horribly fragmented file systems I've not seen
> > major delays.  But I don't have a setup with a lot of actual disks
> > but mostly SSDs these days, so this might not statistically significant.
> 
> Heh, I might have some systems with slow storage around. ;P It may take
> a little time to populate a large enough fs with inodes though..

<anecdote>

So on this laptop, we have:

$ df -i /storage/; df /storage/
Filesystem                     Inodes IUsed IFree IUse% Mounted on
/dev/mapper/birch_disk-storage   466M  1.8M  464M    1% /storage
Filesystem                      Size  Used Avail Use% Mounted on
/dev/mapper/birch_disk-storage  931G  525G  407G  57% /storage

$ sudo xfs_io -c 'fsmap -v -n 1024' . | grep 'inode btree' | \
	awk '{moo[$6] += $8}END{for (x=0;x<=255;x++) if (x in moo) print x, moo[x]}'
0	144
1	160
2	152
3	136
4	152
5	152
6	168
7	152

So on average we have ~160 sectors (or about 20 blocks) of inobt/finobt
in each of 8 AGs.

</anecdote>

--D

> 
> Brian
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Jan. 24, 2017, 7:50 p.m. UTC | #8
On Tue, Jan 24, 2017 at 08:48:37AM -0800, Darrick J. Wong wrote:
> I'd just print out:
> 
> "Per-AG reservation for AG ${agno} failed.  Filesystem may run out of space."

Sure, I can do that.

> 
> if any of the (now three) invocations of __xfs_ag_resv_init() returns
> ENOSPC, since the AG is more than 98% full.  Some day, a sysadmin could
> then use the fsmap data to identify which inodes are using the most
> space in that AG and arrange to migrate/remove/etc the data to another
> AG.

But for all but the reflink one this can't happen as the reservations
have been there since the beginning.  And if they fail neverless we fail
the mount.  Or am I missing something here?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 24, 2017, 8:05 p.m. UTC | #9
On Tue, Jan 24, 2017 at 08:50:50PM +0100, Christoph Hellwig wrote:
> On Tue, Jan 24, 2017 at 08:48:37AM -0800, Darrick J. Wong wrote:
> > I'd just print out:
> > 
> > "Per-AG reservation for AG ${agno} failed.  Filesystem may run out of space."
> 
> Sure, I can do that.
> 
> > 
> > if any of the (now three) invocations of __xfs_ag_resv_init() returns
> > ENOSPC, since the AG is more than 98% full.  Some day, a sysadmin could
> > then use the fsmap data to identify which inodes are using the most
> > space in that AG and arrange to migrate/remove/etc the data to another
> > AG.
> 
> But for all but the reflink one this can't happen as the reservations
> have been there since the beginning.  And if they fail neverless we fail
> the mount.  Or am I missing something here?

The code I wrote doesn't fail the mount (or remount) on ENOSPC so that
it's still possible to delete files off the fs even when space is low.

In theory it wasn't possible to run out, but I've been thinking that
it's better to warn noisily just in case we /do/ run out of space later,
since it's not entirely obvious when the fs goes offline because we
tried to expand a btree and hit ENOSPC.

(Also if people perform underhanded "upgrades" then they can hit this
problem, but maybe that was just me. :P)

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index d346d42..4773c1e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@ 
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -223,6 +224,8 @@  int
 xfs_ag_resv_init(
 	struct xfs_perag		*pag)
 {
+	struct xfs_mount		*mp = pag->pag_mount;
+	xfs_agnumber_t			agno = pag->pag_agno;
 	xfs_extlen_t			ask;
 	xfs_extlen_t			used;
 	int				error = 0;
@@ -231,23 +234,48 @@  xfs_ag_resv_init(
 	if (pag->pag_meta_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
-				pag->pag_agno, &ask, &used);
+		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
-		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
-				ask, used);
+		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error) {
+			/*
+			 * Because we didn't have per-AG reservations when the
+			 * finobt feature was added we might not ne able to
+			 * reservere all needed blocks.  Warn and fall back to
+			 * the old and potentially buggy code in that case,
+			 * but ensure we do have the reservation for the
+			 * refcountbt.
+			 */
+			ask = used = 0;
+
+			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
+					&used);
+			if (error)
+				goto out;
+
+			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+					ask, used);
+			if (error)
+				goto out;
+
+			xfs_warn(mp,
+"Per-AG reservation for finobt failed.  File System may run out of space.\n");
+			mp->m_inotbt_nores = true;
+		}
 	}
 
 	/* Create the AGFL metadata reservation */
 	if (pag->pag_agfl_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
-				&ask, &used);
+		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
@@ -256,9 +284,16 @@  xfs_ag_resv_init(
 			goto out;
 	}
 
+#ifdef DEBUG
+	/* need to read in the AGF for the ASSERT below to work */
+	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
+	if (error)
+		return error;
+
 	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
 	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
 	       pag->pagf_freeblks + pag->pagf_flcount);
+#endif
 out:
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 0bb8966..3f24725 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -168,5 +168,4 @@  int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
 int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_agnumber_t agno, struct xfs_buf **bpp);
 
-
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 0fd086d..7c47188 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@  xfs_finobt_set_root(
 }
 
 STATIC int
-xfs_inobt_alloc_block(
+__xfs_inobt_alloc_block(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*start,
 	union xfs_btree_ptr	*new,
-	int			*stat)
+	int			*stat,
+	enum xfs_ag_resv_type	resv)
 {
 	xfs_alloc_arg_t		args;		/* block allocation args */
 	int			error;		/* error return value */
@@ -103,6 +104,7 @@  xfs_inobt_alloc_block(
 	args.maxlen = 1;
 	args.prod = 1;
 	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.resv = resv;
 
 	error = xfs_alloc_vextent(&args);
 	if (error) {
@@ -123,6 +125,27 @@  xfs_inobt_alloc_block(
 }
 
 STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat,
+			XFS_AG_RESV_METADATA);
+}
+
+STATIC int
 xfs_inobt_free_block(
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
@@ -328,7 +351,7 @@  static const struct xfs_btree_ops xfs_finobt_ops = {
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_finobt_set_root,
-	.alloc_block		= xfs_inobt_alloc_block,
+	.alloc_block		= xfs_finobt_alloc_block,
 	.free_block		= xfs_inobt_free_block,
 	.get_minrecs		= xfs_inobt_get_minrecs,
 	.get_maxrecs		= xfs_inobt_get_maxrecs,
@@ -480,3 +503,64 @@  xfs_inobt_rec_check_count(
 	return 0;
 }
 #endif	/* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_inobt_mxr[0] == 0)
+		return 0;
+
+	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
+		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
+				XFS_INODES_PER_CHUNK);
+}
+
+static int
+xfs_inobt_count_blocks(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum,
+	xfs_extlen_t		*tree_blocks)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
+	error = xfs_btree_count_blocks(cur, tree_blocks);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+
+	return error;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	xfs_extlen_t		tree_len = 0;
+	int			error;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return 0;
+
+	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
+	if (error)
+		return error;
+
+	*ask += xfs_inobt_max_size(mp);
+	*used += tree_len;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453..aa81e2e 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@  int xfs_inobt_rec_check_count(struct xfs_mount *,
 #define xfs_inobt_rec_check_count(mp, rec)	0
 #endif	/* DEBUG */
 
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b955779..de32f0f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1792,22 +1792,23 @@  xfs_inactive_ifree(
 	int			error;
 
 	/*
-	 * The ifree transaction might need to allocate blocks for record
-	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
-	 * allow ifree to dip into the reserved block pool if necessary.
-	 *
-	 * Freeing large sets of inodes generally means freeing inode chunks,
-	 * directory and file data blocks, so this should be relatively safe.
-	 * Only under severe circumstances should it be possible to free enough
-	 * inodes to exhaust the reserve block pool via finobt expansion while
-	 * at the same time not creating free space in the filesystem.
+	 * We try to use a per-AG reservation for any block needed by the finobt
+	 * tree, but as the finobt feature predates the per-AG reservation
+	 * support a degraded file system might not have enough space for the
+	 * reservation at mount time.  In that case try to dip into the reserved
+	 * pool and pray.
 	 *
 	 * Send a warning if the reservation does happen to fail, as the inode
 	 * now remains allocated and sits on the unlinked list until the fs is
 	 * repaired.
 	 */
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
-			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (unlikely(mp->m_inotbt_nores)) {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+				&tp);
+	} else {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+	}
 	if (error) {
 		if (error == -ENOSPC) {
 			xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 84f7852..7f351f7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@  typedef struct xfs_mount {
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
 	__uint64_t		m_flags;	/* global mount flags */
+	bool			m_inotbt_nores; /* no per-AG finobt resv. */
 	int			m_ialloc_inos;	/* inodes in inode allocation */
 	int			m_ialloc_blks;	/* blocks in inode allocation */
 	int			m_ialloc_min_blks;/* min blocks in sparse inode