diff mbox series

[6/8] btrfs: loop in inode_rsv_refill

Message ID 20181203152459.21630-7-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series Enospc cleanups and fixeS | expand

Commit Message

Josef Bacik Dec. 3, 2018, 3:24 p.m. UTC
With severe fragmentation we can end up with our inode rsv size being
huge during writeout, which would cause us to need to make very large
metadata reservations.  However we may not actually need that much once
writeout is complete.  So instead try to make our reservation, and if we
couldn't make it re-calculate our new reservation size and try again.
If our reservation size doesn't change between tries then we know we are
actually out of space and can error out.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 58 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 15 deletions(-)

Comments

Nikolay Borisov Dec. 12, 2018, 4:01 p.m. UTC | #1
On 3.12.18 г. 17:24 ч., Josef Bacik wrote:
> With severe fragmentation we can end up with our inode rsv size being
> huge during writeout, which would cause us to need to make very large
> metadata reservations.  However we may not actually need that much once

The sentence beginning with "However" needs more information, why might
we not need that much once writeout is complete?
> writeout is complete.  So instead try to make our reservation, and if we
> couldn't make it re-calculate our new reservation size and try again.

Why do you think that recalculating the requested bytes will be
different the 2nd time ?

> If our reservation size doesn't change between tries then we know we are
> actually out of space and can error out.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/extent-tree.c | 58 +++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 43 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 0ee77a98f867..0e1a499035ac 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -5787,6 +5787,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
>  	return ret;
>  }
>  
> +static inline void __get_refill_bytes(struct btrfs_block_rsv *block_rsv,
> +				      u64 *metadata_bytes, u64 *qgroup_bytes)

This function needs a better name. Something like calc_required_bytes or
calc_refill_bytes

> +{
> +	*metadata_bytes = 0;
> +	*qgroup_bytes = 0;
> +
> +	spin_lock(&block_rsv->lock);
> +	if (block_rsv->reserved < block_rsv->size)
> +		*metadata_bytes = block_rsv->size - block_rsv->reserved;
> +	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
> +		*qgroup_bytes = block_rsv->qgroup_rsv_size -
> +			block_rsv->qgroup_rsv_reserved;
> +	spin_unlock(&block_rsv->lock);
> +}
> +
>  /**
>   * btrfs_inode_rsv_refill - refill the inode block rsv.
>   * @inode - the inode we are refilling.
> @@ -5802,25 +5817,39 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
>  {
>  	struct btrfs_root *root = inode->root;
>  	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> -	u64 num_bytes = 0;
> +	u64 num_bytes = 0, last = 0;
>  	u64 qgroup_num_bytes = 0;
>  	int ret = -ENOSPC;
>  
> -	spin_lock(&block_rsv->lock);
> -	if (block_rsv->reserved < block_rsv->size)
> -		num_bytes = block_rsv->size - block_rsv->reserved;
> -	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
> -		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
> -				   block_rsv->qgroup_rsv_reserved;
> -	spin_unlock(&block_rsv->lock);
> -
> +	__get_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
>  	if (num_bytes == 0)
>  		return 0;
>  
> -	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
> -	if (ret)
> -		return ret;
> -	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> +	do {
> +		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
> +		if (ret)
> +			return ret;
> +		ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> +		if (ret) {
> +			btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
> +			last = num_bytes;
> +			/*
> +			 * If we are fragmented we can end up with a lot of
> +			 * outstanding extents which will make our size be much
> +			 * larger than our reserved amount.  If we happen to
> +			 * try to do a reservation here that may result in us
> +			 * trying to do a pretty hefty reservation, which we may
> +			 * not need once delalloc flushing happens.  If this is

The "If we happen" sentence needs to be reworded because it's -ENOPARSE.
Perhaps one of the "to do a reservation" should go away?

> +			 * the case try and do the reserve again.
> +			 */
> +			if (flush == BTRFS_RESERVE_FLUSH_ALL)
> +				__get_refill_bytes(block_rsv, &num_bytes,
> +						   &qgroup_num_bytes);
> +			if (num_bytes == 0)
> +				return 0;
> +		}
> +	} while (ret && last != num_bytes);
> +
>  	if (!ret) {
>  		block_rsv_add_bytes(block_rsv, num_bytes, false);
>  		trace_btrfs_space_reservation(root->fs_info, "delalloc",
> @@ -5830,8 +5859,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
>  		spin_lock(&block_rsv->lock);
>  		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
>  		spin_unlock(&block_rsv->lock);
> -	} else
> -		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
> +	}
>  	return ret;
>  }
>  
>
David Sterba Jan. 30, 2019, 4:41 p.m. UTC | #2
On Mon, Dec 03, 2018 at 10:24:57AM -0500, Josef Bacik wrote:
> With severe fragmentation we can end up with our inode rsv size being
> huge during writeout, which would cause us to need to make very large
> metadata reservations.  However we may not actually need that much once
> writeout is complete.  So instead try to make our reservation, and if we
> couldn't make it re-calculate our new reservation size and try again.
> If our reservation size doesn't change between tries then we know we are
> actually out of space and can error out.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>

This patch still has some non-trivial feedback without a response, the
other patches are ready for merge.
David Sterba Feb. 6, 2019, 6:20 p.m. UTC | #3
On Wed, Dec 12, 2018 at 06:01:57PM +0200, Nikolay Borisov wrote:
> 
> 
> On 3.12.18 г. 17:24 ч., Josef Bacik wrote:
> > With severe fragmentation we can end up with our inode rsv size being
> > huge during writeout, which would cause us to need to make very large
> > metadata reservations.  However we may not actually need that much once
> 
> The sentence beginning with "However" needs more information, why might
> we not need that much once writeout is complete?

Updated in changelog

> > writeout is complete.  So instead try to make our reservation, and if we
> > couldn't make it re-calculate our new reservation size and try again.
> 
> Why do you think that recalculating the requested bytes will be
> different the 2nd time ?

Partly answered in the comment in the code

> 
> > If our reservation size doesn't change between tries then we know we are
> > actually out of space and can error out.
> > 
> > Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > ---
> >  fs/btrfs/extent-tree.c | 58 +++++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 43 insertions(+), 15 deletions(-)
> > 
> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index 0ee77a98f867..0e1a499035ac 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -5787,6 +5787,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
> >  	return ret;
> >  }
> >  
> > +static inline void __get_refill_bytes(struct btrfs_block_rsv *block_rsv,
> > +				      u64 *metadata_bytes, u64 *qgroup_bytes)
> 
> This function needs a better name. Something like calc_required_bytes or
> calc_refill_bytes

renamed to calc_refill_bytes

> 
> > +{
> > +	*metadata_bytes = 0;
> > +	*qgroup_bytes = 0;
> > +
> > +	spin_lock(&block_rsv->lock);
> > +	if (block_rsv->reserved < block_rsv->size)
> > +		*metadata_bytes = block_rsv->size - block_rsv->reserved;
> > +	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
> > +		*qgroup_bytes = block_rsv->qgroup_rsv_size -
> > +			block_rsv->qgroup_rsv_reserved;
> > +	spin_unlock(&block_rsv->lock);
> > +}
> > +
> >  /**
> >   * btrfs_inode_rsv_refill - refill the inode block rsv.
> >   * @inode - the inode we are refilling.
> > @@ -5802,25 +5817,39 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
> >  {
> >  	struct btrfs_root *root = inode->root;
> >  	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > -	u64 num_bytes = 0;
> > +	u64 num_bytes = 0, last = 0;
> >  	u64 qgroup_num_bytes = 0;
> >  	int ret = -ENOSPC;
> >  
> > -	spin_lock(&block_rsv->lock);
> > -	if (block_rsv->reserved < block_rsv->size)
> > -		num_bytes = block_rsv->size - block_rsv->reserved;
> > -	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
> > -		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
> > -				   block_rsv->qgroup_rsv_reserved;
> > -	spin_unlock(&block_rsv->lock);
> > -
> > +	__get_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
> >  	if (num_bytes == 0)
> >  		return 0;
> >  
> > -	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
> > -	if (ret)
> > -		return ret;
> > -	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> > +	do {
> > +		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
> > +		if (ret)
> > +			return ret;
> > +		ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> > +		if (ret) {
> > +			btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
> > +			last = num_bytes;
> > +			/*
> > +			 * If we are fragmented we can end up with a lot of
> > +			 * outstanding extents which will make our size be much
> > +			 * larger than our reserved amount.  If we happen to
> > +			 * try to do a reservation here that may result in us
> > +			 * trying to do a pretty hefty reservation, which we may
> > +			 * not need once delalloc flushing happens.  If this is
> 
> The "If we happen" sentence needs to be reworded because it's -ENOPARSE.
> Perhaps one of the "to do a reservation" should go away?

Reworded
diff mbox series

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ee77a98f867..0e1a499035ac 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5787,6 +5787,21 @@  int btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
+static inline void __get_refill_bytes(struct btrfs_block_rsv *block_rsv,
+				      u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+	*metadata_bytes = 0;
+	*qgroup_bytes = 0;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved < block_rsv->size)
+		*metadata_bytes = block_rsv->size - block_rsv->reserved;
+	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+		*qgroup_bytes = block_rsv->qgroup_rsv_size -
+			block_rsv->qgroup_rsv_reserved;
+	spin_unlock(&block_rsv->lock);
+}
+
 /**
  * btrfs_inode_rsv_refill - refill the inode block rsv.
  * @inode - the inode we are refilling.
@@ -5802,25 +5817,39 @@  static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-	u64 num_bytes = 0;
+	u64 num_bytes = 0, last = 0;
 	u64 qgroup_num_bytes = 0;
 	int ret = -ENOSPC;
 
-	spin_lock(&block_rsv->lock);
-	if (block_rsv->reserved < block_rsv->size)
-		num_bytes = block_rsv->size - block_rsv->reserved;
-	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
-		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
-				   block_rsv->qgroup_rsv_reserved;
-	spin_unlock(&block_rsv->lock);
-
+	__get_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
 	if (num_bytes == 0)
 		return 0;
 
-	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
-	if (ret)
-		return ret;
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	do {
+		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
+		if (ret)
+			return ret;
+		ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+		if (ret) {
+			btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+			last = num_bytes;
+			/*
+			 * If we are fragmented we can end up with a lot of
+			 * outstanding extents which will make our size be much
+			 * larger than our reserved amount.  If we happen to
+			 * try to do a reservation here that may result in us
+			 * trying to do a pretty hefty reservation, which we may
+			 * not need once delalloc flushing happens.  If this is
+			 * the case try and do the reserve again.
+			 */
+			if (flush == BTRFS_RESERVE_FLUSH_ALL)
+				__get_refill_bytes(block_rsv, &num_bytes,
+						   &qgroup_num_bytes);
+			if (num_bytes == 0)
+				return 0;
+		}
+	} while (ret && last != num_bytes);
+
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, false);
 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5830,8 +5859,7 @@  static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 		spin_lock(&block_rsv->lock);
 		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
 		spin_unlock(&block_rsv->lock);
-	} else
-		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+	}
 	return ret;
 }