diff mbox

[v2,08/12] Btrfs: fix ENOSPC caused by orphan items reservations

Message ID a0b43dd0428248dc1ff98e0986b142ad9eb418e4.1525997057.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Omar Sandoval May 11, 2018, 12:11 a.m. UTC
From: Omar Sandoval <osandov@fb.com>

Currently, we keep space reserved for all inode orphan items until the
inode is evicted (i.e., all references to it are dropped). We hit an
issue where an application would keep a bunch of deleted files open (by
design) and thus keep a large amount of space reserved, causing ENOSPC
errors when other operations tried to reserve space. This long-standing
reservation isn't absolutely necessary for a couple of reasons:

- We can almost always make the reservation we need or steal from the
  global reserve for the orphan item
- If we can't, it's not the end of the world if we drop the orphan item
  on the floor and let the next mount clean it up

So, get rid of persistent reservation and just reserve space in
btrfs_evict_inode().

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/btrfs_inode.h |  19 +++---
 fs/btrfs/inode.c       | 142 ++++++++++++-----------------------------
 2 files changed, 50 insertions(+), 111 deletions(-)

Comments

Nikolay Borisov May 11, 2018, 6:38 a.m. UTC | #1
On 11.05.2018 03:11, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Currently, we keep space reserved for all inode orphan items until the
> inode is evicted (i.e., all references to it are dropped). We hit an
> issue where an application would keep a bunch of deleted files open (by
> design) and thus keep a large amount of space reserved, causing ENOSPC
> errors when other operations tried to reserve space. This long-standing
> reservation isn't absolutely necessary for a couple of reasons:
> 
> - We can almost always make the reservation we need or steal from the
>   global reserve for the orphan item
> - If we can't, it's not the end of the world if we drop the orphan item
>   on the floor and let the next mount clean it up
> 
> So, get rid of persistent reservation and just reserve space in
> btrfs_evict_inode().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/btrfs/btrfs_inode.h |  19 +++---
>  fs/btrfs/inode.c       | 142 ++++++++++++-----------------------------
>  2 files changed, 50 insertions(+), 111 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 234bae55b85d..2f466cf55790 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -20,16 +20,15 @@
>   * new data the application may have written before commit.
>   */
>  #define BTRFS_INODE_ORDERED_DATA_CLOSE		0
> -#define BTRFS_INODE_ORPHAN_META_RESERVED	1
> -#define BTRFS_INODE_DUMMY			2
> -#define BTRFS_INODE_IN_DEFRAG			3
> -#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
> -#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
> -#define BTRFS_INODE_NEEDS_FULL_SYNC		6
> -#define BTRFS_INODE_COPY_EVERYTHING		7
> -#define BTRFS_INODE_IN_DELALLOC_LIST		8
> -#define BTRFS_INODE_READDIO_NEED_LOCK		9
> -#define BTRFS_INODE_HAS_PROPS		        10
> +#define BTRFS_INODE_DUMMY			1
> +#define BTRFS_INODE_IN_DEFRAG			2
> +#define BTRFS_INODE_HAS_ORPHAN_ITEM		3
> +#define BTRFS_INODE_HAS_ASYNC_EXTENT		4
> +#define BTRFS_INODE_NEEDS_FULL_SYNC		5
> +#define BTRFS_INODE_COPY_EVERYTHING		6
> +#define BTRFS_INODE_IN_DELALLOC_LIST		7
> +#define BTRFS_INODE_READDIO_NEED_LOCK		8
> +#define BTRFS_INODE_HAS_PROPS		        9
>  
>  /* in memory btrfs inode */
>  struct btrfs_inode {
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 348dc57920f5..b9a046b8c72c 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3343,88 +3343,25 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
>  /*
>   * This creates an orphan entry for the given inode in case something goes wrong
>   * in the middle of an unlink.
> - *
> - * NOTE: caller of this function should reserve 5 units of metadata for
> - *	 this function.
>   */
>  int btrfs_orphan_add(struct btrfs_trans_handle *trans,
>  		struct btrfs_inode *inode)
>  {
> -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
>  	struct btrfs_root *root = inode->root;
> -	struct btrfs_block_rsv *block_rsv = NULL;
> -	int reserve = 0;
> -	bool insert = false;
>  	int ret;
>  
> -	if (!root->orphan_block_rsv) {
> -		block_rsv = btrfs_alloc_block_rsv(fs_info,
> -						  BTRFS_BLOCK_RSV_TEMP);
> -		if (!block_rsv)
> -			return -ENOMEM;
> -	}
> -
> -	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> -			      &inode->runtime_flags))
> -		insert = true;
> -
> -	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -			      &inode->runtime_flags))
> -		reserve = 1;
> -
> -	spin_lock(&root->orphan_lock);
> -	/* If someone has created ->orphan_block_rsv, be happy to use it. */
> -	if (!root->orphan_block_rsv) {
> -		root->orphan_block_rsv = block_rsv;
> -	} else if (block_rsv) {
> -		btrfs_free_block_rsv(fs_info, block_rsv);
> -		block_rsv = NULL;
> -	}
> +	if (test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> +			     &inode->runtime_flags))
> +		return 0;

How come this can return true? Shouldn't btrfs_orphan_add always be
called for an inode which doesn't have an orphan item? Having this check
seems to indicate there is some non-determinism in the lifetime of
orphan items.

>  
> -	if (insert)
> -		atomic_inc(&root->orphan_inodes);
> -	spin_unlock(&root->orphan_lock);
> +	atomic_inc(&root->orphan_inodes);
>  
> -	/* grab metadata reservation from transaction handle */
> -	if (reserve) {
> -		ret = btrfs_orphan_reserve_metadata(trans, inode);
> -		ASSERT(!ret);
> -		if (ret) {
> -			/*
> -			 * dec doesn't need spin_lock as ->orphan_block_rsv
> -			 * would be released only if ->orphan_inodes is
> -			 * zero.
> -			 */
> -			atomic_dec(&root->orphan_inodes);
> -			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -				  &inode->runtime_flags);
> -			if (insert)
> -				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> -					  &inode->runtime_flags);
> -			return ret;
> -		}
> -	}
> -
> -	/* insert an orphan item to track this unlinked file */
> -	if (insert) {
> -		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> -		if (ret && ret != -EEXIST) {
> -			if (reserve) {
> -				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -					  &inode->runtime_flags);
> -				btrfs_orphan_release_metadata(inode);
> -			}
> -			/*
> -			 * btrfs_orphan_commit_root may race with us and set
> -			 * ->orphan_block_rsv to zero, in order to avoid that,
> -			 * decrease ->orphan_inodes after everything is done.
> -			 */
> -			atomic_dec(&root->orphan_inodes);
> -			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> -				  &inode->runtime_flags);
> -			btrfs_abort_transaction(trans, ret);
> -			return ret;
> -		}
> +	ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> +	if (ret && ret != -EEXIST) {
> +		atomic_dec(&root->orphan_inodes);
> +		clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags);
> +		btrfs_abort_transaction(trans, ret);
> +		return ret;
>  	}
>  
>  	return 0;
> @@ -3438,27 +3375,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
>  			    struct btrfs_inode *inode)
>  {
>  	struct btrfs_root *root = inode->root;
> -	int delete_item = 0;
>  	int ret = 0;
>  
> -	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> -			       &inode->runtime_flags))
> -		delete_item = 1;
> +	if (!test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> +				&inode->runtime_flags))
> +		return 0;

Similar comment as in btrfs_orphan_del. Shouldn't this always follow a
successful btrfs_orphan_add, guaranteeing there is an item for this
inode? Are there some "benign" races that could happen in the mean time
which could trigger this check ?

>  
> -	if (delete_item && trans)
> +	if (trans)
>  		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
>  
> -	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -			       &inode->runtime_flags))
> -		btrfs_orphan_release_metadata(inode);
> -
> -	/*
> -	 * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
> -	 * to zero, in order to avoid that, decrease ->orphan_inodes after
> -	 * everything is done.
> -	 */
> -	if (delete_item)
> -		atomic_dec(&root->orphan_inodes);
> +	atomic_dec(&root->orphan_inodes);
>  
>  	return ret;
>  }
> @@ -5339,10 +5265,10 @@ void btrfs_evict_inode(struct inode *inode)
>  		trans->block_rsv = rsv;
>  
>  		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
> +		trans->block_rsv = &fs_info->trans_block_rsv;
> +		btrfs_end_transaction(trans);
> +		btrfs_btree_balance_dirty(fs_info);
>  		if (ret) {
> -			trans->block_rsv = &fs_info->trans_block_rsv;
> -			btrfs_end_transaction(trans);
> -			btrfs_btree_balance_dirty(fs_info);
>  			if (ret != -ENOSPC && ret != -EAGAIN) {
>  				btrfs_orphan_del(NULL, BTRFS_I(inode));
>  				btrfs_free_block_rsv(fs_info, rsv);
> @@ -5353,22 +5279,36 @@ void btrfs_evict_inode(struct inode *inode)
>  		}
>  	}
>  
> -	btrfs_free_block_rsv(fs_info, rsv);
> -
>  	/*
> -	 * Errors here aren't a big deal, it just means we leave orphan items
> -	 * in the tree.  They will be cleaned up on the next mount.
> +	 * Errors here aren't a big deal, it just means we leave orphan items in
> +	 * the tree. They will be cleaned up on the next mount. If the inode
> +	 * number gets reused, cleanup deletes the orphan item without doing
> +	 * anything, and unlink reuses the existing orphan item.
> +	 *
> +	 * If it turns out that we are dropping too many of these, we might want
> +	 * to add a mechanism for retrying these after a commit.
>  	 */
> -	trans->block_rsv = root->orphan_block_rsv;
> -	btrfs_orphan_del(trans, BTRFS_I(inode));
> +	trans = evict_refill_and_join(root, rsv, min_size);
> +	if (IS_ERR(trans)) {
> +		btrfs_orphan_del(NULL, BTRFS_I(inode));
> +	} else {
> +		trans->block_rsv = rsv;
> +		btrfs_orphan_del(trans, BTRFS_I(inode));
> +		trans->block_rsv = &fs_info->trans_block_rsv;
> +		btrfs_end_transaction(trans);
> +	}
> +
> +	btrfs_free_block_rsv(fs_info, rsv);
>  
> -	trans->block_rsv = &fs_info->trans_block_rsv;
>  	if (!(root == fs_info->tree_root ||
>  	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
>  		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
>  
> -	btrfs_end_transaction(trans);
> -	btrfs_btree_balance_dirty(fs_info);
> +	/*
> +	 * If we didn't successfully delete, the orphan item will still be in
> +	 * the tree and we'll retry on the next mount. Again, we might also want
> +	 * to retry these periodically in the future.
> +	 */
>  no_delete:
>  	btrfs_remove_delayed_node(BTRFS_I(inode));
>  	clear_inode(inode);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Omar Sandoval May 11, 2018, 6:51 a.m. UTC | #2
On Fri, May 11, 2018 at 09:38:15AM +0300, Nikolay Borisov wrote:
> 
> 
> On 11.05.2018 03:11, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > Currently, we keep space reserved for all inode orphan items until the
> > inode is evicted (i.e., all references to it are dropped). We hit an
> > issue where an application would keep a bunch of deleted files open (by
> > design) and thus keep a large amount of space reserved, causing ENOSPC
> > errors when other operations tried to reserve space. This long-standing
> > reservation isn't absolutely necessary for a couple of reasons:
> > 
> > - We can almost always make the reservation we need or steal from the
> >   global reserve for the orphan item
> > - If we can't, it's not the end of the world if we drop the orphan item
> >   on the floor and let the next mount clean it up
> > 
> > So, get rid of persistent reservation and just reserve space in
> > btrfs_evict_inode().
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  fs/btrfs/btrfs_inode.h |  19 +++---
> >  fs/btrfs/inode.c       | 142 ++++++++++++-----------------------------
> >  2 files changed, 50 insertions(+), 111 deletions(-)
> > 
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index 234bae55b85d..2f466cf55790 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -20,16 +20,15 @@
> >   * new data the application may have written before commit.
> >   */
> >  #define BTRFS_INODE_ORDERED_DATA_CLOSE		0
> > -#define BTRFS_INODE_ORPHAN_META_RESERVED	1
> > -#define BTRFS_INODE_DUMMY			2
> > -#define BTRFS_INODE_IN_DEFRAG			3
> > -#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
> > -#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
> > -#define BTRFS_INODE_NEEDS_FULL_SYNC		6
> > -#define BTRFS_INODE_COPY_EVERYTHING		7
> > -#define BTRFS_INODE_IN_DELALLOC_LIST		8
> > -#define BTRFS_INODE_READDIO_NEED_LOCK		9
> > -#define BTRFS_INODE_HAS_PROPS		        10
> > +#define BTRFS_INODE_DUMMY			1
> > +#define BTRFS_INODE_IN_DEFRAG			2
> > +#define BTRFS_INODE_HAS_ORPHAN_ITEM		3
> > +#define BTRFS_INODE_HAS_ASYNC_EXTENT		4
> > +#define BTRFS_INODE_NEEDS_FULL_SYNC		5
> > +#define BTRFS_INODE_COPY_EVERYTHING		6
> > +#define BTRFS_INODE_IN_DELALLOC_LIST		7
> > +#define BTRFS_INODE_READDIO_NEED_LOCK		8
> > +#define BTRFS_INODE_HAS_PROPS		        9
> >  
> >  /* in memory btrfs inode */
> >  struct btrfs_inode {
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 348dc57920f5..b9a046b8c72c 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -3343,88 +3343,25 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
> >  /*
> >   * This creates an orphan entry for the given inode in case something goes wrong
> >   * in the middle of an unlink.
> > - *
> > - * NOTE: caller of this function should reserve 5 units of metadata for
> > - *	 this function.
> >   */
> >  int btrfs_orphan_add(struct btrfs_trans_handle *trans,
> >  		struct btrfs_inode *inode)
> >  {
> > -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> >  	struct btrfs_root *root = inode->root;
> > -	struct btrfs_block_rsv *block_rsv = NULL;
> > -	int reserve = 0;
> > -	bool insert = false;
> >  	int ret;
> >  
> > -	if (!root->orphan_block_rsv) {
> > -		block_rsv = btrfs_alloc_block_rsv(fs_info,
> > -						  BTRFS_BLOCK_RSV_TEMP);
> > -		if (!block_rsv)
> > -			return -ENOMEM;
> > -	}
> > -
> > -	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > -			      &inode->runtime_flags))
> > -		insert = true;
> > -
> > -	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> > -			      &inode->runtime_flags))
> > -		reserve = 1;
> > -
> > -	spin_lock(&root->orphan_lock);
> > -	/* If someone has created ->orphan_block_rsv, be happy to use it. */
> > -	if (!root->orphan_block_rsv) {
> > -		root->orphan_block_rsv = block_rsv;
> > -	} else if (block_rsv) {
> > -		btrfs_free_block_rsv(fs_info, block_rsv);
> > -		block_rsv = NULL;
> > -	}
> > +	if (test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > +			     &inode->runtime_flags))
> > +		return 0;
> 
> How come this can return true? Shouldn't btrfs_orphan_add always be
> called for an inode which doesn't have an orphan item? Having this check
> seems to indicate there is some non-determinism in the lifetime of
> orphan items.

Another great point, this was needed when we also inserted orphan items
for truncate, but it's not needed anymore. I'll clean that up, too.

> > -	if (insert)
> > -		atomic_inc(&root->orphan_inodes);
> > -	spin_unlock(&root->orphan_lock);
> > +	atomic_inc(&root->orphan_inodes);
> >  
> > -	/* grab metadata reservation from transaction handle */
> > -	if (reserve) {
> > -		ret = btrfs_orphan_reserve_metadata(trans, inode);
> > -		ASSERT(!ret);
> > -		if (ret) {
> > -			/*
> > -			 * dec doesn't need spin_lock as ->orphan_block_rsv
> > -			 * would be released only if ->orphan_inodes is
> > -			 * zero.
> > -			 */
> > -			atomic_dec(&root->orphan_inodes);
> > -			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> > -				  &inode->runtime_flags);
> > -			if (insert)
> > -				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > -					  &inode->runtime_flags);
> > -			return ret;
> > -		}
> > -	}
> > -
> > -	/* insert an orphan item to track this unlinked file */
> > -	if (insert) {
> > -		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> > -		if (ret && ret != -EEXIST) {
> > -			if (reserve) {
> > -				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> > -					  &inode->runtime_flags);
> > -				btrfs_orphan_release_metadata(inode);
> > -			}
> > -			/*
> > -			 * btrfs_orphan_commit_root may race with us and set
> > -			 * ->orphan_block_rsv to zero, in order to avoid that,
> > -			 * decrease ->orphan_inodes after everything is done.
> > -			 */
> > -			atomic_dec(&root->orphan_inodes);
> > -			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > -				  &inode->runtime_flags);
> > -			btrfs_abort_transaction(trans, ret);
> > -			return ret;
> > -		}
> > +	ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> > +	if (ret && ret != -EEXIST) {
> > +		atomic_dec(&root->orphan_inodes);
> > +		clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags);
> > +		btrfs_abort_transaction(trans, ret);
> > +		return ret;
> >  	}
> >  
> >  	return 0;
> > @@ -3438,27 +3375,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
> >  			    struct btrfs_inode *inode)
> >  {
> >  	struct btrfs_root *root = inode->root;
> > -	int delete_item = 0;
> >  	int ret = 0;
> >  
> > -	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > -			       &inode->runtime_flags))
> > -		delete_item = 1;
> > +	if (!test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> > +				&inode->runtime_flags))
> > +		return 0;
> 
> Similar comment as in btrfs_orphan_del. Shouldn't this always follow a
> successful btrfs_orphan_add, guaranteeing there is an item for this
> inode? Are there some "benign" races that could happen in the mean time
> which could trigger this check ?

Same thing here, and it's even easier to convince myself here that it's
unnecessary: btrfs_evict_inode() ignores errors if something magical did
happen, and we really expect the orphan item to be there for an
O_TMPFILE in btrfs_link().
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 234bae55b85d..2f466cf55790 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -20,16 +20,15 @@ 
  * new data the application may have written before commit.
  */
 #define BTRFS_INODE_ORDERED_DATA_CLOSE		0
-#define BTRFS_INODE_ORPHAN_META_RESERVED	1
-#define BTRFS_INODE_DUMMY			2
-#define BTRFS_INODE_IN_DEFRAG			3
-#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
-#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
-#define BTRFS_INODE_NEEDS_FULL_SYNC		6
-#define BTRFS_INODE_COPY_EVERYTHING		7
-#define BTRFS_INODE_IN_DELALLOC_LIST		8
-#define BTRFS_INODE_READDIO_NEED_LOCK		9
-#define BTRFS_INODE_HAS_PROPS		        10
+#define BTRFS_INODE_DUMMY			1
+#define BTRFS_INODE_IN_DEFRAG			2
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		3
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		4
+#define BTRFS_INODE_NEEDS_FULL_SYNC		5
+#define BTRFS_INODE_COPY_EVERYTHING		6
+#define BTRFS_INODE_IN_DELALLOC_LIST		7
+#define BTRFS_INODE_READDIO_NEED_LOCK		8
+#define BTRFS_INODE_HAS_PROPS		        9
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 348dc57920f5..b9a046b8c72c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3343,88 +3343,25 @@  void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 /*
  * This creates an orphan entry for the given inode in case something goes wrong
  * in the middle of an unlink.
- *
- * NOTE: caller of this function should reserve 5 units of metadata for
- *	 this function.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 		struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	struct btrfs_root *root = inode->root;
-	struct btrfs_block_rsv *block_rsv = NULL;
-	int reserve = 0;
-	bool insert = false;
 	int ret;
 
-	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(fs_info,
-						  BTRFS_BLOCK_RSV_TEMP);
-		if (!block_rsv)
-			return -ENOMEM;
-	}
-
-	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			      &inode->runtime_flags))
-		insert = true;
-
-	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-			      &inode->runtime_flags))
-		reserve = 1;
-
-	spin_lock(&root->orphan_lock);
-	/* If someone has created ->orphan_block_rsv, be happy to use it. */
-	if (!root->orphan_block_rsv) {
-		root->orphan_block_rsv = block_rsv;
-	} else if (block_rsv) {
-		btrfs_free_block_rsv(fs_info, block_rsv);
-		block_rsv = NULL;
-	}
+	if (test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			     &inode->runtime_flags))
+		return 0;
 
-	if (insert)
-		atomic_inc(&root->orphan_inodes);
-	spin_unlock(&root->orphan_lock);
+	atomic_inc(&root->orphan_inodes);
 
-	/* grab metadata reservation from transaction handle */
-	if (reserve) {
-		ret = btrfs_orphan_reserve_metadata(trans, inode);
-		ASSERT(!ret);
-		if (ret) {
-			/*
-			 * dec doesn't need spin_lock as ->orphan_block_rsv
-			 * would be released only if ->orphan_inodes is
-			 * zero.
-			 */
-			atomic_dec(&root->orphan_inodes);
-			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-				  &inode->runtime_flags);
-			if (insert)
-				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-					  &inode->runtime_flags);
-			return ret;
-		}
-	}
-
-	/* insert an orphan item to track this unlinked file */
-	if (insert) {
-		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-		if (ret && ret != -EEXIST) {
-			if (reserve) {
-				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-					  &inode->runtime_flags);
-				btrfs_orphan_release_metadata(inode);
-			}
-			/*
-			 * btrfs_orphan_commit_root may race with us and set
-			 * ->orphan_block_rsv to zero, in order to avoid that,
-			 * decrease ->orphan_inodes after everything is done.
-			 */
-			atomic_dec(&root->orphan_inodes);
-			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-				  &inode->runtime_flags);
-			btrfs_abort_transaction(trans, ret);
-			return ret;
-		}
+	ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
+	if (ret && ret != -EEXIST) {
+		atomic_dec(&root->orphan_inodes);
+		clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags);
+		btrfs_abort_transaction(trans, ret);
+		return ret;
 	}
 
 	return 0;
@@ -3438,27 +3375,16 @@  static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 			    struct btrfs_inode *inode)
 {
 	struct btrfs_root *root = inode->root;
-	int delete_item = 0;
 	int ret = 0;
 
-	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			       &inode->runtime_flags))
-		delete_item = 1;
+	if (!test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+				&inode->runtime_flags))
+		return 0;
 
-	if (delete_item && trans)
+	if (trans)
 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
 
-	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-			       &inode->runtime_flags))
-		btrfs_orphan_release_metadata(inode);
-
-	/*
-	 * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
-	 * to zero, in order to avoid that, decrease ->orphan_inodes after
-	 * everything is done.
-	 */
-	if (delete_item)
-		atomic_dec(&root->orphan_inodes);
+	atomic_dec(&root->orphan_inodes);
 
 	return ret;
 }
@@ -5339,10 +5265,10 @@  void btrfs_evict_inode(struct inode *inode)
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+		trans->block_rsv = &fs_info->trans_block_rsv;
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty(fs_info);
 		if (ret) {
-			trans->block_rsv = &fs_info->trans_block_rsv;
-			btrfs_end_transaction(trans);
-			btrfs_btree_balance_dirty(fs_info);
 			if (ret != -ENOSPC && ret != -EAGAIN) {
 				btrfs_orphan_del(NULL, BTRFS_I(inode));
 				btrfs_free_block_rsv(fs_info, rsv);
@@ -5353,22 +5279,36 @@  void btrfs_evict_inode(struct inode *inode)
 		}
 	}
 
-	btrfs_free_block_rsv(fs_info, rsv);
-
 	/*
-	 * Errors here aren't a big deal, it just means we leave orphan items
-	 * in the tree.  They will be cleaned up on the next mount.
+	 * Errors here aren't a big deal, it just means we leave orphan items in
+	 * the tree. They will be cleaned up on the next mount. If the inode
+	 * number gets reused, cleanup deletes the orphan item without doing
+	 * anything, and unlink reuses the existing orphan item.
+	 *
+	 * If it turns out that we are dropping too many of these, we might want
+	 * to add a mechanism for retrying these after a commit.
 	 */
-	trans->block_rsv = root->orphan_block_rsv;
-	btrfs_orphan_del(trans, BTRFS_I(inode));
+	trans = evict_refill_and_join(root, rsv, min_size);
+	if (IS_ERR(trans)) {
+		btrfs_orphan_del(NULL, BTRFS_I(inode));
+	} else {
+		trans->block_rsv = rsv;
+		btrfs_orphan_del(trans, BTRFS_I(inode));
+		trans->block_rsv = &fs_info->trans_block_rsv;
+		btrfs_end_transaction(trans);
+	}
+
+	btrfs_free_block_rsv(fs_info, rsv);
 
-	trans->block_rsv = &fs_info->trans_block_rsv;
 	if (!(root == fs_info->tree_root ||
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
 
-	btrfs_end_transaction(trans);
-	btrfs_btree_balance_dirty(fs_info);
+	/*
+	 * If we didn't successfully delete, the orphan item will still be in
+	 * the tree and we'll retry on the next mount. Again, we might also want
+	 * to retry these periodically in the future.
+	 */
 no_delete:
 	btrfs_remove_delayed_node(BTRFS_I(inode));
 	clear_inode(inode);