diff mbox series

[v5.2,3/3] xfs: fix an incore inode UAF in xfs_bui_recover

Message ID 20201002042236.GV49547@magnolia (mailing list archive)
State Superseded
Headers show
Series None | expand

Commit Message

Darrick J. Wong Oct. 2, 2020, 4:22 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

In xfs_bui_item_recover, there exists a use-after-free bug with regards
to the inode that is involved in the bmap replay operation.  If the
mapping operation does not complete, we call xfs_bmap_unmap_extent to
create a deferred op to finish the unmapping work, and we retain a
pointer to the incore inode.

Unfortunately, the very next thing we do is commit the transaction and
drop the inode.  If reclaim tears down the inode before we try to finish
the defer ops, we dereference garbage and blow up.  Therefore, create a
way to join inodes to the defer ops freezer so that we can maintain the
xfs_inode reference until we're done with the inode.

Note: This imposes the requirement that there be enough memory to keep
every incore inode in memory throughout recovery.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v5.2: rebase on updated defer capture patches
---
 fs/xfs/libxfs/xfs_defer.c  |   55 ++++++++++++++++++++++++++++++++++++++------
 fs/xfs/libxfs/xfs_defer.h  |   11 +++++++--
 fs/xfs/xfs_bmap_item.c     |    8 ++----
 fs/xfs/xfs_extfree_item.c  |    2 +-
 fs/xfs/xfs_log_recover.c   |    7 +++++-
 fs/xfs/xfs_refcount_item.c |    2 +-
 fs/xfs/xfs_rmap_item.c     |    2 +-
 7 files changed, 67 insertions(+), 20 deletions(-)

Comments

Christoph Hellwig Oct. 2, 2020, 7:30 a.m. UTC | #1
On Thu, Oct 01, 2020 at 09:22:36PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> In xfs_bui_item_recover, there exists a use-after-free bug with regards
> to the inode that is involved in the bmap replay operation.  If the
> mapping operation does not complete, we call xfs_bmap_unmap_extent to
> create a deferred op to finish the unmapping work, and we retain a
> pointer to the incore inode.
> 
> Unfortunately, the very next thing we do is commit the transaction and
> drop the inode.  If reclaim tears down the inode before we try to finish
> the defer ops, we dereference garbage and blow up.  Therefore, create a
> way to join inodes to the defer ops freezer so that we can maintain the
> xfs_inode reference until we're done with the inode.
> 
> Note: This imposes the requirement that there be enough memory to keep
> every incore inode in memory throughout recovery.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> v5.2: rebase on updated defer capture patches
> ---
>  fs/xfs/libxfs/xfs_defer.c  |   55 ++++++++++++++++++++++++++++++++++++++------
>  fs/xfs/libxfs/xfs_defer.h  |   11 +++++++--
>  fs/xfs/xfs_bmap_item.c     |    8 ++----
>  fs/xfs/xfs_extfree_item.c  |    2 +-
>  fs/xfs/xfs_log_recover.c   |    7 +++++-
>  fs/xfs/xfs_refcount_item.c |    2 +-
>  fs/xfs/xfs_rmap_item.c     |    2 +-
>  7 files changed, 67 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> index e19dc1ced7e6..4af5752f9830 100644
> --- a/fs/xfs/libxfs/xfs_defer.c
> +++ b/fs/xfs/libxfs/xfs_defer.c
> @@ -16,6 +16,7 @@
>  #include "xfs_inode.h"
>  #include "xfs_inode_item.h"
>  #include "xfs_trace.h"
> +#include "xfs_icache.h"
>  
>  /*
>   * Deferred Operations in XFS
> @@ -553,10 +554,14 @@ xfs_defer_move(
>   * deferred ops state is transferred to the capture structure and the
>   * transaction is then ready for the caller to commit it.  If there are no
>   * intent items to capture, this function returns NULL.
> + *
> + * If inodes are passed in and this function returns a capture structure, the
> + * inodes are now owned by the capture structure.
>   */
>  static struct xfs_defer_capture *
>  xfs_defer_ops_capture(
> -	struct xfs_trans		*tp)
> +	struct xfs_trans		*tp,
> +	struct xfs_inode		*ip)
>  {
>  	struct xfs_defer_capture	*dfc;
>  
> @@ -582,6 +587,12 @@ xfs_defer_ops_capture(
>  	/* Preserve the log reservation size. */
>  	dfc->dfc_logres = tp->t_log_res;
>  
> +	/*
> +	 * Transfer responsibility for unlocking and releasing the inodes to
> +	 * the capture structure.
> +	 */
> +	dfc->dfc_ip = ip;
> +

Maybe rename ip to capture_ip?

> +	ASSERT(ip == NULL || xfs_isilocked(ip, XFS_ILOCK_EXCL));
> +
>  	/* If we don't capture anything, commit transaction and exit. */
> +	dfc = xfs_defer_ops_capture(tp, ip);
> +	if (!dfc) {
> +		error = xfs_trans_commit(tp);
> +		if (ip) {
> +			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +			xfs_irele(ip);
> +		}
> +		return error;
> +	}

Instead of coming up with our own inode unlocking and release schemes,
can't we just require that the inode is joinged by passing the lock
flags to xfs_trans_ijoin, and piggy back on xfs_trans_commit unlocking
it in that case?
Darrick J. Wong Oct. 2, 2020, 4:29 p.m. UTC | #2
On Fri, Oct 02, 2020 at 09:30:06AM +0200, Christoph Hellwig wrote:
> On Thu, Oct 01, 2020 at 09:22:36PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > In xfs_bui_item_recover, there exists a use-after-free bug with regards
> > to the inode that is involved in the bmap replay operation.  If the
> > mapping operation does not complete, we call xfs_bmap_unmap_extent to
> > create a deferred op to finish the unmapping work, and we retain a
> > pointer to the incore inode.
> > 
> > Unfortunately, the very next thing we do is commit the transaction and
> > drop the inode.  If reclaim tears down the inode before we try to finish
> > the defer ops, we dereference garbage and blow up.  Therefore, create a
> > way to join inodes to the defer ops freezer so that we can maintain the
> > xfs_inode reference until we're done with the inode.
> > 
> > Note: This imposes the requirement that there be enough memory to keep
> > every incore inode in memory throughout recovery.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > v5.2: rebase on updated defer capture patches
> > ---
> >  fs/xfs/libxfs/xfs_defer.c  |   55 ++++++++++++++++++++++++++++++++++++++------
> >  fs/xfs/libxfs/xfs_defer.h  |   11 +++++++--
> >  fs/xfs/xfs_bmap_item.c     |    8 ++----
> >  fs/xfs/xfs_extfree_item.c  |    2 +-
> >  fs/xfs/xfs_log_recover.c   |    7 +++++-
> >  fs/xfs/xfs_refcount_item.c |    2 +-
> >  fs/xfs/xfs_rmap_item.c     |    2 +-
> >  7 files changed, 67 insertions(+), 20 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> > index e19dc1ced7e6..4af5752f9830 100644
> > --- a/fs/xfs/libxfs/xfs_defer.c
> > +++ b/fs/xfs/libxfs/xfs_defer.c
> > @@ -16,6 +16,7 @@
> >  #include "xfs_inode.h"
> >  #include "xfs_inode_item.h"
> >  #include "xfs_trace.h"
> > +#include "xfs_icache.h"
> >  
> >  /*
> >   * Deferred Operations in XFS
> > @@ -553,10 +554,14 @@ xfs_defer_move(
> >   * deferred ops state is transferred to the capture structure and the
> >   * transaction is then ready for the caller to commit it.  If there are no
> >   * intent items to capture, this function returns NULL.
> > + *
> > + * If inodes are passed in and this function returns a capture structure, the
> > + * inodes are now owned by the capture structure.
> >   */
> >  static struct xfs_defer_capture *
> >  xfs_defer_ops_capture(
> > -	struct xfs_trans		*tp)
> > +	struct xfs_trans		*tp,
> > +	struct xfs_inode		*ip)
> >  {
> >  	struct xfs_defer_capture	*dfc;
> >  
> > @@ -582,6 +587,12 @@ xfs_defer_ops_capture(
> >  	/* Preserve the log reservation size. */
> >  	dfc->dfc_logres = tp->t_log_res;
> >  
> > +	/*
> > +	 * Transfer responsibility for unlocking and releasing the inodes to
> > +	 * the capture structure.
> > +	 */
> > +	dfc->dfc_ip = ip;
> > +
> 
> Maybe rename ip to capture_ip?

Ok.

> > +	ASSERT(ip == NULL || xfs_isilocked(ip, XFS_ILOCK_EXCL));
> > +
> >  	/* If we don't capture anything, commit transaction and exit. */
> > +	dfc = xfs_defer_ops_capture(tp, ip);
> > +	if (!dfc) {
> > +		error = xfs_trans_commit(tp);
> > +		if (ip) {
> > +			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > +			xfs_irele(ip);
> > +		}
> > +		return error;
> > +	}
> 
> Instead of coming up with our own inode unlocking and release schemes,
> can't we just require that the inode is joinged by passing the lock
> flags to xfs_trans_ijoin, and piggy back on xfs_trans_commit unlocking
> it in that case?

Yes, and let's also xfs_iget(capture_ip->i_ino) to increase the incore
inode's refcount, which would make it so that the caller would still
unlock and rele the reference that they got.

--D
Christoph Hellwig Oct. 5, 2020, 6:25 a.m. UTC | #3
On Fri, Oct 02, 2020 at 09:29:58AM -0700, Darrick J. Wong wrote:
> > Instead of coming up with our own inode unlocking and release schemes,
> > can't we just require that the inode is joinged by passing the lock
> > flags to xfs_trans_ijoin, and piggy back on xfs_trans_commit unlocking
> > it in that case?
> 
> Yes, and let's also xfs_iget(capture_ip->i_ino) to increase the incore
> inode's refcount, which would make it so that the caller would still
> unlock and rele the reference that they got.

Please use ihold(VFS_I(capture_ip)) as that is a lot more efficient.

Can you resend the whole 2 series?  I'm lost with all the incremental
updates for individual patches.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index e19dc1ced7e6..4af5752f9830 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,7 @@ 
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * Deferred Operations in XFS
@@ -553,10 +554,14 @@  xfs_defer_move(
  * deferred ops state is transferred to the capture structure and the
  * transaction is then ready for the caller to commit it.  If there are no
  * intent items to capture, this function returns NULL.
+ *
+ * If inodes are passed in and this function returns a capture structure, the
+ * inodes are now owned by the capture structure.
  */
 static struct xfs_defer_capture *
 xfs_defer_ops_capture(
-	struct xfs_trans		*tp)
+	struct xfs_trans		*tp,
+	struct xfs_inode		*ip)
 {
 	struct xfs_defer_capture	*dfc;
 
@@ -582,6 +587,12 @@  xfs_defer_ops_capture(
 	/* Preserve the log reservation size. */
 	dfc->dfc_logres = tp->t_log_res;
 
+	/*
+	 * Transfer responsibility for unlocking and releasing the inodes to
+	 * the capture structure.
+	 */
+	dfc->dfc_ip = ip;
+
 	return dfc;
 }
 
@@ -592,29 +603,49 @@  xfs_defer_ops_release(
 	struct xfs_defer_capture	*dfc)
 {
 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+	if (dfc->dfc_ip)
+		xfs_irele(dfc->dfc_ip);
 	kmem_free(dfc);
 }
 
 /*
  * Capture any deferred ops and commit the transaction.  This is the last step
- * needed to finish a log intent item that we recovered from the log.
+ * needed to finish a log intent item that we recovered from the log.  If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure.  The
+ * caller must hold ILOCK_EXCL on the inode, and must not touch the inode after
+ * this call returns.
  */
 int
 xfs_defer_ops_capture_and_commit(
 	struct xfs_trans		*tp,
+	struct xfs_inode		*ip,
 	struct list_head		*capture_list)
 {
 	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_defer_capture	*dfc;
 	int				error;
 
+	ASSERT(ip == NULL || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
 	/* If we don't capture anything, commit transaction and exit. */
-	dfc = xfs_defer_ops_capture(tp);
-	if (!dfc)
-		return xfs_trans_commit(tp);
+	dfc = xfs_defer_ops_capture(tp, ip);
+	if (!dfc) {
+		error = xfs_trans_commit(tp);
+		if (ip) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_irele(ip);
+		}
+		return error;
+	}
 
-	/* Commit the transaction and add the capture structure to the list. */
+	/*
+	 * Commit the transaction and add the capture structure to the list.
+	 * Once that's done, we can unlock the inode.
+	 */
 	error = xfs_trans_commit(tp);
+	if (ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error) {
 		xfs_defer_ops_release(mp, dfc);
 		return error;
@@ -626,16 +657,24 @@  xfs_defer_ops_capture_and_commit(
 
 /*
  * Attach a chain of captured deferred ops to a new transaction and free the
- * capture structure.
+ * capture structure.  A captured inode will be passed back to the caller.
  */
 void
 xfs_defer_ops_continue(
 	struct xfs_defer_capture	*dfc,
-	struct xfs_trans		*tp)
+	struct xfs_trans		*tp,
+	struct xfs_inode		**ipp)
 {
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
 
+	/* Lock and join the captured inode to the new transaction. */
+	if (dfc->dfc_ip) {
+		xfs_ilock(dfc->dfc_ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, dfc->dfc_ip, 0);
+	}
+	*ipp = dfc->dfc_ip;
+
 	/* Move captured dfops chain and state to the transaction. */
 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
 	tp->t_flags |= dfc->dfc_tpflags;
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 6cde6f0713f7..04d4def50b19 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -82,15 +82,22 @@  struct xfs_defer_capture {
 
 	/* Log reservation saved from the transaction. */
 	unsigned int		dfc_logres;
+
+	/*
+	 * An inode reference that must be maintained to complete the deferred
+	 * work.
+	 */
+	struct xfs_inode	*dfc_ip;
 };
 
 /*
  * Functions to capture a chain of deferred operations and continue them later.
  * This doesn't normally happen except log recovery.
  */
-int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct list_head *capture_list);
-void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+		struct xfs_inode **ipp);
 void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
 
 #endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 1c9cb5a04bb5..0ffbc75bafe1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -513,15 +513,11 @@  xfs_bui_item_recover(
 		xfs_bmap_unmap_extent(tp, ip, &irec);
 	}
 
-	/* Commit transaction, which frees tp. */
-	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
-	if (error)
-		goto err_unlock;
-	return 0;
+	/* Commit transaction, which frees the transaction and the inode. */
+	return xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
 
 err_cancel:
 	xfs_trans_cancel(tp);
-err_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 err_rele:
 	xfs_irele(ip);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 17d36fe5cfd0..3920542f5736 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -627,7 +627,7 @@  xfs_efi_item_recover(
 
 	}
 
-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
 
 abort_error:
 	xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 001e1585ddc6..a8289adc1b29 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,6 +2439,7 @@  xlog_finish_defer_ops(
 {
 	struct xfs_defer_capture *dfc, *next;
 	struct xfs_trans	*tp;
+	struct xfs_inode	*ip;
 	int			error = 0;
 
 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
@@ -2464,9 +2465,13 @@  xlog_finish_defer_ops(
 		 * from recovering a single intent item.
 		 */
 		list_del_init(&dfc->dfc_list);
-		xfs_defer_ops_continue(dfc, tp);
+		xfs_defer_ops_continue(dfc, tp, &ip);
 
 		error = xfs_trans_commit(tp);
+		if (ip) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_irele(ip);
+		}
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0478374add64..ad895b48f365 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -544,7 +544,7 @@  xfs_cui_item_recover(
 	}
 
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
 
 abort_error:
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0d8fa707f079..1163f32c3e62 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -567,7 +567,7 @@  xfs_rui_item_recover(
 	}
 
 	xfs_rmap_finish_one_cleanup(tp, rcur, error);
-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
 
 abort_error:
 	xfs_rmap_finish_one_cleanup(tp, rcur, error);