diff mbox series

[1/2] xfs: reserve quota for dir expansion when linking/unlinking files

Message ID 164685374682.495923.2923492909223420951.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: make quota reservations for directory changes | expand

Commit Message

Darrick J. Wong March 9, 2022, 7:22 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

XFS does not reserve quota for directory expansion when linking or
unlinking children from a directory.  This means that we don't reject
the expansion with EDQUOT when we're at or near a hard limit, which
means that unprivileged userspace can use link()/unlink() to exceed
quota.

The fix for this is nuanced -- link operations don't always expand the
directory, and we allow a link to proceed with no space reservation if
we don't need to add a block to the directory to handle the addition.
Unlink operations generally do not expand the directory (you'd have to
free a block and then cause a btree split) and we can defer the
directory block freeing if there is no space reservation.

Moreover, there is a further bug in that we do not trigger the blockgc
workers to try to clear space when we're out of quota.

To fix both cases, create a new xfs_trans_alloc_dir function that
allocates the transaction, locks and joins the inodes, and reserves
quota for the directory.  If there isn't sufficient space or quota,
we'll switch the caller to reservationless mode.  This should prevent
quota usage overruns with the least restriction in functionality.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_inode.c |   30 +++++-------------
 fs/xfs/xfs_trans.c |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trans.h |    3 ++
 3 files changed, 97 insertions(+), 22 deletions(-)

Comments

Dave Chinner March 9, 2022, 9:48 p.m. UTC | #1
On Wed, Mar 09, 2022 at 11:22:26AM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> XFS does not reserve quota for directory expansion when linking or
> unlinking children from a directory.  This means that we don't reject
> the expansion with EDQUOT when we're at or near a hard limit, which
> means that unprivileged userspace can use link()/unlink() to exceed
> quota.
> 
> The fix for this is nuanced -- link operations don't always expand the
> directory, and we allow a link to proceed with no space reservation if
> we don't need to add a block to the directory to handle the addition.
> Unlink operations generally do not expand the directory (you'd have to
> free a block and then cause a btree split) and we can defer the
> directory block freeing if there is no space reservation.
> 
> Moreover, there is a further bug in that we do not trigger the blockgc
> workers to try to clear space when we're out of quota.
> 
> To fix both cases, create a new xfs_trans_alloc_dir function that
> allocates the transaction, locks and joins the inodes, and reserves
> quota for the directory.  If there isn't sufficient space or quota,
> we'll switch the caller to reservationless mode.  This should prevent
> quota usage overruns with the least restriction in functionality.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/xfs_inode.c |   30 +++++-------------
>  fs/xfs/xfs_trans.c |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_trans.h |    3 ++
>  3 files changed, 97 insertions(+), 22 deletions(-)

Overall looks good, minor nits below:

> 
> 
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 04bf467b1090..a131bbfe74e4 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1217,7 +1217,7 @@ xfs_link(
>  {
>  	xfs_mount_t		*mp = tdp->i_mount;
>  	xfs_trans_t		*tp;
> -	int			error;
> +	int			error, space_error;
>  	int			resblks;
>  
>  	trace_xfs_link(tdp, target_name);
> @@ -1236,19 +1236,11 @@ xfs_link(
>  		goto std_return;
>  
>  	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
> -	if (error == -ENOSPC) {
> -		resblks = 0;
> -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
> -	}
> +	error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
> +			&tp, &space_error);

It's the nospace_error, isn't it? The code reads a lot better when
it's called that, too.


>  	if (error)
>  		goto std_return;
>  
> -	xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
> -
> -	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
> -	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
> -
>  	error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
>  			XFS_IEXT_DIR_MANIP_CNT(mp));
>  	if (error)
> @@ -1267,6 +1259,8 @@ xfs_link(
>  
>  	if (!resblks) {
>  		error = xfs_dir_canenter(tp, tdp, target_name);
> +		if (error == -ENOSPC && space_error)
> +			error = space_error;

This  would be better in the error_return stack, I think. That way
the transformation only has to be done once, and it will be done for
all functions that can potentially return ENOSPC.

>  		if (error)
>  			goto error_return;
>  	}
> @@ -2755,6 +2749,7 @@ xfs_remove(
>  	xfs_mount_t		*mp = dp->i_mount;
>  	xfs_trans_t             *tp = NULL;
>  	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
> +	int			dontcare;
>  	int                     error = 0;
>  	uint			resblks;
>  
> @@ -2781,22 +2776,13 @@ xfs_remove(
>  	 * block from the directory.
>  	 */
>  	resblks = XFS_REMOVE_SPACE_RES(mp);
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
> -	if (error == -ENOSPC) {
> -		resblks = 0;
> -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
> -				&tp);
> -	}
> +	error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
> +			&tp, &dontcare);
>  	if (error) {
>  		ASSERT(error != -ENOSPC);
>  		goto std_return;
>  	}

So we just ignore -EDQUOT when it is returned in @dontcare? I'd like
a comment to explain why we don't care about EDQUOT here, because
the next time I look at this I will have forgotten all about this...

Cheers,

Dave.
Darrick J. Wong March 9, 2022, 11:33 p.m. UTC | #2
On Thu, Mar 10, 2022 at 08:48:21AM +1100, Dave Chinner wrote:
> On Wed, Mar 09, 2022 at 11:22:26AM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > XFS does not reserve quota for directory expansion when linking or
> > unlinking children from a directory.  This means that we don't reject
> > the expansion with EDQUOT when we're at or near a hard limit, which
> > means that unprivileged userspace can use link()/unlink() to exceed
> > quota.
> > 
> > The fix for this is nuanced -- link operations don't always expand the
> > directory, and we allow a link to proceed with no space reservation if
> > we don't need to add a block to the directory to handle the addition.
> > Unlink operations generally do not expand the directory (you'd have to
> > free a block and then cause a btree split) and we can defer the
> > directory block freeing if there is no space reservation.
> > 
> > Moreover, there is a further bug in that we do not trigger the blockgc
> > workers to try to clear space when we're out of quota.
> > 
> > To fix both cases, create a new xfs_trans_alloc_dir function that
> > allocates the transaction, locks and joins the inodes, and reserves
> > quota for the directory.  If there isn't sufficient space or quota,
> > we'll switch the caller to reservationless mode.  This should prevent
> > quota usage overruns with the least restriction in functionality.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  fs/xfs/xfs_inode.c |   30 +++++-------------
> >  fs/xfs/xfs_trans.c |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_trans.h |    3 ++
> >  3 files changed, 97 insertions(+), 22 deletions(-)
> 
> Overall looks good, minor nits below:
> 
> > 
> > 
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 04bf467b1090..a131bbfe74e4 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1217,7 +1217,7 @@ xfs_link(
> >  {
> >  	xfs_mount_t		*mp = tdp->i_mount;
> >  	xfs_trans_t		*tp;
> > -	int			error;
> > +	int			error, space_error;
> >  	int			resblks;
> >  
> >  	trace_xfs_link(tdp, target_name);
> > @@ -1236,19 +1236,11 @@ xfs_link(
> >  		goto std_return;
> >  
> >  	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
> > -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
> > -	if (error == -ENOSPC) {
> > -		resblks = 0;
> > -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
> > -	}
> > +	error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
> > +			&tp, &space_error);
> 
> It's the nospace_error, isn't it? The code reads a lot better when
> it's called that, too.

Fixed.

> 
> >  	if (error)
> >  		goto std_return;
> >  
> > -	xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
> > -
> > -	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
> > -	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
> > -
> >  	error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
> >  			XFS_IEXT_DIR_MANIP_CNT(mp));
> >  	if (error)
> > @@ -1267,6 +1259,8 @@ xfs_link(
> >  
> >  	if (!resblks) {
> >  		error = xfs_dir_canenter(tp, tdp, target_name);
> > +		if (error == -ENOSPC && space_error)
> > +			error = space_error;
> 
> This  would be better in the error_return stack, I think. That way
> the transformation only has to be done once, and it will be done for
> all functions that can potentially return ENOSPC.

Ok.

> >  		if (error)
> >  			goto error_return;
> >  	}
> > @@ -2755,6 +2749,7 @@ xfs_remove(
> >  	xfs_mount_t		*mp = dp->i_mount;
> >  	xfs_trans_t             *tp = NULL;
> >  	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
> > +	int			dontcare;
> >  	int                     error = 0;
> >  	uint			resblks;
> >  
> > @@ -2781,22 +2776,13 @@ xfs_remove(
> >  	 * block from the directory.
> >  	 */
> >  	resblks = XFS_REMOVE_SPACE_RES(mp);
> > -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
> > -	if (error == -ENOSPC) {
> > -		resblks = 0;
> > -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
> > -				&tp);
> > -	}
> > +	error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
> > +			&tp, &dontcare);
> >  	if (error) {
> >  		ASSERT(error != -ENOSPC);
> >  		goto std_return;
> >  	}
> 
> So we just ignore -EDQUOT when it is returned in @dontcare? I'd like
> a comment to explain why we don't care about EDQUOT here, because
> the next time I look at this I will have forgotten all about this...

Ok.  How about:

	/*
	 * We try to get the real space reservation first, allowing for
	 * directory btree deletion(s) implying possible bmap insert(s).
	 * If we can't get the space reservation then we use 0 instead,
	 * and avoid the bmap btree insert(s) in the directory code by,
	 * if the bmap insert tries to happen, instead trimming the LAST
	 * block from the directory.
	 *
	 * Ignore EDQUOT and ENOSPC being returned via nospace_error
	 * because the directory code can handle a reservationless
	 * update and we don't want to prevent a user from trying to
	 * free space by deleting things.
	 */
	error = xfs_trans_alloc_dir(...);

--D

> Cheers,
> 
> Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner March 10, 2022, 1:50 a.m. UTC | #3
On Wed, Mar 09, 2022 at 03:33:02PM -0800, Darrick J. Wong wrote:
> On Thu, Mar 10, 2022 at 08:48:21AM +1100, Dave Chinner wrote:
> > On Wed, Mar 09, 2022 at 11:22:26AM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > >  		if (error)
> > >  			goto error_return;
> > >  	}
> > > @@ -2755,6 +2749,7 @@ xfs_remove(
> > >  	xfs_mount_t		*mp = dp->i_mount;
> > >  	xfs_trans_t             *tp = NULL;
> > >  	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
> > > +	int			dontcare;
> > >  	int                     error = 0;
> > >  	uint			resblks;
> > >  
> > > @@ -2781,22 +2776,13 @@ xfs_remove(
> > >  	 * block from the directory.
> > >  	 */
> > >  	resblks = XFS_REMOVE_SPACE_RES(mp);
> > > -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
> > > -	if (error == -ENOSPC) {
> > > -		resblks = 0;
> > > -		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
> > > -				&tp);
> > > -	}
> > > +	error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
> > > +			&tp, &dontcare);
> > >  	if (error) {
> > >  		ASSERT(error != -ENOSPC);
> > >  		goto std_return;
> > >  	}
> > 
> > So we just ignore -EDQUOT when it is returned in @dontcare? I'd like
> > a comment to explain why we don't care about EDQUOT here, because
> > the next time I look at this I will have forgotten all about this...
> 
> Ok.  How about:
> 
> 	/*
> 	 * We try to get the real space reservation first, allowing for
> 	 * directory btree deletion(s) implying possible bmap insert(s).
> 	 * If we can't get the space reservation then we use 0 instead,
> 	 * and avoid the bmap btree insert(s) in the directory code by,
> 	 * if the bmap insert tries to happen, instead trimming the LAST
> 	 * block from the directory.
> 	 *
> 	 * Ignore EDQUOT and ENOSPC being returned via nospace_error
> 	 * because the directory code can handle a reservationless
> 	 * update and we don't want to prevent a user from trying to
> 	 * free space by deleting things.
> 	 */
> 	error = xfs_trans_alloc_dir(...);

Yeah, that looks good.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 04bf467b1090..a131bbfe74e4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1217,7 +1217,7 @@  xfs_link(
 {
 	xfs_mount_t		*mp = tdp->i_mount;
 	xfs_trans_t		*tp;
-	int			error;
+	int			error, space_error;
 	int			resblks;
 
 	trace_xfs_link(tdp, target_name);
@@ -1236,19 +1236,11 @@  xfs_link(
 		goto std_return;
 
 	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
-	if (error == -ENOSPC) {
-		resblks = 0;
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
-	}
+	error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+			&tp, &space_error);
 	if (error)
 		goto std_return;
 
-	xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
 	error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
 			XFS_IEXT_DIR_MANIP_CNT(mp));
 	if (error)
@@ -1267,6 +1259,8 @@  xfs_link(
 
 	if (!resblks) {
 		error = xfs_dir_canenter(tp, tdp, target_name);
+		if (error == -ENOSPC && space_error)
+			error = space_error;
 		if (error)
 			goto error_return;
 	}
@@ -2755,6 +2749,7 @@  xfs_remove(
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t             *tp = NULL;
 	int			is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+	int			dontcare;
 	int                     error = 0;
 	uint			resblks;
 
@@ -2781,22 +2776,13 @@  xfs_remove(
 	 * block from the directory.
 	 */
 	resblks = XFS_REMOVE_SPACE_RES(mp);
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
-	if (error == -ENOSPC) {
-		resblks = 0;
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
-				&tp);
-	}
+	error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+			&tp, &dontcare);
 	if (error) {
 		ASSERT(error != -ENOSPC);
 		goto std_return;
 	}
 
-	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
 	/*
 	 * If we're removing a directory perform some additional validation.
 	 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 59e2f9031b9f..75327f2aeb99 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1210,3 +1210,89 @@  xfs_trans_alloc_ichange(
 	xfs_trans_cancel(tp);
 	return error;
 }
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update.  If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @space_error will be set to a negative errno describing the space constraint
+ * we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized.  The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+	struct xfs_inode	*dp,
+	struct xfs_trans_res	*resv,
+	struct xfs_inode	*ip,
+	unsigned int		*dblocks,
+	struct xfs_trans	**tpp,
+	int			*space_error)
+{
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned int		resblks;
+	bool			retried = false;
+	int			error;
+
+retry:
+	*space_error = 0;
+	resblks = *dblocks;
+	error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+	if (error == -ENOSPC) {
+		*space_error = error;
+		resblks = 0;
+		error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+	}
+	if (error)
+		return error;
+
+	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	error = xfs_qm_dqattach_locked(dp, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error) {
+		/* Caller should have allocated the dquots! */
+		ASSERT(error != -ENOENT);
+		goto out_cancel;
+	}
+
+	if (resblks == 0)
+		goto done;
+
+	error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+	if (error == -EDQUOT || error == -ENOSPC) {
+		if (!retried) {
+			xfs_trans_cancel(tp);
+			xfs_blockgc_free_quota(dp, 0);
+			retried = true;
+			goto retry;
+		}
+
+		*space_error = error;
+		resblks = 0;
+		error = 0;
+	}
+	if (error)
+		goto out_cancel;
+
+done:
+	*tpp = tp;
+	*dblocks = resblks;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a487b264a9eb..3b9128d6e214 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -259,6 +259,9 @@  int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
 int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
 		struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
 		struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+		struct xfs_inode *ip, unsigned int *dblocks,
+		struct xfs_trans **tpp, int *space_error);
 
 static inline void
 xfs_trans_set_context(