diff mbox series

[07/12] xfs: don't preallocate a transaction for file size updates

Message ID 20190624055253.31183-8-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/12] list.h: add a list_pop helper | expand

Commit Message

Christoph Hellwig June 24, 2019, 5:52 a.m. UTC
We have historically decided that we want to preallocate the xfs_trans
structure at writeback time so that we don't have to allocate on in
the I/O completion handler.  But we treat unwrittent extent and COW
fork conversions different already, which proves that the transaction
allocations in the end I/O handler are not a problem.  Removing the
preallocation gets rid of a lot of corner case code, and also ensures
we only allocate one and log a transaction when actually required,
as the ioend merging can reduce the number of actual i_size updates
significantly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 110 +++++-----------------------------------------
 fs/xfs/xfs_aops.h |   1 -
 2 files changed, 12 insertions(+), 99 deletions(-)

Comments

Darrick J. Wong June 24, 2019, 4:17 p.m. UTC | #1
On Mon, Jun 24, 2019 at 07:52:48AM +0200, Christoph Hellwig wrote:
> We have historically decided that we want to preallocate the xfs_trans
> structure at writeback time so that we don't have to allocate on in
> the I/O completion handler.  But we treat unwrittent extent and COW
> fork conversions different already, which proves that the transaction
> allocations in the end I/O handler are not a problem.  Removing the
> preallocation gets rid of a lot of corner case code, and also ensures
> we only allocate one and log a transaction when actually required,
> as the ioend merging can reduce the number of actual i_size updates
> significantly.

That's what I thought when I wrote the ioend merging patches, but IIRC
Dave objected on the grounds that most file writes are trivial file
extending writes and therefore we should leave this alone to avoid
slowing down the ioend path even if it came at a cost of cancelling a
lot of empty transactions.

I wasn't 100% convinced it mattered but ran out of time in the
development window and never got around to researching if it made any
difference.

So, uh, how much of a hit do we take for having to allocate a
transaction for a file size extension?  Particularly since we can
combine those things now?

--D

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_aops.c | 110 +++++-----------------------------------------
>  fs/xfs/xfs_aops.h |   1 -
>  2 files changed, 12 insertions(+), 99 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 633baaaff7ae..017b87b7765f 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -130,44 +130,23 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
>  		XFS_I(ioend->io_inode)->i_d.di_size;
>  }
>  
> -STATIC int
> -xfs_setfilesize_trans_alloc(
> -	struct xfs_ioend	*ioend)
> -{
> -	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
> -	struct xfs_trans	*tp;
> -	int			error;
> -
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
> -	if (error)
> -		return error;
> -
> -	ioend->io_append_trans = tp;
> -
> -	/*
> -	 * We may pass freeze protection with a transaction.  So tell lockdep
> -	 * we released it.
> -	 */
> -	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
> -	/*
> -	 * We hand off the transaction to the completion thread now, so
> -	 * clear the flag here.
> -	 */
> -	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> -	return 0;
> -}
> -
>  /*
>   * Update on-disk file size now that data has been written to disk.
>   */
> -STATIC int
> -__xfs_setfilesize(
> +int
> +xfs_setfilesize(
>  	struct xfs_inode	*ip,
> -	struct xfs_trans	*tp,
>  	xfs_off_t		offset,
>  	size_t			size)
>  {
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_trans	*tp;
>  	xfs_fsize_t		isize;
> +	int			error;
> +
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
> +	if (error)
> +		return error;
>  
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  	isize = xfs_new_eof(ip, offset + size);
> @@ -186,48 +165,6 @@ __xfs_setfilesize(
>  	return xfs_trans_commit(tp);
>  }
>  
> -int
> -xfs_setfilesize(
> -	struct xfs_inode	*ip,
> -	xfs_off_t		offset,
> -	size_t			size)
> -{
> -	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_trans	*tp;
> -	int			error;
> -
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
> -	if (error)
> -		return error;
> -
> -	return __xfs_setfilesize(ip, tp, offset, size);
> -}
> -
> -STATIC int
> -xfs_setfilesize_ioend(
> -	struct xfs_ioend	*ioend,
> -	int			error)
> -{
> -	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> -	struct xfs_trans	*tp = ioend->io_append_trans;
> -
> -	/*
> -	 * The transaction may have been allocated in the I/O submission thread,
> -	 * thus we need to mark ourselves as being in a transaction manually.
> -	 * Similarly for freeze protection.
> -	 */
> -	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
> -	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
> -
> -	/* we abort the update if there was an IO error */
> -	if (error) {
> -		xfs_trans_cancel(tp);
> -		return error;
> -	}
> -
> -	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
> -}
> -
>  /*
>   * IO write completion.
>   */
> @@ -267,12 +204,9 @@ xfs_end_ioend(
>  		error = xfs_reflink_end_cow(ip, offset, size);
>  	else if (ioend->io_type == IOMAP_UNWRITTEN)
>  		error = xfs_iomap_write_unwritten(ip, offset, size, false);
> -	else
> -		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
> -
> +	if (!error && xfs_ioend_is_append(ioend))
> +		error = xfs_setfilesize(ip, offset, size);
>  done:
> -	if (ioend->io_append_trans)
> -		error = xfs_setfilesize_ioend(ioend, error);
>  	list_replace_init(&ioend->io_list, &ioend_list);
>  	xfs_destroy_ioend(ioend, error);
>  
> @@ -307,8 +241,6 @@ xfs_ioend_can_merge(
>  		return false;
>  	if (ioend->io_offset + ioend->io_size != next->io_offset)
>  		return false;
> -	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
> -		return false;
>  	return true;
>  }
>  
> @@ -320,7 +252,6 @@ xfs_ioend_try_merge(
>  {
>  	struct xfs_ioend	*next_ioend;
>  	int			ioend_error;
> -	int			error;
>  
>  	if (list_empty(more_ioends))
>  		return;
> @@ -334,10 +265,6 @@ xfs_ioend_try_merge(
>  			break;
>  		list_move_tail(&next_ioend->io_list, &ioend->io_list);
>  		ioend->io_size += next_ioend->io_size;
> -		if (ioend->io_append_trans) {
> -			error = xfs_setfilesize_ioend(next_ioend, 1);
> -			ASSERT(error == 1);
> -		}
>  	}
>  }
>  
> @@ -398,7 +325,7 @@ xfs_end_bio(
>  
>  	if (ioend->io_fork == XFS_COW_FORK ||
>  	    ioend->io_type == IOMAP_UNWRITTEN ||
> -	    ioend->io_append_trans != NULL) {
> +	    xfs_ioend_is_append(ioend)) {
>  		spin_lock_irqsave(&ip->i_ioend_lock, flags);
>  		if (list_empty(&ip->i_ioend_list))
>  			WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
> @@ -660,18 +587,6 @@ xfs_submit_ioend(
>  		memalloc_nofs_restore(nofs_flag);
>  	}
>  
> -	/* Reserve log space if we might write beyond the on-disk inode size. */
> -	if (!status &&
> -	    (ioend->io_fork == XFS_COW_FORK ||
> -	     ioend->io_type != IOMAP_UNWRITTEN) &&
> -	    xfs_ioend_is_append(ioend) &&
> -	    !ioend->io_append_trans) {
> -		unsigned nofs_flag = memalloc_nofs_save();
> -
> -		status = xfs_setfilesize_trans_alloc(ioend);
> -		memalloc_nofs_restore(nofs_flag);
> -	}
> -
>  	ioend->io_bio->bi_private = ioend;
>  	ioend->io_bio->bi_end_io = xfs_end_bio;
>  
> @@ -715,7 +630,6 @@ xfs_alloc_ioend(
>  	ioend->io_inode = inode;
>  	ioend->io_size = 0;
>  	ioend->io_offset = offset;
> -	ioend->io_append_trans = NULL;
>  	ioend->io_bio = bio;
>  	return ioend;
>  }
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 72e30d1c3bdf..23c087f0bcbf 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -18,7 +18,6 @@ struct xfs_ioend {
>  	struct inode		*io_inode;	/* file being written to */
>  	size_t			io_size;	/* size of the extent */
>  	xfs_off_t		io_offset;	/* offset in the file */
> -	struct xfs_trans	*io_append_trans;/* xact. for size update */
>  	struct bio		*io_bio;	/* bio being built */
>  	struct bio		io_inline_bio;	/* MUST BE LAST! */
>  };
> -- 
> 2.20.1
>
Dave Chinner June 24, 2019, 11:15 p.m. UTC | #2
On Mon, Jun 24, 2019 at 09:17:20AM -0700, Darrick J. Wong wrote:
> On Mon, Jun 24, 2019 at 07:52:48AM +0200, Christoph Hellwig wrote:
> > We have historically decided that we want to preallocate the xfs_trans
> > structure at writeback time so that we don't have to allocate on in
> > the I/O completion handler.  But we treat unwrittent extent and COW
> > fork conversions different already, which proves that the transaction
> > allocations in the end I/O handler are not a problem.  Removing the
> > preallocation gets rid of a lot of corner case code, and also ensures
> > we only allocate one and log a transaction when actually required,
> > as the ioend merging can reduce the number of actual i_size updates
> > significantly.
> 
> That's what I thought when I wrote the ioend merging patches, but IIRC
> Dave objected on the grounds that most file writes are trivial file
> extending writes and therefore we should leave this alone to avoid
> slowing down the ioend path even if it came at a cost of cancelling a
> lot of empty transactions.

The issue is stuff like extracting a tarball, where we might write a
hundred thousand files and they are all written in a single IO. i.e.
there is no IO completion merging at all.

> I wasn't 100% convinced it mattered but ran out of time in the
> development window and never got around to researching if it made any
> difference.

Yeah, it's not all that simple :/

In these cases, we always have to allocate a transaction for every
file being written. If we do it before we submit the IO, then all
the transactions are allocated from the single writeback context. If
we don't have log space, data writeback pauses while the tail of the
AIL is pushed, metadata writeback occurs, and then the transaction
allocation for data writeback is woken, and data writeback
submission continues. It's fairly orderly, and we don't end up
trying to write back data while we are doing bulk metadata flushing
from the AIL.

If we delay the transaction allocation to the ioend context and we
are low on log space, we end up blocking a kworker on a transaction
allocation which then has to wait for metadata writeback. The
kworker infrastructure will then issue the next ioend work, which
then blocks on transaction allocation. Delayed allocation can cause
thousands of small file IOs to be inflight at the same time due to
intra-file contiguous allocation and IO merging in the block layer,
hence we can have thousands of individual IO completions that
require transaction allocation to be completed.

> So, uh, how much of a hit do we take for having to allocate a
> transaction for a file size extension?  Particularly since we can
> combine those things now?

Unless we are out of log space, the transaction allocation and free
should be largely uncontended and so it's just a small amount of CPU
usage. i.e it's a slab allocation/free and then lockless space
reservation/free. If we are out of log space, then we sleep waiting
for space - the issue really comes down to where it is better to
sleep in that case....

Cheers,

Dave.
Christoph Hellwig June 25, 2019, 10:25 a.m. UTC | #3
On Tue, Jun 25, 2019 at 09:15:23AM +1000, Dave Chinner wrote:
> > So, uh, how much of a hit do we take for having to allocate a
> > transaction for a file size extension?  Particularly since we can
> > combine those things now?
> 
> Unless we are out of log space, the transaction allocation and free
> should be largely uncontended and so it's just a small amount of CPU
> usage. i.e it's a slab allocation/free and then lockless space
> reservation/free. If we are out of log space, then we sleep waiting
> for space - the issue really comes down to where it is better to
> sleep in that case....

I see the general point, but we'll still have the same issue with
unwritten extent conversion and cow completions, and I don't remember
seeing any issue in that regard.  And we'd hit exactly that case
with random writes to preallocated or COW files, i.e. the typical image
file workload.
Dave Chinner June 27, 2019, 10:23 p.m. UTC | #4
On Tue, Jun 25, 2019 at 12:25:07PM +0200, Christoph Hellwig wrote:
> On Tue, Jun 25, 2019 at 09:15:23AM +1000, Dave Chinner wrote:
> > > So, uh, how much of a hit do we take for having to allocate a
> > > transaction for a file size extension?  Particularly since we can
> > > combine those things now?
> > 
> > Unless we are out of log space, the transaction allocation and free
> > should be largely uncontended and so it's just a small amount of CPU
> > usage. i.e it's a slab allocation/free and then lockless space
> > reservation/free. If we are out of log space, then we sleep waiting
> > for space - the issue really comes down to where it is better to
> > sleep in that case....
> 
> I see the general point, but we'll still have the same issue with
> unwritten extent conversion and cow completions, and I don't remember
> seeing any issue in that regard.

These are realtively rare for small file workloads - I'm really
talking about the effect of delalloc and how we've optimised
allocation during writeback to merge small, cross-file writeback
into much larger large physical IOs. Unwritten extents nor COW are
used in these (common) cases, and if they are then the allocation
patterns prevent the cross-file IO merging in the block layer and so
we don't get the "hundred ioends for a hundred inodes from a single
a physical IO completion" thundering heard problem....

> And we'd hit exactly that case
> with random writes to preallocated or COW files, i.e. the typical image
> file workload.

I do see a noticable amount of IO completion overhead in the host
when hitting unwritten extents in VM image workloads. I'll see if I
can track the number of kworkers we're stalling in under some of
these workloads, but I think it's still largely bound by the request
queue depth of the IO stack inside the VM because there is no IO
merging in these cases.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 633baaaff7ae..017b87b7765f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -130,44 +130,23 @@  static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 		XFS_I(ioend->io_inode)->i_d.di_size;
 }
 
-STATIC int
-xfs_setfilesize_trans_alloc(
-	struct xfs_ioend	*ioend)
-{
-	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
-	struct xfs_trans	*tp;
-	int			error;
-
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
-	if (error)
-		return error;
-
-	ioend->io_append_trans = tp;
-
-	/*
-	 * We may pass freeze protection with a transaction.  So tell lockdep
-	 * we released it.
-	 */
-	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
-	/*
-	 * We hand off the transaction to the completion thread now, so
-	 * clear the flag here.
-	 */
-	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-	return 0;
-}
-
 /*
  * Update on-disk file size now that data has been written to disk.
  */
-STATIC int
-__xfs_setfilesize(
+int
+xfs_setfilesize(
 	struct xfs_inode	*ip,
-	struct xfs_trans	*tp,
 	xfs_off_t		offset,
 	size_t			size)
 {
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
 	xfs_fsize_t		isize;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
+		return error;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	isize = xfs_new_eof(ip, offset + size);
@@ -186,48 +165,6 @@  __xfs_setfilesize(
 	return xfs_trans_commit(tp);
 }
 
-int
-xfs_setfilesize(
-	struct xfs_inode	*ip,
-	xfs_off_t		offset,
-	size_t			size)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_trans	*tp;
-	int			error;
-
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
-	if (error)
-		return error;
-
-	return __xfs_setfilesize(ip, tp, offset, size);
-}
-
-STATIC int
-xfs_setfilesize_ioend(
-	struct xfs_ioend	*ioend,
-	int			error)
-{
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
-	struct xfs_trans	*tp = ioend->io_append_trans;
-
-	/*
-	 * The transaction may have been allocated in the I/O submission thread,
-	 * thus we need to mark ourselves as being in a transaction manually.
-	 * Similarly for freeze protection.
-	 */
-	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
-
-	/* we abort the update if there was an IO error */
-	if (error) {
-		xfs_trans_cancel(tp);
-		return error;
-	}
-
-	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
-}
-
 /*
  * IO write completion.
  */
@@ -267,12 +204,9 @@  xfs_end_ioend(
 		error = xfs_reflink_end_cow(ip, offset, size);
 	else if (ioend->io_type == IOMAP_UNWRITTEN)
 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
-	else
-		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
-
+	if (!error && xfs_ioend_is_append(ioend))
+		error = xfs_setfilesize(ip, offset, size);
 done:
-	if (ioend->io_append_trans)
-		error = xfs_setfilesize_ioend(ioend, error);
 	list_replace_init(&ioend->io_list, &ioend_list);
 	xfs_destroy_ioend(ioend, error);
 
@@ -307,8 +241,6 @@  xfs_ioend_can_merge(
 		return false;
 	if (ioend->io_offset + ioend->io_size != next->io_offset)
 		return false;
-	if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next))
-		return false;
 	return true;
 }
 
@@ -320,7 +252,6 @@  xfs_ioend_try_merge(
 {
 	struct xfs_ioend	*next_ioend;
 	int			ioend_error;
-	int			error;
 
 	if (list_empty(more_ioends))
 		return;
@@ -334,10 +265,6 @@  xfs_ioend_try_merge(
 			break;
 		list_move_tail(&next_ioend->io_list, &ioend->io_list);
 		ioend->io_size += next_ioend->io_size;
-		if (ioend->io_append_trans) {
-			error = xfs_setfilesize_ioend(next_ioend, 1);
-			ASSERT(error == 1);
-		}
 	}
 }
 
@@ -398,7 +325,7 @@  xfs_end_bio(
 
 	if (ioend->io_fork == XFS_COW_FORK ||
 	    ioend->io_type == IOMAP_UNWRITTEN ||
-	    ioend->io_append_trans != NULL) {
+	    xfs_ioend_is_append(ioend)) {
 		spin_lock_irqsave(&ip->i_ioend_lock, flags);
 		if (list_empty(&ip->i_ioend_list))
 			WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
@@ -660,18 +587,6 @@  xfs_submit_ioend(
 		memalloc_nofs_restore(nofs_flag);
 	}
 
-	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status &&
-	    (ioend->io_fork == XFS_COW_FORK ||
-	     ioend->io_type != IOMAP_UNWRITTEN) &&
-	    xfs_ioend_is_append(ioend) &&
-	    !ioend->io_append_trans) {
-		unsigned nofs_flag = memalloc_nofs_save();
-
-		status = xfs_setfilesize_trans_alloc(ioend);
-		memalloc_nofs_restore(nofs_flag);
-	}
-
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
 
@@ -715,7 +630,6 @@  xfs_alloc_ioend(
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = offset;
-	ioend->io_append_trans = NULL;
 	ioend->io_bio = bio;
 	return ioend;
 }
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 72e30d1c3bdf..23c087f0bcbf 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,6 @@  struct xfs_ioend {
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
-	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct bio		*io_bio;	/* bio being built */
 	struct bio		io_inline_bio;	/* MUST BE LAST! */
 };